diff options
Diffstat (limited to 'pkg/sentry/syscalls')
45 files changed, 10998 insertions, 0 deletions
diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD new file mode 100644 index 000000000..d667b42c8 --- /dev/null +++ b/pkg/sentry/syscalls/BUILD @@ -0,0 +1,43 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "syscalls", + srcs = [ + "epoll.go", + "polling.go", + "syscalls.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls", + visibility = ["//:sandbox"], + deps = [ + ":unimplemented_syscall_go_proto", + "//pkg/abi/linux", + "//pkg/eventchannel", + "//pkg/sentry/arch", + "//pkg/sentry/fs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/epoll", + "//pkg/sentry/kernel/kdefs", + "//pkg/sentry/kernel/time", + "//pkg/syserror", + "//pkg/waiter", + ], +) + +proto_library( + name = "unimplemented_syscall_proto", + srcs = ["unimplemented_syscall.proto"], + visibility = ["//visibility:public"], + deps = ["//pkg/sentry/arch:registers_proto"], +) + +go_proto_library( + name = "unimplemented_syscall_go_proto", + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto", + proto = ":unimplemented_syscall_proto", + visibility = ["//visibility:public"], + deps = ["//pkg/sentry/arch:registers_go_proto"], +) diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go new file mode 100644 index 000000000..01dd6fa71 --- /dev/null +++ b/pkg/sentry/syscalls/epoll.go @@ -0,0 +1,174 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package syscalls + +import ( + "syscall" + "time" + + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// CreateEpoll implements the epoll_create(2) linux syscall. +func CreateEpoll(t *kernel.Task, closeOnExec bool) (kdefs.FD, error) { + file := epoll.NewEventPoll(t) + defer file.DecRef() + + flags := kernel.FDFlags{ + CloseOnExec: closeOnExec, + } + fd, err := t.FDMap().NewFDFrom(0, file, flags, t.ThreadGroup().Limits()) + if err != nil { + return 0, err + } + + return fd, nil +} + +// AddEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_ADD. +func AddEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { + // Get epoll from the file descriptor. + epollfile := t.FDMap().GetFile(epfd) + if epollfile == nil { + return syscall.EBADF + } + defer epollfile.DecRef() + + // Get the target file id. + file := t.FDMap().GetFile(fd) + if file == nil { + return syscall.EBADF + } + defer file.DecRef() + + // Extract the epollPoll operations. + e, ok := epollfile.FileOperations.(*epoll.EventPoll) + if !ok { + return syscall.EBADF + } + + // Try to add the entry. + return e.AddEntry(epoll.FileIdentifier{file, fd}, flags, mask, userData) +} + +// UpdateEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_MOD. +func UpdateEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD, flags epoll.EntryFlags, mask waiter.EventMask, userData [2]int32) error { + // Get epoll from the file descriptor. + epollfile := t.FDMap().GetFile(epfd) + if epollfile == nil { + return syscall.EBADF + } + defer epollfile.DecRef() + + // Get the target file id. + file := t.FDMap().GetFile(fd) + if file == nil { + return syscall.EBADF + } + defer file.DecRef() + + // Extract the epollPoll operations. + e, ok := epollfile.FileOperations.(*epoll.EventPoll) + if !ok { + return syscall.EBADF + } + + // Try to update the entry. + return e.UpdateEntry(epoll.FileIdentifier{file, fd}, flags, mask, userData) +} + +// RemoveEpoll implements the epoll_ctl(2) linux syscall when op is EPOLL_CTL_DEL. +func RemoveEpoll(t *kernel.Task, epfd kdefs.FD, fd kdefs.FD) error { + // Get epoll from the file descriptor. + epollfile := t.FDMap().GetFile(epfd) + if epollfile == nil { + return syscall.EBADF + } + defer epollfile.DecRef() + + // Get the target file id. + file := t.FDMap().GetFile(fd) + if file == nil { + return syscall.EBADF + } + defer file.DecRef() + + // Extract the epollPoll operations. + e, ok := epollfile.FileOperations.(*epoll.EventPoll) + if !ok { + return syscall.EBADF + } + + // Try to remove the entry. + return e.RemoveEntry(epoll.FileIdentifier{file, fd}) +} + +// WaitEpoll implements the epoll_wait(2) linux syscall. +func WaitEpoll(t *kernel.Task, fd kdefs.FD, max int, timeout int) ([]epoll.Event, error) { + // Get epoll from the file descriptor. + epollfile := t.FDMap().GetFile(fd) + if epollfile == nil { + return nil, syscall.EBADF + } + defer epollfile.DecRef() + + // Extract the epollPoll operations. + e, ok := epollfile.FileOperations.(*epoll.EventPoll) + if !ok { + return nil, syscall.EBADF + } + + // Try to read events and return right away if we got them or if the + // caller requested a non-blocking "wait". + r := e.ReadEvents(max) + if len(r) != 0 || timeout == 0 { + return r, nil + } + + // We'll have to wait. Set up the timer if a timeout was specified and + // and register with the epoll object for readability events. + var haveDeadline bool + var deadline ktime.Time + if timeout > 0 { + timeoutDur := time.Duration(timeout) * time.Millisecond + deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur) + haveDeadline = true + } + + w, ch := waiter.NewChannelEntry(nil) + e.EventRegister(&w, waiter.EventIn) + defer e.EventUnregister(&w) + + // Try to read the events again until we succeed, timeout or get + // interrupted. + for { + r = e.ReadEvents(max) + if len(r) != 0 { + return r, nil + } + + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syscall.ETIMEDOUT { + return nil, nil + } + + return nil, err + } + } +} diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD new file mode 100644 index 000000000..bc67ebf30 --- /dev/null +++ b/pkg/sentry/syscalls/linux/BUILD @@ -0,0 +1,103 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "linux_state", + srcs = [ + "sys_aio.go", + "sys_futex.go", + "sys_poll.go", + "sys_time.go", + ], + out = "linux_state.go", + package = "linux", +) + +go_library( + name = "linux", + srcs = [ + "error.go", + "flags.go", + "linux64.go", + "linux_state.go", + "sigset.go", + "sys_aio.go", + "sys_capability.go", + "sys_epoll.go", + "sys_eventfd.go", + "sys_file.go", + "sys_futex.go", + "sys_getdents.go", + "sys_identity.go", + "sys_inotify.go", + "sys_lseek.go", + "sys_mmap.go", + "sys_mount.go", + "sys_pipe.go", + "sys_poll.go", + "sys_prctl.go", + "sys_random.go", + "sys_read.go", + "sys_rlimit.go", + "sys_rusage.go", + "sys_sched.go", + "sys_sem.go", + "sys_signal.go", + "sys_socket.go", + "sys_stat.go", + "sys_sync.go", + "sys_sysinfo.go", + "sys_syslog.go", + "sys_thread.go", + "sys_time.go", + "sys_timer.go", + "sys_timerfd.go", + "sys_tls.go", + "sys_utsname.go", + "sys_write.go", + "timespec.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux", + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi", + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/bpf", + "//pkg/eventchannel", + "//pkg/log", + "//pkg/metric", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fs/timerfd", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/epoll", + "//pkg/sentry/kernel/eventfd", + "//pkg/sentry/kernel/kdefs", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/kernel/semaphore", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/memmap", + "//pkg/sentry/mm", + "//pkg/sentry/safemem", + "//pkg/sentry/socket", + "//pkg/sentry/socket/control", + "//pkg/sentry/syscalls", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go new file mode 100644 index 000000000..013b385bc --- /dev/null +++ b/pkg/sentry/syscalls/linux/error.go @@ -0,0 +1,117 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "io" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/metric" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +var ( + partialResultMetric = metric.MustCreateNewUint64Metric("/syscalls/partial_result", true /* sync */, "Whether or not a partial result has occurred for this sandbox.") + partialResultOnce sync.Once +) + +// handleIOError handles special error cases for partial results. For some +// errors, we may consume the error and return only the partial read/write. +// +// op and f are used only for panics. +func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error { + switch err { + case nil: + // Typical successful syscall. + return nil + case io.EOF: + // EOF is always consumed. If this is a partial read/write + // (result != 0), the application will see that, otherwise + // they will see 0. + return nil + case syserror.ErrExceedsFileSizeLimit: + // Ignore partialResult because this error only applies to + // normal files, and for those files we cannot accumulate + // write results. + // + // Do not consume the error and return it as EFBIG. + // Simultaneously send a SIGXFSZ per setrlimit(2). + t.SendSignal(&arch.SignalInfo{ + Signo: int32(syscall.SIGXFSZ), + Code: arch.SignalInfoKernel, + }) + return syscall.EFBIG + case syserror.ErrInterrupted: + // The syscall was interrupted. Return nil if it completed + // partially, otherwise return the error code that the syscall + // needs (to indicate to the kernel what it should do). + if partialResult { + return nil + } + return intr + } + + if !partialResult { + // Typical syscall error. + return err + } + + switch err { + case syserror.EINTR: + // Syscall interrupted, but completed a partial + // read/write. Like ErrWouldBlock, since we have a + // partial read/write, we consume the error and return + // the partial result. + return nil + case syserror.EFAULT: + // EFAULT is only shown the user if nothing was + // read/written. If we read something (this case), they see + // a partial read/write. They will then presumably try again + // with an incremented buffer, which will EFAULT with + // result == 0. + return nil + case syserror.EPIPE: + // Writes to a pipe or socket will return EPIPE if the other + // side is gone. The partial write is returned. EPIPE will be + // returned on the next call. + // + // TODO: In some cases SIGPIPE should also be sent + // to the application. + return nil + case syserror.ErrWouldBlock: + // Syscall would block, but completed a partial read/write. + // This case should only be returned by IssueIO for nonblocking + // files. Since we have a partial read/write, we consume + // ErrWouldBlock, returning the partial result. + return nil + } + + switch err.(type) { + case kernel.SyscallRestartErrno: + // Identical to the EINTR case. + return nil + } + + // An unknown error is encountered with a partial read/write. + name, _ := f.Dirent.FullName(nil /* ignore chroot */) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations) + partialResultOnce.Do(partialResultMetric.Increment) + return nil +} diff --git a/pkg/sentry/syscalls/linux/flags.go b/pkg/sentry/syscalls/linux/flags.go new file mode 100644 index 000000000..82bfd7c2a --- /dev/null +++ b/pkg/sentry/syscalls/linux/flags.go @@ -0,0 +1,95 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" +) + +// flagsToPermissions returns a Permissions object from Linux flags. +// This includes truncate permission if O_TRUNC is set in the mask. +func flagsToPermissions(mask uint) (p fs.PermMask) { + switch mask & syscall.O_ACCMODE { + case syscall.O_WRONLY: + p.Write = true + case syscall.O_RDWR: + p.Write = true + p.Read = true + case syscall.O_RDONLY: + p.Read = true + } + return +} + +// fdFlagsToLinux converts a kernel.FDFlags object to a Linux representation. +func fdFlagsToLinux(flags kernel.FDFlags) (mask uint) { + if flags.CloseOnExec { + mask |= syscall.FD_CLOEXEC + } + return +} + +// flagsToLinux converts a FileFlags object to a Linux representation. +func flagsToLinux(flags fs.FileFlags) (mask uint) { + if flags.Direct { + mask |= syscall.O_DIRECT + } + if flags.NonBlocking { + mask |= syscall.O_NONBLOCK + } + if flags.Sync { + mask |= syscall.O_SYNC + } + if flags.Append { + mask |= syscall.O_APPEND + } + if flags.Directory { + mask |= syscall.O_DIRECTORY + } + switch { + case flags.Read && flags.Write: + mask |= syscall.O_RDWR + case flags.Write: + mask |= syscall.O_WRONLY + case flags.Read: + mask |= syscall.O_RDONLY + } + return +} + +// linuxToFlags converts linux file flags to a FileFlags object. +func linuxToFlags(mask uint) (flags fs.FileFlags) { + return fs.FileFlags{ + Direct: mask&syscall.O_DIRECT != 0, + Sync: mask&syscall.O_SYNC != 0, + NonBlocking: mask&syscall.O_NONBLOCK != 0, + Read: (mask & syscall.O_ACCMODE) != syscall.O_WRONLY, + Write: (mask & syscall.O_ACCMODE) != syscall.O_RDONLY, + Append: mask&syscall.O_APPEND != 0, + Directory: mask&syscall.O_DIRECTORY != 0, + } +} + +// linuxToSettableFlags converts linux file flags to a SettableFileFlags object. +func linuxToSettableFlags(mask uint) fs.SettableFileFlags { + return fs.SettableFileFlags{ + Direct: mask&syscall.O_DIRECT != 0, + NonBlocking: mask&syscall.O_NONBLOCK != 0, + Append: mask&syscall.O_APPEND != 0, + } +} diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go new file mode 100644 index 000000000..44db2d582 --- /dev/null +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -0,0 +1,376 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package linux provides syscall tables for amd64 Linux. +// +// NOTE: Linux i386 support has been removed. +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// AUDIT_ARCH_X86_64 identifies the Linux syscall API on AMD64, and is taken +// from <linux/audit.h>. +const _AUDIT_ARCH_X86_64 = 0xc000003e + +// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall +// numbers from Linux 3.11. The entries commented out are those syscalls we +// don't currently support. +var AMD64 = &kernel.SyscallTable{ + OS: abi.Linux, + Arch: arch.AMD64, + Version: kernel.Version{ + Sysname: "Linux", + Release: "3.11.10", + Version: "#1 SMP Fri Nov 29 10:47:50 PST 2013", + }, + AuditNumber: _AUDIT_ARCH_X86_64, + Table: map[uintptr]kernel.SyscallFn{ + 0: Read, + 1: Write, + 2: Open, + 3: Close, + 4: Stat, + 5: Fstat, + 6: Lstat, + 7: Poll, + 8: Lseek, + 9: Mmap, + 10: Mprotect, + 11: Munmap, + 12: Brk, + 13: RtSigaction, + 14: RtSigprocmask, + 15: RtSigreturn, + 16: Ioctl, + 17: Pread64, + 18: Pwrite64, + 19: Readv, + 20: Writev, + 21: Access, + 22: Pipe, + 23: Select, + 24: SchedYield, + 25: Mremap, + 26: Msync, + 27: Mincore, + 28: Madvise, + // 29: Shmget, TODO + // 30: Shmat, TODO + // 31: Shmctl, TODO + 32: Dup, + 33: Dup2, + 34: Pause, + 35: Nanosleep, + 36: Getitimer, + 37: Alarm, + 38: Setitimer, + 39: Getpid, + 40: Sendfile, + 41: Socket, + 42: Connect, + 43: Accept, + 44: SendTo, + 45: RecvFrom, + 46: SendMsg, + 47: RecvMsg, + 48: Shutdown, + 49: Bind, + 50: Listen, + 51: GetSockName, + 52: GetPeerName, + 53: SocketPair, + 54: SetSockOpt, + 55: GetSockOpt, + 56: Clone, + 57: Fork, + 58: Vfork, + 59: Execve, + 60: Exit, + 61: Wait4, + 62: Kill, + 63: Uname, + 64: Semget, + 65: Semop, + 66: Semctl, + // 67: Shmdt, TODO + // 68: Msgget, TODO + // 69: Msgsnd, TODO + // 70: Msgrcv, TODO + // 71: Msgctl, TODO + 72: Fcntl, + 73: Flock, + 74: Fsync, + 75: Fdatasync, + 76: Truncate, + 77: Ftruncate, + 78: Getdents, + 79: Getcwd, + 80: Chdir, + 81: Fchdir, + 82: Rename, + 83: Mkdir, + 84: Rmdir, + 85: Creat, + 86: Link, + 87: Unlink, + 88: Symlink, + 89: Readlink, + 90: Chmod, + 91: Fchmod, + 92: Chown, + 93: Fchown, + 94: Lchown, + 95: Umask, + 96: Gettimeofday, + 97: Getrlimit, + 98: Getrusage, + 99: Sysinfo, + 100: Times, + 101: Ptrace, + 102: Getuid, + 103: Syslog, + 104: Getgid, + 105: Setuid, + 106: Setgid, + 107: Geteuid, + 108: Getegid, + 109: Setpgid, + 110: Getppid, + 111: Getpgrp, + 112: Setsid, + 113: Setreuid, + 114: Setregid, + 115: Getgroups, + 116: Setgroups, + 117: Setresuid, + 118: Getresuid, + 119: Setresgid, + 120: Getresgid, + 121: Getpgid, + // 122: Setfsuid, TODO + // 123: Setfsgid, TODO + 124: Getsid, + 125: Capget, + 126: Capset, + 127: RtSigpending, + 128: RtSigtimedwait, + 129: RtSigqueueinfo, + 130: RtSigsuspend, + 131: Sigaltstack, + 132: Utime, + 133: Mknod, + 134: syscalls.Error(syscall.ENOSYS), // Uselib, obsolete + 135: syscalls.ErrorWithEvent(syscall.EINVAL), // SetPersonality, unable to change personality + 136: syscalls.ErrorWithEvent(syscall.ENOSYS), // Ustat, needs filesystem support + 137: Statfs, + 138: Fstatfs, + // 139: Sysfs, TODO + 140: Getpriority, + 141: Setpriority, + 142: syscalls.CapError(linux.CAP_SYS_NICE), // SchedSetparam, requires cap_sys_nice + 143: SchedGetparam, + 144: SchedSetscheduler, + 145: SchedGetscheduler, + 146: SchedGetPriorityMax, + 147: SchedGetPriorityMin, + 148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval, + 149: syscalls.Error(nil), // Mlock, TODO + 150: syscalls.Error(nil), // Munlock, TODO + 151: syscalls.Error(nil), // Mlockall, TODO + 152: syscalls.Error(nil), // Munlockall, TODO + 153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup, + 154: syscalls.Error(syscall.EPERM), // ModifyLdt, + 155: syscalls.Error(syscall.EPERM), // PivotRoot, + 156: syscalls.Error(syscall.EPERM), // Sysctl, syscall is "worthless" + 157: Prctl, + 158: ArchPrctl, + 159: syscalls.CapError(linux.CAP_SYS_TIME), // Adjtimex, requires cap_sys_time + 160: Setrlimit, + 161: Chroot, + 162: Sync, + 163: syscalls.CapError(linux.CAP_SYS_PACCT), // Acct, requires cap_sys_pacct + 164: syscalls.CapError(linux.CAP_SYS_TIME), // Settimeofday, requires cap_sys_time + 165: Mount, + 166: Umount2, + 167: syscalls.CapError(linux.CAP_SYS_ADMIN), // Swapon, requires cap_sys_admin + 168: syscalls.CapError(linux.CAP_SYS_ADMIN), // Swapoff, requires cap_sys_admin + 169: syscalls.CapError(linux.CAP_SYS_BOOT), // Reboot, requires cap_sys_boot + 170: Sethostname, + 171: Setdomainname, + 172: syscalls.CapError(linux.CAP_SYS_RAWIO), // Iopl, requires cap_sys_rawio + 173: syscalls.CapError(linux.CAP_SYS_RAWIO), // Ioperm, requires cap_sys_rawio + 174: syscalls.CapError(linux.CAP_SYS_MODULE), // CreateModule, requires cap_sys_module + 175: syscalls.CapError(linux.CAP_SYS_MODULE), // InitModule, requires cap_sys_module + 176: syscalls.CapError(linux.CAP_SYS_MODULE), // DeleteModule, requires cap_sys_module + 177: syscalls.Error(syscall.ENOSYS), // GetKernelSyms, not supported in > 2.6 + 178: syscalls.Error(syscall.ENOSYS), // QueryModule, not supported in > 2.6 + 179: syscalls.CapError(linux.CAP_SYS_ADMIN), // Quotactl, requires cap_sys_admin (most operations) + 180: syscalls.Error(syscall.ENOSYS), // Nfsservctl, does not exist > 3.1 + 181: syscalls.Error(syscall.ENOSYS), // Getpmsg, not implemented in Linux + 182: syscalls.Error(syscall.ENOSYS), // Putpmsg, not implemented in Linux + 183: syscalls.Error(syscall.ENOSYS), // AfsSyscall, not implemented in Linux + 184: syscalls.Error(syscall.ENOSYS), // Tuxcall, not implemented in Linux + 185: syscalls.Error(syscall.ENOSYS), // Security, not implemented in Linux + 186: Gettid, + 187: nil, // Readahead, TODO + 188: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Setxattr, requires filesystem support + 189: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lsetxattr, requires filesystem support + 190: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fsetxattr, requires filesystem support + 191: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Getxattr, requires filesystem support + 192: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lgetxattr, requires filesystem support + 193: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fgetxattr, requires filesystem support + 194: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Listxattr, requires filesystem support + 195: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Llistxattr, requires filesystem support + 196: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Flistxattr, requires filesystem support + 197: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Removexattr, requires filesystem support + 198: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Lremovexattr, requires filesystem support + 199: syscalls.ErrorWithEvent(syscall.ENOTSUP), // Fremovexattr, requires filesystem support + 200: Tkill, + 201: Time, + 202: Futex, + 203: SchedSetaffinity, + 204: SchedGetaffinity, + 205: syscalls.Error(syscall.ENOSYS), // SetThreadArea, expected to return ENOSYS on 64-bit + 206: IoSetup, + 207: IoDestroy, + 208: IoGetevents, + 209: IoSubmit, + 210: IoCancel, + 211: syscalls.Error(syscall.ENOSYS), // GetThreadArea, expected to return ENOSYS on 64-bit + 212: syscalls.CapError(linux.CAP_SYS_ADMIN), // LookupDcookie, requires cap_sys_admin + 213: EpollCreate, + 214: syscalls.ErrorWithEvent(syscall.ENOSYS), // EpollCtlOld, deprecated (afaik, unused) + 215: syscalls.ErrorWithEvent(syscall.ENOSYS), // EpollWaitOld, deprecated (afaik, unused) + 216: syscalls.ErrorWithEvent(syscall.ENOSYS), // RemapFilePages, deprecated since 3.16 + 217: Getdents64, + 218: SetTidAddress, + 219: RestartSyscall, + // 220: Semtimedop, TODO + 221: Fadvise64, + // 222: TimerCreate, TODO + // 223: TimerSettime, TODO + // 224: TimerGettime, TODO + // 225: TimerGetoverrun, TODO + // 226: TimerDelete, TODO + 227: ClockSettime, + 228: ClockGettime, + 229: ClockGetres, + 230: ClockNanosleep, + 231: ExitGroup, + 232: EpollWait, + 233: EpollCtl, + 234: Tgkill, + 235: Utimes, + 236: syscalls.Error(syscall.ENOSYS), // Vserver, not implemented by Linux + 237: syscalls.CapError(linux.CAP_SYS_NICE), // Mbind, may require cap_sys_nice TODO + 238: SetMempolicy, + 239: GetMempolicy, + // 240: MqOpen, TODO + // 241: MqUnlink, TODO + // 242: MqTimedsend, TODO + // 243: MqTimedreceive, TODO + // 244: MqNotify, TODO + // 245: MqGetsetattr, TODO + 246: syscalls.CapError(linux.CAP_SYS_BOOT), // kexec_load, requires cap_sys_boot + 247: Waitid, + 248: syscalls.Error(syscall.EACCES), // AddKey, not available to user + 249: syscalls.Error(syscall.EACCES), // RequestKey, not available to user + 250: syscalls.Error(syscall.EACCES), // Keyctl, not available to user + 251: syscalls.CapError(linux.CAP_SYS_ADMIN), // IoprioSet, requires cap_sys_nice or cap_sys_admin (depending) + 252: syscalls.CapError(linux.CAP_SYS_ADMIN), // IoprioGet, requires cap_sys_nice or cap_sys_admin (depending) + 253: InotifyInit, + 254: InotifyAddWatch, + 255: InotifyRmWatch, + 256: syscalls.CapError(linux.CAP_SYS_NICE), // MigratePages, requires cap_sys_nice + 257: Openat, + 258: Mkdirat, + 259: Mknodat, + 260: Fchownat, + 261: Futimesat, + 262: Fstatat, + 263: Unlinkat, + 264: Renameat, + 265: Linkat, + 266: Symlinkat, + 267: Readlinkat, + 268: Fchmodat, + 269: Faccessat, + 270: Pselect, + 271: Ppoll, + 272: Unshare, + 273: syscalls.Error(syscall.ENOSYS), // SetRobustList, obsolete + 274: syscalls.Error(syscall.ENOSYS), // GetRobustList, obsolete + // 275: Splice, TODO + // 276: Tee, TODO + // 277: SyncFileRange, TODO + // 278: Vmsplice, TODO + 279: syscalls.CapError(linux.CAP_SYS_NICE), // MovePages, requires cap_sys_nice (mostly) + 280: Utimensat, + 281: EpollPwait, + // 282: Signalfd, TODO + 283: TimerfdCreate, + 284: Eventfd, + 285: Fallocate, + 286: TimerfdSettime, + 287: TimerfdGettime, + 288: Accept4, + // 289: Signalfd4, TODO + 290: Eventfd2, + 291: EpollCreate1, + 292: Dup3, + 293: Pipe2, + 294: InotifyInit1, + 295: Preadv, + 296: Pwritev, + 297: RtTgsigqueueinfo, + 298: syscalls.ErrorWithEvent(syscall.ENODEV), // PerfEventOpen, no support for perf counters + 299: RecvMMsg, + 300: syscalls.ErrorWithEvent(syscall.ENOSYS), // FanotifyInit, needs CONFIG_FANOTIFY + 301: syscalls.ErrorWithEvent(syscall.ENOSYS), // FanotifyMark, needs CONFIG_FANOTIFY + 302: Prlimit64, + 303: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), // NameToHandleAt, needs filesystem support + 304: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), // OpenByHandleAt, needs filesystem support + 305: syscalls.CapError(linux.CAP_SYS_TIME), // ClockAdjtime, requires cap_sys_time + 306: Syncfs, + 307: SendMMsg, + // 308: Setns, TODO + 309: Getcpu, + // 310: ProcessVmReadv, TODO may require cap_sys_ptrace + // 311: ProcessVmWritev, TODO may require cap_sys_ptrace + 312: syscalls.CapError(linux.CAP_SYS_PTRACE), // Kcmp, requires cap_sys_ptrace + 313: syscalls.CapError(linux.CAP_SYS_MODULE), // FinitModule, requires cap_sys_module + // "Backports." + 318: GetRandom, + }, + + Emulate: map[usermem.Addr]uintptr{ + 0xffffffffff600000: 96, // vsyscall gettimeofday(2) + 0xffffffffff600400: 201, // vsyscall time(2) + 0xffffffffff600800: 309, // vsyscall getcpu(2) + }, + Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { + syscalls.UnimplementedEvent(t) + return 0, syserror.ENOSYS + }, +} diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go new file mode 100644 index 000000000..bfb541634 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -0,0 +1,69 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and +// STOP are clear. +func copyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) { + if size != linux.SignalSetSize { + return 0, syscall.EINVAL + } + b := t.CopyScratchBuffer(8) + if _, err := t.CopyInBytes(sigSetAddr, b); err != nil { + return 0, err + } + mask := usermem.ByteOrder.Uint64(b[:]) + return linux.SignalSet(mask) &^ kernel.UnblockableSignals, nil +} + +// copyOutSigSet copies out a sigset_t. +func copyOutSigSet(t *kernel.Task, sigSetAddr usermem.Addr, mask linux.SignalSet) error { + b := t.CopyScratchBuffer(8) + usermem.ByteOrder.PutUint64(b, uint64(mask)) + _, err := t.CopyOutBytes(sigSetAddr, b) + return err +} + +// copyInSigSetWithSize copies in a structure as below +// +// struct { +// sigset_t* sigset_addr; +// size_t sizeof_sigset; +// }; +// +// and returns sigset_addr and size. +func copyInSigSetWithSize(t *kernel.Task, addr usermem.Addr) (usermem.Addr, uint, error) { + switch t.Arch().Width() { + case 8: + in := t.CopyScratchBuffer(16) + if _, err := t.CopyInBytes(addr, in); err != nil { + return 0, 0, err + } + maskAddr := usermem.Addr(usermem.ByteOrder.Uint64(in[0:])) + maskSize := uint(usermem.ByteOrder.Uint64(in[8:])) + return maskAddr, maskSize, nil + default: + return 0, 0, syserror.ENOSYS + } +} diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go new file mode 100644 index 000000000..80407a082 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -0,0 +1,402 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "encoding/binary" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// I/O commands. +const ( + _IOCB_CMD_PREAD = 0 + _IOCB_CMD_PWRITE = 1 + _IOCB_CMD_FSYNC = 2 + _IOCB_CMD_FDSYNC = 3 + _IOCB_CMD_NOOP = 6 + _IOCB_CMD_PREADV = 7 + _IOCB_CMD_PWRITEV = 8 +) + +// I/O flags. +const ( + _IOCB_FLAG_RESFD = 1 +) + +// ioCallback describes an I/O request. +// +// The priority field is currently ignored in the implementation below. Also +// note that the IOCB_FLAG_RESFD feature is not supported. +type ioCallback struct { + Data uint64 + Key uint32 + Reserved1 uint32 + + OpCode uint16 + ReqPrio int16 + FD uint32 + + Buf uint64 + Bytes uint64 + Offset int64 + + Reserved2 uint64 + Flags uint32 + + // eventfd to signal if IOCB_FLAG_RESFD is set in flags. + ResFD uint32 +} + +// ioEvent describes an I/O result. +type ioEvent struct { + Data uint64 + Obj uint64 + Result int64 + Result2 int64 +} + +// ioEventSize is the size of an ioEvent encoded. +var ioEventSize = binary.Size(ioEvent{}) + +// IoSetup implements linux syscall io_setup(2). +func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nrEvents := args[0].Int() + idAddr := args[1].Pointer() + + // Linux uses the native long as the aio ID. + // + // The context pointer _must_ be zero initially. + var idIn uint64 + if _, err := t.CopyIn(idAddr, &idIn); err != nil { + return 0, nil, err + } + if idIn != 0 { + return 0, nil, syserror.EINVAL + } + + id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents)) + if err != nil { + return 0, nil, err + } + + // Copy out the new ID. + if _, err := t.CopyOut(idAddr, &id); err != nil { + t.MemoryManager().DestroyAIOContext(t, id) + return 0, nil, err + } + + return 0, nil, nil +} + +// IoDestroy implements linux syscall io_destroy(2). +func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + + // Destroy the given context. + if !t.MemoryManager().DestroyAIOContext(t, id) { + // Does not exist. + return 0, nil, syserror.EINVAL + } + // FIXME: Linux blocks until all AIO to the destroyed context is + // done. + return 0, nil, nil +} + +// IoGetevents implements linux syscall io_getevents(2). +func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + minEvents := args[1].Int() + events := args[2].Int() + eventsAddr := args[3].Pointer() + timespecAddr := args[4].Pointer() + + // Sanity check arguments. + if minEvents > events { + return 0, nil, syserror.EINVAL + } + + ctx, ok := t.MemoryManager().LookupAIOContext(t, id) + if !ok { + return 0, nil, syserror.EINVAL + } + + // Setup the timeout. + var haveDeadline bool + var deadline ktime.Time + if timespecAddr != 0 { + d, err := copyTimespecIn(t, timespecAddr) + if err != nil { + return 0, nil, err + } + if !d.Valid() { + return 0, nil, syserror.EINVAL + } + deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration()) + haveDeadline = true + } + + // Loop over all requests. + for count := int32(0); count < events; count++ { + // Get a request, per semantics. + var v interface{} + if count >= minEvents { + var ok bool + v, ok = ctx.PopRequest() + if !ok { + return uintptr(count), nil, nil + } + } else { + var err error + v, err = waitForRequest(ctx, t, haveDeadline, deadline) + if err != nil { + if count > 0 || err == syserror.ETIMEDOUT { + return uintptr(count), nil, nil + } + return 0, nil, syserror.ConvertIntr(err, syserror.EINTR) + } + } + + ev := v.(*ioEvent) + + // Copy out the result. + if _, err := t.CopyOut(eventsAddr, ev); err != nil { + if count > 0 { + return uintptr(count), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Keep rolling. + eventsAddr += usermem.Addr(ioEventSize) + } + + // Everything finished. + return uintptr(events), nil, nil +} + +func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) { + for { + if v, ok := ctx.PopRequest(); ok { + // Request was readly available. Just return it. + return v, nil + } + + // Need to wait for request completion. + done, active := ctx.WaitChannel() + if !active { + // Context has been destroyed. + return nil, syserror.EINVAL + } + if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil { + return nil, err + } + } +} + +// memoryFor returns appropriate memory for the given callback. +func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { + bytes := int(cb.Bytes) + if bytes < 0 { + // Linux also requires that this field fit in ssize_t. + return usermem.IOSequence{}, syserror.EINVAL + } + + // Since this I/O will be asynchronous with respect to t's task goroutine, + // we have no guarantee that t's AddressSpace will be active during the + // I/O. + switch cb.OpCode { + case _IOCB_CMD_PREAD, _IOCB_CMD_PWRITE: + return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case _IOCB_CMD_PREADV, _IOCB_CMD_PWRITEV: + return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case _IOCB_CMD_FSYNC, _IOCB_CMD_FDSYNC, _IOCB_CMD_NOOP: + return usermem.IOSequence{}, nil + + default: + // Not a supported command. + return usermem.IOSequence{}, syserror.EINVAL + } +} + +func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) { + ev := &ioEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } + + // Construct a context.Context that will not be interrupted if t is + // interrupted. + c := t.AsyncContext() + + var err error + switch cb.OpCode { + case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV: + ev.Result, err = file.Preadv(c, ioseq, cb.Offset) + case _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV: + ev.Result, err = file.Pwritev(c, ioseq, cb.Offset) + case _IOCB_CMD_FSYNC: + err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncAll) + case _IOCB_CMD_FDSYNC: + err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncData) + } + + // Update the result. + if err != nil { + err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) + ev.Result = -int64(t.ExtractErrno(err, 0)) + } + + file.DecRef() + + // Queue the result for delivery. + ctx.FinishRequest(ev) + + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFile != nil { + eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) + eventFile.DecRef() + } +} + +// submitCallback processes a single callback. +func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error { + file := t.FDMap().GetFile(kdefs.FD(cb.FD)) + if file == nil { + // File not found. + return syserror.EBADF + } + defer file.DecRef() + + // Was there an eventFD? Extract it. + var eventFile *fs.File + if cb.Flags&_IOCB_FLAG_RESFD != 0 { + eventFile := t.FDMap().GetFile(kdefs.FD(cb.ResFD)) + if eventFile == nil { + // Bad FD. + return syserror.EBADF + } + defer eventFile.DecRef() + + // Check that it is an eventfd. + if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok { + // Not an event FD. + return syserror.EINVAL + } + } + + ioseq, err := memoryFor(t, cb) + if err != nil { + return err + } + + // Prepare the request. + ctx, ok := t.MemoryManager().LookupAIOContext(t, id) + if !ok { + return syserror.EINVAL + } + if ready := ctx.Prepare(); !ready { + // Context is busy. + return syserror.EAGAIN + } + + if eventFile != nil { + // The request is set. Make sure there's a ref on the file. + // + // This is necessary when the callback executes on completion, + // which is also what will release this reference. + eventFile.IncRef() + } + + // Perform the request asynchronously. + file.IncRef() + fs.Async(func() { performCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile) }) + + // All set. + return nil +} + +// IoSubmit implements linux syscall io_submit(2). +func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + nrEvents := args[1].Int() + addr := args[2].Pointer() + + for i := int32(0); i < nrEvents; i++ { + // Copy in the address. + cbAddrNative := t.Arch().Native(0) + if _, err := t.CopyIn(addr, cbAddrNative); err != nil { + if i > 0 { + // Some successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Copy in this callback. + var cb ioCallback + cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative)) + if _, err := t.CopyIn(cbAddr, &cb); err != nil { + + if i > 0 { + // Some have been successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Process this callback. + if err := submitCallback(t, id, &cb, cbAddr); err != nil { + if i > 0 { + // Partial success. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Advance to the next one. + addr += usermem.Addr(t.Arch().Width()) + } + + return uintptr(nrEvents), nil, nil +} + +// IoCancel implements linux syscall io_cancel(2). +// +// It is not presently supported (ENOSYS indicates no support on this +// architecture). +func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.ENOSYS +} diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go new file mode 100644 index 000000000..89c81ac90 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_capability.go @@ -0,0 +1,149 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +func lookupCaps(t *kernel.Task, tid kernel.ThreadID) (permitted, inheritable, effective auth.CapabilitySet, err error) { + if tid < 0 { + err = syserror.EINVAL + return + } + if tid > 0 { + t = t.PIDNamespace().TaskWithID(tid) + } + if t == nil { + err = syserror.ESRCH + return + } + creds := t.Credentials() + permitted, inheritable, effective = creds.PermittedCaps, creds.InheritableCaps, creds.EffectiveCaps + return +} + +// Capget implements Linux syscall capget. +func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + hdrAddr := args[0].Pointer() + dataAddr := args[1].Pointer() + + var hdr linux.CapUserHeader + if _, err := t.CopyIn(hdrAddr, &hdr); err != nil { + return 0, nil, err + } + // hdr.Pid doesn't need to be valid if this capget() is a "version probe" + // (hdr.Version is unrecognized and dataAddr is null), so we can't do the + // lookup yet. + switch hdr.Version { + case linux.LINUX_CAPABILITY_VERSION_1: + if dataAddr == 0 { + return 0, nil, nil + } + p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid)) + if err != nil { + return 0, nil, err + } + data := linux.CapUserData{ + Effective: uint32(e), + Permitted: uint32(p), + Inheritable: uint32(i), + } + _, err = t.CopyOut(dataAddr, &data) + return 0, nil, err + + case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: + if dataAddr == 0 { + return 0, nil, nil + } + p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid)) + if err != nil { + return 0, nil, err + } + data := [2]linux.CapUserData{ + { + Effective: uint32(e), + Permitted: uint32(p), + Inheritable: uint32(i), + }, + { + Effective: uint32(e >> 32), + Permitted: uint32(p >> 32), + Inheritable: uint32(i >> 32), + }, + } + _, err = t.CopyOut(dataAddr, &data) + return 0, nil, err + + default: + hdr.Version = linux.HighestCapabilityVersion + if _, err := t.CopyOut(hdrAddr, &hdr); err != nil { + return 0, nil, err + } + if dataAddr != 0 { + return 0, nil, syserror.EINVAL + } + return 0, nil, nil + } +} + +// Capset implements Linux syscall capset. +func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + hdrAddr := args[0].Pointer() + dataAddr := args[1].Pointer() + + var hdr linux.CapUserHeader + if _, err := t.CopyIn(hdrAddr, &hdr); err != nil { + return 0, nil, err + } + switch hdr.Version { + case linux.LINUX_CAPABILITY_VERSION_1: + if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() { + return 0, nil, syserror.EPERM + } + var data linux.CapUserData + if _, err := t.CopyIn(dataAddr, &data); err != nil { + return 0, nil, err + } + p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities + i := auth.CapabilitySet(data.Inheritable) & auth.AllCapabilities + e := auth.CapabilitySet(data.Effective) & auth.AllCapabilities + return 0, nil, t.SetCapabilitySets(p, i, e) + + case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: + if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() { + return 0, nil, syserror.EPERM + } + var data [2]linux.CapUserData + if _, err := t.CopyIn(dataAddr, &data); err != nil { + return 0, nil, err + } + p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities + i := (auth.CapabilitySet(data[0].Inheritable) | (auth.CapabilitySet(data[1].Inheritable) << 32)) & auth.AllCapabilities + e := (auth.CapabilitySet(data[0].Effective) | (auth.CapabilitySet(data[1].Effective) << 32)) & auth.AllCapabilities + return 0, nil, t.SetCapabilitySets(p, i, e) + + default: + hdr.Version = linux.HighestCapabilityVersion + if _, err := t.CopyOut(hdrAddr, &hdr); err != nil { + return 0, nil, err + } + return 0, nil, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go new file mode 100644 index 000000000..e69dfc77a --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -0,0 +1,171 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// EpollCreate1 implements the epoll_create1(2) linux syscall. +func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + if flags & ^syscall.EPOLL_CLOEXEC != 0 { + return 0, nil, syserror.EINVAL + } + + closeOnExec := flags&syscall.EPOLL_CLOEXEC != 0 + fd, err := syscalls.CreateEpoll(t, closeOnExec) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// EpollCreate implements the epoll_create(2) linux syscall. +func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + size := args[0].Int() + + if size <= 0 { + return 0, nil, syserror.EINVAL + } + + fd, err := syscalls.CreateEpoll(t, false) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// EpollCtl implements the epoll_ctl(2) linux syscall. +func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := kdefs.FD(args[0].Int()) + op := args[1].Int() + fd := kdefs.FD(args[2].Int()) + eventAddr := args[3].Pointer() + + // Capture the event state if needed. + flags := epoll.EntryFlags(0) + mask := waiter.EventMask(0) + var data [2]int32 + if op != syscall.EPOLL_CTL_DEL { + var e syscall.EpollEvent + if _, err := t.CopyIn(eventAddr, &e); err != nil { + return 0, nil, err + } + + if e.Events&syscall.EPOLLONESHOT != 0 { + flags |= epoll.OneShot + } + + // syscall.EPOLLET is incorrectly generated as a negative number + // in Go, see https://github.com/golang/go/issues/5328 for + // details. + if e.Events&-syscall.EPOLLET != 0 { + flags |= epoll.EdgeTriggered + } + + mask = waiter.EventMask(e.Events) + data[0] = e.Fd + data[1] = e.Pad + } + + // Perform the requested operations. + switch op { + case syscall.EPOLL_CTL_ADD: + // See fs/eventpoll.c. + mask |= waiter.EventHUp | waiter.EventErr + return 0, nil, syscalls.AddEpoll(t, epfd, fd, flags, mask, data) + case syscall.EPOLL_CTL_DEL: + return 0, nil, syscalls.RemoveEpoll(t, epfd, fd) + case syscall.EPOLL_CTL_MOD: + // Same as EPOLL_CTL_ADD. + mask |= waiter.EventHUp | waiter.EventErr + return 0, nil, syscalls.UpdateEpoll(t, epfd, fd, flags, mask, data) + default: + return 0, nil, syserror.EINVAL + } +} + +// copyOutEvents copies epoll events from the kernel to user memory. +func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error { + const itemLen = 12 + if _, ok := addr.AddLength(uint64(len(e)) * itemLen); !ok { + return syserror.EFAULT + } + + b := t.CopyScratchBuffer(itemLen) + for i := range e { + usermem.ByteOrder.PutUint32(b[0:], e[i].Events) + usermem.ByteOrder.PutUint32(b[4:], uint32(e[i].Data[0])) + usermem.ByteOrder.PutUint32(b[8:], uint32(e[i].Data[1])) + if _, err := t.CopyOutBytes(addr, b); err != nil { + return err + } + addr += itemLen + } + + return nil +} + +// EpollWait implements the epoll_wait(2) linux syscall. +func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := kdefs.FD(args[0].Int()) + eventsAddr := args[1].Pointer() + maxEvents := int(args[2].Int()) + timeout := int(args[3].Int()) + + r, err := syscalls.WaitEpoll(t, epfd, maxEvents, timeout) + if err != nil { + return 0, nil, syserror.ConvertIntr(err, syserror.EINTR) + } + + if len(r) != 0 { + if err := copyOutEvents(t, eventsAddr, r); err != nil { + return 0, nil, err + } + } + + return uintptr(len(r)), nil, nil +} + +// EpollPwait implements the epoll_pwait(2) linux syscall. +func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + maskAddr := args[4].Pointer() + maskSize := uint(args[5].Uint()) + + if maskAddr != 0 { + mask, err := copyInSigSet(t, maskAddr, maskSize) + if err != nil { + return 0, nil, err + } + + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + } + + return EpollWait(t, args) +} diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go new file mode 100644 index 000000000..60fe5a133 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -0,0 +1,65 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd" +) + +const ( + // EFD_SEMAPHORE is a flag used in syscall eventfd(2) and eventfd2(2). Please + // see its man page for more information. + EFD_SEMAPHORE = 1 + EFD_NONBLOCK = 0x800 + EFD_CLOEXEC = 0x80000 +) + +// Eventfd2 implements linux syscall eventfd2(2). +func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + initVal := args[0].Int() + flags := uint(args[1].Uint()) + allOps := uint(EFD_SEMAPHORE | EFD_NONBLOCK | EFD_CLOEXEC) + + if flags & ^allOps != 0 { + return 0, nil, syscall.EINVAL + } + + event := eventfd.New(t, uint64(initVal), flags&EFD_SEMAPHORE != 0) + event.SetFlags(fs.SettableFileFlags{ + NonBlocking: flags&EFD_NONBLOCK != 0, + }) + defer event.DecRef() + + fd, err := t.FDMap().NewFDFrom(0, event, kernel.FDFlags{ + CloseOnExec: flags&EFD_CLOEXEC != 0, + }, + t.ThreadGroup().Limits()) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// Eventfd implements linux syscall eventfd(2). +func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[1].Value = 0 + return Eventfd2(t, args) +} diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go new file mode 100644 index 000000000..a2dbba7e0 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -0,0 +1,1942 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "io" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// fileOpAt performs an operation on the second last component in the path. +func fileOpAt(t *kernel.Task, dirFD kdefs.FD, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string) error) error { + // Extract the last component. + dir, name := fs.SplitLast(path) + if dir == "/" { + // Common case: we are accessing a file in the root. + root := t.FSContext().RootDirectory() + err := fn(root, root, name) + root.DecRef() + return err + } else if dir == "." && dirFD == linux.AT_FDCWD { + // Common case: we are accessing a file relative to the current + // working directory; skip the look-up. + wd := t.FSContext().WorkingDirectory() + root := t.FSContext().RootDirectory() + err := fn(root, wd, name) + wd.DecRef() + root.DecRef() + return err + } + + return fileOpOn(t, dirFD, dir, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + return fn(root, d, name) + }) +} + +// fileOpOn performs an operation on the last entry of the path. +func fileOpOn(t *kernel.Task, dirFD kdefs.FD, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error { + var ( + d *fs.Dirent // The file. + wd *fs.Dirent // The working directory (if required.) + rel *fs.Dirent // The relative directory for search (if required.) + f *fs.File // The file corresponding to dirFD (if required.) + err error + ) + + // Extract the working directory (maybe). + if len(path) > 0 && path[0] == '/' { + // Absolute path; rel can be nil. + } else if dirFD == linux.AT_FDCWD { + // Need to reference the working directory. + wd = t.FSContext().WorkingDirectory() + rel = wd + } else { + // Need to extract the given FD. + f = t.FDMap().GetFile(dirFD) + if f == nil { + return syserror.EBADF + } + rel = f.Dirent + if !fs.IsDir(rel.Inode.StableAttr) { + return syserror.ENOTDIR + } + } + + // Grab the root (always required.) + root := t.FSContext().RootDirectory() + + // Lookup the node. + if resolve { + d, err = t.MountNamespace().FindInode(t, root, rel, path, linux.MaxSymlinkTraversals) + } else { + d, err = t.MountNamespace().FindLink(t, root, rel, path, linux.MaxSymlinkTraversals) + } + root.DecRef() + if wd != nil { + wd.DecRef() + } + if f != nil { + f.DecRef() + } + if err != nil { + return err + } + + err = fn(root, d) + d.DecRef() + return err +} + +// copyInPath copies a path in. +func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string, dirPath bool, err error) { + path, err = t.CopyInString(addr, syscall.PathMax) + if err != nil { + return "", false, err + } + if path == "" && !allowEmpty { + return "", false, syserror.ENOENT + } + + // If the path ends with a /, then checks must be enforced in various + // ways in the different callers. We pass this back to the caller. + path, dirPath = fs.TrimTrailingSlashes(path) + + return path, dirPath, nil +} + +func openAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint) (fd uintptr, err error) { + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, err + } + + err = fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + // First check a few things about the filesystem before trying to get the file + // reference. + // + // It's required that Check does not try to open files not that aren't backed by + // this dirent (e.g. pipes and sockets) because this would result in opening these + // files an extra time just to check permissions. + if err := d.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { + return err + } + + fileFlags := linuxToFlags(flags) + isDir := fs.IsDir(d.Inode.StableAttr) + + // If O_DIRECTORY is set, but the file is not a directory, then fail. + if fileFlags.Directory && !isDir { + return syserror.ENOTDIR + } + + // If it's a directory, then make sure. + if dirPath && !isDir { + return syserror.ENOTDIR + } + + // Don't allow directories to be opened writable. + if isDir && fileFlags.Write { + return syserror.EISDIR + } + + file, err := d.Inode.GetFile(t, d, fileFlags) + if err != nil { + return syserror.ConvertIntr(err, kernel.ERESTARTSYS) + } + defer file.DecRef() + + // Success. + fdFlags := kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0} + newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + if err != nil { + return err + } + + // Set return result in frame. + fd = uintptr(newFD) + + // Generate notification for opened file. + d.InotifyEvent(linux.IN_OPEN, 0) + + return nil + }) + return fd, err // Use result in frame. +} + +func mknodAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + if dirPath { + return syserror.ENOENT + } + + return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Do we have the appropriate permissions on the parent? + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + + // Attempt a creation. + perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) + + switch mode.FileType() { + case 0: + // "Zero file type is equivalent to type S_IFREG." - mknod(2) + fallthrough + case linux.ModeRegular: + // We are not going to return the file, so the actual + // flags used don't matter, but they cannot be empty or + // Create will complain. + flags := fs.FileFlags{Read: true, Write: true} + file, err := d.Create(t, root, name, flags, perms) + if err != nil { + return err + } + file.DecRef() + return nil + + case linux.ModeNamedPipe: + return d.CreateFifo(t, root, name, perms) + + case linux.ModeSocket: + // While it is possible create a unix domain socket file on linux + // using mknod(2), in practice this is pretty useless from an + // application. Linux internally uses mknod() to create the socket + // node during bind(2), but we implement bind(2) independently. If + // an application explicitly creates a socket node using mknod(), + // you can't seem to bind() or connect() to the resulting socket. + // + // Instead of emulating this seemingly useless behaviour, we'll + // indicate that the filesystem doesn't support the creation of + // sockets. + return syserror.EOPNOTSUPP + + case linux.ModeCharacterDevice: + fallthrough + case linux.ModeBlockDevice: + // TODO: We don't support creating block or character + // devices at the moment. + // + // When we start supporting block and character devices, we'll + // need to check for CAP_MKNOD here. + return syserror.EPERM + + default: + // "EINVAL - mode requested creation of something other than a + // regular file, device special file, FIFO or socket." - mknod(2) + return syserror.EINVAL + } + }) +} + +// Mknod implements the linux syscall mknod(2). +func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + path := args[0].Pointer() + mode := linux.FileMode(args[1].ModeT()) + // We don't need this argument until we support creation of device nodes. + _ = args[2].Uint() // dev + + return 0, nil, mknodAt(t, linux.AT_FDCWD, path, mode) +} + +// Mknodat implements the linux syscall mknodat(2). +func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := kdefs.FD(args[0].Int()) + path := args[1].Pointer() + mode := linux.FileMode(args[2].ModeT()) + // We don't need this argument until we support creation of device nodes. + _ = args[3].Uint() // dev + + return 0, nil, mknodAt(t, dirFD, path, mode) +} + +func createAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) { + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, err + } + if dirPath { + return 0, syserror.ENOENT + } + + err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Does this file exist already? + targetDirent, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals) + var newFile *fs.File + switch err { + case nil: + // The file existed. + defer targetDirent.DecRef() + + // Check if we wanted to create. + if flags&syscall.O_EXCL != 0 { + return syserror.EEXIST + } + + // Like sys_open, check for a few things about the + // filesystem before trying to get a reference to the + // fs.File. The same constraints on Check apply. + if err := targetDirent.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil { + return err + } + + // Should we truncate the file? + if flags&syscall.O_TRUNC != 0 { + if err := targetDirent.Inode.Truncate(t, targetDirent, 0); err != nil { + return err + } + } + + // Create a new fs.File. + newFile, err = targetDirent.Inode.GetFile(t, targetDirent, linuxToFlags(flags)) + if err != nil { + return syserror.ConvertIntr(err, kernel.ERESTARTSYS) + } + defer newFile.DecRef() + case syserror.EACCES: + // Permission denied while walking to the file. + return err + default: + // Do we have write permissions on the parent? + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + + // Attempt a creation. + perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) + newFile, err = d.Create(t, root, name, linuxToFlags(flags), perms) + if err != nil { + // No luck, bail. + return err + } + defer newFile.DecRef() + targetDirent = newFile.Dirent + } + + // Success. + fdFlags := kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0} + newFD, err := t.FDMap().NewFDFrom(0, newFile, fdFlags, t.ThreadGroup().Limits()) + if err != nil { + return err + } + + // Set result in frame. + fd = uintptr(newFD) + + // Queue the open inotify event. The creation event is + // automatically queued when the dirent is targetDirent. The + // open events are implemented at the syscall layer so we need + // to manually queue one here. + targetDirent.InotifyEvent(linux.IN_OPEN, 0) + + return nil + }) + return fd, err // Use result in frame. +} + +// Open implements linux syscall open(2). +func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := uint(args[1].Uint()) + if flags&syscall.O_CREAT != 0 { + mode := linux.FileMode(args[2].ModeT()) + n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode) + return n, nil, err + } + n, err := openAt(t, linux.AT_FDCWD, addr, flags) + return n, nil, err +} + +// Openat implements linux syscall openat(2). +func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + flags := uint(args[2].Uint()) + if flags&syscall.O_CREAT != 0 { + mode := linux.FileMode(args[3].ModeT()) + n, err := createAt(t, dirFD, addr, flags, mode) + return n, nil, err + } + n, err := openAt(t, dirFD, addr, flags) + return n, nil, err +} + +// Creat implements linux syscall creat(2). +func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := linux.FileMode(args[1].ModeT()) + n, err := createAt(t, linux.AT_FDCWD, addr, syscall.O_WRONLY|syscall.O_TRUNC, mode) + return n, nil, err +} + +// accessContext is a context that overrides the credentials used, but +// otherwise carries the same values as the embedded context. +// +// accessContext should only be used for access(2). +type accessContext struct { + context.Context + creds auth.Credentials +} + +// Value implements context.Context. +func (ac accessContext) Value(key interface{}) interface{} { + switch key { + case auth.CtxCredentials: + return &ac.creds + default: + return ac.Context.Value(key) + } +} + +func accessAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, resolve bool, mode uint) error { + const rOK = 4 + const wOK = 2 + const xOK = 1 + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + // Sanity check the mode. + if mode&^(rOK|wOK|xOK) != 0 { + return syserror.EINVAL + } + + return fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error { + // access(2) and faccessat(2) check permissions using real + // UID/GID, not effective UID/GID. + // + // "access() needs to use the real uid/gid, not the effective + // uid/gid. We do this by temporarily clearing all FS-related + // capabilities and switching the fsuid/fsgid around to the + // real ones." -fs/open.c:faccessat + creds := t.Credentials() + creds.EffectiveKUID = creds.RealKUID + creds.EffectiveKGID = creds.RealKGID + if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID { + creds.EffectiveCaps = creds.PermittedCaps + } else { + creds.EffectiveCaps = 0 + } + + ctx := &accessContext{ + Context: t, + creds: creds, + } + + if err := d.Inode.CheckPermission(ctx, fs.PermMask{ + Read: mode&rOK != 0, + Write: mode&wOK != 0, + Execute: mode&xOK != 0, + }); err != nil { + return err + } + return nil + }) +} + +// Access implements linux syscall access(2). +func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + + return 0, nil, accessAt(t, linux.AT_FDCWD, addr, true, mode) +} + +// Faccessat implements linux syscall faccessat(2). +func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + mode := args[2].ModeT() + flags := args[3].Int() + + return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode) +} + +// Ioctl implements linux syscall ioctl(2). +func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + request := int(args[1].Int()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Shared flags between file and socket. + switch request { + case linux.FIONCLEX: + t.FDMap().SetFlags(fd, kernel.FDFlags{ + CloseOnExec: false, + }) + return 0, nil, nil + case linux.FIOCLEX: + t.FDMap().SetFlags(fd, kernel.FDFlags{ + CloseOnExec: true, + }) + return 0, nil, nil + + case linux.FIONBIO: + var set int32 + if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.Flags() + if set != 0 { + flags.NonBlocking = true + } else { + flags.NonBlocking = false + } + file.SetFlags(flags.Settable()) + return 0, nil, nil + + default: + ret, err := file.FileOperations.Ioctl(t, t.MemoryManager(), args) + if err != nil { + return 0, nil, err + } + + return ret, nil, nil + } +} + +// Getcwd implements the linux syscall getcwd(2). +func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + size := args[1].SizeT() + cwd := t.FSContext().WorkingDirectory() + defer cwd.DecRef() + root := t.FSContext().RootDirectory() + defer root.DecRef() + + // Get our fullname from the root and preprend unreachable if the root was + // unreachable from our current dirent this is the same behavior as on linux. + s, reachable := cwd.FullName(root) + if !reachable { + s = "(unreachable)" + s + } + + // Note this is >= because we need a terminator. + if uint(len(s)) >= size { + return 0, nil, syserror.ERANGE + } + + // Copy out the path name for the node. + bytes, err := t.CopyOutBytes(addr, []byte(s)) + if err != nil { + return 0, nil, err + } + + // Top it off with a terminator. + _, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00")) + return uintptr(bytes + 1), nil, err +} + +// Chroot implements the linux syscall chroot(2). +func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + if !t.HasCapability(linux.CAP_SYS_CHROOT) { + return 0, nil, syserror.EPERM + } + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + // Is it a directory? + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Does it have execute permissions? + if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil { + return err + } + + t.FSContext().SetRootDirectory(d) + return nil + }) +} + +// Chdir implements the linux syscall chdir(2). +func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + // Is it a directory? + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Does it have execute permissions? + if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil { + return err + } + + t.FSContext().SetWorkingDirectory(d) + return nil + }) +} + +// Fchdir implements the linux syscall fchdir(2). +func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Is it a directory? + if !fs.IsDir(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.ENOTDIR + } + + // Does it have execute permissions? + if err := file.Dirent.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil { + return 0, nil, err + } + + t.FSContext().SetWorkingDirectory(file.Dirent) + return 0, nil, nil +} + +// Close implements linux syscall close(2). +func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + + file, ok := t.FDMap().Remove(fd) + if !ok { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.Flush(t) + return 0, nil, handleIOError(t, false /* partial */, err, syscall.EINTR, "close", file) +} + +// Dup implements linux syscall dup(2). +func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + newfd, err := t.FDMap().NewFDFrom(0, file, kernel.FDFlags{}, t.ThreadGroup().Limits()) + if err != nil { + return 0, nil, syserror.EMFILE + } + return uintptr(newfd), nil, nil +} + +// Dup2 implements linux syscall dup2(2). +func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := kdefs.FD(args[0].Int()) + newfd := kdefs.FD(args[1].Int()) + + // If oldfd is a valid file descriptor, and newfd has the same value as oldfd, + // then dup2() does nothing, and returns newfd. + if oldfd == newfd { + oldFile := t.FDMap().GetFile(oldfd) + if oldFile == nil { + return 0, nil, syserror.EBADF + } + defer oldFile.DecRef() + + return uintptr(newfd), nil, nil + } + + // Zero out flags arg to be used by Dup3. + args[2].Value = 0 + return Dup3(t, args) +} + +// Dup3 implements linux syscall dup3(2). +func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := kdefs.FD(args[0].Int()) + newfd := kdefs.FD(args[1].Int()) + flags := args[2].Uint() + + if oldfd == newfd { + return 0, nil, syserror.EINVAL + } + + oldFile := t.FDMap().GetFile(oldfd) + if oldFile == nil { + return 0, nil, syserror.EBADF + } + defer oldFile.DecRef() + + err := t.FDMap().NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&syscall.O_CLOEXEC != 0}, t.ThreadGroup().Limits()) + if err != nil { + return 0, nil, err + } + + return uintptr(newfd), nil, nil +} + +// Fcntl implements linux syscall fcntl(2). +func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + cmd := args[1].Int() + + file, flags := t.FDMap().GetDescriptor(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + switch cmd { + case syscall.F_DUPFD, syscall.F_DUPFD_CLOEXEC: + from := kdefs.FD(args[2].Int()) + fdFlags := kernel.FDFlags{CloseOnExec: cmd == syscall.F_DUPFD_CLOEXEC} + fd, err := t.FDMap().NewFDFrom(from, file, fdFlags, t.ThreadGroup().Limits()) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil + case syscall.F_GETFD: + return uintptr(fdFlagsToLinux(flags)), nil, nil + case syscall.F_SETFD: + flags := args[2].Uint() + t.FDMap().SetFlags(fd, kernel.FDFlags{ + CloseOnExec: flags&syscall.FD_CLOEXEC != 0, + }) + case syscall.F_GETFL: + return uintptr(flagsToLinux(file.Flags())), nil, nil + case syscall.F_SETFL: + flags := uint(args[2].Uint()) + file.SetFlags(linuxToSettableFlags(flags)) + case syscall.F_SETLK, syscall.F_SETLKW: + // In Linux the file system can choose to provide lock operations for an inode. + // Normally pipe and socket types lack lock operations. We diverge and use a heavy + // hammer by only allowing locks on files and directories. + if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.EBADF + } + + // Copy in the lock request. + flockAddr := args[2].Pointer() + var flock syscall.Flock_t + if _, err := t.CopyIn(flockAddr, &flock); err != nil { + return 0, nil, err + } + + // Compute the lock whence. + var sw fs.SeekWhence + switch flock.Whence { + case 0: + sw = fs.SeekSet + case 1: + sw = fs.SeekCurrent + case 2: + sw = fs.SeekEnd + default: + return 0, nil, syserror.EINVAL + } + + // Compute the lock offset. + var off int64 + switch sw { + case fs.SeekSet: + off = 0 + case fs.SeekCurrent: + // Note that Linux does not hold any mutexes while retrieving the file offset, + // see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. + off = file.Offset() + case fs.SeekEnd: + uattr, err := file.Dirent.Inode.UnstableAttr(t) + if err != nil { + return 0, nil, err + } + off = uattr.Size + default: + return 0, nil, syserror.EINVAL + } + + // Compute the lock range. + rng, err := lock.ComputeRange(flock.Start, flock.Len, off) + if err != nil { + return 0, nil, err + } + + // The lock uid is that of the Task's FDMap. + lockUniqueID := lock.UniqueID(t.FDMap().ID()) + + // These locks don't block; execute the non-blocking operation using the inode's lock + // context directly. + switch flock.Type { + case syscall.F_RDLCK: + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + if cmd == syscall.F_SETLK { + // Non-blocking lock, provide a nil lock.Blocker. + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { + return 0, nil, syserror.EAGAIN + } + } else { + // Blocking lock, pass in the task to satisfy the lock.Blocker interface. + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) { + return 0, nil, syserror.EINTR + } + } + return 0, nil, nil + case syscall.F_WRLCK: + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + if cmd == syscall.F_SETLK { + // Non-blocking lock, provide a nil lock.Blocker. + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { + return 0, nil, syserror.EAGAIN + } + } else { + // Blocking lock, pass in the task to satisfy the lock.Blocker interface. + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) { + return 0, nil, syserror.EINTR + } + } + return 0, nil, nil + case syscall.F_UNLCK: + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng) + return 0, nil, nil + default: + return 0, nil, syserror.EINVAL + } + default: + // Everything else is not yet supported. + return 0, nil, syserror.EINVAL + } + return 0, nil, nil +} + +const ( + _FADV_NORMAL = 0 + _FADV_RANDOM = 1 + _FADV_SEQUENTIAL = 2 + _FADV_WILLNEED = 3 + _FADV_DONTNEED = 4 + _FADV_NOREUSE = 5 +) + +// Fadvise64 implements linux syscall fadvise64(2). +// This implementation currently ignores the provided advice. +func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + offset := args[1].Int64() + length := args[2].Uint() + advice := args[3].Int() + + if offset < 0 || length < 0 { + return 0, nil, syserror.EINVAL + } + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + switch advice { + case _FADV_NORMAL: + case _FADV_RANDOM: + case _FADV_SEQUENTIAL: + case _FADV_WILLNEED: + case _FADV_DONTNEED: + case _FADV_NOREUSE: + default: + return 0, nil, syserror.EINVAL + } + + // Sure, whatever. + return 0, nil, nil +} + +func mkdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Does this directory exist already? + f, err := t.MountNamespace().FindInode(t, root, d, name, linux.MaxSymlinkTraversals) + switch err { + case nil: + // The directory existed. + defer f.DecRef() + return syserror.EEXIST + case syserror.EACCES: + // Permission denied while walking to the directory. + return err + default: + // Do we have write permissions on the parent? + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + + // Create the directory. + perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask())) + return d.CreateDirectory(t, root, name, perms) + } + }) +} + +// Mkdir implements linux syscall mkdir(2). +func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := linux.FileMode(args[1].ModeT()) + + return 0, nil, mkdirAt(t, linux.AT_FDCWD, addr, mode) +} + +// Mkdirat implements linux syscall mkdirat(2). +func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + mode := linux.FileMode(args[2].ModeT()) + + return 0, nil, mkdirAt(t, dirFD, addr, mode) +} + +func rmdirAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error { + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + // Special case: rmdir rejects anything with '.' as last component. + // This would be handled by the busy check for the current working + // directory, but this is how it's done. + if (len(path) == 1 && path == ".") || (len(path) > 1 && path[len(path)-2:] == "/.") { + return syserror.EINVAL + } + + return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + if err := fs.MayDelete(t, root, d, name); err != nil { + return err + } + + return d.RemoveDirectory(t, root, name) + }) +} + +// Rmdir implements linux syscall rmdir(2). +func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr) +} + +func symlinkAt(t *kernel.Task, dirFD kdefs.FD, newAddr usermem.Addr, oldAddr usermem.Addr) error { + newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */) + if err != nil { + return err + } + if dirPath { + return syserror.ENOENT + } + + // The oldPath is copied in verbatim. This is because the symlink + // will include all details, including trailing slashes. + oldPath, err := t.CopyInString(oldAddr, syscall.PathMax) + if err != nil { + return err + } + if oldPath == "" { + return syserror.ENOENT + } + + return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Make sure we have write permissions on the parent directory. + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + return d.CreateLink(t, root, oldPath, name) + }) +} + +// Symlink implements linux syscall symlink(2). +func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldAddr := args[0].Pointer() + newAddr := args[1].Pointer() + + return 0, nil, symlinkAt(t, linux.AT_FDCWD, newAddr, oldAddr) +} + +// Symlinkat implements linux syscall symlinkat(2). +func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldAddr := args[0].Pointer() + dirFD := kdefs.FD(args[1].Int()) + newAddr := args[2].Pointer() + + return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr) +} + +// mayLinkAt determines whether t can create a hard link to target. +// +// This corresponds to Linux's fs/namei.c:may_linkat. +func mayLinkAt(t *kernel.Task, target *fs.Inode) error { + // Technically Linux is more restrictive in 3.11.10 (requires CAP_FOWNER in + // root user namespace); this is from the later f2ca379642d7 "namei: permit + // linking with CAP_FOWNER in userns". + if !target.CheckOwnership(t) { + return syserror.EPERM + } + + // Check that the target is not a directory and that permissions are okay. + if fs.IsDir(target.StableAttr) || target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil { + return syserror.EPERM + } + + return nil +} + +// linkAt creates a hard link to the target specified by oldDirFD and oldAddr, +// specified by newDirFD and newAddr. If resolve is true, then the symlinks +// will be followed when evaluating the target. +func linkAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr, resolve, allowEmpty bool) error { + oldPath, _, err := copyInPath(t, oldAddr, allowEmpty) + if err != nil { + return err + } + newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */) + if err != nil { + return err + } + if dirPath { + return syserror.ENOENT + } + + if allowEmpty && oldPath == "" { + target := t.FDMap().GetFile(oldDirFD) + if target == nil { + return syserror.EBADF + } + defer target.DecRef() + if err := mayLinkAt(t, target.Dirent.Inode); err != nil { + return err + } + + // Resolve the target directory. + return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error { + if !fs.IsDir(newParent.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Make sure we have write permissions on the parent directory. + if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + return newParent.CreateHardLink(t, root, target.Dirent, newName) + }) + } + + // Resolve oldDirFD and oldAddr to a dirent. The "resolve" argument + // only applies to this name. + return fileOpOn(t, oldDirFD, oldPath, resolve, func(root *fs.Dirent, target *fs.Dirent) error { + if err := mayLinkAt(t, target.Inode); err != nil { + return err + } + + // Next resolve newDirFD and newAddr to the parent dirent and name. + return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error { + if !fs.IsDir(newParent.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Make sure we have write permissions on the parent directory. + if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil { + return err + } + return newParent.CreateHardLink(t, root, target, newName) + }) + }) +} + +// Link implements linux syscall link(2). +func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldAddr := args[0].Pointer() + newAddr := args[1].Pointer() + + // man link(2): + // POSIX.1-2001 says that link() should dereference oldpath if it is a + // symbolic link. However, since kernel 2.0, Linux does not do so: if + // oldpath is a symbolic link, then newpath is created as a (hard) link + // to the same symbolic link file (i.e., newpath becomes a symbolic + // link to the same file that oldpath refers to). + resolve := false + return 0, nil, linkAt(t, linux.AT_FDCWD, oldAddr, linux.AT_FDCWD, newAddr, resolve, false /* allowEmpty */) +} + +// Linkat implements linux syscall linkat(2). +func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldDirFD := kdefs.FD(args[0].Int()) + oldAddr := args[1].Pointer() + newDirFD := kdefs.FD(args[2].Int()) + newAddr := args[3].Pointer() + + // man linkat(2): + // By default, linkat(), does not dereference oldpath if it is a + // symbolic link (like link(2)). Since Linux 2.6.18, the flag + // AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be + // dereferenced if it is a symbolic link. + flags := args[4].Int() + resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW + allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH + + if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) { + return 0, nil, syserror.ENOENT + } + + return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) +} + +func readlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, err + } + if dirPath { + return 0, syserror.ENOENT + } + + err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + // Check for Read permission. + if err := d.Inode.CheckPermission(t, fs.PermMask{Read: true}); err != nil { + return err + } + + s, err := d.Inode.Readlink(t) + if err == syserror.ENOLINK { + return syserror.EINVAL + } + if err != nil { + return err + } + + buffer := []byte(s) + if uint(len(buffer)) > size { + buffer = buffer[:size] + } + + n, err := t.CopyOutBytes(bufAddr, buffer) + + // Update frame return value. + copied = uintptr(n) + + return err + }) + return copied, err // Return frame value. +} + +// Readlink implements linux syscall readlink(2). +func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + bufAddr := args[1].Pointer() + size := args[2].SizeT() + + n, err := readlinkAt(t, linux.AT_FDCWD, addr, bufAddr, size) + return n, nil, err +} + +// Readlinkat implements linux syscall readlinkat(2). +func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + bufAddr := args[2].Pointer() + size := args[3].SizeT() + + n, err := readlinkAt(t, dirFD, addr, bufAddr, size) + return n, nil, err +} + +func unlinkAt(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr) error { + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + if dirPath { + return syserror.ENOENT + } + + return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string) error { + if !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + if err := fs.MayDelete(t, root, d, name); err != nil { + return err + } + + return d.Remove(t, root, name) + }) +} + +// Unlink implements linux syscall unlink(2). +func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + return 0, nil, unlinkAt(t, linux.AT_FDCWD, addr) +} + +// Unlinkat implements linux syscall unlinkat(2). +func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + flags := args[2].Uint() + if flags&linux.AT_REMOVEDIR != 0 { + return 0, nil, rmdirAt(t, dirFD, addr) + } + return 0, nil, unlinkAt(t, dirFD, addr) +} + +// Truncate implements linux syscall truncate(2). +func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Int64() + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + if dirPath { + return 0, nil, syserror.EINVAL + } + + if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(syscall.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + if fs.IsDir(d.Inode.StableAttr) { + return syserror.EISDIR + } + if !fs.IsFile(d.Inode.StableAttr) { + return syserror.EINVAL + } + + // Reject truncation if the access permissions do not allow truncation. + // This is different from the behavior of sys_ftruncate, see below. + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil { + return err + } + + if err := d.Inode.Truncate(t, d, length); err != nil { + return err + } + + // File length modified, generate notification. + d.InotifyEvent(linux.IN_MODIFY, 0) + + return nil + }) +} + +// Ftruncate implements linux syscall ftruncate(2). +func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + length := args[1].Int64() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Reject truncation if the file flags do not permit this operation. + // This is different from truncate(2) above. + if !file.Flags().Write { + return 0, nil, syserror.EINVAL + } + + // Note that this is different from truncate(2) above, where a + // directory returns EISDIR. + if !fs.IsFile(file.Dirent.Inode.StableAttr) { + return 0, nil, syserror.EINVAL + } + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(syscall.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil { + return 0, nil, err + } + + // File length modified, generate notification. + file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + + return 0, nil, nil +} + +// Umask implements linux syscall umask(2). +func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + mask := args[0].ModeT() + mask = t.FSContext().SwapUmask(mask & 0777) + return uintptr(mask), nil, nil +} + +// Change ownership of a file. +// +// uid and gid may be -1, in which case they will not be changed. +func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { + owner := fs.FileOwner{ + UID: auth.NoID, + GID: auth.NoID, + } + + uattr, err := d.Inode.UnstableAttr(t) + if err != nil { + return err + } + c := t.Credentials() + hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN) + isOwner := uattr.Owner.UID == c.EffectiveKUID + if uid.Ok() { + kuid := c.UserNamespace.MapToKUID(uid) + // Valid UID must be supplied if UID is to be changed. + if !kuid.Ok() { + return syserror.EINVAL + } + + // "Only a privileged process (CAP_CHOWN) may change the owner + // of a file." -chown(2) + // + // Linux also allows chown if you own the file and are + // explicitly not changing its UID. + isNoop := uattr.Owner.UID == kuid + if !(hasCap || (isOwner && isNoop)) { + return syserror.EPERM + } + + owner.UID = kuid + } + if gid.Ok() { + kgid := c.UserNamespace.MapToKGID(gid) + // Valid GID must be supplied if GID is to be changed. + if !kgid.Ok() { + return syserror.EINVAL + } + + // "The owner of a file may change the group of the file to any + // group of which that owner is a member. A privileged process + // (CAP_CHOWN) may change the group arbitrarily." -chown(2) + isNoop := uattr.Owner.GID == kgid + isMemberGroup := c.InGroup(kgid) + if !(hasCap || (isOwner && (isNoop || isMemberGroup))) { + return syserror.EPERM + } + + owner.GID = kgid + } + + // FIXME: This is racy; the inode's owner may have changed in + // the meantime. (Linux holds i_mutex while calling + // fs/attr.c:notify_change() => inode_operations::setattr => + // inode_change_ok().) + if err := d.Inode.SetOwner(t, d, owner); err != nil { + return err + } + + // When the owner or group are changed by an unprivileged user, + // chown(2) also clears the set-user-ID and set-group-ID bits, but + // we do not support them. + return nil +} + +func chownAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error { + path, _, err := copyInPath(t, addr, allowEmpty) + if err != nil { + return err + } + + if path == "" { + // Annoying. What's wrong with fchown? + file := t.FDMap().GetFile(fd) + if file == nil { + return syserror.EBADF + } + defer file.DecRef() + + return chown(t, file.Dirent, uid, gid) + } + + return fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error { + return chown(t, d, uid, gid) + }) +} + +// Chown implements linux syscall chown(2). +func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + uid := auth.UID(args[1].Uint()) + gid := auth.GID(args[2].Uint()) + + return 0, nil, chownAt(t, linux.AT_FDCWD, addr, true /* resolve */, false /* allowEmpty */, uid, gid) +} + +// Lchown implements linux syscall lchown(2). +func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + uid := auth.UID(args[1].Uint()) + gid := auth.GID(args[2].Uint()) + + return 0, nil, chownAt(t, linux.AT_FDCWD, addr, false /* resolve */, false /* allowEmpty */, uid, gid) +} + +// Fchown implements linux syscall fchown(2). +func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + uid := auth.UID(args[1].Uint()) + gid := auth.GID(args[2].Uint()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, chown(t, file.Dirent, uid, gid) +} + +// Fchownat implements Linux syscall fchownat(2). +func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + uid := auth.UID(args[2].Uint()) + gid := auth.GID(args[3].Uint()) + flags := args[4].Int() + + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + + return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid) +} + +func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error { + // Must own file to change mode. + if !d.Inode.CheckOwnership(t) { + return syserror.EPERM + } + + p := fs.FilePermsFromMode(mode) + if !d.Inode.SetPermissions(t, d, p) { + return syserror.EPERM + } + + // File attribute changed, generate notification. + d.InotifyEvent(linux.IN_ATTRIB, 0) + + return nil +} + +func chmodAt(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, mode linux.FileMode) error { + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + return fileOpOn(t, fd, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + return chmod(t, d, mode) + }) +} + +// Chmod implements linux syscall chmod(2). +func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := linux.FileMode(args[1].ModeT()) + + return 0, nil, chmodAt(t, linux.AT_FDCWD, addr, mode) +} + +// Fchmod implements linux syscall fchmod(2). +func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + mode := linux.FileMode(args[1].ModeT()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, chmod(t, file.Dirent, mode) +} + +// Fchmodat implements linux syscall fchmodat(2). +func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + mode := linux.FileMode(args[2].ModeT()) + + return 0, nil, chmodAt(t, fd, addr, mode) +} + +// defaultSetToSystemTimeSpec returns a TimeSpec that will set ATime and MTime +// to the system time. +func defaultSetToSystemTimeSpec() fs.TimeSpec { + return fs.TimeSpec{ + ATimeSetSystemTime: true, + MTimeSetSystemTime: true, + } +} + +func utimes(t *kernel.Task, dirFD kdefs.FD, addr usermem.Addr, ts fs.TimeSpec, resolve bool) error { + setTimestamp := func(root *fs.Dirent, d *fs.Dirent) error { + // Does the task own the file? + if !d.Inode.CheckOwnership(t) { + // Trying to set a specific time? Must be owner. + if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) { + return syserror.EPERM + } + + // Trying to set to current system time? Must have write access. + if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil { + return err + } + } + + return d.Inode.SetTimestamps(t, d, ts) + } + + // From utimes.c: + // "If filename is NULL and dfd refers to an open file, then operate on + // the file. Otherwise look up filename, possibly using dfd as a + // starting point." + if addr == 0 && dirFD != linux.AT_FDCWD { + if !resolve { + // Linux returns EINVAL in this case. See utimes.c. + return syserror.EINVAL + } + f := t.FDMap().GetFile(dirFD) + if f == nil { + return syserror.EBADF + } + defer f.DecRef() + + root := t.FSContext().RootDirectory() + defer root.DecRef() + + return setTimestamp(root, f.Dirent) + } + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return err + } + + return fileOpOn(t, dirFD, path, resolve, setTimestamp) +} + +// Utime implements linux syscall utime(2). +func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + filenameAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + // No timesAddr argument will be interpreted as current system time. + ts := defaultSetToSystemTimeSpec() + if timesAddr != 0 { + var times syscall.Utimbuf + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + ts = fs.TimeSpec{ + ATime: ktime.FromSeconds(times.Actime), + MTime: ktime.FromSeconds(times.Modtime), + } + } + return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true) +} + +// Utimes implements linux syscall utimes(2). +func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + filenameAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + // No timesAddr argument will be interpreted as current system time. + ts := defaultSetToSystemTimeSpec() + if timesAddr != 0 { + var times [2]linux.Timeval + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + ts = fs.TimeSpec{ + ATime: ktime.FromTimeval(times[0]), + MTime: ktime.FromTimeval(times[1]), + } + } + return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true) +} + +// timespecIsValid checks that the timespec is valid for use in utimensat. +func timespecIsValid(ts linux.Timespec) bool { + // Nsec must be UTIME_OMIT, UTIME_NOW, or less than 10^9. + return ts.Nsec == linux.UTIME_OMIT || ts.Nsec == linux.UTIME_NOW || ts.Nsec < 1e9 +} + +// Utimensat implements linux syscall utimensat(2). +func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := kdefs.FD(args[0].Int()) + pathnameAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + flags := args[3].Int() + + // No timesAddr argument will be interpreted as current system time. + ts := defaultSetToSystemTimeSpec() + if timesAddr != 0 { + var times [2]linux.Timespec + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) { + return 0, nil, syserror.EINVAL + } + + // If both are UTIME_OMIT, this is a noop. + if times[0].Nsec == linux.UTIME_OMIT && times[1].Nsec == linux.UTIME_OMIT { + return 0, nil, nil + } + + ts = fs.TimeSpec{ + ATime: ktime.FromTimespec(times[0]), + ATimeOmit: times[0].Nsec == linux.UTIME_OMIT, + ATimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW, + MTime: ktime.FromTimespec(times[1]), + MTimeOmit: times[1].Nsec == linux.UTIME_OMIT, + MTimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW, + } + } + return 0, nil, utimes(t, dirFD, pathnameAddr, ts, flags&linux.AT_SYMLINK_NOFOLLOW == 0) +} + +// Futimesat implements linux syscall futimesat(2). +func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirFD := kdefs.FD(args[0].Int()) + pathnameAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + + // No timesAddr argument will be interpreted as current system time. + ts := defaultSetToSystemTimeSpec() + if timesAddr != 0 { + var times [2]linux.Timeval + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + if times[0].Usec >= 1e6 || times[0].Usec < 0 || + times[1].Usec >= 1e6 || times[1].Usec < 0 { + return 0, nil, syserror.EINVAL + } + + ts = fs.TimeSpec{ + ATime: ktime.FromTimeval(times[0]), + MTime: ktime.FromTimeval(times[1]), + } + } + return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true) +} + +func renameAt(t *kernel.Task, oldDirFD kdefs.FD, oldAddr usermem.Addr, newDirFD kdefs.FD, newAddr usermem.Addr) error { + newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */) + if err != nil { + return err + } + oldPath, _, err := copyInPath(t, oldAddr, false /* allowEmpty */) + if err != nil { + return err + } + + return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string) error { + if !fs.IsDir(oldParent.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Root cannot be renamed to anything. + // + // TODO: This catches the case when the rename + // argument is exactly "/", but we should return EBUSY when + // renaming any mount point, or when the argument is not + // exactly "/" but still resolves to the root, like "/.." or + // "/bin/..". + if oldParent == root && oldName == "." { + return syscall.EBUSY + } + return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string) error { + if !fs.IsDir(newParent.Inode.StableAttr) { + return syserror.ENOTDIR + } + + // Nothing can be renamed to root. + // + // TODO: Same as above. + if newParent == root && newName == "." { + return syscall.EBUSY + } + return fs.Rename(t, root, oldParent, oldName, newParent, newName) + }) + }) +} + +// Rename implements linux syscall rename(2). +func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldPathAddr := args[0].Pointer() + newPathAddr := args[1].Pointer() + return 0, nil, renameAt(t, linux.AT_FDCWD, oldPathAddr, linux.AT_FDCWD, newPathAddr) +} + +// Renameat implements linux syscall renameat(2). +func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldDirFD := kdefs.FD(args[0].Int()) + oldPathAddr := args[1].Pointer() + newDirFD := kdefs.FD(args[2].Int()) + newPathAddr := args[3].Pointer() + return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr) +} + +// Fallocate implements linux system call fallocate(2). +// (well, not really, but at least we return the expected error codes) +func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + offset := args[2].Int64() + length := args[3].Int64() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + if offset < 0 || length <= 0 { + return 0, nil, syserror.EINVAL + } + + return 0, nil, syserror.EOPNOTSUPP +} + +// Flock implements linux syscall flock(2). +func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + operation := args[1].Int() + + file := t.FDMap().GetFile(fd) + if file == nil { + // flock(2): EBADF fd is not an open file descriptor. + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + nonblocking := operation&linux.LOCK_NB != 0 + operation &^= linux.LOCK_NB + + // flock(2): + // Locks created by flock() are associated with an open file table entry. This means that + // duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the + // same lock, and this lock may be modified or released using any of these descriptors. Furthermore, + // the lock is released either by an explicit LOCK_UN operation on any of these duplicate + // descriptors, or when all such descriptors have been closed. + // + // If a process uses open(2) (or similar) to obtain more than one descriptor for the same file, + // these descriptors are treated independently by flock(). An attempt to lock the file using + // one of these file descriptors may be denied by a lock that the calling process has already placed via + // another descriptor. + // + // We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2) + // and fork(2). + lockUniqueID := lock.UniqueID(file.UniqueID) + + // A BSD style lock spans the entire file. + rng := lock.LockRange{ + Start: 0, + End: lock.LockEOF, + } + + switch operation { + case linux.LOCK_EX: + if nonblocking { + // Since we're nonblocking we pass a nil lock.Blocker implementation. + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { + return 0, nil, syserror.EWOULDBLOCK + } + } else { + // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) { + return 0, nil, syserror.EINTR + } + } + case linux.LOCK_SH: + if nonblocking { + // Since we're nonblocking we pass a nil lock.Blocker implementation. + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { + return 0, nil, syserror.EWOULDBLOCK + } + } else { + // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) { + return 0, nil, syserror.EINTR + } + } + case linux.LOCK_UN: + file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng) + default: + // flock(2): EINVAL operation is invalid. + return 0, nil, syserror.EINVAL + } + + return 0, nil, nil +} + +// Sendfile implements linux system call sendfile(2). +func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + outFD := kdefs.FD(args[0].Int()) + inFD := kdefs.FD(args[1].Int()) + offsetAddr := args[2].Pointer() + count := int64(args[3].SizeT()) + + // Don't send a negative number of bytes. + if count < 0 { + return 0, nil, syserror.EINVAL + } + + // Get files. + outFile := t.FDMap().GetFile(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + inFile := t.FDMap().GetFile(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + + // Verify that the outfile is writable. + outFlags := outFile.Flags() + if !outFlags.Write { + return 0, nil, syserror.EBADF + } + + // Verify that the outfile Append flag is not set. + if outFlags.Append { + return 0, nil, syserror.EINVAL + } + + // Verify that we have a regular infile. + // http://elixir.free-electrons.com/linux/latest/source/fs/splice.c#L933 + if !fs.IsRegular(inFile.Dirent.Inode.StableAttr) { + return 0, nil, syserror.EINVAL + } + + // Verify that the infile is readable. + if !inFile.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Setup for sending data. + var offset uint64 + var n int64 + var err error + w := &fs.FileWriter{t, outFile} + hasOffset := offsetAddr != 0 + // If we have a provided offset. + if hasOffset { + // Copy in the offset. + if _, err := t.CopyIn(offsetAddr, &offset); err != nil { + return 0, nil, err + } + // Send data using Preadv. + r := io.NewSectionReader(&fs.FileReader{t, inFile}, int64(offset), count) + n, err = io.Copy(w, r) + // Copy out the new offset. + if _, err := t.CopyOut(offsetAddr, n+int64(offset)); err != nil { + return 0, nil, err + } + // If we don't have a provided offset. + } else { + // Send data using readv. + r := &io.LimitedReader{R: &fs.FileReader{t, inFile}, N: count} + n, err = io.Copy(w, r) + } + + // We can only pass a single file to handleIOError, so pick inFile + // arbitrarily. + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "sendfile", inFile) +} diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go new file mode 100644 index 000000000..57762d058 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -0,0 +1,319 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// futexChecker is a futex.Checker that uses a Task's MemoryManager. +type futexChecker struct { + t *kernel.Task +} + +// Check checks if the address contains the given value, and returns +// syserror.EAGAIN if it doesn't. See Checker interface in futex package +// for more information. +func (f futexChecker) Check(addr uintptr, val uint32) error { + in := f.t.CopyScratchBuffer(4) + _, err := f.t.CopyInBytes(usermem.Addr(addr), in) + if err != nil { + return err + } + nval := usermem.ByteOrder.Uint32(in) + if val != nval { + return syserror.EAGAIN + } + return nil +} + +func (f futexChecker) atomicOp(addr uintptr, op func(uint32) uint32) (uint32, error) { + in := f.t.CopyScratchBuffer(4) + _, err := f.t.CopyInBytes(usermem.Addr(addr), in) + if err != nil { + return 0, err + } + o := usermem.ByteOrder.Uint32(in) + mm := f.t.MemoryManager() + for { + n := op(o) + r, err := mm.CompareAndSwapUint32(f.t, usermem.Addr(addr), o, n, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + if r == o { + return o, nil + } + o = r + } +} + +// Op performs an operation on addr and returns a result based on the operation. +func (f futexChecker) Op(addr uintptr, opIn uint32) (bool, error) { + op := (opIn >> 28) & 0xf + cmp := (opIn >> 24) & 0xf + opArg := (opIn >> 12) & 0xfff + cmpArg := opIn & 0xfff + + if op&linux.FUTEX_OP_OPARG_SHIFT != 0 { + opArg = 1 << opArg + op &^= linux.FUTEX_OP_OPARG_SHIFT // clear flag + } + + var oldVal uint32 + var err error + switch op { + case linux.FUTEX_OP_SET: + oldVal, err = f.t.MemoryManager().SwapUint32(f.t, usermem.Addr(addr), opArg, usermem.IOOpts{ + AddressSpaceActive: true, + }) + case linux.FUTEX_OP_ADD: + oldVal, err = f.atomicOp(addr, func(a uint32) uint32 { + return a + opArg + }) + case linux.FUTEX_OP_OR: + oldVal, err = f.atomicOp(addr, func(a uint32) uint32 { + return a | opArg + }) + case linux.FUTEX_OP_ANDN: + oldVal, err = f.atomicOp(addr, func(a uint32) uint32 { + return a & ^opArg + }) + case linux.FUTEX_OP_XOR: + oldVal, err = f.atomicOp(addr, func(a uint32) uint32 { + return a ^ opArg + }) + default: + return false, syserror.ENOSYS + } + if err != nil { + return false, err + } + + switch cmp { + case linux.FUTEX_OP_CMP_EQ: + return oldVal == cmpArg, nil + case linux.FUTEX_OP_CMP_NE: + return oldVal != cmpArg, nil + case linux.FUTEX_OP_CMP_LT: + return oldVal < cmpArg, nil + case linux.FUTEX_OP_CMP_LE: + return oldVal <= cmpArg, nil + case linux.FUTEX_OP_CMP_GT: + return oldVal > cmpArg, nil + case linux.FUTEX_OP_CMP_GE: + return oldVal >= cmpArg, nil + default: + return false, syserror.ENOSYS + } +} + +// futexWaitRestartBlock encapsulates the state required to restart futex(2) +// via restart_syscall(2). +type futexWaitRestartBlock struct { + duration time.Duration + + // addr stored as uint64 since uintptr is not save-able. + addr uint64 + + val uint32 + mask uint32 +} + +// Restart implements kernel.SyscallRestartBlock.Restart. +func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) { + return futexWaitDuration(t, f.duration, false, uintptr(f.addr), f.val, f.mask) +} + +// futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is +// complete. +// +// The wait blocks forever if forever is true, otherwise it blocks until ts. +// +// If blocking is interrupted, the syscall is restarted with the original +// arguments. +func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr uintptr, val, mask uint32) (uintptr, error) { + w := t.FutexWaiter() + err := t.Futex().WaitPrepare(w, futexChecker{t}, addr, val, mask) + if err != nil { + return 0, err + } + + if forever { + err = t.Block(w.C) + } else if clockRealtime { + notifier, tchan := ktime.NewChannelNotifier() + timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier) + timer.Swap(ktime.Setting{ + Enabled: true, + Next: ktime.FromTimespec(ts), + }) + err = t.BlockWithTimer(w.C, tchan) + timer.Destroy() + } else { + err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts)) + } + + t.Futex().WaitComplete(w) + return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +// futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is +// complete. +// +// The wait blocks forever if forever is true, otherwise is blocks for +// duration. +// +// If blocking is interrupted, forever determines how to restart the +// syscall. If forever is true, the syscall is restarted with the original +// arguments. If forever is false, duration is a relative timeout and the +// syscall is restarted with the remaining timeout. +func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr uintptr, val, mask uint32) (uintptr, error) { + w := t.FutexWaiter() + err := t.Futex().WaitPrepare(w, futexChecker{t}, addr, val, mask) + if err != nil { + return 0, err + } + + remaining, err := t.BlockWithTimeout(w.C, !forever, duration) + t.Futex().WaitComplete(w) + if err == nil { + return 0, nil + } + + // The wait was unsuccessful for some reason other than interruption. Simply + // forward the error. + if err != syserror.ErrInterrupted { + return 0, err + } + + // The wait was interrupted and we need to restart. Decide how. + + // The wait duration was absolute, restart with the original arguments. + if forever { + return 0, kernel.ERESTARTSYS + } + + // The wait duration was relative, restart with the remaining duration. + t.SetSyscallRestartBlock(&futexWaitRestartBlock{ + duration: remaining, + addr: uint64(addr), + val: val, + mask: mask, + }) + return 0, kernel.ERESTART_RESTARTBLOCK +} + +// Futex implements linux syscall futex(2). +// It provides a method for a program to wait for a value at a given address to +// change, and a method to wake up anyone waiting on a particular address. +func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + uaddr := args[0].Pointer() + futexOp := args[1].Int() + val := int(args[2].Int()) + nreq := int(args[3].Int()) + timeout := args[3].Pointer() + uaddr2 := args[4].Pointer() + val3 := args[5].Int() + + addr := uintptr(uaddr) + naddr := uintptr(uaddr2) + cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME) + clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME + mask := uint32(val3) + + switch cmd { + case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET: + // WAIT{_BITSET} wait forever if the timeout isn't passed. + forever := timeout == 0 + + var timespec linux.Timespec + if !forever { + var err error + timespec, err = copyTimespecIn(t, timeout) + if err != nil { + return 0, nil, err + } + } + + switch cmd { + case linux.FUTEX_WAIT: + // WAIT uses a relative timeout. + mask = ^uint32(0) + var timeoutDur time.Duration + if !forever { + timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond + } + n, err := futexWaitDuration(t, timeoutDur, forever, addr, uint32(val), mask) + return n, nil, err + + case linux.FUTEX_WAIT_BITSET: + // WAIT_BITSET uses an absolute timeout which is either + // CLOCK_MONOTONIC or CLOCK_REALTIME. + if mask == 0 { + return 0, nil, syserror.EINVAL + } + n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, uint32(val), mask) + return n, nil, err + default: + panic("unreachable") + } + + case linux.FUTEX_WAKE: + mask = ^uint32(0) + fallthrough + + case linux.FUTEX_WAKE_BITSET: + if mask == 0 { + return 0, nil, syserror.EINVAL + } + n, err := t.Futex().Wake(addr, mask, val) + return uintptr(n), nil, err + + case linux.FUTEX_REQUEUE: + n, err := t.Futex().Requeue(addr, naddr, val, nreq) + return uintptr(n), nil, err + + case linux.FUTEX_CMP_REQUEUE: + // 'val3' contains the value to be checked at 'addr' and + // 'val' is the number of waiters that should be woken up. + nval := uint32(val3) + n, err := t.Futex().RequeueCmp(futexChecker{t}, addr, nval, naddr, val, nreq) + return uintptr(n), nil, err + + case linux.FUTEX_WAKE_OP: + op := uint32(val3) + n, err := t.Futex().WakeOp(futexChecker{t}, addr, naddr, val, nreq, op) + return uintptr(n), nil, err + + case linux.FUTEX_LOCK_PI, linux.FUTEX_UNLOCK_PI, linux.FUTEX_TRYLOCK_PI, linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI: + // We don't support any priority inversion futexes. + return 0, nil, syserror.ENOSYS + + default: + // We don't even know about this command. + return 0, nil, syserror.ENOSYS + } +} diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go new file mode 100644 index 000000000..178714b07 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -0,0 +1,269 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "bytes" + "io" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Getdents implements linux syscall getdents(2) for 64bit systems. +func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + size := int(args[2].Uint()) + + minSize := int(smallestDirent(t.Arch())) + if size < minSize { + // size is smaller than smallest possible dirent. + return 0, nil, syserror.EINVAL + } + + n, err := getdents(t, fd, addr, size, (*dirent).Serialize) + return n, nil, err +} + +// Getdents64 implements linux syscall getdents64(2). +func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + size := int(args[2].Uint()) + + minSize := int(smallestDirent64(t.Arch())) + if size < minSize { + // size is smaller than smallest possible dirent. + return 0, nil, syserror.EINVAL + } + + n, err := getdents(t, fd, addr, size, (*dirent).Serialize64) + return n, nil, err +} + +// getdents implements the core of getdents(2)/getdents64(2). +// f is the syscall implementation dirent serialization function. +func getdents(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, size int, f func(*dirent, io.Writer) (int, error)) (uintptr, error) { + dir := t.FDMap().GetFile(fd) + if dir == nil { + return 0, syserror.EBADF + } + defer dir.DecRef() + + w := &usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: addr, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + } + + ds := newDirentSerializer(f, w, t.Arch(), size) + rerr := dir.Readdir(t, ds) + + switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err { + case nil: + dir.Dirent.InotifyEvent(syscall.IN_ACCESS, 0) + return uintptr(ds.Written()), nil + case io.EOF: + return 0, nil + default: + return 0, err + } +} + +// oldDirentHdr is a fixed sized header matching the fixed size +// fields found in the old linux dirent struct. +type oldDirentHdr struct { + Ino uint64 + Off uint64 + Reclen uint16 +} + +// direntHdr is a fixed sized header matching the fixed size +// fields found in the new linux dirent struct. +type direntHdr struct { + OldHdr oldDirentHdr + Typ uint8 +} + +// dirent contains the data pointed to by a new linux dirent struct. +type dirent struct { + Hdr direntHdr + Name []byte +} + +// newDirent returns a dirent from an fs.InodeOperationsInfo. +func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent { + d := &dirent{ + Hdr: direntHdr{ + OldHdr: oldDirentHdr{ + Ino: attr.InodeID, + Off: offset, + }, + Typ: toType(attr.Type), + }, + Name: []byte(name), + } + d.Hdr.OldHdr.Reclen = d.padRec(int(width)) + return d +} + +// smallestDirent returns the size of the smallest possible dirent using +// the old linux dirent format. +func smallestDirent(a arch.Context) uint { + d := dirent{} + return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1 +} + +// smallestDirent64 returns the size of the smallest possible dirent using +// the new linux dirent format. +func smallestDirent64(a arch.Context) uint { + d := dirent{} + return uint(binary.Size(d.Hdr)) + a.Width() +} + +// toType converts an fs.InodeOperationsInfo to a linux dirent typ field. +func toType(nodeType fs.InodeType) uint8 { + switch nodeType { + case fs.RegularFile, fs.SpecialFile: + return syscall.DT_REG + case fs.Symlink: + return syscall.DT_LNK + case fs.Directory: + return syscall.DT_DIR + case fs.Pipe: + return syscall.DT_FIFO + case fs.CharacterDevice: + return syscall.DT_CHR + case fs.BlockDevice: + return syscall.DT_BLK + case fs.Socket: + return syscall.DT_SOCK + default: + return syscall.DT_UNKNOWN + } +} + +// padRec pads the name field until the rec length is a multiple of the width, +// which must be a power of 2. It returns the padded rec length. +func (d *dirent) padRec(width int) uint16 { + a := int(binary.Size(d.Hdr)) + len(d.Name) + r := (a + width) &^ (width - 1) + padding := r - a + d.Name = append(d.Name, make([]byte, padding)...) + return uint16(r) +} + +// Serialize64 serializes a Dirent struct to a byte slice, keeping the new +// linux dirent format. Returns the number of bytes serialized or an error. +func (d *dirent) Serialize64(w io.Writer) (int, error) { + n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr)) + if err != nil { + return 0, err + } + n2, err := w.Write(d.Name) + if err != nil { + return 0, err + } + return n1 + n2, nil +} + +// Serialize serializes a Dirent struct to a byte slice, using the old linux +// dirent format. +// Returns the number of bytes serialized or an error. +func (d *dirent) Serialize(w io.Writer) (int, error) { + n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr)) + if err != nil { + return 0, err + } + n2, err := w.Write(d.Name) + if err != nil { + return 0, err + } + n3, err := w.Write([]byte{d.Hdr.Typ}) + if err != nil { + return 0, err + } + return n1 + n2 + n3, nil +} + +// direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an +// io.Writer. +type direntSerializer struct { + serialize func(*dirent, io.Writer) (int, error) + w io.Writer + // width is the arch native value width. + width uint + // offset is the current dirent offset. + offset uint64 + // written is the total bytes serialized. + written int + // size is the size of the buffer to serialize into. + size int +} + +func newDirentSerializer(f func(d *dirent, w io.Writer) (int, error), w io.Writer, ac arch.Context, size int) *direntSerializer { + return &direntSerializer{ + serialize: f, + w: w, + width: ac.Width(), + size: size, + } +} + +// CopyOut implements fs.InodeOperationsInfoSerializer.CopyOut. +// It serializes and writes the fs.DentAttr to the direntSerializer io.Writer. +func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error { + ds.offset++ + + d := newDirent(ds.width, name, attr, ds.offset) + + // Serialize dirent into a temp buffer. + var b bytes.Buffer + n, err := ds.serialize(d, &b) + if err != nil { + ds.offset-- + return err + } + + // Check that we have enough room remaining to write the dirent. + if n > (ds.size - ds.written) { + ds.offset-- + return io.EOF + } + + // Write out the temp buffer. + if _, err := b.WriteTo(ds.w); err != nil { + ds.offset-- + return err + } + + ds.written += n + return nil +} + +// Written returns the total number of bytes written. +func (ds *direntSerializer) Written() int { + return ds.written +} diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go new file mode 100644 index 000000000..4fd0ed794 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_identity.go @@ -0,0 +1,180 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const ( + // As NGROUPS_MAX in include/uapi/linux/limits.h. + maxNGroups = 65536 +) + +// Getuid implements the Linux syscall getuid. +func Getuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + c := t.Credentials() + ruid := c.RealKUID.In(c.UserNamespace).OrOverflow() + return uintptr(ruid), nil, nil +} + +// Geteuid implements the Linux syscall geteuid. +func Geteuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + c := t.Credentials() + euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow() + return uintptr(euid), nil, nil +} + +// Getresuid implements the Linux syscall getresuid. +func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + ruidAddr := args[0].Pointer() + euidAddr := args[1].Pointer() + suidAddr := args[2].Pointer() + c := t.Credentials() + ruid := c.RealKUID.In(c.UserNamespace).OrOverflow() + euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow() + suid := c.SavedKUID.In(c.UserNamespace).OrOverflow() + if _, err := t.CopyOut(ruidAddr, ruid); err != nil { + return 0, nil, err + } + if _, err := t.CopyOut(euidAddr, euid); err != nil { + return 0, nil, err + } + if _, err := t.CopyOut(suidAddr, suid); err != nil { + return 0, nil, err + } + return 0, nil, nil +} + +// Getgid implements the Linux syscall getgid. +func Getgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + c := t.Credentials() + rgid := c.RealKGID.In(c.UserNamespace).OrOverflow() + return uintptr(rgid), nil, nil +} + +// Getegid implements the Linux syscall getegid. +func Getegid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + c := t.Credentials() + egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow() + return uintptr(egid), nil, nil +} + +// Getresgid implements the Linux syscall getresgid. +func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + rgidAddr := args[0].Pointer() + egidAddr := args[1].Pointer() + sgidAddr := args[2].Pointer() + c := t.Credentials() + rgid := c.RealKGID.In(c.UserNamespace).OrOverflow() + egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow() + sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow() + if _, err := t.CopyOut(rgidAddr, rgid); err != nil { + return 0, nil, err + } + if _, err := t.CopyOut(egidAddr, egid); err != nil { + return 0, nil, err + } + if _, err := t.CopyOut(sgidAddr, sgid); err != nil { + return 0, nil, err + } + return 0, nil, nil +} + +// Setuid implements the Linux syscall setuid. +func Setuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + uid := auth.UID(args[0].Int()) + return 0, nil, t.SetUID(uid) +} + +// Setreuid implements the Linux syscall setreuid. +func Setreuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + ruid := auth.UID(args[0].Int()) + euid := auth.UID(args[1].Int()) + return 0, nil, t.SetREUID(ruid, euid) +} + +// Setresuid implements the Linux syscall setreuid. +func Setresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + ruid := auth.UID(args[0].Int()) + euid := auth.UID(args[1].Int()) + suid := auth.UID(args[2].Int()) + return 0, nil, t.SetRESUID(ruid, euid, suid) +} + +// Setgid implements the Linux syscall setgid. +func Setgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + gid := auth.GID(args[0].Int()) + return 0, nil, t.SetGID(gid) +} + +// Setregid implements the Linux syscall setregid. +func Setregid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + rgid := auth.GID(args[0].Int()) + egid := auth.GID(args[1].Int()) + return 0, nil, t.SetREGID(rgid, egid) +} + +// Setresgid implements the Linux syscall setregid. +func Setresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + rgid := auth.GID(args[0].Int()) + egid := auth.GID(args[1].Int()) + sgid := auth.GID(args[2].Int()) + return 0, nil, t.SetRESGID(rgid, egid, sgid) +} + +// Getgroups implements the Linux syscall getgroups. +func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + size := int(args[0].Int()) + if size < 0 { + return 0, nil, syserror.EINVAL + } + kgids := t.Credentials().ExtraKGIDs + // "If size is zero, list is not modified, but the total number of + // supplementary group IDs for the process is returned." - getgroups(2) + if size == 0 { + return uintptr(len(kgids)), nil, nil + } + if size < len(kgids) { + return 0, nil, syserror.EINVAL + } + gids := make([]auth.GID, len(kgids)) + for i, kgid := range kgids { + gids[i] = kgid.In(t.UserNamespace()).OrOverflow() + } + if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil { + return 0, nil, err + } + return uintptr(len(gids)), nil, nil +} + +// Setgroups implements the Linux syscall setgroups. +func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + size := args[0].Int() + if size < 0 || size > maxNGroups { + return 0, nil, syserror.EINVAL + } + if size == 0 { + return 0, nil, t.SetExtraGIDs(nil) + } + gids := make([]auth.GID, size) + if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil { + return 0, nil, err + } + return 0, nil, t.SetExtraGIDs(gids) +} diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go new file mode 100644 index 000000000..725204dff --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -0,0 +1,135 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" +) + +const allFlags = int(linux.IN_NONBLOCK | linux.IN_CLOEXEC) + +// InotifyInit1 implements the inotify_init1() syscalls. +func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := int(args[0].Int()) + + if flags&^allFlags != 0 { + return 0, nil, syscall.EINVAL + } + + dirent := fs.NewDirent(anon.NewInode(t), "inotify") + fileFlags := fs.FileFlags{ + Read: true, + Write: true, + NonBlocking: flags&linux.IN_NONBLOCK != 0, + } + n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t)) + defer n.DecRef() + + fd, err := t.FDMap().NewFDFrom(0, n, kernel.FDFlags{ + CloseOnExec: flags&linux.IN_CLOEXEC != 0, + }, t.ThreadGroup().Limits()) + + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// InotifyInit implements the inotify_init() syscalls. +func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[0].Value = 0 + return InotifyInit1(t, args) +} + +// fdToInotify resolves an fd to an inotify object. If successful, the file will +// have an extra ref and the caller is responsible for releasing the ref. +func fdToInotify(t *kernel.Task, fd kdefs.FD) (*fs.Inotify, *fs.File, error) { + file := t.FDMap().GetFile(fd) + if file == nil { + // Invalid fd. + return nil, nil, syscall.EBADF + } + + ino, ok := file.FileOperations.(*fs.Inotify) + if !ok { + // Not an inotify fd. + file.DecRef() + return nil, nil, syscall.EINVAL + } + + return ino, file, nil +} + +// InotifyAddWatch implements the inotify_add_watch() syscall. +func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + mask := args[2].Uint() + + // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link." + // -- inotify(7) + resolve := mask&linux.IN_DONT_FOLLOW == 0 + + // "EINVAL: The given event mask contains no valid events." + // -- inotify_add_watch(2) + if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { + return 0, nil, syscall.EINVAL + } + + ino, file, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + err = fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, dirent *fs.Dirent) error { + // "IN_ONLYDIR: Only watch pathname if it is a directory." -- inotify(7) + if onlyDir := mask&linux.IN_ONLYDIR != 0; onlyDir && !fs.IsDir(dirent.Inode.StableAttr) { + return syscall.ENOTDIR + } + + // Copy out to the return frame. + fd = kdefs.FD(ino.AddWatch(dirent, mask)) + + return nil + }) + return uintptr(fd), nil, err // Return from the existing value. +} + +// InotifyRmWatch implements the inotify_rm_watch() syscall. +func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + wd := args[1].Int() + + ino, file, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + return 0, nil, ino.RmWatch(wd) +} diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go new file mode 100644 index 000000000..97b51ba7c --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -0,0 +1,55 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Lseek implements linux syscall lseek(2). +func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + offset := args[1].Int64() + whence := args[2].Int() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + var sw fs.SeekWhence + switch whence { + case 0: + sw = fs.SeekSet + case 1: + sw = fs.SeekCurrent + case 2: + sw = fs.SeekEnd + default: + return 0, nil, syserror.EINVAL + } + + offset, serr := file.Seek(t, sw, offset) + err := handleIOError(t, false /* partialResult */, serr, kernel.ERESTARTSYS, "lseek", file) + if err != nil { + return 0, nil, err + } + return uintptr(offset), nil, err +} diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go new file mode 100644 index 000000000..2c7d41de0 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -0,0 +1,435 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "bytes" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Brk implements linux syscall brk(2). +func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr, _ := t.MemoryManager().Brk(t, args[0].Pointer()) + // "However, the actual Linux system call returns the new program break on + // success. On failure, the system call returns the current break." - + // brk(2) + return uintptr(addr), nil, nil +} + +// Mmap implements linux syscall mmap(2). +func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + prot := args[2].Int() + flags := args[3].Int() + fd := kdefs.FD(args[4].Int()) + fixed := flags&linux.MAP_FIXED != 0 + private := flags&linux.MAP_PRIVATE != 0 + shared := flags&linux.MAP_SHARED != 0 + anon := flags&linux.MAP_ANONYMOUS != 0 + + // Require exactly one of MAP_PRIVATE and MAP_SHARED. + if private == shared { + return 0, nil, syserror.EINVAL + } + + opts := memmap.MMapOpts{ + Length: args[1].Uint64(), + Offset: args[5].Uint64(), + Addr: args[0].Pointer(), + Fixed: fixed, + Unmap: fixed, + Private: private, + Perms: usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, + MaxPerms: usermem.AnyAccess, + GrowsDown: linux.MAP_GROWSDOWN&flags != 0, + Precommit: linux.MAP_POPULATE&flags != 0, + } + defer func() { + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef() + } + }() + + if !anon { + // Convert the passed FD to a file reference. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + flags := file.Flags() + // mmap unconditionally requires that the FD is readable. + if !flags.Read { + return 0, nil, syserror.EACCES + } + // MAP_SHARED requires that the FD be writable for PROT_WRITE. + if shared && !flags.Write { + opts.MaxPerms.Write = false + } + + if err := file.ConfigureMMap(t, &opts); err != nil { + return 0, nil, err + } + } + + rv, err := t.MemoryManager().MMap(t, opts) + return uintptr(rv), nil, err +} + +// Munmap implements linux syscall munmap(2). +func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) +} + +// Mremap implements linux syscall mremap(2). +func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldAddr := args[0].Pointer() + oldSize := args[1].Uint64() + newSize := args[2].Uint64() + flags := args[3].Uint64() + newAddr := args[4].Pointer() + + if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { + return 0, nil, syserror.EINVAL + } + mayMove := flags&linux.MREMAP_MAYMOVE != 0 + fixed := flags&linux.MREMAP_FIXED != 0 + var moveMode mm.MRemapMoveMode + switch { + case !mayMove && !fixed: + moveMode = mm.MRemapNoMove + case mayMove && !fixed: + moveMode = mm.MRemapMayMove + case mayMove && fixed: + moveMode = mm.MRemapMustMove + case !mayMove && fixed: + // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be + // specified." - mremap(2) + return 0, nil, syserror.EINVAL + } + + rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ + Move: moveMode, + NewAddr: newAddr, + }) + return uintptr(rv), nil, err +} + +// Mprotect implements linux syscall mprotect(2). +func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + length := args[1].Uint64() + prot := args[2].Int() + err := t.MemoryManager().MProtect(args[0].Pointer(), length, usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, linux.PROT_GROWSDOWN&prot != 0) + return 0, nil, err +} + +// Madvise implements linux syscall madvise(2). +func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := uint64(args[1].SizeT()) + adv := args[2].Int() + + // "The Linux implementation requires that the address addr be + // page-aligned, and allows length to be zero." - madvise(2) + if addr.RoundDown() != addr { + return 0, nil, syserror.EINVAL + } + if length == 0 { + return 0, nil, nil + } + // Not explicitly stated: length need not be page-aligned. + lenAddr, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.EINVAL + } + length = uint64(lenAddr) + + switch adv { + case linux.MADV_DONTNEED: + return 0, nil, t.MemoryManager().Decommit(addr, length) + case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE: + fallthrough + case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE: + fallthrough + case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED: + // Do nothing, we totally ignore the suggestions above. + return 0, nil, nil + case linux.MADV_REMOVE, linux.MADV_DOFORK, linux.MADV_DONTFORK: + // These "suggestions" have application-visible side effects, so we + // have to indicate that we don't support them. + return 0, nil, syserror.ENOSYS + case linux.MADV_HWPOISON: + // Only privileged processes are allowed to poison pages. + return 0, nil, syserror.EPERM + default: + // If adv is not a valid value tell the caller. + return 0, nil, syserror.EINVAL + } +} + +func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) { + if ptr != 0 { + return t.CopyOut(ptr, val) + } + return 0, nil +} + +// GetMempolicy implements the syscall get_mempolicy(2). +func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + mode := args[0].Pointer() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + addr := args[3].Pointer() + flags := args[4].Uint() + + memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0 + nodeFlag := flags&linux.MPOL_F_NODE != 0 + addrFlag := flags&linux.MPOL_F_ADDR != 0 + + // TODO: Once sysfs is implemented, report a single numa node in + // /sys/devices/system/node. + if nodemask != 0 && maxnode < 1 { + return 0, nil, syserror.EINVAL + } + + // 'addr' provided iff 'addrFlag' set. + if addrFlag == (addr == 0) { + return 0, nil, syserror.EINVAL + } + + // Default policy for the thread. + if flags == 0 { + policy, nodemaskVal := t.NumaPolicy() + if _, err := copyOutIfNotNull(t, mode, policy); err != nil { + return 0, nil, syserror.EFAULT + } + if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil { + return 0, nil, syserror.EFAULT + } + return 0, nil, nil + } + + // Report all nodes available to caller. + if memsAllowed { + // MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED. + if nodeFlag || addrFlag { + return 0, nil, syserror.EINVAL + } + + // Report a single numa node. + if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil { + return 0, nil, syserror.EFAULT + } + return 0, nil, nil + } + + if addrFlag { + if nodeFlag { + // Return the id for the node where 'addr' resides, via 'mode'. + // + // The real get_mempolicy(2) allocates the page referenced by 'addr' + // by simulating a read, if it is unallocated before the call. It + // then returns the node the page is allocated on through the mode + // pointer. + b := t.CopyScratchBuffer(1) + _, err := t.CopyInBytes(addr, b) + if err != nil { + return 0, nil, syserror.EFAULT + } + if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { + return 0, nil, syserror.EFAULT + } + } else { + storedPolicy, _ := t.NumaPolicy() + // Return the policy governing the memory referenced by 'addr'. + if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil { + return 0, nil, syserror.EFAULT + } + } + return 0, nil, nil + } + + storedPolicy, _ := t.NumaPolicy() + if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) { + // Policy for current thread is to interleave memory between + // nodes. Return the next node we'll allocate on. Since we only have a + // single node, this is always node 0. + if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { + return 0, nil, syserror.EFAULT + } + return 0, nil, nil + } + + return 0, nil, syserror.EINVAL +} + +func allowedNodesMask() uint32 { + const maxNodes = 1 + return ^uint32((1 << maxNodes) - 1) +} + +// SetMempolicy implements the syscall set_mempolicy(2). +func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + modeWithFlags := args[0].Int() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + + if maxnode < 1 { + return 0, nil, syserror.EINVAL + } + + if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS { + // Can't specify multiple modes simultaneously. Must also contain a + // valid mode, which we check below. + return 0, nil, syserror.EINVAL + } + + mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS + if mode < 0 || mode >= linux.MPOL_MAX { + return 0, nil, syserror.EINVAL + } + + var nodemaskVal uint32 + if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil { + return 0, nil, syserror.EFAULT + } + + // When setting MPOL_INTERLEAVE, nodemask must not be empty. + if mode == linux.MPOL_INTERLEAVE && nodemaskVal == 0 { + return 0, nil, syserror.EINVAL + } + + if nodemaskVal&allowedNodesMask() != 0 { + // Invalid node specified. + return 0, nil, syserror.EINVAL + } + + t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal) + + return 0, nil, nil +} + +// Mincore implements the syscall mincore(2). +func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + vec := args[2].Pointer() + + if addr != addr.RoundDown() { + return 0, nil, syserror.EINVAL + } + // "The length argument need not be a multiple of the page size, but since + // residency information is returned for whole pages, length is effectively + // rounded up to the next multiple of the page size." - mincore(2) + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.ENOMEM + } + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return 0, nil, syserror.ENOMEM + } + + // Pretend that all mapped pages are "resident in core". + mapped := t.MemoryManager().VirtualMemorySizeRange(ar) + // "ENOMEM: addr to addr + length contained unmapped memory." + if mapped != uint64(la) { + return 0, nil, syserror.ENOMEM + } + resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize)) + _, err := t.CopyOut(vec, resident) + return 0, nil, err +} + +// Msync implements Linux syscall msync(2). +func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + flags := args[2].Int() + + if addr != addr.RoundDown() { + return 0, nil, syserror.EINVAL + } + if length == 0 { + return 0, nil, nil + } + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.ENOMEM + } + // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, + // and may additionally include the MS_INVALIDATE bit. ... However, Linux + // permits a call to msync() that specifies neither of these flags, with + // semantics that are (currently) equivalent to specifying MS_ASYNC." - + // msync(2) + if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { + return 0, nil, syserror.EINVAL + } + sync := flags&linux.MS_SYNC != 0 + if sync && flags&linux.MS_ASYNC != 0 { + return 0, nil, syserror.EINVAL + } + + // MS_INVALIDATE "asks to invalidate other mappings of the same file (so + // that they can be updated with the fresh values just written)". This is a + // no-op given that shared memory exists. However, MS_INVALIDATE can also + // be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags, + // and a memory lock exists for the specified address range." Given that + // mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since + // some user program could be using it for synchronization. + if flags&linux.MS_INVALIDATE != 0 { + return 0, nil, syserror.EINVAL + } + // MS_SYNC "requests an update and waits for it to complete." + if sync { + err := t.MemoryManager().Sync(t, addr, uint64(la)) + // Sync calls fsync, the same interrupt conversion rules apply, see + // mm/msync.c, fsync POSIX.1-2008. + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + } + // MS_ASYNC "specifies that an update be scheduled, but the call returns + // immediately". As long as dirty pages are tracked and eventually written + // back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC + // is in fact a no-op, since the kernel properly tracks dirty pages and + // flushes them to storage as necessary.") + // + // However: "ENOMEM: The indicated memory (or part of it) was not mapped." + // This applies even for MS_ASYNC. + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return 0, nil, syserror.ENOMEM + } + mapped := t.MemoryManager().VirtualMemorySizeRange(ar) + if mapped != uint64(la) { + return 0, nil, syserror.ENOMEM + } + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go new file mode 100644 index 000000000..d70b79e4f --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_mount.go @@ -0,0 +1,140 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Mount implements Linux syscall mount(2). +func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sourceAddr := args[0].Pointer() + targetAddr := args[1].Pointer() + typeAddr := args[2].Pointer() + flags := args[3].Uint64() + dataAddr := args[4].Pointer() + + fsType, err := t.CopyInString(typeAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + + sourcePath, _, err := copyInPath(t, sourceAddr, true /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + targetPath, _, err := copyInPath(t, targetAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + // In Linux, a full page is always copied in regardless of null + // character placement, and the address is passed to each file system. + // Most file systems always treat this data as a string, though, and so + // do all of the ones we implement. + data, err := t.CopyInString(dataAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + + // Ignore magic value that was required before Linux 2.4. + if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL { + flags = flags &^ linux.MS_MGC_MSK + } + + // Must have CAP_SYS_ADMIN in the mount namespace's associated user + // namespace. + if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) { + return 0, nil, syserror.EPERM + } + + const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND | + linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE | + linux.MS_UNBINDABLE | linux.MS_MOVE + + // Silently allow MS_NOSUID, since we don't implement set-id bits + // anyway. + const unsupportedFlags = linux.MS_NODEV | linux.MS_NOEXEC | + linux.MS_NODIRATIME | linux.MS_STRICTATIME + + // Linux just allows passing any flags to mount(2) - it won't fail when + // unknown or unsupported flags are passed. Since we don't implement + // everything, we fail explicitly on flags that are unimplemented. + if flags&(unsupportedOps|unsupportedFlags) != 0 { + return 0, nil, syserror.EINVAL + } + + rsys, ok := fs.FindFilesystem(fsType) + if !ok { + return 0, nil, syserror.ENODEV + } + if !rsys.AllowUserMount() { + return 0, nil, syserror.EPERM + } + + var superFlags fs.MountSourceFlags + if flags&linux.MS_NOATIME == linux.MS_NOATIME { + superFlags.NoAtime = true + } + if flags&linux.MS_RDONLY == linux.MS_RDONLY { + superFlags.ReadOnly = true + } + + rootInode, err := rsys.Mount(t, sourcePath, superFlags, data) + if err != nil { + return 0, nil, syserror.EINVAL + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + return t.MountNamespace().Mount(t, d, rootInode) + }) +} + +// Umount2 implements Linux syscall umount2(2). +func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Int() + + const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE + if flags&unsupported != 0 { + return 0, nil, syserror.EINVAL + } + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + // Must have CAP_SYS_ADMIN in the mount namespace's associated user + // namespace. + // + // Currently, this is always the init task's user namespace. + if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().UserNamespace()) { + return 0, nil, syserror.EPERM + } + + resolve := flags&linux.UMOUNT_NOFOLLOW != linux.UMOUNT_NOFOLLOW + detachOnly := flags&linux.MNT_DETACH == linux.MNT_DETACH + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolve, func(root *fs.Dirent, d *fs.Dirent) error { + return t.MountNamespace().Unmount(t, d, detachOnly) + }) +} diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go new file mode 100644 index 000000000..3efc06a27 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -0,0 +1,78 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// pipe2 implements the actual system call with flags. +func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { + if flags&^(syscall.O_NONBLOCK|syscall.O_CLOEXEC) != 0 { + return 0, syscall.EINVAL + } + r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize) + + r.SetFlags(linuxToFlags(flags).Settable()) + defer r.DecRef() + + w.SetFlags(linuxToFlags(flags).Settable()) + defer w.DecRef() + + rfd, err := t.FDMap().NewFDFrom(0, r, kernel.FDFlags{ + CloseOnExec: flags&syscall.O_CLOEXEC != 0}, + t.ThreadGroup().Limits()) + if err != nil { + return 0, err + } + + wfd, err := t.FDMap().NewFDFrom(0, w, kernel.FDFlags{ + CloseOnExec: flags&syscall.O_CLOEXEC != 0}, + t.ThreadGroup().Limits()) + if err != nil { + t.FDMap().Remove(rfd) + return 0, err + } + + if _, err := t.CopyOut(addr, []kdefs.FD{rfd, wfd}); err != nil { + t.FDMap().Remove(rfd) + t.FDMap().Remove(wfd) + return 0, syscall.EFAULT + } + return 0, nil +} + +// Pipe implements linux syscall pipe(2). +func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + n, err := pipe2(t, addr, 0) + return n, nil, err +} + +// Pipe2 implements linux syscall pipe2(2). +func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := uint(args[1].Uint()) + + n, err := pipe2(t, addr, flags) + return n, nil, err +} diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go new file mode 100644 index 000000000..d4dbfd285 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -0,0 +1,429 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// fileCap is the maximum allowable files for poll & select. +const fileCap = 1024 * 1024 + +// Masks for "readable", "writable", and "exceptional" events as defined by +// select(2). +const ( + // selectReadEvents is analogous to the Linux kernel's + // fs/select.c:POLLIN_SET. + selectReadEvents = waiter.EventIn | waiter.EventHUp | waiter.EventErr + + // selectWriteEvents is analogous to the Linux kernel's + // fs/select.c:POLLOUT_SET. + selectWriteEvents = waiter.EventOut | waiter.EventErr + + // selectExceptEvents is analogous to the Linux kernel's + // fs/select.c:POLLEX_SET. + selectExceptEvents = waiter.EventPri +) + +func doPoll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) { + if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { + return timeout, 0, syserror.EINVAL + } + + pfd := make([]syscalls.PollFD, nfds) + if nfds > 0 { + if _, err := t.CopyIn(pfdAddr, &pfd); err != nil { + return timeout, 0, err + } + } + + // Compatibility warning: Linux adds POLLHUP and POLLERR just before + // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after + // polling, changing event masks here is an application-visible difference. + // (Linux also doesn't copy out event masks at all, only revents.) + for i := range pfd { + pfd[i].Events |= waiter.EventHUp | waiter.EventErr + } + remainingTimeout, n, err := syscalls.Poll(t, pfd, timeout) + err = syserror.ConvertIntr(err, syserror.EINTR) + + // The poll entries are copied out regardless of whether + // any are set or not. This aligns with the Linux behavior. + if nfds > 0 && err == nil { + if _, err := t.CopyOut(pfdAddr, pfd); err != nil { + return remainingTimeout, 0, err + } + } + + return remainingTimeout, n, err +} + +func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) { + if nfds < 0 || uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { + return 0, syserror.EINVAL + } + + // Capture all the provided input vectors. + // + // N.B. This only works on little-endian architectures. + byteCount := (nfds + 7) / 8 + bitsInLastPartialByte := uint(nfds % 8) + r := make([]byte, byteCount) + w := make([]byte, byteCount) + e := make([]byte, byteCount) + + if readFDs != 0 { + if _, err := t.CopyIn(readFDs, &r); err != nil { + return 0, err + } + // Mask out bits above nfds. + if bitsInLastPartialByte != 0 { + r[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte + } + } + + if writeFDs != 0 { + if _, err := t.CopyIn(writeFDs, &w); err != nil { + return 0, err + } + if bitsInLastPartialByte != 0 { + w[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte + } + } + + if exceptFDs != 0 { + if _, err := t.CopyIn(exceptFDs, &e); err != nil { + return 0, err + } + if bitsInLastPartialByte != 0 { + e[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte + } + } + + // Count how many FDs are actually being requested so that we can build + // a PollFD array. + fdCount := 0 + for i := 0; i < byteCount; i++ { + v := r[i] | w[i] | e[i] + for v != 0 { + v &= (v - 1) + fdCount++ + } + } + + // Build the PollFD array. + pfd := make([]syscalls.PollFD, 0, fdCount) + fd := kdefs.FD(0) + for i := 0; i < byteCount; i++ { + rV, wV, eV := r[i], w[i], e[i] + v := rV | wV | eV + m := byte(1) + for j := 0; j < 8; j++ { + if (v & m) != 0 { + // Make sure the fd is valid and decrement the reference + // immediately to ensure we don't leak. Note, another thread + // might be about to close fd. This is racy, but that's + // OK. Linux is racy in the same way. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, syserror.EBADF + } + file.DecRef() + + mask := waiter.EventMask(0) + if (rV & m) != 0 { + mask |= selectReadEvents + } + + if (wV & m) != 0 { + mask |= selectWriteEvents + } + + if (eV & m) != 0 { + mask |= selectExceptEvents + } + + pfd = append(pfd, syscalls.PollFD{ + FD: fd, + Events: mask, + }) + } + + fd++ + m <<= 1 + } + } + + // Do the syscall, then count the number of bits set. + _, _, err := syscalls.Poll(t, pfd, timeout) + if err != nil { + return 0, syserror.ConvertIntr(err, syserror.EINTR) + } + + // r, w, and e are currently event mask bitsets; unset bits corresponding + // to events that *didn't* occur. + bitSetCount := uintptr(0) + for idx := range pfd { + events := pfd[idx].REvents + i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8) + m := byte(1) << j + if r[i]&m != 0 { + if (events & selectReadEvents) != 0 { + bitSetCount++ + } else { + r[i] &^= m + } + } + if w[i]&m != 0 { + if (events & selectWriteEvents) != 0 { + bitSetCount++ + } else { + w[i] &^= m + } + } + if e[i]&m != 0 { + if (events & selectExceptEvents) != 0 { + bitSetCount++ + } else { + e[i] &^= m + } + } + } + + // Copy updated vectors back. + if readFDs != 0 { + if _, err := t.CopyOut(readFDs, r); err != nil { + return 0, err + } + } + + if writeFDs != 0 { + if _, err := t.CopyOut(writeFDs, w); err != nil { + return 0, err + } + } + + if exceptFDs != 0 { + if _, err := t.CopyOut(exceptFDs, e); err != nil { + return 0, err + } + } + + return bitSetCount, nil +} + +// timeoutRemaining returns the amount of time remaining for the specified +// timeout or 0 if it has elapsed. +// +// startNs must be from CLOCK_MONOTONIC. +func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration { + now := t.Kernel().MonotonicClock().Now() + remaining := timeout - now.Sub(startNs) + if remaining < 0 { + remaining = 0 + } + return remaining +} + +// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds()) + return copyTimespecOut(t, timespecAddr, &tsRemaining) +} + +// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds()) + return copyTimevalOut(t, timevalAddr, &tvRemaining) +} + +// pollRestartBlock encapsulates the state required to restart poll(2) via +// restart_syscall(2). +type pollRestartBlock struct { + pfdAddr usermem.Addr + nfds uint + timeout time.Duration +} + +// Restart implements kernel.SyscallRestartBlock.Restart. +func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) { + return poll(t, p.pfdAddr, p.nfds, p.timeout) +} + +func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) { + remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout) + // On an interrupt poll(2) is restarted with the remaining timeout. + if err == syserror.EINTR { + t.SetSyscallRestartBlock(&pollRestartBlock{ + pfdAddr: pfdAddr, + nfds: nfds, + timeout: remainingTimeout, + }) + return 0, kernel.ERESTART_RESTARTBLOCK + } + return n, err +} + +// Poll implements linux syscall poll(2). +func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timeout := time.Duration(args[2].Int()) * time.Millisecond + n, err := poll(t, pfdAddr, nfds, timeout) + return n, nil, err +} + +// Ppoll implements linux syscall ppoll(2). +func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timespecAddr := args[2].Pointer() + maskAddr := args[3].Pointer() + maskSize := uint(args[4].Uint()) + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if maskAddr != 0 { + mask, err := copyInSigSet(t, maskAddr, maskSize) + if err != nil { + return 0, nil, err + } + + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + } + + _, n, err := doPoll(t, pfdAddr, nfds, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // doPoll returns EINTR if interrupted, but ppoll is normally restartable + // if interrupted by something other than a signal handled by the + // application (i.e. returns ERESTARTNOHAND). However, if + // copyOutTimespecRemaining failed, then the restarted ppoll would use the + // wrong timeout, so the error should be left as EINTR. + // + // Note that this means that if err is nil but copyErr is not, copyErr is + // ignored. This is consistent with Linux. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Select implements linux syscall select(2). +func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timevalAddr := args[4].Pointer() + + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timevalAddr != 0 { + timeval, err := copyTimevalIn(t, timevalAddr) + if err != nil { + return 0, nil, err + } + if timeval.Sec < 0 || timeval.Usec < 0 { + return 0, nil, syserror.EINVAL + } + timeout = time.Duration(timeval.ToNsecCapped()) + } + startNs := t.Kernel().MonotonicClock().Now() + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Pselect implements linux syscall pselect(2). +func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timespecAddr := args[4].Pointer() + maskWithSizeAddr := args[5].Pointer() + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if maskWithSizeAddr != 0 { + maskAddr, size, err := copyInSigSetWithSize(t, maskWithSizeAddr) + if err != nil { + return 0, nil, err + } + + if maskAddr != 0 { + mask, err := copyInSigSet(t, maskAddr, size) + if err != nil { + return 0, nil, err + } + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + } + } + + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go new file mode 100644 index 000000000..2ca7471cf --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -0,0 +1,188 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// userSockFprog is equivalent to Linux's struct sock_fprog on amd64. +type userSockFprog struct { + // Len is the length of the filter in BPF instructions. + Len uint16 + + _ [6]byte // padding for alignment + + // Filter is a user pointer to the struct sock_filter array that makes up + // the filter program. Filter is a uint64 rather than a usermem.Addr + // because usermem.Addr is actually uintptr, which is not a fixed-size + // type, and encoding/binary.Read objects to this. + Filter uint64 +} + +// Prctl implements linux syscall prctl(2). +// It has a list of subfunctions which operate on the process. The arguments are +// all based on each subfunction. +func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + option := args[0].Int() + + switch option { + case linux.PR_SET_PDEATHSIG: + sig := linux.Signal(args[1].Int()) + if sig != 0 && !sig.IsValid() { + return 0, nil, syscall.EINVAL + } + t.SetParentDeathSignal(sig) + return 0, nil, nil + + case linux.PR_GET_PDEATHSIG: + _, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal())) + return 0, nil, err + + case linux.PR_GET_KEEPCAPS: + if t.Credentials().KeepCaps { + return 1, nil, nil + } + + return 0, nil, nil + + case linux.PR_SET_KEEPCAPS: + val := args[1].Int() + // prctl(2): arg2 must be either 0 (permitted capabilities are cleared) + // or 1 (permitted capabilities are kept). + if val == 0 { + t.SetKeepCaps(false) + } else if val == 1 { + t.SetKeepCaps(true) + } else { + return 0, nil, syscall.EINVAL + } + + return 0, nil, nil + + case linux.PR_SET_NAME: + addr := args[1].Pointer() + name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1) + if err != nil && err != syscall.ENAMETOOLONG { + return 0, nil, err + } + t.SetName(name) + + case linux.PR_GET_NAME: + addr := args[1].Pointer() + buf := make([]byte, linux.TASK_COMM_LEN) + len := copy(buf, t.Name()) + if len < linux.TASK_COMM_LEN { + buf[len] = 0 + len++ + } + _, err := t.CopyOut(addr, buf[:len]) + if err != nil { + return 0, nil, err + } + + case linux.PR_SET_MM: + switch args[1].Int() { + case linux.PR_SET_MM_EXE_FILE: + fd := kdefs.FD(args[2].Int()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // They trying to set exe to a non-file? + if !fs.IsFile(file.Dirent.Inode.StableAttr) { + return 0, nil, syscall.EBADF + } + + // Set the underlying executable. + t.MemoryManager().SetExecutable(file.Dirent) + default: + return 0, nil, syscall.EINVAL + } + + case linux.PR_SET_NO_NEW_PRIVS: + if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { + return 0, nil, syscall.EINVAL + } + // no_new_privs is assumed to always be set. See + // auth.Credentials.UpdateForExec. + return 0, nil, nil + + case linux.PR_GET_NO_NEW_PRIVS: + if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { + return 0, nil, syscall.EINVAL + } + return 1, nil, nil + + case linux.PR_SET_SECCOMP: + if args[1].Int() != linux.SECCOMP_MODE_FILTER { + // Unsupported mode. + return 0, nil, syscall.EINVAL + } + var fprog userSockFprog + if _, err := t.CopyIn(args[2].Pointer(), &fprog); err != nil { + return 0, nil, err + } + filter := make([]linux.BPFInstruction, int(fprog.Len)) + if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil { + return 0, nil, err + } + compiledFilter, err := bpf.Compile(filter) + if err != nil { + t.Debugf("Invalid seccomp-bpf filter: %v", err) + return 0, nil, syscall.EINVAL + } + return 0, nil, t.AppendSyscallFilter(compiledFilter) + + case linux.PR_GET_SECCOMP: + return uintptr(t.SeccompMode()), nil, nil + + case linux.PR_CAPBSET_READ: + cp := linux.Capability(args[1].Uint64()) + if !cp.Ok() { + return 0, nil, syscall.EINVAL + } + var rv uintptr + if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 { + rv = 1 + } + return rv, nil, nil + + case linux.PR_CAPBSET_DROP: + cp := linux.Capability(args[1].Uint64()) + if !cp.Ok() { + return 0, nil, syscall.EINVAL + } + return 0, nil, t.DropBoundingCapability(cp) + + default: + t.Warningf("Unsupported prctl %d", option) + return 0, nil, syscall.EINVAL + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go new file mode 100644 index 000000000..2dd59b1c3 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_random.go @@ -0,0 +1,92 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "crypto/rand" + "io" + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const ( + _GRND_NONBLOCK = 0x1 + _GRND_RANDOM = 0x2 +) + +// GetRandom implements the linux syscall getrandom(2). +// +// In a multi-tenant/shared environment, the only valid implementation is to +// fetch data from the urandom pool, otherwise starvation attacks become +// possible. The urandom pool is also expected to have plenty of entropy, thus +// the GRND_RANDOM flag is ignored. The GRND_NONBLOCK flag does not apply, as +// the pool will already be initialized. +func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + flags := args[2].Int() + + // Flags are checked for validity but otherwise ignored. See above. + if flags & ^(_GRND_NONBLOCK|_GRND_RANDOM) != 0 { + return 0, nil, syserror.EINVAL + } + + if length > math.MaxInt32 { + length = math.MaxInt32 + } + ar, ok := addr.ToRange(uint64(length)) + if !ok { + return 0, nil, syserror.EFAULT + } + + // "If the urandom source has been initialized, reads of up to 256 bytes + // will always return as many bytes as requested and will not be + // interrupted by signals. No such guarantees apply for larger buffer + // sizes." - getrandom(2) + min := int(length) + if min > 256 { + min = 256 + } + n, err := t.MemoryManager().CopyOutFrom(t, usermem.AddrRangeSeqOf(ar), safemem.FromIOReader{&randReader{-1, min}}, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if n >= int64(min) { + return uintptr(n), nil, nil + } + return 0, nil, err +} + +// randReader is a io.Reader that handles partial reads from rand.Reader. +type randReader struct { + done int + min int +} + +// Read implements io.Reader.Read. +func (r *randReader) Read(dst []byte) (int, error) { + if r.done >= r.min { + return rand.Reader.Read(dst) + } + min := r.min - r.done + if min > len(dst) { + min = len(dst) + } + return io.ReadAtLeast(rand.Reader, dst, min) +} diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go new file mode 100644 index 000000000..0be2d195a --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -0,0 +1,274 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +const ( + // EventMaskRead contains events that can be triggerd on reads. + EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr +) + +// Read implements linux syscall read(2). Note that we try to get a buffer that +// is exactly the size requested because some applications like qemu expect +// they can do large reads all at once. Bug for bug. Same for other read +// calls below. +func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := readv(t, file, dst) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file) +} + +// Pread64 implements linux syscall pread64(2). +func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Is reading at an offset supported? + if !file.Flags().Pread { + return 0, nil, syserror.ESPIPE + } + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := preadv(t, file, dst, offset) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file) +} + +// Readv implements linux syscall readv(2). +func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Read the iovecs that specify the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := readv(t, file, dst) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "readv", file) +} + +// Preadv implements linux syscall preadv(2). +func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Is reading at an offset supported? + if !file.Flags().Pread { + return 0, nil, syserror.ESPIPE + } + + // Check that the file is readable. + if !file.Flags().Read { + return 0, nil, syserror.EBADF + } + + // Read the iovecs that specify the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := preadv(t, file, dst, offset) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file) +} + +func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { + n, err := f.Readv(t, dst) + if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if n > 0 { + // Queue notification if we read anything. + f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) + } + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + f.EventRegister(&w, EventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst64(n) + + // Issue the request and break out if it completes with anything + // other than "would block". + n, err = f.Readv(t, dst) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.Block(ch); err != nil { + break + } + } + + f.EventUnregister(&w) + + if total > 0 { + // Queue notification if we read anything. + f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) + } + + return total, err +} + +func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + n, err := f.Preadv(t, dst, offset) + if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if n > 0 { + // Queue notification if we read anything. + f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) + } + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + f.EventRegister(&w, EventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst64(n) + + // Issue the request and break out if it completes with anything + // other than "would block". + n, err = f.Preadv(t, dst, offset+total) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.Block(ch); err != nil { + break + } + } + + f.EventUnregister(&w) + + if total > 0 { + // Queue notification if we read anything. + f.Dirent.InotifyEvent(linux.IN_ACCESS, 0) + } + + return total, err +} diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go new file mode 100644 index 000000000..481e79eaa --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -0,0 +1,217 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// rlimit describes an implementation of 'struct rlimit', which may vary from +// system-to-system. +type rlimit interface { + // toLimit converts an rlimit to a limits.Limit. + toLimit() *limits.Limit + + // fromLimit converts a limits.Limit to an rlimit. + fromLimit(lim limits.Limit) + + // copyIn copies an rlimit from the untrusted app to the kernel. + copyIn(t *kernel.Task, addr usermem.Addr) error + + // copyOut copies an rlimit from the kernel to the untrusted app. + copyOut(t *kernel.Task, addr usermem.Addr) error +} + +// newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system. +func newRlimit(t *kernel.Task) (rlimit, error) { + switch t.Arch().Width() { + case 8: + // On 64-bit system, struct rlimit and struct rlimit64 are identical. + return &rlimit64{}, nil + default: + return nil, syserror.ENOSYS + } +} + +type rlimit64 struct { + Cur uint64 + Max uint64 +} + +func (r *rlimit64) toLimit() *limits.Limit { + return &limits.Limit{ + Cur: limits.FromLinux(r.Cur), + Max: limits.FromLinux(r.Max), + } +} + +func (r *rlimit64) fromLimit(lim limits.Limit) { + *r = rlimit64{ + Cur: limits.ToLinux(lim.Cur), + Max: limits.ToLinux(lim.Max), + } +} + +func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error { + _, err := t.CopyIn(addr, r) + return err +} + +func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error { + _, err := t.CopyOut(addr, *r) + return err +} + +func makeRlimit64(lim limits.Limit) *rlimit64 { + return &rlimit64{Cur: lim.Cur, Max: lim.Max} +} + +// setableLimits is the set of supported setable limits. +var setableLimits = map[limits.LimitType]struct{}{ + limits.NumberOfFiles: {}, + limits.AS: {}, + limits.CPU: {}, + limits.Data: {}, + limits.FileSize: {}, + limits.Stack: {}, + // These are not enforced, but we include them here to avoid returning + // EPERM, since some apps expect them to succeed. + limits.Core: {}, + limits.ProcessCount: {}, +} + +func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit) (limits.Limit, error) { + if newLim == nil { + return t.ThreadGroup().Limits().Get(resource), nil + } + + if _, ok := setableLimits[resource]; !ok { + return limits.Limit{}, syserror.EPERM + } + oldLim, err := t.ThreadGroup().Limits().Set(resource, *newLim) + if err != nil { + return limits.Limit{}, err + } + + if resource == limits.CPU { + t.ThreadGroup().SetCPUTimer(newLim) + } + return oldLim, nil +} + +// Getrlimit implements linux syscall getrlimit(2). +func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + resource, ok := limits.FromLinuxResource[int(args[0].Int())] + if !ok { + // Return err; unknown limit. + return 0, nil, syserror.EINVAL + } + addr := args[1].Pointer() + rlim, err := newRlimit(t) + if err != nil { + return 0, nil, err + } + lim, err := prlimit64(t, resource, nil) + if err != nil { + return 0, nil, err + } + rlim.fromLimit(lim) + return 0, nil, rlim.copyOut(t, addr) +} + +// Setrlimit implements linux syscall setrlimit(2). +func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + resource, ok := limits.FromLinuxResource[int(args[0].Int())] + if !ok { + // Return err; unknown limit. + return 0, nil, syserror.EINVAL + } + addr := args[1].Pointer() + rlim, err := newRlimit(t) + if err != nil { + return 0, nil, err + } + if err := rlim.copyIn(t, addr); err != nil { + return 0, nil, syserror.EFAULT + } + _, err = prlimit64(t, resource, rlim.toLimit()) + return 0, nil, err +} + +// Prlimit64 implements linux syscall prlimit64(2). +func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := kernel.ThreadID(args[0].Int()) + resource, ok := limits.FromLinuxResource[int(args[1].Int())] + if !ok { + // Return err; unknown limit. + return 0, nil, syserror.EINVAL + } + newRlimAddr := args[2].Pointer() + oldRlimAddr := args[3].Pointer() + + var newLim *limits.Limit + if newRlimAddr != 0 { + var nrl rlimit64 + if err := nrl.copyIn(t, newRlimAddr); err != nil { + return 0, nil, syserror.EFAULT + } + newLim = nrl.toLimit() + } + + if tid < 0 { + return 0, nil, syserror.EINVAL + } + ot := t + if tid > 0 { + if ot = t.PIDNamespace().TaskWithID(tid); ot == nil { + return 0, nil, syserror.ESRCH + } + } + + // "To set or get the resources of a process other than itself, the caller + // must have the CAP_SYS_RESOURCE capability, or the real, effective, and + // saved set user IDs of the target process must match the real user ID of + // the caller and the real, effective, and saved set group IDs of the + // target process must match the real group ID of the caller." + if !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) { + cred, tcred := t.Credentials(), ot.Credentials() + if cred.RealKUID != tcred.RealKUID || + cred.RealKUID != tcred.EffectiveKUID || + cred.RealKUID != tcred.SavedKUID || + cred.RealKGID != tcred.RealKGID || + cred.RealKGID != tcred.EffectiveKGID || + cred.RealKGID != tcred.SavedKGID { + return 0, nil, syserror.EPERM + } + } + + oldLim, err := prlimit64(ot, resource, newLim) + if err != nil { + return 0, nil, err + } + + if oldRlimAddr != 0 { + if err := makeRlimit64(oldLim).copyOut(t, oldRlimAddr); err != nil { + return 0, nil, syserror.EFAULT + } + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go new file mode 100644 index 000000000..82e42b589 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_rusage.go @@ -0,0 +1,112 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +func getrusage(t *kernel.Task, which int32) linux.Rusage { + var cs usage.CPUStats + + switch which { + case linux.RUSAGE_SELF: + cs = t.ThreadGroup().CPUStats() + + case linux.RUSAGE_CHILDREN: + cs = t.ThreadGroup().JoinedChildCPUStats() + + case linux.RUSAGE_THREAD: + cs = t.CPUStats() + + case linux.RUSAGE_BOTH: + tg := t.ThreadGroup() + cs = tg.CPUStats() + cs.Accumulate(tg.JoinedChildCPUStats()) + } + + return linux.Rusage{ + UTime: linux.NsecToTimeval(cs.UserTime.Nanoseconds()), + STime: linux.NsecToTimeval(cs.SysTime.Nanoseconds()), + NVCSw: int64(cs.VoluntarySwitches), + MaxRSS: int64(t.MaxRSS(which) / 1024), + } +} + +// Getrusage implements linux syscall getrusage(2). +// marked "y" are supported now +// marked "*" are not used on Linux +// marked "p" are pending for support +// +// y struct timeval ru_utime; /* user CPU time used */ +// y struct timeval ru_stime; /* system CPU time used */ +// p long ru_maxrss; /* maximum resident set size */ +// * long ru_ixrss; /* integral shared memory size */ +// * long ru_idrss; /* integral unshared data size */ +// * long ru_isrss; /* integral unshared stack size */ +// p long ru_minflt; /* page reclaims (soft page faults) */ +// p long ru_majflt; /* page faults (hard page faults) */ +// * long ru_nswap; /* swaps */ +// p long ru_inblock; /* block input operations */ +// p long ru_oublock; /* block output operations */ +// * long ru_msgsnd; /* IPC messages sent */ +// * long ru_msgrcv; /* IPC messages received */ +// * long ru_nsignals; /* signals received */ +// y long ru_nvcsw; /* voluntary context switches */ +// y long ru_nivcsw; /* involuntary context switches */ +func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + which := args[0].Int() + addr := args[1].Pointer() + + if which != linux.RUSAGE_SELF && which != linux.RUSAGE_CHILDREN && which != linux.RUSAGE_THREAD { + return 0, nil, syserror.EINVAL + } + + ru := getrusage(t, which) + _, err := t.CopyOut(addr, &ru) + return 0, nil, err +} + +// Times implements linux syscall times(2). +func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + // Calculate the ticks first, and figure out if any additional work is + // necessary. Linux allows for a NULL addr, in which case only the + // return value is meaningful. We don't need to do anything else. + ticks := uintptr(ktime.NowFromContext(t).Nanoseconds() / linux.ClockTick.Nanoseconds()) + if addr == 0 { + return ticks, nil, nil + } + + cs1 := t.ThreadGroup().CPUStats() + cs2 := t.ThreadGroup().JoinedChildCPUStats() + r := linux.Tms{ + UTime: linux.ClockTFromDuration(cs1.UserTime), + STime: linux.ClockTFromDuration(cs1.SysTime), + CUTime: linux.ClockTFromDuration(cs2.UserTime), + CSTime: linux.ClockTFromDuration(cs2.SysTime), + } + if _, err := t.CopyOut(addr, &r); err != nil { + return 0, nil, err + } + + return ticks, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go new file mode 100644 index 000000000..ff9e46077 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_sched.go @@ -0,0 +1,100 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" +) + +const ( + onlyScheduler = linux.SCHED_NORMAL + onlyPriority = 0 +) + +// SchedParam replicates struct sched_param in sched.h. +type SchedParam struct { + schedPriority int64 +} + +// SchedGetparam implements linux syscall sched_getparam(2). +func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := args[0].Int() + param := args[1].Pointer() + if param == 0 { + return 0, nil, syscall.EINVAL + } + if pid < 0 { + return 0, nil, syscall.EINVAL + } + if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { + return 0, nil, syscall.ESRCH + } + r := SchedParam{schedPriority: onlyPriority} + if _, err := t.CopyOut(param, r); err != nil { + return 0, nil, err + } + + return 0, nil, nil +} + +// SchedGetscheduler implements linux syscall sched_getscheduler(2). +func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := args[0].Int() + if pid < 0 { + return 0, nil, syscall.EINVAL + } + if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { + return 0, nil, syscall.ESRCH + } + return onlyScheduler, nil, nil +} + +// SchedSetscheduler implements linux syscall sched_setscheduler(2). +func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := args[0].Int() + policy := args[1].Int() + param := args[2].Pointer() + if pid < 0 { + return 0, nil, syscall.EINVAL + } + if policy != onlyScheduler { + return 0, nil, syscall.EINVAL + } + if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { + return 0, nil, syscall.ESRCH + } + var r SchedParam + if _, err := t.CopyIn(param, &r); err != nil { + return 0, nil, syscall.EINVAL + } + if r.schedPriority != onlyPriority { + return 0, nil, syscall.EINVAL + } + return 0, nil, nil +} + +// SchedGetPriorityMax implements linux syscall sched_get_priority_max(2). +func SchedGetPriorityMax(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return onlyPriority, nil, nil +} + +// SchedGetPriorityMin implements linux syscall sched_get_priority_min(2). +func SchedGetPriorityMin(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return onlyPriority, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go new file mode 100644 index 000000000..a8983705b --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_sem.go @@ -0,0 +1,166 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const opsMax = 500 // SEMOPM + +// Semget handles: semget(key_t key, int nsems, int semflg) +func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + key := args[0].Int() + nsems := args[1].Int() + flag := args[2].Int() + + private := key == linux.IPC_PRIVATE + create := flag&linux.IPC_CREAT == linux.IPC_CREAT + exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL + mode := linux.FileMode(flag & 0777) + + r := t.IPCNamespace().SemaphoreRegistry() + set, err := r.FindOrCreate(t, key, nsems, mode, private, create, exclusive) + if err != nil { + return 0, nil, err + } + return uintptr(set.ID), nil, nil +} + +// Semop handles: semop(int semid, struct sembuf *sops, size_t nsops) +func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Int() + sembufAddr := args[1].Pointer() + nsops := args[2].SizeT() + + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return 0, nil, syserror.EINVAL + } + if nsops <= 0 { + return 0, nil, syserror.EINVAL + } + if nsops > opsMax { + return 0, nil, syserror.E2BIG + } + + ops := make([]linux.Sembuf, nsops) + if _, err := t.CopyIn(sembufAddr, ops); err != nil { + return 0, nil, err + } + + creds := auth.CredentialsFromContext(t) + for { + ch, num, err := set.ExecuteOps(t, ops, creds) + if ch == nil || err != nil { + // We're done (either on success or a failure). + return 0, nil, err + } + if err = t.Block(ch); err != nil { + set.AbortWait(num, ch) + return 0, nil, err + } + } +} + +// Semctl handles: semctl(int semid, int semnum, int cmd, ...) +func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Int() + num := args[1].Int() + cmd := args[2].Int() + + switch cmd { + case linux.SETVAL: + val := args[3].Int() + if val > math.MaxInt16 { + return 0, nil, syserror.ERANGE + } + return 0, nil, setVal(t, id, num, int16(val)) + + case linux.GETVAL: + v, err := getVal(t, id, num) + return uintptr(v), nil, err + + case linux.IPC_RMID: + return 0, nil, remove(t, id) + + case linux.IPC_SET: + arg := args[3].Pointer() + s := linux.SemidDS{} + if _, err := t.CopyIn(arg, &s); err != nil { + return 0, nil, err + } + + perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777)) + return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms) + + default: + return 0, nil, syserror.EINVAL + } +} + +func remove(t *kernel.Task, id int32) error { + r := t.IPCNamespace().SemaphoreRegistry() + creds := auth.CredentialsFromContext(t) + return r.RemoveID(id, creds) +} + +func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return syserror.EINVAL + } + + creds := auth.CredentialsFromContext(t) + kuid := creds.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return syserror.EINVAL + } + kgid := creds.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + owner := fs.FileOwner{UID: kuid, GID: kgid} + return set.Change(t, creds, owner, perms) +} + +func setVal(t *kernel.Task, id int32, num int32, val int16) error { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return syserror.EINVAL + } + creds := auth.CredentialsFromContext(t) + return set.SetVal(t, num, val, creds) +} + +func getVal(t *kernel.Task, id int32, num int32) (int16, error) { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return 0, syserror.EINVAL + } + creds := auth.CredentialsFromContext(t) + return set.GetVal(num, creds) +} diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go new file mode 100644 index 000000000..93b3f531a --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -0,0 +1,553 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "math" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// "For a process to have permission to send a signal it must +// - either be privileged (CAP_KILL), or +// - the real or effective user ID of the sending process must be equal to the +// real or saved set-user-ID of the target process. +// +// In the case of SIGCONT it suffices when the sending and receiving processes +// belong to the same session." - kill(2) +// +// Equivalent to kernel/signal.c:check_kill_permission. +func mayKill(t *kernel.Task, target *kernel.Task, sig linux.Signal) bool { + // kernel/signal.c:check_kill_permission also allows a signal if the + // sending and receiving tasks share a thread group, which is not + // mentioned in kill(2) since kill does not allow task-level + // granularity in signal sending. + if t.ThreadGroup() == target.ThreadGroup() { + return true + } + + if t.HasCapabilityIn(linux.CAP_KILL, target.UserNamespace()) { + return true + } + + creds := t.Credentials() + tcreds := target.Credentials() + if creds.EffectiveKUID == tcreds.SavedKUID || + creds.EffectiveKUID == tcreds.RealKUID || + creds.RealKUID == tcreds.SavedKUID || + creds.RealKUID == tcreds.RealKUID { + return true + } + + if sig == linux.SIGCONT && target.ThreadGroup().Session() == t.ThreadGroup().Session() { + return true + } + return false +} + +// Kill implements linux syscall kill(2). +func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := kernel.ThreadID(args[0].Int()) + sig := linux.Signal(args[1].Int()) + + switch { + case pid > 0: + // "If pid is positive, then signal sig is sent to the process with the + // ID specified by pid." - kill(2) + // This loops to handle races with execve where target dies between + // TaskWithID and SendGroupSignal. Compare Linux's + // kernel/signal.c:kill_pid_info(). + for { + target := t.PIDNamespace().TaskWithID(pid) + if target == nil { + return 0, nil, syserror.ESRCH + } + if !mayKill(t, target, sig) { + return 0, nil, syserror.EPERM + } + info := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + info.SetPid(int32(target.PIDNamespace().IDOfTask(t))) + info.SetUid(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow())) + if err := target.SendGroupSignal(info); err != syserror.ESRCH { + return 0, nil, err + } + } + case pid == -1: + // "If pid equals -1, then sig is sent to every process for which the + // calling process has permission to send signals, except for process 1 + // (init), but see below. ... POSIX.1-2001 requires that kill(-1,sig) + // send sig to all processes that the calling process may send signals + // to, except possibly for some implementation-defined system + // processes. Linux allows a process to signal itself, but on Linux the + // call kill(-1,sig) does not signal the calling process." + var ( + lastErr error + delivered int + ) + for _, tg := range t.PIDNamespace().ThreadGroups() { + if tg == t.ThreadGroup() { + continue + } + if t.PIDNamespace().IDOfThreadGroup(tg) == kernel.InitTID { + continue + } + + // If pid == -1, the returned error is the last non-EPERM error + // from any call to group_send_sig_info. + if !mayKill(t, tg.Leader(), sig) { + continue + } + // Here and below, whether or not kill returns an error may + // depend on the iteration order. We at least implement the + // semantics documented by the man page: "On success (at least + // one signal was sent), zero is returned." + info := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + info.SetPid(int32(tg.PIDNamespace().IDOfTask(t))) + info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) + err := tg.SendSignal(info) + if err == syserror.ESRCH { + // ESRCH is ignored because it means the task + // exited while we were iterating. This is a + // race which would not normally exist on + // Linux, so we suppress it. + continue + } + delivered++ + if err != nil { + lastErr = err + } + } + if delivered > 0 { + return 0, nil, lastErr + } + return 0, nil, syserror.ESRCH + default: + // "If pid equals 0, then sig is sent to every process in the process + // group of the calling process." + // + // "If pid is less than -1, then sig is sent to every process + // in the process group whose ID is -pid." + pgid := kernel.ProcessGroupID(-pid) + if pgid == 0 { + pgid = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup()) + } + + // If pid != -1 (i.e. signalling a process group), the returned error + // is the last error from any call to group_send_sig_info. + lastErr := syserror.ESRCH + for _, tg := range t.PIDNamespace().ThreadGroups() { + if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { + if !mayKill(t, tg.Leader(), sig) { + lastErr = syserror.EPERM + continue + } + + info := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + info.SetPid(int32(tg.PIDNamespace().IDOfTask(t))) + info.SetUid(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) + // See note above regarding ESRCH race above. + if err := tg.SendSignal(info); err != syserror.ESRCH { + lastErr = err + } + } + } + + return 0, nil, lastErr + } +} + +func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *arch.SignalInfo { + info := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoTkill, + } + info.SetPid(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup()))) + info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + return info +} + +// Tkill implements linux syscall tkill(2). +func Tkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := kernel.ThreadID(args[0].Int()) + sig := linux.Signal(args[1].Int()) + + // N.B. Inconsistent with man page, linux actually rejects calls with + // tid <=0 by EINVAL. This isn't the same for all signal calls. + if tid <= 0 { + return 0, nil, syserror.EINVAL + } + + target := t.PIDNamespace().TaskWithID(tid) + if target == nil { + return 0, nil, syserror.ESRCH + } + + if !mayKill(t, target, sig) { + return 0, nil, syserror.EPERM + } + return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) +} + +// Tgkill implements linux syscall tgkill(2). +func Tgkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tgid := kernel.ThreadID(args[0].Int()) + tid := kernel.ThreadID(args[1].Int()) + sig := linux.Signal(args[2].Int()) + + // N.B. Inconsistent with man page, linux actually rejects calls with + // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. + if tgid <= 0 || tid <= 0 { + return 0, nil, syserror.EINVAL + } + + targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) + target := t.PIDNamespace().TaskWithID(tid) + if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { + return 0, nil, syserror.ESRCH + } + + if !mayKill(t, target, sig) { + return 0, nil, syserror.EPERM + } + return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) +} + +// RtSigaction implements linux syscall rt_sigaction(2). +func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sig := linux.Signal(args[0].Int()) + newactarg := args[1].Pointer() + oldactarg := args[2].Pointer() + + var newactptr *arch.SignalAct + if newactarg != 0 { + newact, err := t.CopyInSignalAct(newactarg) + if err != nil { + return 0, nil, err + } + newactptr = &newact + } + oldact, err := t.ThreadGroup().SetSignalAct(sig, newactptr) + if err != nil { + return 0, nil, err + } + if oldactarg != 0 { + if err := t.CopyOutSignalAct(oldactarg, &oldact); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// Sigreturn implements linux syscall sigreturn(2). +func Sigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + ctrl, err := t.SignalReturn(false) + return 0, ctrl, err +} + +// RtSigreturn implements linux syscall rt_sigreturn(2). +func RtSigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + ctrl, err := t.SignalReturn(true) + return 0, ctrl, err +} + +// RtSigprocmask implements linux syscall rt_sigprocmask(2). +func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + how := args[0].Int() + setaddr := args[1].Pointer() + oldaddr := args[2].Pointer() + sigsetsize := args[3].SizeT() + + if sigsetsize != linux.SignalSetSize { + return 0, nil, syserror.EINVAL + } + oldmask := t.SignalMask() + if setaddr != 0 { + mask, err := copyInSigSet(t, setaddr, sigsetsize) + if err != nil { + return 0, nil, err + } + + switch how { + case linux.SIG_BLOCK: + t.SetSignalMask(oldmask | mask) + case linux.SIG_UNBLOCK: + t.SetSignalMask(oldmask &^ mask) + case linux.SIG_SETMASK: + t.SetSignalMask(mask) + default: + return 0, nil, syserror.EINVAL + } + } + if oldaddr != 0 { + return 0, nil, copyOutSigSet(t, oldaddr, oldmask) + } + + return 0, nil, nil +} + +// Sigaltstack implements linux syscall sigaltstack(2). +func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + setaddr := args[0].Pointer() + oldaddr := args[1].Pointer() + + if oldaddr != 0 { + alt := t.SignalStack() + if t.OnSignalStack(alt) { + alt.Flags |= arch.SignalStackFlagOnStack + } + if err := t.CopyOutSignalStack(oldaddr, &alt); err != nil { + return 0, nil, err + } + } + if setaddr != 0 { + if t.OnSignalStack(t.SignalStack()) { + return 0, nil, syserror.EPERM + } + alt, err := t.CopyInSignalStack(setaddr) + if err != nil { + return 0, nil, err + } + if err := t.SetSignalStack(alt); err != nil { + return 0, nil, err + } + } + + return 0, nil, nil +} + +// Pause implements linux syscall pause(2). +func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND) +} + +func sigtimedwait(t *kernel.Task, mask linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) { + // Is it already pending? + if info := t.TakeSignal(^mask); info != nil { + return info, nil + } + + // No signals available immediately and asked not to wait. + if timeout == 0 { + return nil, syserror.EAGAIN + } + + // No signals available yet. Temporarily unblock the ones we are interested + // in then wait for either a timeout or a new signal. + oldmask := t.SignalMask() + t.SetSignalMask(oldmask &^ mask) + _, err := t.BlockWithTimeout(nil, true, timeout) + t.SetSignalMask(oldmask) + + // How did the wait go? + switch err { + case syserror.ErrInterrupted: + if info := t.TakeSignal(^mask); info != nil { + // Got one of the signals we were waiting for. + return info, nil + } + // Got a signal we weren't waiting for. + return nil, syserror.EINTR + case syserror.ETIMEDOUT: + // Timed out and still no signals. + return nil, syserror.EAGAIN + default: + // Some other error? Shouldn't be possible. The event channel + // passed to BlockWithTimeout was nil, so the only two ways the + // block could've ended are a timeout or an interrupt. + panic("unreachable") + } +} + +// RtSigpending implements linux syscall rt_sigpending(2). +func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + pending := t.PendingSignals() + _, err := t.CopyOut(addr, pending) + return 0, nil, err +} + +// RtSigtimedwait implements linux syscall rt_sigtimedwait(2). +func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sigset := args[0].Pointer() + siginfo := args[1].Pointer() + timespec := args[2].Pointer() + sigsetsize := args[3].SizeT() + + mask, err := copyInSigSet(t, sigset, sigsetsize) + if err != nil { + return 0, nil, err + } + + var timeout time.Duration + if timespec != 0 { + d, err := copyTimespecIn(t, timespec) + if err != nil { + return 0, nil, err + } + if !d.Valid() { + return 0, nil, syserror.EINVAL + } + timeout = time.Duration(d.ToNsecCapped()) + } else { + timeout = time.Duration(math.MaxInt64) + } + + si, err := sigtimedwait(t, mask, timeout) + if err != nil { + return 0, nil, err + } + + if si != nil { + if siginfo != 0 { + si.FixSignalCodeForUser() + if _, err := t.CopyOut(siginfo, si); err != nil { + return 0, nil, err + } + } + return uintptr(si.Signo), nil, nil + } + + // sigtimedwait's not supposed to return nil si and err... + return 0, nil, nil +} + +// RtSigqueueinfo implements linux syscall rt_sigqueueinfo(2). +func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := kernel.ThreadID(args[0].Int()) + sig := linux.Signal(args[1].Int()) + infoAddr := args[2].Pointer() + + // Copy in the info. + // + // We must ensure that the Signo is set (Linux overrides this in the + // same way), and that the code is in the allowed set. This same logic + // appears below in RtSigtgqueueinfo and should be kept in sync. + var info arch.SignalInfo + if _, err := t.CopyIn(infoAddr, &info); err != nil { + return 0, nil, err + } + info.Signo = int32(sig) + + // This must loop to handle the race with execve described in Kill. + for { + // Deliver to the given task's thread group. + target := t.PIDNamespace().TaskWithID(pid) + if target == nil { + return 0, nil, syserror.ESRCH + } + + // If the sender is not the receiver, it can't use si_codes used by the + // kernel or SI_TKILL. + if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t { + return 0, nil, syserror.EPERM + } + + if !mayKill(t, target, sig) { + return 0, nil, syserror.EPERM + } + + if err := target.SendGroupSignal(&info); err != syserror.ESRCH { + return 0, nil, err + } + } +} + +// RtTgsigqueueinfo implements linux syscall rt_tgsigqueueinfo(2). +func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tgid := kernel.ThreadID(args[0].Int()) + tid := kernel.ThreadID(args[1].Int()) + sig := linux.Signal(args[2].Int()) + infoAddr := args[3].Pointer() + + // N.B. Inconsistent with man page, linux actually rejects calls with + // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. + if tgid <= 0 || tid <= 0 { + return 0, nil, syserror.EINVAL + } + + // Copy in the info. See RtSigqueueinfo above. + var info arch.SignalInfo + if _, err := t.CopyIn(infoAddr, &info); err != nil { + return 0, nil, err + } + info.Signo = int32(sig) + + // Deliver to the given task. + targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) + target := t.PIDNamespace().TaskWithID(tid) + if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { + return 0, nil, syserror.ESRCH + } + + // If the sender is not the receiver, it can't use si_codes used by the + // kernel or SI_TKILL. + if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t { + return 0, nil, syserror.EPERM + } + + if !mayKill(t, target, sig) { + return 0, nil, syserror.EPERM + } + return 0, nil, target.SendSignal(&info) +} + +// RtSigsuspend implements linux syscall rt_sigsuspend(2). +func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sigset := args[0].Pointer() + + // Copy in the signal mask. + var mask linux.SignalSet + if _, err := t.CopyIn(sigset, &mask); err != nil { + return 0, nil, err + } + mask &^= kernel.UnblockableSignals + + // Swap the mask. + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + + // Perform the wait. + return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND) +} + +// RestartSyscall implements the linux syscall restart_syscall(2). +func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + if r := t.SyscallRestartBlock(); r != nil { + n, err := r.Restart(t) + return n, nil, err + } + // The restart block should never be nil here, but it's possible + // ERESTART_RESTARTBLOCK was set by ptrace without the current syscall + // setting up a restart block. If ptrace didn't manipulate the return value, + // finding a nil restart block is a bug. Linux ensures that the restart + // function is never null by (re)initializing it with one that translates + // the restart into EINTR. We'll emulate that behaviour. + t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?") + return 0, nil, syserror.EINTR +} diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go new file mode 100644 index 000000000..3797c0a5d --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -0,0 +1,1059 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +// minListenBacklog is the minimum reasonable backlog for listening sockets. +const minListenBacklog = 8 + +// maxListenBacklog is the maximum allowed backlog for listening sockets. +const maxListenBacklog = 1024 + +// maxAddrLen is the maximum socket address length we're willing to accept. +const maxAddrLen = 200 + +// maxOptLen is the maximum sockopt parameter length we're willing to accept. +const maxOptLen = 1024 + +// maxControlLen is the maximum length of the msghdr.msg_control buffer we're +// willing to accept. Note that this limit is smaller than Linux, which allows +// buffers upto INT_MAX. +const maxControlLen = 10 * 1024 * 1024 + +// nameLenOffset is the offset from the start of the MessageHeader64 struct to +// the NameLen field. +const nameLenOffset = 8 + +// controlLenOffset is the offset form the start of the MessageHeader64 struct +// to the ControlLen field. +const controlLenOffset = 40 + +// messageHeader64Len is the length of a MessageHeader64 struct. +var messageHeader64Len = uint64(binary.Size(MessageHeader64{})) + +// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct. +var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{})) + +// MessageHeader64 is the 64-bit representation of the msghdr struct used in +// the recvmsg and sendmsg syscalls. +type MessageHeader64 struct { + // Name is the optional pointer to a network address buffer. + Name uint64 + + // NameLen is the length of the buffer pointed to by Name. + NameLen uint32 + _ uint32 + + // Iov is a pointer to an array of io vectors that describe the memory + // locations involved in the io operation. + Iov uint64 + + // IovLen is the length of the array pointed to by Iov. + IovLen uint64 + + // Control is the optional pointer to ancillary control data. + Control uint64 + + // ControlLen is the length of the data pointed to by Control. + ControlLen uint64 + + // Flags on the sent/received message. + Flags int32 + _ int32 +} + +// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in +// the recvmmsg and sendmmsg syscalls. +type multipleMessageHeader64 struct { + msgHdr MessageHeader64 + msgLen uint32 + _ int32 +} + +// CopyInMessageHeader64 copies a message header from user to kernel memory. +func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error { + b := t.CopyScratchBuffer(52) + if _, err := t.CopyInBytes(addr, b); err != nil { + return err + } + + msg.Name = usermem.ByteOrder.Uint64(b[0:]) + msg.NameLen = usermem.ByteOrder.Uint32(b[8:]) + msg.Iov = usermem.ByteOrder.Uint64(b[16:]) + msg.IovLen = usermem.ByteOrder.Uint64(b[24:]) + msg.Control = usermem.ByteOrder.Uint64(b[32:]) + msg.ControlLen = usermem.ByteOrder.Uint64(b[40:]) + msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:])) + + return nil +} + +// CaptureAddress allocates memory for and copies a socket address structure +// from the untrusted address space range. +func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) { + if addrlen > maxAddrLen { + return nil, syscall.EINVAL + } + + addrBuf := make([]byte, addrlen) + if _, err := t.CopyInBytes(addr, addrBuf); err != nil { + return nil, err + } + + return addrBuf, nil +} + +// writeAddress writes a sockaddr structure and its length to an output buffer +// in the unstrusted address space range. If the address is bigger than the +// buffer, it is truncated. +func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error { + // Get the buffer length. + var bufLen uint32 + if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil { + return err + } + + if int32(bufLen) < 0 { + return syscall.EINVAL + } + + // Write the length unconditionally. + if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil { + return err + } + + if addr == nil { + return nil + } + + if bufLen > addrLen { + bufLen = addrLen + } + + // Copy as much of the address as will fit in the buffer. + encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr) + if bufLen > uint32(len(encodedAddr)) { + bufLen = uint32(len(encodedAddr)) + } + _, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)]) + return err +} + +// Socket implements the linux syscall socket(2). +func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + domain := int(args[0].Int()) + stype := args[1].Int() + protocol := int(args[2].Int()) + + // Check and initialize the flags. + if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, nil, syscall.EINVAL + } + + // Create the new socket. + s, e := socket.New(t, domain, unix.SockType(stype&0xf), protocol) + if e != nil { + return 0, nil, e.ToError() + } + s.SetFlags(fs.SettableFileFlags{ + NonBlocking: stype&linux.SOCK_NONBLOCK != 0, + }) + defer s.DecRef() + + fd, err := t.FDMap().NewFDFrom(0, s, kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + }, t.ThreadGroup().Limits()) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// SocketPair implements the linux syscall socketpair(2). +func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + domain := int(args[0].Int()) + stype := args[1].Int() + protocol := int(args[2].Int()) + socks := args[3].Pointer() + + // Check and initialize the flags. + if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, nil, syscall.EINVAL + } + + fileFlags := fs.SettableFileFlags{ + NonBlocking: stype&linux.SOCK_NONBLOCK != 0, + } + fdFlags := kernel.FDFlags{ + CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, + } + + // Create the socket pair. + s1, s2, e := socket.Pair(t, domain, unix.SockType(stype&0xf), protocol) + if e != nil { + return 0, nil, e.ToError() + } + s1.SetFlags(fileFlags) + s2.SetFlags(fileFlags) + defer s1.DecRef() + defer s2.DecRef() + + // Create the FDs for the sockets. + fd1, err := t.FDMap().NewFDFrom(0, s1, fdFlags, t.ThreadGroup().Limits()) + if err != nil { + return 0, nil, err + } + fd2, err := t.FDMap().NewFDFrom(0, s2, fdFlags, t.ThreadGroup().Limits()) + if err != nil { + t.FDMap().Remove(fd1) + return 0, nil, err + } + + // Copy the file descriptors out. + if _, err := t.CopyOut(socks, []int32{int32(fd1), int32(fd2)}); err != nil { + t.FDMap().Remove(fd1) + t.FDMap().Remove(fd2) + return 0, nil, err + } + + return 0, nil, nil +} + +// Connect implements the linux syscall connect(2). +func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + addrlen := args[2].Uint() + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + // Capture address and call syscall implementation. + a, err := CaptureAddress(t, addr, addrlen) + if err != nil { + return 0, nil, err + } + + blocking := !file.Flags().NonBlocking + return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS) +} + +// accept is the implementation of the accept syscall. It is called by accept +// and accept4 syscall handlers. +func accept(t *kernel.Task, fd kdefs.FD, addr usermem.Addr, addrLen usermem.Addr, flags int) (uintptr, error) { + // Check that no unsupported flags are passed in. + if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { + return 0, syscall.EINVAL + } + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, syscall.ENOTSOCK + } + + // Call the syscall implementation for this socket, then copy the + // output address if one is specified. + blocking := !file.Flags().NonBlocking + + peerRequested := addrLen != 0 + nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + if peerRequested { + // NOTE: Linux does not give you an error if it can't + // write the data back out so neither do we. + if err := writeAddress(t, peer, peerLen, addr, addrLen); err == syscall.EINVAL { + return 0, err + } + } + return uintptr(nfd), nil +} + +// Accept4 implements the linux syscall accept4(2). +func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + addrlen := args[2].Pointer() + flags := int(args[3].Int()) + + n, err := accept(t, fd, addr, addrlen, flags) + return n, nil, err +} + +// Accept implements the linux syscall accept(2). +func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + n, err := accept(t, fd, addr, addrlen, 0) + return n, nil, err +} + +// Bind implements the linux syscall bind(2). +func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + addrlen := args[2].Uint() + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + // Capture address and call syscall implementation. + a, err := CaptureAddress(t, addr, addrlen) + if err != nil { + return 0, nil, err + } + + return 0, nil, s.Bind(t, a).ToError() +} + +// Listen implements the linux syscall listen(2). +func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + backlog := args[1].Int() + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + // Per Linux, the backlog is silently capped to reasonable values. + if backlog <= 0 { + backlog = minListenBacklog + } + if backlog > maxListenBacklog { + backlog = maxListenBacklog + } + + return 0, nil, s.Listen(t, int(backlog)).ToError() +} + +// Shutdown implements the linux syscall shutdown(2). +func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + how := args[1].Int() + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + // Validate how, then call syscall implementation. + switch how { + case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: + default: + return 0, nil, syscall.EINVAL + } + + return 0, nil, s.Shutdown(t, int(how)).ToError() +} + +// GetSockOpt implements the linux syscall getsockopt(2). +func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + level := args[1].Int() + name := args[2].Int() + optValAddr := args[3].Pointer() + optLenAddr := args[4].Pointer() + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + // Read the length if present. Reject negative values. + optLen := int32(0) + if optLenAddr != 0 { + if _, err := t.CopyIn(optLenAddr, &optLen); err != nil { + return 0, nil, err + } + + if optLen < 0 { + return 0, nil, syscall.EINVAL + } + } + + // Call syscall implementation then copy both value and value len out. + v, e := s.GetSockOpt(t, int(level), int(name), int(optLen)) + if e != nil { + return 0, nil, e.ToError() + } + + if optLenAddr != 0 { + vLen := int32(binary.Size(v)) + if _, err := t.CopyOut(optLenAddr, vLen); err != nil { + return 0, nil, err + } + } + + if v != nil { + if _, err := t.CopyOut(optValAddr, v); err != nil { + return 0, nil, err + } + } + + return 0, nil, nil +} + +// SetSockOpt implements the linux syscall setsockopt(2). +// +// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket. +func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + level := args[1].Int() + name := args[2].Int() + optValAddr := args[3].Pointer() + optLen := args[4].Int() + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + if optLen <= 0 { + return 0, nil, syscall.EINVAL + } + if optLen > maxOptLen { + return 0, nil, syscall.EINVAL + } + buf := make([]byte, optLen) + if _, err := t.CopyIn(optValAddr, &buf); err != nil { + return 0, nil, err + } + + // Call syscall implementation. + if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, nil +} + +// GetSockName implements the linux syscall getsockname(2). +func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + // Get the socket name and copy it to the caller. + v, vl, err := s.GetSockName(t) + if err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, writeAddress(t, v, vl, addr, addrlen) +} + +// GetPeerName implements the linux syscall getpeername(2). +func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + addrlen := args[2].Pointer() + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + // Get the socket peer name and copy it to the caller. + v, vl, err := s.GetPeerName(t) + if err != nil { + return 0, nil, err.ToError() + } + + return 0, nil, writeAddress(t, v, vl, addr, addrlen) +} + +// RecvMsg implements the linux syscall recvmsg(2). +func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + msgPtr := args[1].Pointer() + flags := args[2].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syscall.EINVAL + } + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { + return 0, nil, syscall.EINVAL + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + n, err := recvSingleMsg(t, s, msgPtr, flags, false, ktime.Time{}) + return n, nil, err +} + +// RecvMMsg implements the linux syscall recvmmsg(2). +func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + msgPtr := args[1].Pointer() + vlen := args[2].Uint() + flags := args[3].Int() + toPtr := args[4].Pointer() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syscall.EINVAL + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_TRUNC|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { + return 0, nil, syscall.EINVAL + } + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + var haveDeadline bool + var deadline ktime.Time + if toPtr != 0 { + ts, err := copyTimespecIn(t, toPtr) + if err != nil { + return 0, nil, err + } + if !ts.Valid() { + return 0, nil, syscall.EINVAL + } + deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) + haveDeadline = true + } + + if !haveDeadline { + dl := s.RecvTimeout() + if dl != 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } + } + + var count uint32 + var err error + for i := uint64(0); i < uint64(vlen); i++ { + mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) + if !ok { + return 0, nil, syscall.EFAULT + } + var n uintptr + if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { + break + } + + // Copy the received length to the caller. + lp, ok := mp.AddLength(messageHeader64Len) + if !ok { + return 0, nil, syscall.EFAULT + } + if _, err = t.CopyOut(lp, uint32(n)); err != nil { + break + } + count++ + } + + if count == 0 { + return 0, nil, err + } + return uintptr(count), nil, nil +} + +func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) { + // Capture the message header and io vectors. + var msg MessageHeader64 + if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + return 0, err + } + + if msg.IovLen > linux.UIO_MAXIOV { + return 0, syscall.EMSGSIZE + } + dst, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + // FIXME: Pretend we have an empty error queue. + if flags&linux.MSG_ERRQUEUE != 0 { + return 0, syscall.EAGAIN + } + + // Fast path when no control message nor name buffers are provided. + if msg.ControlLen == 0 && msg.NameLen == 0 { + n, _, _, _, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) + if err != nil { + return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS) + } + return uintptr(n), nil + } + + if msg.ControlLen > maxControlLen { + return 0, syscall.ENOBUFS + } + n, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + defer cms.Release() + + controlData := make([]byte, 0, msg.ControlLen) + + if cr, ok := s.(unix.Credentialer); ok && cr.Passcred() { + creds, _ := cms.Credentials.(control.SCMCredentials) + controlData = control.PackCredentials(t, creds, controlData) + } + + if cms.Rights != nil { + controlData = control.PackRights(t, cms.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData) + } + + // Copy the address to the caller. + if msg.NameLen != 0 { + if err := writeAddress(t, sender, senderLen, usermem.Addr(msg.Name), usermem.Addr(msgPtr+nameLenOffset)); err != nil { + return 0, err + } + } + + // Copy the control data to the caller. + if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil { + return 0, err + } + if len(controlData) > 0 { + if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil { + return 0, err + } + } + + return uintptr(n), nil +} + +// recvFrom is the implementation of the recvfrom syscall. It is called by +// recvfrom and recv syscall handlers. +func recvFrom(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLenPtr usermem.Addr) (uintptr, error) { + if int(bufLen) < 0 { + return 0, syscall.EINVAL + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_NOSIGNAL|linux.MSG_PEEK|linux.MSG_TRUNC) != 0 { + return 0, syscall.EINVAL + } + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, syscall.ENOTSOCK + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + var haveDeadline bool + var deadline ktime.Time + + if dl := s.RecvTimeout(); dl != 0 { + deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) + haveDeadline = true + } + + n, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) + cm.Release() + if e != nil { + return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + } + + // Copy the address to the caller. + if nameLenPtr != 0 { + if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil { + return 0, err + } + } + + return uintptr(n), nil +} + +// RecvFrom implements the linux syscall recvfrom(2). +func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + bufPtr := args[1].Pointer() + bufLen := args[2].Uint64() + flags := args[3].Int() + namePtr := args[4].Pointer() + nameLenPtr := args[5].Pointer() + + n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr) + return n, nil, err +} + +// SendMsg implements the linux syscall sendmsg(2). +func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + msgPtr := args[1].Pointer() + flags := args[2].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syscall.EINVAL + } + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { + return 0, nil, syscall.EINVAL + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + n, err := sendSingleMsg(t, s, file, msgPtr, flags) + return n, nil, err +} + +// SendMMsg implements the linux syscall sendmmsg(2). +func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + msgPtr := args[1].Pointer() + vlen := args[2].Uint() + flags := args[3].Int() + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + return 0, nil, syscall.EINVAL + } + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, nil, syscall.ENOTSOCK + } + + // Reject flags that we don't handle yet. + if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { + return 0, nil, syscall.EINVAL + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + var count uint32 + var err error + for i := uint64(0); i < uint64(vlen); i++ { + mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) + if !ok { + return 0, nil, syscall.EFAULT + } + var n uintptr + if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { + break + } + + // Copy the received length to the caller. + lp, ok := mp.AddLength(messageHeader64Len) + if !ok { + return 0, nil, syscall.EFAULT + } + if _, err = t.CopyOut(lp, uint32(n)); err != nil { + break + } + count++ + } + + if count == 0 { + return 0, nil, err + } + return uintptr(count), nil, nil +} + +func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) { + // Capture the message header. + var msg MessageHeader64 + if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + return 0, err + } + + var controlData []byte + if msg.ControlLen > 0 { + // Put an upper bound to prevent large allocations. + if msg.ControlLen > maxControlLen { + return 0, syscall.ENOBUFS + } + controlData = make([]byte, msg.ControlLen) + if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil { + return 0, err + } + } + + // Read the destination address if one is specified. + var to []byte + if msg.NameLen != 0 { + var err error + to, err = CaptureAddress(t, usermem.Addr(msg.Name), msg.NameLen) + if err != nil { + return 0, err + } + } + + // Read data then call the sendmsg implementation. + if msg.IovLen > linux.UIO_MAXIOV { + return 0, syscall.EMSGSIZE + } + src, err := t.IovecsIOSequence(usermem.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + controlMessages, err := control.Parse(t, s, controlData) + if err != nil { + return 0, err + } + + // Call the syscall implementation. + n, e := s.SendMsg(t, src, to, int(flags), controlMessages) + err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) + if err != nil { + controlMessages.Release() + } + return uintptr(n), err +} + +// sendTo is the implementation of the sendto syscall. It is called by sendto +// and send syscall handlers. +func sendTo(t *kernel.Task, fd kdefs.FD, bufPtr usermem.Addr, bufLen uint64, flags int32, namePtr usermem.Addr, nameLen uint32) (uintptr, error) { + bl := int(bufLen) + if bl < 0 { + return 0, syscall.EINVAL + } + + // Get socket from the file descriptor. + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, syscall.EBADF + } + defer file.DecRef() + + // Extract the socket. + s, ok := file.FileOperations.(socket.Socket) + if !ok { + return 0, syscall.ENOTSOCK + } + + if file.Flags().NonBlocking { + flags |= linux.MSG_DONTWAIT + } + + // Read the destination address if one is specified. + var to []byte + var err error + if namePtr != 0 { + to, err = CaptureAddress(t, namePtr, nameLen) + if err != nil { + return 0, err + } + } + + src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + // Call the syscall implementation. + n, e := s.SendMsg(t, src, to, int(flags), control.New(t, s, nil)) + return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file) +} + +// SendTo implements the linux syscall sendto(2). +func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + bufPtr := args[1].Pointer() + bufLen := args[2].Uint64() + flags := args[3].Int() + namePtr := args[4].Pointer() + nameLen := args[5].Uint() + + n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen) + return n, nil, err +} diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go new file mode 100644 index 000000000..6e21b34fd --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -0,0 +1,209 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Stat implements linux syscall stat(2). +func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + statAddr := args[1].Pointer() + + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + return stat(t, d, dirPath, statAddr) + }) +} + +// Fstatat implements linux syscall newfstatat, i.e. fstatat(2). +func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + statAddr := args[2].Pointer() + flags := args[3].Int() + + path, dirPath, err := copyInPath(t, addr, flags&linux.AT_EMPTY_PATH != 0) + if err != nil { + return 0, nil, err + } + + if path == "" { + // Annoying. What's wrong with fstat? + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, stat(t, file.Dirent, false, statAddr) + } + + return 0, nil, fileOpOn(t, fd, path, flags&linux.AT_SYMLINK_NOFOLLOW == 0, func(root *fs.Dirent, d *fs.Dirent) error { + return stat(t, d, dirPath, statAddr) + }) +} + +// Lstat implements linux syscall lstat(2). +func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + statAddr := args[1].Pointer() + + path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + return stat(t, d, dirPath, statAddr) + }) +} + +// Fstat implements linux syscall fstat(2). +func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + statAddr := args[1].Pointer() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, stat(t, file.Dirent, false /* dirPath */, statAddr) +} + +// stat implements stat from the given *fs.Dirent. +func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(t) + if err != nil { + return err + } + + var mode uint32 + switch d.Inode.StableAttr.Type { + case fs.RegularFile, fs.SpecialFile: + mode |= linux.ModeRegular + case fs.Symlink: + mode |= linux.ModeSymlink + case fs.Directory, fs.SpecialDirectory: + mode |= linux.ModeDirectory + case fs.Pipe: + mode |= linux.ModeNamedPipe + case fs.CharacterDevice: + mode |= linux.ModeCharacterDevice + case fs.BlockDevice: + mode |= linux.ModeBlockDevice + case fs.Socket: + mode |= linux.ModeSocket + } + + _, err = t.CopyOut(statAddr, linux.Stat{ + Dev: uint64(d.Inode.StableAttr.DeviceID), + Rdev: uint64(linux.MakeDeviceID(d.Inode.StableAttr.DeviceFileMajor, d.Inode.StableAttr.DeviceFileMinor)), + Ino: uint64(d.Inode.StableAttr.InodeID), + Nlink: uattr.Links, + Mode: mode | uint32(uattr.Perms.LinuxMode()), + UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()), + GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()), + Size: uattr.Size, + Blksize: d.Inode.StableAttr.BlockSize, + Blocks: uattr.Usage / 512, + ATime: uattr.AccessTime.Timespec(), + MTime: uattr.ModificationTime.Timespec(), + CTime: uattr.StatusChangeTime.Timespec(), + }) + return err +} + +// Statfs implements linux syscall statfs(2). +func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + statfsAddr := args[1].Pointer() + + path, _, err := copyInPath(t, addr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + return statfsImpl(t, d, statfsAddr) + }) +} + +// Fstatfs implements linux syscall fstatfs(2). +func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + statfsAddr := args[1].Pointer() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, statfsImpl(t, file.Dirent, statfsAddr) +} + +// statfsImpl implements the linux syscall statfs and fstatfs based on a Dirent, +// copying the statfs structure out to addr on success, otherwise an error is +// returned. +func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error { + info, err := d.Inode.StatFS(t) + if err != nil { + return err + } + // Construct the statfs structure and copy it out. + statfs := linux.Statfs{ + Type: info.Type, + // Treat block size and fragment size as the same, as + // most consumers of this structure will expect one + // or the other to be filled in. + BlockSize: d.Inode.StableAttr.BlockSize, + Blocks: info.TotalBlocks, + // We don't have the concept of reserved blocks, so + // report blocks free the same as available blocks. + // This is a normal thing for filesystems, to do, see + // udf, hugetlbfs, tmpfs, among others. + BlocksFree: info.FreeBlocks, + BlocksAvailable: info.FreeBlocks, + Files: info.TotalFiles, + FilesFree: info.FreeFiles, + // Same as Linux for simple_statfs, see fs/libfs.c. + NameLength: syscall.PathMax, + FragmentSize: d.Inode.StableAttr.BlockSize, + // Leave other fields 0 like simple_statfs does. + } + if _, err := t.CopyOut(addr, &statfs); err != nil { + return err + } + return nil +} diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go new file mode 100644 index 000000000..902d210db --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -0,0 +1,75 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Sync implements linux system call sync(2). +func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + t.MountNamespace().SyncAll(t) + // Sync is always successful. + return 0, nil, nil +} + +// Syncfs implements linux system call syncfs(2). +func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Use "sync-the-world" for now, it's guaranteed that fd is at least + // on the root filesystem. + return Sync(t, args) +} + +// Fsync implements linux syscall fsync(2). +func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll) + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +// Fdatasync implements linux syscall fdatasync(2). +// +// At the moment, it just calls Fsync, which is a big hammer, but correct. +func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData) + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go new file mode 100644 index 000000000..bd0ffcd5c --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go @@ -0,0 +1,42 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" +) + +// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo. +func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + mem := t.Kernel().Platform.Memory() + mem.UpdateUsage() + _, totalUsage := usage.MemoryAccounting.Copy() + totalSize := usage.TotalMemory(mem.TotalSize(), totalUsage) + + // Only a subset of the fields in sysinfo_t make sense to return. + si := linux.Sysinfo{ + Procs: uint16(len(t.PIDNamespace().Tasks())), + Uptime: t.Kernel().MonotonicClock().Now().Seconds(), + TotalRAM: totalSize, + FreeRAM: totalSize - totalUsage, + } + _, err := t.CopyOut(addr, si) + return 0, nil, err +} diff --git a/pkg/sentry/syscalls/linux/sys_syslog.go b/pkg/sentry/syscalls/linux/sys_syslog.go new file mode 100644 index 000000000..792040c81 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_syslog.go @@ -0,0 +1,61 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const ( + _SYSLOG_ACTION_READ_ALL = 3 + _SYSLOG_ACTION_SIZE_BUFFER = 10 +) + +// logBufLen is the default syslog buffer size on Linux. +const logBufLen = 1 << 17 + +// Syslog implements part of Linux syscall syslog. +// +// Only the unpriviledged commands are implemented, allowing applications to +// read a fun dmesg. +func Syslog(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + command := args[0].Int() + buf := args[1].Pointer() + size := int(args[2].Int()) + + switch command { + case _SYSLOG_ACTION_READ_ALL: + if size < 0 { + return 0, nil, syserror.EINVAL + } + if size > logBufLen { + size = logBufLen + } + + log := t.Kernel().Syslog().Log() + if len(log) > size { + log = log[:size] + } + + n, err := t.CopyOutBytes(buf, log) + return uintptr(n), nil, err + case _SYSLOG_ACTION_SIZE_BUFFER: + return logBufLen, nil, nil + default: + return 0, nil, syserror.ENOSYS + } +} diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go new file mode 100644 index 000000000..0adbf160f --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -0,0 +1,704 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const ( + // ExecMaxTotalSize is the maximum length of all argv and envv entries. + // + // N.B. The behavior here is different than Linux. Linux provides a limit on + // individual arguments of 32 pages, and an aggregate limit of at least 32 pages + // but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement + // any behavior based on the stack size, and instead provide a fixed hard-limit of + // 2 MB (which should work well given that 8 MB stack limits are common). + ExecMaxTotalSize = 2 * 1024 * 1024 + + // ExecMaxElemSize is the maximum length of a single argv or envv entry. + ExecMaxElemSize = 32 * usermem.PageSize + + // exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux. + exitSignalMask = 0xff +) + +// Possible values for the idtype argument to waitid(2), defined in Linux's +// include/uapi/linux/wait.h. +const ( + _P_ALL = 0 + _P_PID = 1 + _P_PGID = 2 +) + +// Getppid implements linux syscall getppid(2). +func Getppid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + parent := t.Parent() + if parent == nil { + return 0, nil, nil + } + return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil +} + +// Getpid implements linux syscall getpid(2). +func Getpid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return uintptr(t.ThreadGroup().ID()), nil, nil +} + +// Gettid implements linux syscall gettid(2). +func Gettid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return uintptr(t.ThreadID()), nil, nil +} + +// Execve implements linux syscall execve(2). +func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + filenameAddr := args[0].Pointer() + argvAddr := args[1].Pointer() + envvAddr := args[2].Pointer() + + // Extract our arguments. + filename, err := t.CopyInString(filenameAddr, syscall.PathMax) + if err != nil { + return 0, nil, err + } + + var argv, envv []string + if argvAddr != 0 { + var err error + argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + if envvAddr != 0 { + var err error + envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + + root := t.FSContext().RootDirectory() + defer root.DecRef() + wd := t.FSContext().WorkingDirectory() + defer wd.DecRef() + + // Load the new TaskContext. + tc, err := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, linux.MaxSymlinkTraversals, filename, argv, envv, t.Arch().FeatureSet()) + if err != nil { + return 0, nil, err + } + + ctrl, err := t.Execve(tc) + return 0, ctrl, err +} + +// Exit implements linux syscall exit(2). +func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + status := int(args[0].Int()) + t.PrepareExit(kernel.ExitStatus{Code: status}) + return 0, kernel.CtrlDoExit, nil +} + +// ExitGroup implements linux syscall exit_group(2). +func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + status := int(args[0].Int()) + t.PrepareGroupExit(kernel.ExitStatus{Code: status}) + return 0, kernel.CtrlDoExit, nil +} + +// clone is used by Clone, Fork, and VFork. +func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr, childTID usermem.Addr, tls usermem.Addr) (uintptr, *kernel.SyscallControl, error) { + opts := kernel.CloneOptions{ + SharingOptions: kernel.SharingOptions{ + NewAddressSpace: flags&syscall.CLONE_VM == 0, + NewSignalHandlers: flags&syscall.CLONE_SIGHAND == 0, + NewThreadGroup: flags&syscall.CLONE_THREAD == 0, + TerminationSignal: linux.Signal(flags & exitSignalMask), + NewPIDNamespace: flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID, + NewUserNamespace: flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER, + NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET, + NewFiles: flags&syscall.CLONE_FILES == 0, + NewFSContext: flags&syscall.CLONE_FS == 0, + NewUTSNamespace: flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS, + NewIPCNamespace: flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC, + }, + Stack: stack, + SetTLS: flags&syscall.CLONE_SETTLS == syscall.CLONE_SETTLS, + TLS: tls, + ChildClearTID: flags&syscall.CLONE_CHILD_CLEARTID == syscall.CLONE_CHILD_CLEARTID, + ChildSetTID: flags&syscall.CLONE_CHILD_SETTID == syscall.CLONE_CHILD_SETTID, + ChildTID: childTID, + ParentSetTID: flags&syscall.CLONE_PARENT_SETTID == syscall.CLONE_PARENT_SETTID, + ParentTID: parentTID, + Vfork: flags&syscall.CLONE_VFORK == syscall.CLONE_VFORK, + Untraced: flags&syscall.CLONE_UNTRACED == syscall.CLONE_UNTRACED, + InheritTracer: flags&syscall.CLONE_PTRACE == syscall.CLONE_PTRACE, + } + ntid, ctrl, err := t.Clone(&opts) + return uintptr(ntid), ctrl, err +} + +// Clone implements linux syscall clone(2). +// sys_clone has so many flavors. We implement the default one in the +// current linux 3.11 x86_64: +// sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val) +func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := int(args[0].Int()) + stack := args[1].Pointer() + parentTID := args[2].Pointer() + childTID := args[3].Pointer() + tls := args[4].Pointer() + return clone(t, flags, stack, parentTID, childTID, tls) +} + +// Fork implements Linux syscall fork(2). +func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // "A call to fork() is equivalent to a call to clone(2) specifying flags + // as just SIGCHLD." - fork(2) + return clone(t, int(syscall.SIGCHLD), 0, 0, 0, 0) +} + +// Vfork implements Linux syscall vfork(2). +func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // """ + // A call to vfork() is equivalent to calling clone(2) with flags specified as: + // + // CLONE_VM | CLONE_VFORK | SIGCHLD + // """ - vfork(2) + return clone(t, syscall.CLONE_VM|syscall.CLONE_VFORK|int(syscall.SIGCHLD), 0, 0, 0, 0) +} + +// wait4 waits for the given child process to exit. +func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusageAddr usermem.Addr) (uintptr, error) { + if options&^(syscall.WNOHANG|syscall.WUNTRACED|syscall.WCONTINUED|syscall.WALL|syscall.WCLONE) != 0 { + return 0, syscall.EINVAL + } + wopts := kernel.WaitOptions{ + Events: kernel.EventExit | kernel.EventTraceeStop, + ConsumeEvent: true, + } + // There are four cases to consider: + // + // pid < -1 any child process whose process group ID is equal to the absolute value of pid + // pid == -1 any child process + // pid == 0 any child process whose process group ID is equal to that of the calling process + // pid > 0 the child whose process ID is equal to the value of pid + switch { + case pid < -1: + wopts.SpecificPGID = kernel.ProcessGroupID(-pid) + case pid == -1: + // Any process is the default. + case pid == 0: + wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup()) + default: + wopts.SpecificTID = kernel.ThreadID(pid) + } + + switch options & (syscall.WCLONE | syscall.WALL) { + case 0: + wopts.NonCloneTasks = true + case syscall.WCLONE: + wopts.CloneTasks = true + case syscall.WALL: + wopts.NonCloneTasks = true + wopts.CloneTasks = true + default: + return 0, syscall.EINVAL + } + if options&syscall.WUNTRACED != 0 { + wopts.Events |= kernel.EventChildGroupStop + } + if options&syscall.WCONTINUED != 0 { + wopts.Events |= kernel.EventGroupContinue + } + if options&syscall.WNOHANG == 0 { + wopts.BlockInterruptErr = kernel.ERESTARTSYS + } + + wr, err := t.Wait(&wopts) + if err != nil { + if err == kernel.ErrNoWaitableEvent { + return 0, nil + } + return 0, err + } + if statusAddr != 0 { + if _, err := t.CopyOut(statusAddr, wr.Status); err != nil { + return 0, err + } + } + if rusageAddr != 0 { + ru := getrusage(wr.Task, linux.RUSAGE_BOTH) + if _, err := t.CopyOut(rusageAddr, &ru); err != nil { + return 0, err + } + } + return uintptr(wr.TID), nil +} + +// Wait4 implements linux syscall wait4(2). +func Wait4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := int(args[0].Int()) + statusAddr := args[1].Pointer() + options := int(args[2].Uint()) + rusageAddr := args[3].Pointer() + + n, err := wait4(t, pid, statusAddr, options, rusageAddr) + return n, nil, err +} + +// WaitPid implements linux syscall waitpid(2). +func WaitPid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pid := int(args[0].Int()) + statusAddr := args[1].Pointer() + options := int(args[2].Uint()) + + n, err := wait4(t, pid, statusAddr, options, 0) + return n, nil, err +} + +// Waitid implements linux syscall waitid(2). +func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + idtype := args[0].Int() + id := args[1].Int() + infop := args[2].Pointer() + options := int(args[3].Uint()) + rusageAddr := args[4].Pointer() + + if options&^(syscall.WNOHANG|syscall.WEXITED|syscall.WSTOPPED|syscall.WCONTINUED|syscall.WNOWAIT) != 0 { + return 0, nil, syscall.EINVAL + } + if options&(syscall.WEXITED|syscall.WSTOPPED|syscall.WCONTINUED) == 0 { + return 0, nil, syscall.EINVAL + } + wopts := kernel.WaitOptions{ + NonCloneTasks: true, + Events: kernel.EventTraceeStop, + ConsumeEvent: options&syscall.WNOWAIT == 0, + } + switch idtype { + case _P_ALL: + case _P_PID: + wopts.SpecificTID = kernel.ThreadID(id) + case _P_PGID: + wopts.SpecificPGID = kernel.ProcessGroupID(id) + default: + return 0, nil, syscall.EINVAL + } + if options&syscall.WEXITED != 0 { + wopts.Events |= kernel.EventExit + } + if options&syscall.WSTOPPED != 0 { + wopts.Events |= kernel.EventChildGroupStop + } + if options&syscall.WCONTINUED != 0 { + wopts.Events |= kernel.EventGroupContinue + } + if options&syscall.WNOHANG == 0 { + wopts.BlockInterruptErr = kernel.ERESTARTSYS + } + + wr, err := t.Wait(&wopts) + if err != nil { + if err == kernel.ErrNoWaitableEvent { + err = nil + // "If WNOHANG was specified in options and there were no children + // in a waitable state, then waitid() returns 0 immediately and the + // state of the siginfo_t structure pointed to by infop is + // unspecified." - waitid(2). But Linux's waitid actually zeroes + // out the fields it would set for a successful waitid in this case + // as well. + if infop != 0 { + var si arch.SignalInfo + _, err = t.CopyOut(infop, &si) + } + } + return 0, nil, err + } + if rusageAddr != 0 { + ru := getrusage(wr.Task, linux.RUSAGE_BOTH) + if _, err := t.CopyOut(rusageAddr, &ru); err != nil { + return 0, nil, err + } + } + if infop == 0 { + return 0, nil, nil + } + si := arch.SignalInfo{ + Signo: int32(syscall.SIGCHLD), + } + si.SetPid(int32(wr.TID)) + si.SetUid(int32(wr.UID)) + // TODO: convert kernel.ExitStatus to functions and make + // WaitResult.Status a linux.WaitStatus + s := syscall.WaitStatus(wr.Status) + switch { + case s.Exited(): + si.Code = arch.CLD_EXITED + si.SetStatus(int32(s.ExitStatus())) + case s.Signaled(): + si.Code = arch.CLD_KILLED + si.SetStatus(int32(s.Signal())) + case s.CoreDump(): + si.Code = arch.CLD_DUMPED + si.SetStatus(int32(s.Signal())) + case s.Stopped(): + if wr.Event == kernel.EventTraceeStop { + si.Code = arch.CLD_TRAPPED + si.SetStatus(int32(s.TrapCause())) + } else { + si.Code = arch.CLD_STOPPED + si.SetStatus(int32(s.StopSignal())) + } + case s.Continued(): + si.Code = arch.CLD_CONTINUED + si.SetStatus(int32(syscall.SIGCONT)) + default: + t.Warningf("waitid got incomprehensible wait status %d", s) + } + _, err = t.CopyOut(infop, &si) + return 0, nil, err +} + +// SetTidAddress implements linux syscall set_tid_address(2). +func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + // Always succeed, return caller's tid. + t.SetClearTID(addr) + return uintptr(t.ThreadID()), nil, nil +} + +// Unshare implements linux syscall unshare(2). +func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + opts := kernel.SharingOptions{ + NewAddressSpace: flags&syscall.CLONE_VM == syscall.CLONE_VM, + NewSignalHandlers: flags&syscall.CLONE_SIGHAND == syscall.CLONE_SIGHAND, + NewThreadGroup: flags&syscall.CLONE_THREAD == syscall.CLONE_THREAD, + NewPIDNamespace: flags&syscall.CLONE_NEWPID == syscall.CLONE_NEWPID, + NewUserNamespace: flags&syscall.CLONE_NEWUSER == syscall.CLONE_NEWUSER, + NewNetworkNamespace: flags&syscall.CLONE_NEWNET == syscall.CLONE_NEWNET, + NewFiles: flags&syscall.CLONE_FILES == syscall.CLONE_FILES, + NewFSContext: flags&syscall.CLONE_FS == syscall.CLONE_FS, + NewUTSNamespace: flags&syscall.CLONE_NEWUTS == syscall.CLONE_NEWUTS, + NewIPCNamespace: flags&syscall.CLONE_NEWIPC == syscall.CLONE_NEWIPC, + } + // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) + if opts.NewPIDNamespace { + opts.NewThreadGroup = true + } + // "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since + // Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS." + if opts.NewUserNamespace { + opts.NewThreadGroup = true + opts.NewFSContext = true + } + return 0, nil, t.Unshare(&opts) +} + +// SchedYield implements linux syscall sched_yield(2). +func SchedYield(t *kernel.Task, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + t.Yield() + return 0, nil, nil +} + +// SchedSetaffinity implements linux syscall sched_setaffinity(2). +func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := args[0].Int() + size := args[1].SizeT() + maskAddr := args[2].Pointer() + + var task *kernel.Task + if tid == 0 { + task = t + } else { + task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) + if task == nil { + return 0, nil, syserror.ESRCH + } + } + + mask := sched.NewCPUSet(t.Kernel().ApplicationCores()) + if size > mask.Size() { + size = mask.Size() + } + if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil { + return 0, nil, err + } + return 0, nil, task.SetCPUMask(mask) +} + +// SchedGetaffinity implements linux syscall sched_getaffinity(2). +func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := args[0].Int() + size := args[1].SizeT() + maskAddr := args[2].Pointer() + + // This limitation is because linux stores the cpumask + // in an array of "unsigned long" so the buffer needs to + // be a multiple of the word size. + if size&(t.Arch().Width()-1) > 0 { + return 0, nil, syserror.EINVAL + } + + var task *kernel.Task + if tid == 0 { + task = t + } else { + task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) + if task == nil { + return 0, nil, syserror.ESRCH + } + } + + mask := task.CPUMask() + // The buffer needs to be big enough to hold a cpumask with + // all possible cpus. + if size < mask.Size() { + return 0, nil, syserror.EINVAL + } + _, err := t.CopyOutBytes(maskAddr, mask) + + // NOTE: The syscall interface is slightly different than the glibc + // interface. The raw sched_getaffinity syscall returns the number of + // bytes used to represent a cpu mask. + return uintptr(mask.Size()), nil, err +} + +// Getcpu implements linux syscall getcpu(2). +func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + cpu := args[0].Pointer() + node := args[1].Pointer() + // third argument to this system call is nowadays unused. + + if cpu != 0 { + buf := t.CopyScratchBuffer(4) + usermem.ByteOrder.PutUint32(buf, uint32(t.CPU())) + if _, err := t.CopyOutBytes(cpu, buf); err != nil { + return 0, nil, err + } + } + // We always return node 0. + if node != 0 { + if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// Setpgid implements the linux syscall setpgid(2). +func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // Note that throughout this function, pgid is interpreted with respect + // to t's namespace, not with respect to the selected ThreadGroup's + // namespace (which may be different). + pid := kernel.ThreadID(args[0].Int()) + pgid := kernel.ProcessGroupID(args[1].Int()) + + // "If pid is zero, then the process ID of the calling process is used." + tg := t.ThreadGroup() + if pid != 0 { + ot := t.PIDNamespace().TaskWithID(pid) + if ot == nil { + return 0, nil, syserror.ESRCH + } + tg = ot.ThreadGroup() + if tg.Leader() != ot { + return 0, nil, syserror.EINVAL + } + + // Setpgid only operates on child threadgroups. + if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) { + return 0, nil, syserror.ESRCH + } + } + + // "If pgid is zero, then the PGID of the process specified by pid is made + // the same as its process ID." + defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg)) + if pgid == 0 { + pgid = defaultPGID + } else if pgid < 0 { + return 0, nil, syserror.EINVAL + } + + // If the pgid is the same as the group, then create a new one. Otherwise, + // we attempt to join an existing process group. + if pgid == defaultPGID { + // For convenience, errors line up with Linux syscall API. + if err := tg.CreateProcessGroup(); err != nil { + // Is the process group already as expected? If so, + // just return success. This is the same behavior as + // Linux. + if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID { + return 0, nil, nil + } + return 0, nil, err + } + } else { + // Same as CreateProcessGroup, above. + if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil { + // See above. + if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { + return 0, nil, nil + } + return 0, nil, err + } + } + + // Success. + return 0, nil, nil +} + +// Getpgrp implements the linux syscall getpgrp(2). +func Getpgrp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil +} + +// Getpgid implements the linux syscall getpgid(2). +func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := kernel.ThreadID(args[0].Int()) + if tid == 0 { + return Getpgrp(t, args) + } + + target := t.PIDNamespace().TaskWithID(tid) + if target == nil { + return 0, nil, syserror.ESRCH + } + + return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil +} + +// Setsid implements the linux syscall setsid(2). +func Setsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.ThreadGroup().CreateSession() +} + +// Getsid implements the linux syscall getsid(2). +func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tid := kernel.ThreadID(args[0].Int()) + if tid == 0 { + return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil + } + + target := t.PIDNamespace().TaskWithID(tid) + if target == nil { + return 0, nil, syserror.ESRCH + } + + return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil +} + +// Getpriority pretends to implement the linux syscall getpriority(2). +// +// This is a stub; real priorities require a full scheduler. +func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + which := args[0].Int() + who := kernel.ThreadID(args[1].Int()) + + switch which { + case syscall.PRIO_PROCESS: + // Look for who, return ESRCH if not found. + var task *kernel.Task + if who == 0 { + task = t + } else { + task = t.PIDNamespace().TaskWithID(who) + } + + if task == nil { + return 0, nil, syscall.ESRCH + } + + // From kernel/sys.c:getpriority: + // "To avoid negative return values, 'getpriority()' + // will not return the normal nice-value, but a negated + // value that has been offset by 20" + return uintptr(20 - task.Niceness()), nil, nil + case syscall.PRIO_USER: + fallthrough + case syscall.PRIO_PGRP: + // PRIO_USER and PRIO_PGRP have no further implementation yet. + return 0, nil, nil + default: + return 0, nil, syscall.EINVAL + } +} + +// Setpriority pretends to implement the linux syscall setpriority(2). +// +// This is a stub; real priorities require a full scheduler. +func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + which := args[0].Int() + who := kernel.ThreadID(args[1].Int()) + niceval := int(args[2].Int()) + + // In the kernel's implementation, values outside the range + // of [-20, 19] are truncated to these minimum and maximum + // values. + if niceval < -20 /* min niceval */ { + niceval = -20 + } else if niceval > 19 /* max niceval */ { + niceval = 19 + } + + switch which { + case syscall.PRIO_PROCESS: + // Look for who, return ESRCH if not found. + var task *kernel.Task + if who == 0 { + task = t + } else { + task = t.PIDNamespace().TaskWithID(who) + } + + if task == nil { + return 0, nil, syscall.ESRCH + } + + task.SetNiceness(niceval) + case syscall.PRIO_USER: + fallthrough + case syscall.PRIO_PGRP: + // PRIO_USER and PRIO_PGRP have no further implementation yet. + return 0, nil, nil + default: + return 0, nil, syscall.EINVAL + } + + return 0, nil, nil +} + +// Ptrace implements linux system call ptrace(2). +func Ptrace(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + req := args[0].Int64() + pid := kernel.ThreadID(args[1].Int()) + addr := args[2].Pointer() + data := args[3].Pointer() + + return 0, nil, t.Ptrace(req, pid, addr, data) +} diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go new file mode 100644 index 000000000..dcee694b2 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_time.go @@ -0,0 +1,338 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// The most significant 29 bits hold either a pid or a file descriptor. +func pidOfClockID(c int32) kernel.ThreadID { + return kernel.ThreadID(^(c >> 3)) +} + +// whichCPUClock returns one of CPUCLOCK_PERF, CPUCLOCK_VIRT, CPUCLOCK_SCHED or +// CLOCK_FD. +func whichCPUClock(c int32) int32 { + return c & linux.CPUCLOCK_CLOCK_MASK +} + +// isCPUClockPerThread returns true if the CPUCLOCK_PERTHREAD bit is set in the +// clock id. +func isCPUClockPerThread(c int32) bool { + return c&linux.CPUCLOCK_PERTHREAD_MASK != 0 +} + +// isValidCPUClock returns checks that the cpu clock id is valid. +func isValidCPUClock(c int32) bool { + // Bits 0, 1, and 2 cannot all be set. + if c&7 == 7 { + return false + } + if whichCPUClock(c) >= linux.CPUCLOCK_MAX { + return false + } + return true +} + +// targetTask returns the kernel.Task for the given clock id. +func targetTask(t *kernel.Task, c int32) *kernel.Task { + pid := pidOfClockID(c) + if pid == 0 { + return t + } + return t.PIDNamespace().TaskWithID(pid) +} + +// ClockGetres implements linux syscall clock_getres(2). +func ClockGetres(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := int32(args[0].Int()) + addr := args[1].Pointer() + r := linux.Timespec{ + Sec: 0, + Nsec: 1, + } + + if _, err := getClock(t, clockID); err != nil { + return 0, nil, syserror.EINVAL + } + + if addr == 0 { + // Don't need to copy out. + return 0, nil, nil + } + + return 0, nil, copyTimespecOut(t, addr, &r) +} + +type cpuClocker interface { + UserCPUClock() ktime.Clock + CPUClock() ktime.Clock +} + +func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) { + if clockID < 0 { + if !isValidCPUClock(clockID) { + return nil, syserror.EINVAL + } + + targetTask := targetTask(t, clockID) + if targetTask == nil { + return nil, syserror.EINVAL + } + + var target cpuClocker + if isCPUClockPerThread(clockID) { + target = targetTask + } else { + target = targetTask.ThreadGroup() + } + + switch whichCPUClock(clockID) { + case linux.CPUCLOCK_VIRT: + return target.UserCPUClock(), nil + case linux.CPUCLOCK_PROF, linux.CPUCLOCK_SCHED: + // CPUCLOCK_SCHED is approximated by CPUCLOCK_PROF. + return target.CPUClock(), nil + default: + return nil, syserror.EINVAL + } + } + + switch clockID { + case linux.CLOCK_REALTIME, linux.CLOCK_REALTIME_COARSE: + return t.Kernel().RealtimeClock(), nil + case linux.CLOCK_MONOTONIC, linux.CLOCK_MONOTONIC_COARSE, linux.CLOCK_MONOTONIC_RAW: + // CLOCK_MONOTONIC approximates CLOCK_MONOTONIC_RAW. + return t.Kernel().MonotonicClock(), nil + case linux.CLOCK_PROCESS_CPUTIME_ID: + return t.ThreadGroup().CPUClock(), nil + case linux.CLOCK_THREAD_CPUTIME_ID: + return t.CPUClock(), nil + default: + return nil, syserror.EINVAL + } +} + +// ClockGettime implements linux syscall clock_gettime(2). +func ClockGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := int32(args[0].Int()) + addr := args[1].Pointer() + + c, err := getClock(t, clockID) + if err != nil { + return 0, nil, err + } + ts := c.Now().Timespec() + return 0, nil, copyTimespecOut(t, addr, &ts) +} + +// ClockSettime implements linux syscall clock_settime(2). +func ClockSettime(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.EPERM +} + +// Time implements linux syscall time(2). +func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + r := t.Kernel().RealtimeClock().Now().TimeT() + if addr == usermem.Addr(0) { + return uintptr(r), nil, nil + } + + if _, err := t.CopyOut(addr, r); err != nil { + return 0, nil, err + } + return uintptr(r), nil, nil +} + +// clockNanosleepRestartBlock encapsulates the state required to restart +// clock_nanosleep(2) via restart_syscall(2). +type clockNanosleepRestartBlock struct { + c ktime.Clock + duration time.Duration + rem usermem.Addr +} + +// Restart implements kernel.SyscallRestartBlock.Restart. +func (n *clockNanosleepRestartBlock) Restart(t *kernel.Task) (uintptr, error) { + return 0, clockNanosleepFor(t, n.c, n.duration, n.rem) +} + +// clockNanosleepUntil blocks until a specified time. +// +// If blocking is interrupted, the syscall is restarted with the original +// arguments. +func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error { + notifier, tchan := ktime.NewChannelNotifier() + timer := ktime.NewTimer(c, notifier) + + // Turn on the timer. + timer.Swap(ktime.Setting{ + Period: 0, + Enabled: true, + Next: ktime.FromTimespec(ts), + }) + + err := t.BlockWithTimer(nil, tchan) + + timer.Destroy() + + // Did we just block until the timeout happened? + if err == syserror.ETIMEDOUT { + return nil + } + + return syserror.ConvertIntr(err, kernel.ERESTARTNOHAND) +} + +// clockNanosleepFor blocks for a specified duration. +// +// If blocking is interrupted, the syscall is restarted with the remaining +// duration timeout. +func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem usermem.Addr) error { + timer, start, tchan := ktime.After(c, dur) + + err := t.BlockWithTimer(nil, tchan) + + after := c.Now() + + timer.Destroy() + + var remaining time.Duration + // Did we just block for the entire duration? + if err == syserror.ETIMEDOUT { + remaining = 0 + } else { + remaining = dur - after.Sub(start) + if remaining < 0 { + remaining = time.Duration(0) + } + } + + // Copy out remaining time. + if err != nil && rem != usermem.Addr(0) { + timeleft := linux.NsecToTimespec(remaining.Nanoseconds()) + if err := copyTimespecOut(t, rem, &timeleft); err != nil { + return err + } + } + + // Did we just block for the entire duration? + if err == syserror.ETIMEDOUT { + return nil + } + + // If interrupted, arrange for a restart with the remaining duration. + if err == syserror.ErrInterrupted { + t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{ + c: c, + duration: remaining, + rem: rem, + }) + return kernel.ERESTART_RESTARTBLOCK + } + + return err +} + +// Nanosleep implements linux syscall Nanosleep(2). +func Nanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + rem := args[1].Pointer() + + ts, err := copyTimespecIn(t, addr) + if err != nil { + return 0, nil, err + } + + if !ts.Valid() { + return 0, nil, syserror.EINVAL + } + + // Just like linux, we cap the timeout with the max number that int64 can + // represent which is roughly 292 years. + dur := time.Duration(ts.ToNsecCapped()) * time.Nanosecond + return 0, nil, clockNanosleepFor(t, t.Kernel().MonotonicClock(), dur, rem) +} + +// ClockNanosleep implements linux syscall clock_nanosleep(2). +func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := int32(args[0].Int()) + flags := args[1].Int() + addr := args[2].Pointer() + rem := args[3].Pointer() + + req, err := copyTimespecIn(t, addr) + if err != nil { + return 0, nil, err + } + + if !req.Valid() { + return 0, nil, syserror.EINVAL + } + + // Only allow clock constants also allowed by Linux. + if clockID > 0 { + if clockID != linux.CLOCK_REALTIME && + clockID != linux.CLOCK_MONOTONIC && + clockID != linux.CLOCK_PROCESS_CPUTIME_ID { + return 0, nil, syserror.EINVAL + } + } + + c, err := getClock(t, clockID) + if err != nil { + return 0, nil, err + } + + if flags&linux.TIMER_ABSTIME != 0 { + return 0, nil, clockNanosleepUntil(t, c, req) + } + + dur := time.Duration(req.ToNsecCapped()) * time.Nanosecond + return 0, nil, clockNanosleepFor(t, c, dur, rem) +} + +// Gettimeofday implements linux syscall gettimeofday(2). +func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + tv := args[0].Pointer() + tz := args[1].Pointer() + + if tv != usermem.Addr(0) { + nowTv := t.Kernel().RealtimeClock().Now().Timeval() + if err := copyTimevalOut(t, tv, &nowTv); err != nil { + return 0, nil, err + } + } + + if tz != usermem.Addr(0) { + // Ask the time package for the timezone. + _, offset := time.Now().Zone() + // This int32 array mimics linux's struct timezone. + timezone := [2]int32{-int32(offset) / 60, 0} + _, err := t.CopyOut(tz, timezone) + return 0, nil, err + } + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go new file mode 100644 index 000000000..4ed077626 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -0,0 +1,168 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// ItimerType denotes the type of interval timer. +type ItimerType int + +// Interval timer types from <sys/time.h>. +const ( + // ItimerReal equals to ITIMER_REAL. + ItimerReal ItimerType = iota + // ItimerVirtual equals to ITIMER_VIRTUAL. + ItimerVirtual + // ItimerProf equals to ITIMER_PROF. + ItimerProf +) + +const nsecPerSec = int64(time.Second) + +// copyItimerValIn copies an ItimerVal from the untrusted app range to the +// kernel. The ItimerVal may be either 32 or 64 bits. +// A NULL address is allowed because because Linux allows +// setitimer(which, NULL, &old_value) which disables the timer. +// There is a KERN_WARN message saying this misfeature will be removed. +// However, that hasn't happened as of 3.19, so we continue to support it. +func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) { + if addr == usermem.Addr(0) { + return linux.ItimerVal{}, nil + } + + switch t.Arch().Width() { + case 8: + // Native size, just copy directly. + var itv linux.ItimerVal + if _, err := t.CopyIn(addr, &itv); err != nil { + return linux.ItimerVal{}, err + } + + return itv, nil + default: + return linux.ItimerVal{}, syscall.ENOSYS + } +} + +// copyItimerValOut copies an ItimerVal to the untrusted app range. +// The ItimerVal may be either 32 or 64 bits. +// A NULL address is allowed, in which case no copy takes place +func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error { + if addr == usermem.Addr(0) { + return nil + } + + switch t.Arch().Width() { + case 8: + // Native size, just copy directly. + _, err := t.CopyOut(addr, itv) + return err + default: + return syscall.ENOSYS + } +} + +func findTimer(t *kernel.Task, w ItimerType) (*ktime.Timer, error) { + switch w { + case ItimerReal: + return t.ThreadGroup().Timer().RealTimer, nil + case ItimerVirtual: + return t.ThreadGroup().Timer().VirtualTimer, nil + case ItimerProf: + return t.ThreadGroup().Timer().ProfTimer, nil + default: + return nil, syscall.EINVAL + } +} + +// Getitimer implements linux syscall getitimer(2). +func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + timerID := ItimerType(args[0].Int()) + val := args[1].Pointer() + + timer, err := findTimer(t, timerID) + if err != nil { + return 0, nil, err + } + value, interval := ktime.SpecFromSetting(timer.Get()) + olditv := linux.ItimerVal{ + Value: linux.DurationToTimeval(value), + Interval: linux.DurationToTimeval(interval), + } + + return 0, nil, copyItimerValOut(t, val, &olditv) +} + +// Setitimer implements linux syscall setitimer(2). +func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + timerID := ItimerType(args[0].Int()) + newVal := args[1].Pointer() + oldVal := args[2].Pointer() + + timer, err := findTimer(t, timerID) + if err != nil { + return 0, nil, err + } + + itv, err := copyItimerValIn(t, newVal) + if err != nil { + return 0, nil, err + } + // Just like linux, we cap the timer value and interval with the max + // number that int64 can represent which is roughly 292 years. + s, err := ktime.SettingFromSpec(itv.Value.ToDuration(), + itv.Interval.ToDuration(), timer.Clock()) + if err != nil { + return 0, nil, err + } + + valueNS, intervalNS := ktime.SpecFromSetting(timer.Swap(s)) + olditv := linux.ItimerVal{ + Value: linux.DurationToTimeval(valueNS), + Interval: linux.DurationToTimeval(intervalNS), + } + + return 0, nil, copyItimerValOut(t, oldVal, &olditv) +} + +// Alarm implements linux syscall alarm(2). +func Alarm(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + duration := time.Duration(args[0].Uint()) * time.Second + + timer := t.ThreadGroup().Timer().RealTimer + s, err := ktime.SettingFromSpec(duration, 0, timer.Clock()) + if err != nil { + return 0, nil, err + } + + value, _ := ktime.SpecFromSetting(timer.Swap(s)) + sec := int64(value) / nsecPerSec + nsec := int64(value) % nsecPerSec + // We can't return 0 if we have an alarm pending ... + if (sec == 0 && nsec > 0) || nsec >= nsecPerSec/2 { + sec++ + } + + return uintptr(sec), nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go new file mode 100644 index 000000000..cb81d42b9 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -0,0 +1,135 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// TimerfdCreate implements Linux syscall timerfd_create(2). +func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := args[0].Int() + flags := args[1].Int() + + if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 { + return 0, nil, syserror.EINVAL + } + + var c ktime.Clock + switch clockID { + case linux.CLOCK_REALTIME: + c = t.Kernel().RealtimeClock() + case linux.CLOCK_MONOTONIC: + c = t.Kernel().MonotonicClock() + default: + return 0, nil, syserror.EINVAL + } + f := timerfd.NewFile(t, c) + defer f.DecRef() + f.SetFlags(fs.SettableFileFlags{ + NonBlocking: flags&linux.TFD_NONBLOCK != 0, + }) + + fd, err := t.FDMap().NewFDFrom(0, f, kernel.FDFlags{ + CloseOnExec: flags&linux.TFD_CLOEXEC != 0, + }, t.ThreadGroup().Limits()) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// TimerfdSettime implements Linux syscall timerfd_settime(2). +func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + flags := args[1].Int() + newValAddr := args[2].Pointer() + oldValAddr := args[3].Pointer() + + if flags&^(linux.TFD_TIMER_ABSTIME) != 0 { + return 0, nil, syserror.EINVAL + } + + f := t.FDMap().GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + tf, ok := f.FileOperations.(*timerfd.TimerOperations) + if !ok { + return 0, nil, syserror.EINVAL + } + + var newVal linux.Itimerspec + if _, err := t.CopyIn(newValAddr, &newVal); err != nil { + return 0, nil, err + } + var s ktime.Setting + var err error + if flags&linux.TFD_TIMER_ABSTIME != 0 { + s, err = ktime.SettingFromAbsSpec(ktime.FromTimespec(newVal.Value), + newVal.Interval.ToDuration()) + } else { + s, err = ktime.SettingFromSpec(newVal.Value.ToDuration(), + newVal.Interval.ToDuration(), tf.Clock()) + } + if err != nil { + return 0, nil, err + } + valueNS, intervalNS := ktime.SpecFromSetting(tf.SetTime(s)) + if oldValAddr == 0 { + return 0, nil, nil + } + oldVal := linux.Itimerspec{ + Interval: linux.DurationToTimespec(intervalNS), + Value: linux.DurationToTimespec(valueNS), + } + _, err = t.CopyOut(oldValAddr, &oldVal) + return 0, nil, err +} + +// TimerfdGettime implements Linux syscall timerfd_gettime(2). +func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + curValAddr := args[1].Pointer() + + f := t.FDMap().GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + tf, ok := f.FileOperations.(*timerfd.TimerOperations) + if !ok { + return 0, nil, syserror.EINVAL + } + + valueNS, intervalNS := ktime.SpecFromSetting(tf.GetTime()) + curVal := linux.Itimerspec{ + Interval: linux.DurationToTimespec(intervalNS), + Value: linux.DurationToTimespec(valueNS), + } + _, err := t.CopyOut(curValAddr, &curVal) + return 0, nil, err +} diff --git a/pkg/sentry/syscalls/linux/sys_tls.go b/pkg/sentry/syscalls/linux/sys_tls.go new file mode 100644 index 000000000..1047364b3 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_tls.go @@ -0,0 +1,48 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//+build amd64 + +package linux + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" +) + +// ArchPrctl implements linux syscall arch_prctl(2). +// It sets architecture-specific process or thread state for t. +func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + switch args[0].Int() { + case linux.ARCH_GET_FS: + addr := args[1].Pointer() + _, err := t.CopyOut(addr, &t.Arch().StateData().Regs.Fs_base) + if err != nil { + return 0, nil, err + } + + case linux.ARCH_SET_FS: + regs := &t.Arch().StateData().Regs + regs.Fs = 0 + regs.Fs_base = args[1].Uint64() + + default: + return 0, nil, syscall.EINVAL + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go new file mode 100644 index 000000000..899116374 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_utsname.go @@ -0,0 +1,89 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Uname implements linux syscall uname. +func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + version := t.SyscallTable().Version + + uts := t.UTSNamespace() + + // Fill in structure fields. + var u linux.UtsName + copy(u.Sysname[:], version.Sysname) + copy(u.Nodename[:], uts.HostName()) + copy(u.Release[:], version.Release) + copy(u.Version[:], version.Version) + copy(u.Machine[:], "x86_64") // +build tag above. + copy(u.Domainname[:], uts.DomainName()) + + // Copy out the result. + va := args[0].Pointer() + _, err := t.CopyOut(va, u) + return 0, nil, err +} + +// Setdomainname implements Linux syscall setdomainname. +func Setdomainname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nameAddr := args[0].Pointer() + size := args[1].Int() + + utsns := t.UTSNamespace() + if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) { + return 0, nil, syserror.EPERM + } + if size < 0 || size > linux.UTSLen { + return 0, nil, syserror.EINVAL + } + + name, err := t.CopyInString(nameAddr, int(size)) + if err != nil { + return 0, nil, err + } + + utsns.SetDomainName(name) + return 0, nil, nil +} + +// Sethostname implements Linux syscall sethostname. +func Sethostname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nameAddr := args[0].Pointer() + size := args[1].Int() + + utsns := t.UTSNamespace() + if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) { + return 0, nil, syserror.EPERM + } + if size < 0 || size > linux.UTSLen { + return 0, nil, syserror.EINVAL + } + + name, err := t.CopyInString(nameAddr, int(size)) + if err != nil { + return 0, nil, err + } + + utsns.SetHostName(name) + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go new file mode 100644 index 000000000..caa7b01ea --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -0,0 +1,274 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +const ( + // EventMaskWrite contains events that can be triggered on writes. + // + // Note that EventHUp is not going to happen for pipes but may for + // implementations of poll on some sockets, see net/core/datagram.c. + EventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr +) + +// Write implements linux syscall write(2). +func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is writable. + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := writev(t, file, src) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "write", file) +} + +// Pwrite64 implements linux syscall pwrite64(2). +func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Is writing at an offset supported? + if !file.Flags().Pwrite { + return 0, nil, syserror.ESPIPE + } + + // Check that the file is writable. + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwritev(t, file, src, offset) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file) +} + +// Writev implements linux syscall writev(2). +func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the file is writable. + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + + // Read the iovecs that specify the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := writev(t, file, src) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "writev", file) +} + +// Pwritev implements linux syscall pwritev(2). +func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := kdefs.FD(args[0].Int()) + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.FDMap().GetFile(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Is writing at an offset supported? + if !file.Flags().Pwrite { + return 0, nil, syserror.ESPIPE + } + + // Check that the file is writable. + if !file.Flags().Write { + return 0, nil, syserror.EBADF + } + + // Read the iovecs that specify the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwritev(t, file, src, offset) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file) +} + +func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { + n, err := f.Writev(t, src) + if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if n > 0 { + // Queue notification if we wrote anything. + f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + } + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + f.EventRegister(&w, EventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst64(n) + + // Issue the request and break out if it completes with + // anything other than "would block". + n, err = f.Writev(t, src) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.Block(ch); err != nil { + break + } + } + + f.EventUnregister(&w) + + if total > 0 { + // Queue notification if we wrote anything. + f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + } + + return total, err +} + +func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + n, err := f.Pwritev(t, src, offset) + if err != syserror.ErrWouldBlock || f.Flags().NonBlocking { + if n > 0 { + // Queue notification if we wrote anything. + f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + } + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + f.EventRegister(&w, EventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst64(n) + + // Issue the request and break out if it completes with + // anything other than "would block". + n, err = f.Pwritev(t, src, offset+total) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + if err = t.Block(ch); err != nil { + break + } + } + + f.EventUnregister(&w) + + if total > 0 { + // Queue notification if we wrote anything. + f.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + } + + return total, err +} diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go new file mode 100644 index 000000000..e865c6fc0 --- /dev/null +++ b/pkg/sentry/syscalls/linux/timespec.go @@ -0,0 +1,112 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "syscall" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// copyTimespecIn copies a Timespec from the untrusted app range to the kernel. +func copyTimespecIn(t *kernel.Task, addr usermem.Addr) (linux.Timespec, error) { + switch t.Arch().Width() { + case 8: + ts := linux.Timespec{} + in := t.CopyScratchBuffer(16) + _, err := t.CopyInBytes(addr, in) + if err != nil { + return ts, err + } + ts.Sec = int64(usermem.ByteOrder.Uint64(in[0:])) + ts.Nsec = int64(usermem.ByteOrder.Uint64(in[8:])) + return ts, nil + default: + return linux.Timespec{}, syserror.ENOSYS + } +} + +// copyTimespecOut copies a Timespec to the untrusted app range. +func copyTimespecOut(t *kernel.Task, addr usermem.Addr, ts *linux.Timespec) error { + switch t.Arch().Width() { + case 8: + out := t.CopyScratchBuffer(16) + usermem.ByteOrder.PutUint64(out[0:], uint64(ts.Sec)) + usermem.ByteOrder.PutUint64(out[8:], uint64(ts.Nsec)) + _, err := t.CopyOutBytes(addr, out) + return err + default: + return syserror.ENOSYS + } +} + +// copyTimevalIn copies a Timeval from the untrusted app range to the kernel. +func copyTimevalIn(t *kernel.Task, addr usermem.Addr) (linux.Timeval, error) { + switch t.Arch().Width() { + case 8: + tv := linux.Timeval{} + in := t.CopyScratchBuffer(16) + _, err := t.CopyInBytes(addr, in) + if err != nil { + return tv, err + } + tv.Sec = int64(usermem.ByteOrder.Uint64(in[0:])) + tv.Usec = int64(usermem.ByteOrder.Uint64(in[8:])) + return tv, nil + default: + return linux.Timeval{}, syscall.ENOSYS + } +} + +// copyTimevalOut copies a Timeval to the untrusted app range. +func copyTimevalOut(t *kernel.Task, addr usermem.Addr, tv *linux.Timeval) error { + switch t.Arch().Width() { + case 8: + out := t.CopyScratchBuffer(16) + usermem.ByteOrder.PutUint64(out[0:], uint64(tv.Sec)) + usermem.ByteOrder.PutUint64(out[8:], uint64(tv.Usec)) + _, err := t.CopyOutBytes(addr, out) + return err + default: + return syscall.ENOSYS + } +} + +// copyTimespecInToDuration copies a Timespec from the untrusted app range, +// validates it and converts it to a Duration. +// +// If the Timespec is larger than what can be represented in a Duration, the +// returned value is the maximum that Duration will allow. +// +// If timespecAddr is NULL, the returned value is negative. +func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) { + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timespecAddr != 0 { + timespec, err := copyTimespecIn(t, timespecAddr) + if err != nil { + return 0, err + } + if !timespec.Valid() { + return 0, syscall.EINVAL + } + timeout = time.Duration(timespec.ToNsecCapped()) + } + return timeout, nil +} diff --git a/pkg/sentry/syscalls/polling.go b/pkg/sentry/syscalls/polling.go new file mode 100644 index 000000000..fd90184ef --- /dev/null +++ b/pkg/sentry/syscalls/polling.go @@ -0,0 +1,137 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package syscalls + +import ( + "syscall" + "time" + + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// PollFD describes a pollable FD. +type PollFD struct { + FD kdefs.FD + Events waiter.EventMask + REvents waiter.EventMask +} + +// pollState tracks the associated file descriptor and waiter of a PollFD. +type pollState struct { + file *fs.File + waiter waiter.Entry +} + +// initReadiness gets the current ready mask for the file represented by the FD +// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is +// used to register with the file for event notifications, and a reference to +// the file is stored in "state". +func (pfd *PollFD) initReadiness(t *kernel.Task, state *pollState, ch chan struct{}) { + if pfd.FD < 0 { + pfd.REvents = 0 + return + } + + file := t.FDMap().GetFile(pfd.FD) + if file == nil { + pfd.REvents = waiter.EventNVal + return + } + + if ch == nil { + defer file.DecRef() + } else { + state.file = file + state.waiter, _ = waiter.NewChannelEntry(ch) + file.EventRegister(&state.waiter, pfd.Events) + } + + pfd.REvents = file.Readiness(pfd.Events) & pfd.Events +} + +// releaseState releases all the pollState in "state". +func releaseState(state []pollState) { + for i := range state { + if state[i].file != nil { + state[i].file.EventUnregister(&state[i].waiter) + state[i].file.DecRef() + } + } +} + +// Poll polls the PollFDs in "pfd" with a bounded time specified in "timeout" +// when "timeout" is greater than zero. +// +// Poll returns the remaining timeout, which is always 0 on a timeout; and 0 or +// positive if interrupted by a signal. +func Poll(t *kernel.Task, pfd []PollFD, timeout time.Duration) (time.Duration, uintptr, error) { + var ch chan struct{} + if timeout != 0 { + ch = make(chan struct{}, 1) + } + + // Register for event notification in the files involved if we may + // block (timeout not zero). Once we find a file that has a non-zero + // result, we stop registering for events but still go through all files + // to get their ready masks. + state := make([]pollState, len(pfd)) + defer releaseState(state) + n := uintptr(0) + for i := range pfd { + pfd[i].initReadiness(t, &state[i], ch) + if pfd[i].REvents != 0 { + n++ + ch = nil + } + } + + if timeout == 0 { + return timeout, n, nil + } + + forever := timeout < 0 + + for n == 0 { + var err error + // Wait for a notification. + timeout, err = t.BlockWithTimeout(ch, !forever, timeout) + if err != nil { + if err == syscall.ETIMEDOUT { + err = nil + } + return timeout, 0, err + } + + // We got notified, count how many files are ready. If none, + // then this was a spurious notification, and we just go back + // to sleep with the remaining timeout. + for i := range state { + if state[i].file == nil { + continue + } + + ready := state[i].file.Readiness(pfd[i].Events) & pfd[i].Events + if ready != 0 { + pfd[i].REvents = ready + n++ + } + } + } + + return timeout, n, nil +} diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go new file mode 100644 index 000000000..1176f858d --- /dev/null +++ b/pkg/sentry/syscalls/syscalls.go @@ -0,0 +1,72 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package syscalls is the interface from the application to the kernel. +// Traditionally, syscalls is the interface that is used by applications to +// request services from the kernel of a operating system. We provide a +// user-mode kernel that needs to handle those requests coming from unmodified +// applications. Therefore, we still use the term "syscalls" to denote this +// interface. +// +// Note that the stubs in this package may merely provide the interface, not +// the actual implementation. It just makes writing syscall stubs +// straightforward. +package syscalls + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/eventchannel" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + uspb "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/unimplemented_syscall_go_proto" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Error returns a syscall handler that will always give the passed error. +func Error(err error) kernel.SyscallFn { + return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, err + } +} + +// ErrorWithEvent gives a syscall function that sends an unimplemented +// syscall event via the event channel and returns the passed error. +func ErrorWithEvent(err error) kernel.SyscallFn { + return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + UnimplementedEvent(t) + return 0, nil, err + } +} + +// CapError gives a syscall function that checks for capability c. If the task +// has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged +// tasks, it will seem like there is an implementation. +func CapError(c linux.Capability) kernel.SyscallFn { + return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + if !t.HasCapability(c) { + return 0, nil, syserror.EPERM + } + UnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + } +} + +// UnimplementedEvent emits an UnimplementedSyscall event via the event +// channel. +func UnimplementedEvent(t *kernel.Task) { + eventchannel.Emit(&uspb.UnimplementedSyscall{ + Tid: int32(t.ThreadID()), + Registers: t.Arch().StateData().Proto(), + }) +} diff --git a/pkg/sentry/syscalls/unimplemented_syscall.proto b/pkg/sentry/syscalls/unimplemented_syscall.proto new file mode 100644 index 000000000..d6febf5b1 --- /dev/null +++ b/pkg/sentry/syscalls/unimplemented_syscall.proto @@ -0,0 +1,27 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package gvisor; + +import "pkg/sentry/arch/registers.proto"; + +message UnimplementedSyscall { + // Task ID. + int32 tid = 1; + + // Registers at the time of the call. + Registers registers = 2; +} |