summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/BUILD234
-rw-r--r--pkg/sentry/kernel/README.md106
-rw-r--r--pkg/sentry/kernel/abstract_socket_namespace.go108
-rw-r--r--pkg/sentry/kernel/auth/BUILD73
-rw-r--r--pkg/sentry/kernel/auth/auth.go22
-rw-r--r--pkg/sentry/kernel/auth/capability_set.go61
-rw-r--r--pkg/sentry/kernel/auth/context.go36
-rw-r--r--pkg/sentry/kernel/auth/credentials.go227
-rw-r--r--pkg/sentry/kernel/auth/id.go121
-rw-r--r--pkg/sentry/kernel/auth/id_map.go283
-rw-r--r--pkg/sentry/kernel/auth/id_map_functions.go45
-rw-r--r--pkg/sentry/kernel/auth/user_namespace.go130
-rw-r--r--pkg/sentry/kernel/context.go135
-rw-r--r--pkg/sentry/kernel/epoll/BUILD52
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go466
-rw-r--r--pkg/sentry/kernel/epoll/epoll_state.go51
-rw-r--r--pkg/sentry/kernel/epoll/epoll_test.go54
-rw-r--r--pkg/sentry/kernel/eventfd/BUILD46
-rw-r--r--pkg/sentry/kernel/eventfd/eventfd.go172
-rw-r--r--pkg/sentry/kernel/eventfd/eventfd_test.go78
-rw-r--r--pkg/sentry/kernel/fd_map.go340
-rw-r--r--pkg/sentry/kernel/fd_map_test.go134
-rw-r--r--pkg/sentry/kernel/fs_context.go172
-rw-r--r--pkg/sentry/kernel/futex/BUILD48
-rw-r--r--pkg/sentry/kernel/futex/futex.go405
-rw-r--r--pkg/sentry/kernel/futex/futex_test.go500
-rw-r--r--pkg/sentry/kernel/g3doc/run_states.dot99
-rw-r--r--pkg/sentry/kernel/ipc_namespace.go43
-rw-r--r--pkg/sentry/kernel/kdefs/BUILD10
-rw-r--r--pkg/sentry/kernel/kdefs/kdefs.go20
-rw-r--r--pkg/sentry/kernel/kernel.go957
-rw-r--r--pkg/sentry/kernel/memevent/BUILD31
-rw-r--r--pkg/sentry/kernel/memevent/memory_events.go98
-rw-r--r--pkg/sentry/kernel/memevent/memory_events.proto25
-rw-r--r--pkg/sentry/kernel/pending_signals.go126
-rw-r--r--pkg/sentry/kernel/pipe/BUILD68
-rw-r--r--pkg/sentry/kernel/pipe/buffers.go50
-rw-r--r--pkg/sentry/kernel/pipe/device.go20
-rw-r--r--pkg/sentry/kernel/pipe/node.go175
-rw-r--r--pkg/sentry/kernel/pipe/node_test.go308
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go335
-rw-r--r--pkg/sentry/kernel/pipe/pipe_test.go138
-rw-r--r--pkg/sentry/kernel/pipe/reader.go37
-rw-r--r--pkg/sentry/kernel/pipe/reader_writer.go91
-rw-r--r--pkg/sentry/kernel/pipe/writer.go37
-rw-r--r--pkg/sentry/kernel/ptrace.go1054
-rw-r--r--pkg/sentry/kernel/rseq.go118
-rw-r--r--pkg/sentry/kernel/sched/BUILD20
-rw-r--r--pkg/sentry/kernel/sched/cpuset.go105
-rw-r--r--pkg/sentry/kernel/sched/cpuset_test.go44
-rw-r--r--pkg/sentry/kernel/sched/sched.go16
-rw-r--r--pkg/sentry/kernel/seccomp.go205
-rw-r--r--pkg/sentry/kernel/semaphore/BUILD62
-rw-r--r--pkg/sentry/kernel/semaphore/semaphore.go473
-rw-r--r--pkg/sentry/kernel/semaphore/semaphore_test.go172
-rw-r--r--pkg/sentry/kernel/sessions.go462
-rw-r--r--pkg/sentry/kernel/signal.go69
-rw-r--r--pkg/sentry/kernel/signal_handlers.go79
-rw-r--r--pkg/sentry/kernel/syscalls.go305
-rw-r--r--pkg/sentry/kernel/syscalls_state.go29
-rw-r--r--pkg/sentry/kernel/syslog.go100
-rw-r--r--pkg/sentry/kernel/table_test.go108
-rw-r--r--pkg/sentry/kernel/task.go606
-rw-r--r--pkg/sentry/kernel/task_acct.go111
-rw-r--r--pkg/sentry/kernel/task_block.go207
-rw-r--r--pkg/sentry/kernel/task_clone.go475
-rw-r--r--pkg/sentry/kernel/task_context.go179
-rw-r--r--pkg/sentry/kernel/task_exec.go240
-rw-r--r--pkg/sentry/kernel/task_exit.go1139
-rw-r--r--pkg/sentry/kernel/task_identity.go557
-rw-r--r--pkg/sentry/kernel/task_log.go137
-rw-r--r--pkg/sentry/kernel/task_net.go35
-rw-r--r--pkg/sentry/kernel/task_resources.go126
-rw-r--r--pkg/sentry/kernel/task_run.go346
-rw-r--r--pkg/sentry/kernel/task_sched.go329
-rw-r--r--pkg/sentry/kernel/task_signals.go1056
-rw-r--r--pkg/sentry/kernel/task_start.go252
-rw-r--r--pkg/sentry/kernel/task_stop.go226
-rw-r--r--pkg/sentry/kernel/task_syscall.go434
-rw-r--r--pkg/sentry/kernel/task_test.go69
-rw-r--r--pkg/sentry/kernel/task_usermem.go298
-rw-r--r--pkg/sentry/kernel/thread_group.go269
-rw-r--r--pkg/sentry/kernel/threads.go443
-rw-r--r--pkg/sentry/kernel/time/BUILD32
-rw-r--r--pkg/sentry/kernel/time/context.go44
-rw-r--r--pkg/sentry/kernel/time/time.go649
-rw-r--r--pkg/sentry/kernel/timekeeper.go270
-rw-r--r--pkg/sentry/kernel/timekeeper_state.go41
-rw-r--r--pkg/sentry/kernel/timekeeper_test.go156
-rw-r--r--pkg/sentry/kernel/timer.go282
-rw-r--r--pkg/sentry/kernel/uts_namespace.go100
-rw-r--r--pkg/sentry/kernel/vdso.go145
-rw-r--r--pkg/sentry/kernel/version.go33
93 files changed, 19474 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
new file mode 100644
index 000000000..62794cff5
--- /dev/null
+++ b/pkg/sentry/kernel/BUILD
@@ -0,0 +1,234 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+ name = "kernel_state",
+ srcs = [
+ "abstract_socket_namespace.go",
+ "fd_map.go",
+ "fs_context.go",
+ "ipc_namespace.go",
+ "kernel.go",
+ "pending_signals.go",
+ "pending_signals_list.go",
+ "process_group_list.go",
+ "ptrace.go",
+ "rseq.go",
+ "session_list.go",
+ "sessions.go",
+ "signal.go",
+ "signal_handlers.go",
+ "syscalls.go",
+ "syscalls_state.go",
+ "syslog.go",
+ "task.go",
+ "task_clone.go",
+ "task_context.go",
+ "task_exec.go",
+ "task_exit.go",
+ "task_list.go",
+ "task_resources.go",
+ "task_run.go",
+ "task_sched.go",
+ "task_signals.go",
+ "task_start.go",
+ "task_syscall.go",
+ "thread_group.go",
+ "threads.go",
+ "timekeeper.go",
+ "timekeeper_state.go",
+ "timer.go",
+ "uts_namespace.go",
+ "vdso.go",
+ "version.go",
+ ],
+ out = "kernel_state.go",
+ imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"],
+ package = "kernel",
+)
+
+go_template_instance(
+ name = "pending_signals_list",
+ out = "pending_signals_list.go",
+ package = "kernel",
+ prefix = "pendingSignal",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Linker": "*pendingSignal",
+ },
+)
+
+go_template_instance(
+ name = "process_group_list",
+ out = "process_group_list.go",
+ package = "kernel",
+ prefix = "processGroup",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Linker": "*ProcessGroup",
+ },
+)
+
+go_template_instance(
+ name = "seqatomic_taskgoroutineschedinfo",
+ out = "seqatomic_taskgoroutineschedinfo.go",
+ package = "kernel",
+ suffix = "TaskGoroutineSchedInfo",
+ template = "//pkg/sync:generic_seqatomic",
+ types = {
+ "Value": "TaskGoroutineSchedInfo",
+ },
+)
+
+go_template_instance(
+ name = "session_list",
+ out = "session_list.go",
+ package = "kernel",
+ prefix = "session",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Linker": "*Session",
+ },
+)
+
+go_template_instance(
+ name = "task_list",
+ out = "task_list.go",
+ package = "kernel",
+ prefix = "task",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Linker": "*Task",
+ },
+)
+
+go_library(
+ name = "kernel",
+ srcs = [
+ "abstract_socket_namespace.go",
+ "context.go",
+ "fd_map.go",
+ "fs_context.go",
+ "ipc_namespace.go",
+ "kernel.go",
+ "kernel_state.go",
+ "pending_signals.go",
+ "pending_signals_list.go",
+ "process_group_list.go",
+ "ptrace.go",
+ "rseq.go",
+ "seccomp.go",
+ "seqatomic_taskgoroutineschedinfo.go",
+ "session_list.go",
+ "sessions.go",
+ "signal.go",
+ "signal_handlers.go",
+ "syscalls.go",
+ "syscalls_state.go",
+ "syslog.go",
+ "task.go",
+ "task_acct.go",
+ "task_block.go",
+ "task_clone.go",
+ "task_context.go",
+ "task_exec.go",
+ "task_exit.go",
+ "task_identity.go",
+ "task_list.go",
+ "task_log.go",
+ "task_net.go",
+ "task_resources.go",
+ "task_run.go",
+ "task_sched.go",
+ "task_signals.go",
+ "task_start.go",
+ "task_stop.go",
+ "task_syscall.go",
+ "task_usermem.go",
+ "thread_group.go",
+ "threads.go",
+ "timekeeper.go",
+ "timekeeper_state.go",
+ "timer.go",
+ "uts_namespace.go",
+ "vdso.go",
+ "version.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel",
+ visibility = ["//:sandbox"],
+ deps = [
+ "//pkg/abi",
+ "//pkg/abi/linux",
+ "//pkg/amutex",
+ "//pkg/binary",
+ "//pkg/bits",
+ "//pkg/bpf",
+ "//pkg/cpuid",
+ "//pkg/eventchannel",
+ "//pkg/log",
+ "//pkg/refs",
+ "//pkg/secio",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/context",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/lock",
+ "//pkg/sentry/fs/timerfd",
+ "//pkg/sentry/hostcpu",
+ "//pkg/sentry/inet",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/epoll",
+ "//pkg/sentry/kernel/futex",
+ "//pkg/sentry/kernel/kdefs",
+ "//pkg/sentry/kernel/sched",
+ "//pkg/sentry/kernel/semaphore",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/limits",
+ "//pkg/sentry/loader",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/mm",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/safemem",
+ "//pkg/sentry/socket/netlink/port",
+ "//pkg/sentry/time",
+ "//pkg/sentry/uniqueid",
+ "//pkg/sentry/usage",
+ "//pkg/sentry/usermem",
+ "//pkg/state",
+ "//pkg/state/statefile",
+ "//pkg/sync",
+ "//pkg/syserror",
+ "//pkg/tcpip",
+ "//pkg/tcpip/stack",
+ "//pkg/tcpip/transport/unix",
+ "//pkg/waiter",
+ ],
+)
+
+go_test(
+ name = "kernel_test",
+ size = "small",
+ srcs = [
+ "fd_map_test.go",
+ "table_test.go",
+ "task_test.go",
+ "timekeeper_test.go",
+ ],
+ embed = [":kernel"],
+ deps = [
+ "//pkg/abi",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs/filetest",
+ "//pkg/sentry/kernel/kdefs",
+ "//pkg/sentry/kernel/sched",
+ "//pkg/sentry/limits",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/time",
+ "//pkg/sentry/usage",
+ "//pkg/sentry/usermem",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/kernel/README.md b/pkg/sentry/kernel/README.md
new file mode 100644
index 000000000..3306780d6
--- /dev/null
+++ b/pkg/sentry/kernel/README.md
@@ -0,0 +1,106 @@
+This package contains:
+
+- A (partial) emulation of the "core Linux kernel", which governs task
+ execution and scheduling, system call dispatch, and signal handling. See
+ below for details.
+
+- The top-level interface for the sentry's Linux kernel emulation in general,
+ used by the `main` function of all versions of the sentry. This interface
+ revolves around the `Env` type (defined in `kernel.go`).
+
+# Background
+
+In Linux, each schedulable context is referred to interchangeably as a "task" or
+"thread". Tasks can be divided into userspace and kernel tasks. In the sentry,
+scheduling is managed by the Go runtime, so each schedulable context is a
+goroutine; only "userspace" (application) contexts are referred to as tasks, and
+represented by Task objects. (From this point forward, "task" refers to the
+sentry's notion of a task unless otherwise specified.)
+
+At a high level, Linux application threads can be thought of as repeating a "run
+loop":
+
+- Some amount of application code is executed in userspace.
+
+- A trap (explicit syscall invocation, hardware interrupt or exception, etc.)
+ causes control flow to switch to the kernel.
+
+- Some amount of kernel code is executed in kernelspace, e.g. to handle the
+ cause of the trap.
+
+- The kernel "returns from the trap" into application code.
+
+Analogously, each task in the sentry is associated with a *task goroutine* that
+executes that task's run loop (`Task.run` in `task_run.go`). However, the
+sentry's task run loop differs in structure in order to support saving execution
+state to, and resuming execution from, checkpoints.
+
+While in kernelspace, a Linux thread can be descheduled (cease execution) in a
+variety of ways:
+
+- It can yield or be preempted, becoming temporarily descheduled but still
+ runnable. At present, the sentry delegates scheduling of runnable threads to
+ the Go runtime.
+
+- It can exit, becoming permanently descheduled. The sentry's equivalent is
+ returning from `Task.run`, terminating the task goroutine.
+
+- It can enter interruptible sleep, a state in which it can be woken by a
+ caller-defined wakeup or the receipt of a signal. In the sentry, interruptible
+ sleep (which is ambiguously referred to as *blocking*) is implemented by
+ making all events that can end blocking (including signal notifications)
+ communicated via Go channels and using `select` to multiplex wakeup sources;
+ see `task_block.go`.
+
+- It can enter uninterruptible sleep, a state in which it can only be woken by a
+ caller-defined wakeup. Killable sleep is a closely related variant in which
+ the task can also be woken by SIGKILL. (These definitions also include Linux's
+ "group-stopped" (`TASK_STOPPED`) and "ptrace-stopped" (`TASK_TRACED`) states.)
+
+To maximize compatibility with Linux, sentry checkpointing appears as a spurious
+signal-delivery interrupt on all tasks; interrupted system calls return `EINTR`
+or are automatically restarted as usual. However, these semantics require that
+uninterruptible and killable sleeps do not appear to be interrupted. In other
+words, the state of the task, including its progress through the interrupted
+operation, must be preserved by checkpointing. For many such sleeps, the wakeup
+condition is application-controlled, making it infeasible to wait for the sleep
+to end before checkpointing. Instead, we must support checkpointing progress
+through sleeping operations.
+
+# Implementation
+
+We break the task's control flow graph into *states*, delimited by:
+
+1. Points where uninterruptible and killable sleeps may occur. For example,
+there exists a state boundary between signal dequeueing and signal delivery
+because there may be an intervening ptrace signal-delivery-stop.
+
+2. Points where sleep-induced branches may "rejoin" normal execution. For
+example, the syscall exit state exists because it can be reached immediately
+following a synchronous syscall, or after a task that is sleeping in `execve()`
+or `vfork()` resumes execution.
+
+3. Points containing large branches. This is strictly for organizational
+purposes. For example, the state that processes interrupt-signaled conditions is
+kept separate from the main "app" state to reduce the size of the latter.
+
+4. `SyscallReinvoke`, which does not correspond to anything in Linux, and exists
+solely to serve the autosave feature.
+
+![dot -Tsvg -Goverlap=false -orun_states.svg run_states.dot](g3doc/run_states.dot "Task control flow graph")
+
+States before which a stop may occur are represented as implementations of the
+`taskRunState` interface named `run(state)`, allowing them to be saved and
+restored. States that cannot be immediately preceded by a stop are simply `Task`
+methods named `do(state)`.
+
+Conditions that can require task goroutines to cease execution for unknown
+lengths of time are called *stops*. Stops are divided into *internal stops*,
+which are stops whose start and end conditions are implemented within the
+sentry, and *external stops*, which are stops whose start and end conditions are
+not known to the sentry. Hence all uninterruptible and killable sleeps are
+internal stops, and the existence of a pending checkpoint operation is an
+external stop. Internal stops are reified into instances of the `TaskStop` type,
+while external stops are merely counted. The task run loop alternates between
+checking for stops and advancing the task's state. This allows checkpointing to
+hold tasks in a stopped state while waiting for all tasks in the system to stop.
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
new file mode 100644
index 000000000..014c4a3bf
--- /dev/null
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -0,0 +1,108 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+)
+
+type abstractEndpoint struct {
+ ep unix.BoundEndpoint
+ wr *refs.WeakRef
+ name string
+ ns *AbstractSocketNamespace
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+func (e *abstractEndpoint) WeakRefGone() {
+ e.ns.mu.Lock()
+ if e.ns.endpoints[e.name].ep == e.ep {
+ delete(e.ns.endpoints, e.name)
+ }
+ e.ns.mu.Unlock()
+}
+
+// AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
+type AbstractSocketNamespace struct {
+ mu sync.Mutex `state:"nosave"`
+
+ // Keeps mapping from name to endpoint.
+ endpoints map[string]abstractEndpoint
+}
+
+// NewAbstractSocketNamespace returns a new AbstractSocketNamespace.
+func NewAbstractSocketNamespace() *AbstractSocketNamespace {
+ return &AbstractSocketNamespace{
+ endpoints: make(map[string]abstractEndpoint),
+ }
+}
+
+// A boundEndpoint wraps a unix.BoundEndpoint to maintain a reference on its
+// backing object.
+type boundEndpoint struct {
+ unix.BoundEndpoint
+ rc refs.RefCounter
+}
+
+// Release implements unix.BoundEndpoint.Release.
+func (e *boundEndpoint) Release() {
+ e.rc.DecRef()
+ e.BoundEndpoint.Release()
+}
+
+// BoundEndpoint retrieves the endpoint bound to the given name. The return
+// value is nil if no endpoint was bound.
+func (a *AbstractSocketNamespace) BoundEndpoint(name string) unix.BoundEndpoint {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+
+ ep, ok := a.endpoints[name]
+ if !ok {
+ return nil
+ }
+
+ rc := ep.wr.Get()
+ if rc == nil {
+ delete(a.endpoints, name)
+ return nil
+ }
+
+ return &boundEndpoint{ep.ep, rc}
+}
+
+// Bind binds the given socket.
+//
+// When the last reference managed by rc is dropped, ep may be removed from the
+// namespace.
+func (a *AbstractSocketNamespace) Bind(name string, ep unix.BoundEndpoint, rc refs.RefCounter) error {
+ a.mu.Lock()
+ defer a.mu.Unlock()
+
+ if ep, ok := a.endpoints[name]; ok {
+ if rc := ep.wr.Get(); rc != nil {
+ rc.DecRef()
+ return syscall.EADDRINUSE
+ }
+ }
+
+ ae := abstractEndpoint{ep: ep, name: name, ns: a}
+ ae.wr = refs.NewWeakRef(rc, &ae)
+ a.endpoints[name] = ae
+ return nil
+}
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
new file mode 100644
index 000000000..7f0680b88
--- /dev/null
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -0,0 +1,73 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+ name = "auth_state",
+ srcs = [
+ "credentials.go",
+ "id.go",
+ "id_map_range.go",
+ "id_map_set.go",
+ "user_namespace.go",
+ ],
+ out = "auth_state.go",
+ package = "auth",
+)
+
+go_template_instance(
+ name = "id_map_range",
+ out = "id_map_range.go",
+ package = "auth",
+ prefix = "idMap",
+ template = "//pkg/segment:generic_range",
+ types = {
+ "T": "uint32",
+ },
+)
+
+go_template_instance(
+ name = "id_map_set",
+ out = "id_map_set.go",
+ consts = {
+ "minDegree": "3",
+ },
+ package = "auth",
+ prefix = "idMap",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "uint32",
+ "Range": "idMapRange",
+ "Value": "uint32",
+ "Functions": "idMapFunctions",
+ },
+)
+
+go_library(
+ name = "auth",
+ srcs = [
+ "auth.go",
+ "auth_state.go",
+ "capability_set.go",
+ "context.go",
+ "credentials.go",
+ "id.go",
+ "id_map.go",
+ "id_map_functions.go",
+ "id_map_range.go",
+ "id_map_set.go",
+ "user_namespace.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/bits",
+ "//pkg/log",
+ "//pkg/sentry/context",
+ "//pkg/state",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go
new file mode 100644
index 000000000..c49a6b852
--- /dev/null
+++ b/pkg/sentry/kernel/auth/auth.go
@@ -0,0 +1,22 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package auth implements an access control model that is a subset of Linux's.
+//
+// The auth package supports two kinds of access controls: user/group IDs and
+// capabilities. Each resource in the security model is associated with a user
+// namespace; "privileged" operations check that the operator's credentials
+// have the required user/group IDs or capabilities within the user namespace
+// of accessed resources.
+package auth
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
new file mode 100644
index 000000000..5b8164c49
--- /dev/null
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -0,0 +1,61 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/bits"
+)
+
+// A CapabilitySet is a set of capabilities implemented as a bitset. The zero
+// value of CapabilitySet is a set containing no capabilities.
+type CapabilitySet uint64
+
+// AllCapabilities is a CapabilitySet containing all valid capabilities.
+var AllCapabilities = CapabilitySetOf(linux.MaxCapability+1) - 1
+
+// CapabilitySetOf returns a CapabilitySet containing only the given
+// capability.
+func CapabilitySetOf(cp linux.Capability) CapabilitySet {
+ return CapabilitySet(bits.MaskOf64(int(cp)))
+}
+
+// CapabilitySetOfMany returns a CapabilitySet containing the given capabilities.
+func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet {
+ var cs uint64
+ for _, cp := range cps {
+ cs |= bits.MaskOf64(int(cp))
+ }
+ return CapabilitySet(cs)
+}
+
+// TaskCapabilities represents all the capability sets for a task. Each of these
+// sets is explained in greater detail in capabilities(7).
+type TaskCapabilities struct {
+ // Permitted is a limiting superset for the effective capabilities that
+ // the thread may assume.
+ PermittedCaps CapabilitySet
+ // Inheritable is a set of capabilities preserved across an execve(2).
+ InheritableCaps CapabilitySet
+ // Effective is the set of capabilities used by the kernel to perform
+ // permission checks for the thread.
+ EffectiveCaps CapabilitySet
+ // Bounding is a limiting superset for the capabilities that a thread
+ // can add to its inheritable set using capset(2).
+ BoundingCaps CapabilitySet
+ // Ambient is a set of capabilities that are preserved across an
+ // execve(2) of a program that is not privileged.
+ AmbientCaps CapabilitySet
+}
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
new file mode 100644
index 000000000..914589b28
--- /dev/null
+++ b/pkg/sentry/kernel/auth/context.go
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the auth package's type for context.Context.Value keys.
+type contextID int
+
+const (
+ // CtxCredentials is a Context.Value key for Credentials.
+ CtxCredentials contextID = iota
+)
+
+// CredentialsFromContext returns a copy of the Credentials used by ctx, or a
+// set of Credentials with no capabilities if ctx does not have Credentials.
+func CredentialsFromContext(ctx context.Context) *Credentials {
+ if v := ctx.Value(CtxCredentials); v != nil {
+ return v.(*Credentials)
+ }
+ return NewAnonymousCredentials()
+}
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
new file mode 100644
index 000000000..b832b28fe
--- /dev/null
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -0,0 +1,227 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Credentials contains information required to authorize privileged operations
+// in a user namespace.
+type Credentials struct {
+ // Real/effective/saved user/group IDs in the root user namespace. None of
+ // these should ever be NoID.
+ RealKUID KUID
+ EffectiveKUID KUID
+ SavedKUID KUID
+ RealKGID KGID
+ EffectiveKGID KGID
+ SavedKGID KGID
+
+ // Filesystem user/group IDs are not implemented. "... setfsuid() is
+ // nowadays unneeded and should be avoided in new applications (likewise
+ // for setfsgid(2))." - setfsuid(2)
+
+ // Supplementary groups used by set/getgroups.
+ //
+ // ExtraKGIDs slices are immutable, allowing multiple Credentials with the
+ // same ExtraKGIDs to share the same slice.
+ ExtraKGIDs []KGID
+
+ // The capability sets applicable to this set of credentials.
+ PermittedCaps CapabilitySet
+ InheritableCaps CapabilitySet
+ EffectiveCaps CapabilitySet
+ BoundingCaps CapabilitySet
+ // Ambient capabilities are not introduced until Linux 4.3.
+
+ // KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be
+ // maintained after a switch from root user to non-root user via setuid().
+ KeepCaps bool
+
+ // The user namespace associated with the owner of the credentials.
+ UserNamespace *UserNamespace
+}
+
+// NewAnonymousCredentials returns a set of credentials with no capabilities in
+// any user namespace.
+func NewAnonymousCredentials() *Credentials {
+ // Create a new root user namespace. Since the new namespace's owner is
+ // KUID 0 and the returned credentials have non-zero KUID/KGID, the
+ // returned credentials do not have any capabilities in the new namespace.
+ // Since the new namespace is not part of any existing user namespace
+ // hierarchy, the returned credentials do not have any capabilities in any
+ // other namespace.
+ return &Credentials{
+ RealKUID: NobodyKUID,
+ EffectiveKUID: NobodyKUID,
+ SavedKUID: NobodyKUID,
+ RealKGID: NobodyKGID,
+ EffectiveKGID: NobodyKGID,
+ SavedKGID: NobodyKGID,
+ UserNamespace: NewRootUserNamespace(),
+ }
+}
+
+// NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e.
+// global root) in user namespace ns.
+func NewRootCredentials(ns *UserNamespace) *Credentials {
+ // I can't find documentation for this anywhere, but it's correct for the
+ // inheritable capability set to be initially empty (the capabilities test
+ // checks for this property).
+ return &Credentials{
+ RealKUID: RootKUID,
+ EffectiveKUID: RootKUID,
+ SavedKUID: RootKUID,
+ RealKGID: RootKGID,
+ EffectiveKGID: RootKGID,
+ SavedKGID: RootKGID,
+ PermittedCaps: AllCapabilities,
+ EffectiveCaps: AllCapabilities,
+ BoundingCaps: AllCapabilities,
+ UserNamespace: ns,
+ }
+}
+
+// NewUserCredentials returns a set of credentials based on the given UID, GIDs,
+// and capabilities in a given namespace. If all arguments are their zero
+// values, this returns the same credentials as NewRootCredentials.
+func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials {
+ creds := NewRootCredentials(ns)
+
+ // Set the UID.
+ uid := kuid
+ creds.RealKUID = uid
+ creds.EffectiveKUID = uid
+ creds.SavedKUID = uid
+
+ // Set GID.
+ gid := kgid
+ creds.RealKGID = gid
+ creds.EffectiveKGID = gid
+ creds.SavedKGID = gid
+
+ // Set additional GIDs.
+ creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...)
+
+ // Set capabilities. If capabilities aren't specified, we default to
+ // all capabilities.
+ if capabilities != nil {
+ creds.PermittedCaps = capabilities.PermittedCaps
+ creds.EffectiveCaps = capabilities.EffectiveCaps
+ creds.BoundingCaps = capabilities.BoundingCaps
+ creds.InheritableCaps = capabilities.InheritableCaps
+ // // TODO: Support ambient capabilities.
+ } else {
+ // If no capabilities are specified, grant the same capabilites
+ // that NewRootCredentials does.
+ creds.PermittedCaps = AllCapabilities
+ creds.EffectiveCaps = AllCapabilities
+ creds.BoundingCaps = AllCapabilities
+ }
+
+ return creds
+}
+
+// Fork generates an identical copy of a set of credentials.
+func (c *Credentials) Fork() *Credentials {
+ nc := new(Credentials)
+ *nc = *c // Copy-by-value; this is legal for all fields.
+ return nc
+}
+
+// InGroup returns true if c is in group kgid. Compare Linux's
+// kernel/groups.c:in_group_p().
+func (c *Credentials) InGroup(kgid KGID) bool {
+ if c.EffectiveKGID == kgid {
+ return true
+ }
+ for _, extraKGID := range c.ExtraKGIDs {
+ if extraKGID == kgid {
+ return true
+ }
+ }
+ return false
+}
+
+// HasCapabilityIn returns true if c has capability cp in ns.
+func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool {
+ for {
+ // "1. A process has a capability inside a user namespace if it is a member
+ // of that namespace and it has the capability in its effective capability
+ // set." - user_namespaces(7)
+ if c.UserNamespace == ns {
+ return CapabilitySetOf(cp)&c.EffectiveCaps != 0
+ }
+ // "3. ... A process that resides in the parent of the user namespace and
+ // whose effective user ID matches the owner of the namespace has all
+ // capabilities in the namespace."
+ if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner {
+ return true
+ }
+ // "2. If a process has a capability in a user namespace, then it has that
+ // capability in all child (and further removed descendant) namespaces as
+ // well."
+ if ns.parent == nil {
+ return false
+ }
+ ns = ns.parent
+ }
+}
+
+// HasCapability returns true if c has capability cp in its user namespace.
+func (c *Credentials) HasCapability(cp linux.Capability) bool {
+ return c.HasCapabilityIn(cp, c.UserNamespace)
+}
+
+// UseUID checks that c can use uid in its user namespace, then translates it
+// to the root user namespace.
+//
+// The checks UseUID does are common, but you should verify that it's doing
+// exactly what you want.
+func (c *Credentials) UseUID(uid UID) (KUID, error) {
+ // uid must be mapped.
+ kuid := c.UserNamespace.MapToKUID(uid)
+ if !kuid.Ok() {
+ return NoID, syserror.EINVAL
+ }
+ // If c has CAP_SETUID, then it can use any UID in its user namespace.
+ if c.HasCapability(linux.CAP_SETUID) {
+ return kuid, nil
+ }
+ // Otherwise, c must already have the UID as its real, effective, or saved
+ // set-user-ID.
+ if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID {
+ return kuid, nil
+ }
+ return NoID, syserror.EPERM
+}
+
+// UseGID checks that c can use gid in its user namespace, then translates it
+// to the root user namespace.
+func (c *Credentials) UseGID(gid GID) (KGID, error) {
+ kgid := c.UserNamespace.MapToKGID(gid)
+ if !kgid.Ok() {
+ return NoID, syserror.EINVAL
+ }
+ if c.HasCapability(linux.CAP_SETGID) {
+ return kgid, nil
+ }
+ if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID {
+ return kgid, nil
+ }
+ return NoID, syserror.EPERM
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
new file mode 100644
index 000000000..37522b018
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id.go
@@ -0,0 +1,121 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "math"
+)
+
+// UID is a user ID in an unspecified user namespace.
+type UID uint32
+
+// GID is a group ID in an unspecified user namespace.
+type GID uint32
+
+// In the root user namespace, user/group IDs have a 1-to-1 relationship with
+// the users/groups they represent. In other user namespaces, this is not the
+// case; for example, two different unmapped users may both "have" the overflow
+// UID. This means that it is generally only valid to compare user and group
+// IDs in the root user namespace. We assign distinct types, KUID/KGID, to such
+// IDs to emphasize this distinction. ("k" is for "key", as in "unique key".
+// Linux also uses the prefix "k", but I think they mean "kernel".)
+
+// KUID is a user ID in the root user namespace.
+type KUID uint32
+
+// KGID is a group ID in the root user namespace.
+type KGID uint32
+
+const (
+ // NoID is uint32(-1). -1 is consistently used as a special value, in Linux
+ // and by extension in the auth package, to mean "no ID":
+ //
+ // - ID mapping returns -1 if the ID is not mapped.
+ //
+ // - Most set*id() syscalls accept -1 to mean "do not change this ID".
+ NoID = math.MaxUint32
+
+ // OverflowUID is the default value of /proc/sys/kernel/overflowuid. The
+ // "overflow UID" is usually [1] used when translating a user ID between
+ // namespaces fails because the ID is not mapped. (We don't implement this
+ // file, so the overflow UID is constant.)
+ //
+ // [1] "There is one notable case where unmapped user and group IDs are not
+ // converted to the corresponding overflow ID value. When viewing a uid_map
+ // or gid_map file in which there is no mapping for the second field, that
+ // field is displayed as 4294967295 (-1 as an unsigned integer);" -
+ // user_namespaces(7)
+ OverflowUID = UID(65534)
+ OverflowGID = GID(65534)
+
+ // NobodyKUID is the user ID usually reserved for the least privileged user
+ // "nobody".
+ NobodyKUID = KUID(65534)
+ NobodyKGID = KGID(65534)
+
+ // RootKUID is the user ID usually used for the most privileged user "root".
+ RootKUID = KUID(0)
+ RootKGID = KGID(0)
+ RootUID = UID(0)
+ RootGID = GID(0)
+)
+
+// Ok returns true if uid is not -1.
+func (uid UID) Ok() bool {
+ return uid != NoID
+}
+
+// Ok returns true if gid is not -1.
+func (gid GID) Ok() bool {
+ return gid != NoID
+}
+
+// Ok returns true if kuid is not -1.
+func (kuid KUID) Ok() bool {
+ return kuid != NoID
+}
+
+// Ok returns true if kgid is not -1.
+func (kgid KGID) Ok() bool {
+ return kgid != NoID
+}
+
+// OrOverflow returns uid if it is valid and the overflow UID otherwise.
+func (uid UID) OrOverflow() UID {
+ if uid.Ok() {
+ return uid
+ }
+ return OverflowUID
+}
+
+// OrOverflow returns gid if it is valid and the overflow GID otherwise.
+func (gid GID) OrOverflow() GID {
+ if gid.Ok() {
+ return gid
+ }
+ return OverflowGID
+}
+
+// In translates kuid into user namespace ns. If kuid is not mapped in ns, In
+// returns NoID.
+func (kuid KUID) In(ns *UserNamespace) UID {
+ return ns.MapFromKUID(kuid)
+}
+
+// In translates kgid into user namespace ns. If kgid is not mapped in ns, In
+// returns NoID.
+func (kgid KGID) In(ns *UserNamespace) GID {
+ return ns.MapFromKGID(kgid)
+}
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
new file mode 100644
index 000000000..6adb33530
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -0,0 +1,283 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns.
+func (ns *UserNamespace) MapFromKUID(kuid KUID) UID {
+ if ns.parent == nil {
+ return UID(kuid)
+ }
+ return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid))))
+}
+
+// MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns.
+func (ns *UserNamespace) MapFromKGID(kgid KGID) GID {
+ if ns.parent == nil {
+ return GID(kgid)
+ }
+ return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid))))
+}
+
+// MapToKUID translates uid, a UID in ns, to a UID in the root namespace.
+func (ns *UserNamespace) MapToKUID(uid UID) KUID {
+ if ns.parent == nil {
+ return KUID(uid)
+ }
+ return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid))))
+}
+
+// MapToKGID translates gid, a GID in ns, to a GID in the root namespace.
+func (ns *UserNamespace) MapToKGID(gid GID) KGID {
+ if ns.parent == nil {
+ return KGID(gid)
+ }
+ return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid))))
+}
+
+func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 {
+ if id == NoID {
+ return NoID
+ }
+ ns.mu.Lock()
+ defer ns.mu.Unlock()
+ if it := m.FindSegment(id); it.Ok() {
+ return it.Value() + (id - it.Start())
+ }
+ return NoID
+}
+
+// allIDsMapped returns true if all IDs in the range [start, end) are mapped in
+// m.
+//
+// Preconditions: end >= start.
+func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool {
+ ns.mu.Lock()
+ defer ns.mu.Unlock()
+ return m.SpanRange(idMapRange{start, end}) == end-start
+}
+
+// An IDMapEntry represents a mapping from a range of contiguous IDs in a user
+// namespace to an equally-sized range of contiguous IDs in the namespace's
+// parent.
+type IDMapEntry struct {
+ // FirstID is the first ID in the range in the namespace.
+ FirstID uint32
+
+ // FirstParentID is the first ID in the range in the parent namespace.
+ FirstParentID uint32
+
+ // Length is the number of IDs in the range.
+ Length uint32
+}
+
+// SetUIDMap instructs ns to translate UIDs as specified by entries.
+//
+// Note: SetUIDMap does not place an upper bound on the number of entries, but
+// Linux does. This restriction is implemented in SetUIDMap's caller, the
+// implementation of /proc/[pid]/uid_map.
+func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error {
+ c := CredentialsFromContext(ctx)
+
+ ns.mu.Lock()
+ defer ns.mu.Unlock()
+ // "After the creation of a new user namespace, the uid_map file of *one*
+ // of the processes in the namespace may be written to *once* to define the
+ // mapping of user IDs in the new user namespace. An attempt to write more
+ // than once to a uid_map file in a user namespace fails with the error
+ // EPERM. Similar rules apply for gid_map files." - user_namespaces(7)
+ if !ns.uidMapFromParent.IsEmpty() {
+ return syserror.EPERM
+ }
+ // "At least one line must be written to the file."
+ if len(entries) == 0 {
+ return syserror.EINVAL
+ }
+ // """
+ // In order for a process to write to the /proc/[pid]/uid_map
+ // (/proc/[pid]/gid_map) file, all of the following requirements must be
+ // met:
+ //
+ // 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability
+ // in the user namespace of the process pid.
+ // """
+ if !c.HasCapabilityIn(linux.CAP_SETUID, ns) {
+ return syserror.EPERM
+ }
+ // "2. The writing process must either be in the user namespace of the process
+ // pid or be in the parent user namespace of the process pid."
+ if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+ return syserror.EPERM
+ }
+ // """
+ // 3. (see trySetUIDMap)
+ //
+ // 4. One of the following two cases applies:
+ //
+ // * Either the writing process has the CAP_SETUID (CAP_SETGID) capability
+ // in the parent user namespace.
+ // """
+ if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) {
+ // """
+ // * Or otherwise all of the following restrictions apply:
+ //
+ // + The data written to uid_map (gid_map) must consist of a single line
+ // that maps the writing process' effective user ID (group ID) in the
+ // parent user namespace to a user ID (group ID) in the user namespace.
+ // """
+ if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 {
+ return syserror.EPERM
+ }
+ // """
+ // + The writing process must have the same effective user ID as the
+ // process that created the user namespace.
+ // """
+ if c.EffectiveKUID != ns.owner {
+ return syserror.EPERM
+ }
+ }
+ // trySetUIDMap leaves data in maps if it fails.
+ if err := ns.trySetUIDMap(entries); err != nil {
+ ns.uidMapFromParent.RemoveAll()
+ ns.uidMapToParent.RemoveAll()
+ return err
+ }
+ return nil
+}
+
+func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error {
+ for _, e := range entries {
+ // Determine upper bounds and check for overflow. This implicitly
+ // checks for NoID.
+ lastID := e.FirstID + e.Length
+ if lastID <= e.FirstID {
+ return syserror.EINVAL
+ }
+ lastParentID := e.FirstParentID + e.Length
+ if lastParentID <= e.FirstParentID {
+ return syserror.EINVAL
+ }
+ // "3. The mapped user IDs (group IDs) must in turn have a mapping in
+ // the parent user namespace."
+ // Only the root namespace has a nil parent, and root is assigned
+ // mappings when it's created, so SetUIDMap would have returned EPERM
+ // without reaching this point if ns is root.
+ if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) {
+ return syserror.EPERM
+ }
+ // If either of these Adds fail, we have an overlapping range.
+ if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+ return syserror.EINVAL
+ }
+ if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+ return syserror.EINVAL
+ }
+ }
+ return nil
+}
+
+// SetGIDMap instructs ns to translate GIDs as specified by entries.
+func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error {
+ c := CredentialsFromContext(ctx)
+
+ ns.mu.Lock()
+ defer ns.mu.Unlock()
+ if !ns.gidMapFromParent.IsEmpty() {
+ return syserror.EPERM
+ }
+ if len(entries) == 0 {
+ return syserror.EINVAL
+ }
+ if !c.HasCapabilityIn(linux.CAP_SETGID, ns) {
+ return syserror.EPERM
+ }
+ if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+ return syserror.EPERM
+ }
+ if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) {
+ if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 {
+ return syserror.EPERM
+ }
+ // It's correct for this to still be UID.
+ if c.EffectiveKUID != ns.owner {
+ return syserror.EPERM
+ }
+ // "In the case of gid_map, use of the setgroups(2) system call must
+ // first be denied by writing "deny" to the /proc/[pid]/setgroups file
+ // (see below) before writing to gid_map." (This file isn't implemented
+ // in the version of Linux we're emulating; see comment in
+ // UserNamespace.)
+ }
+ if err := ns.trySetGIDMap(entries); err != nil {
+ ns.gidMapFromParent.RemoveAll()
+ ns.gidMapToParent.RemoveAll()
+ return err
+ }
+ return nil
+}
+
+func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error {
+ for _, e := range entries {
+ lastID := e.FirstID + e.Length
+ if lastID <= e.FirstID {
+ return syserror.EINVAL
+ }
+ lastParentID := e.FirstParentID + e.Length
+ if lastParentID <= e.FirstParentID {
+ return syserror.EINVAL
+ }
+ if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) {
+ return syserror.EPERM
+ }
+ if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+ return syserror.EINVAL
+ }
+ if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+ return syserror.EINVAL
+ }
+ }
+ return nil
+}
+
+// UIDMap returns the user ID mappings configured for ns. If no mappings
+// have been configured, UIDMap returns nil.
+func (ns *UserNamespace) UIDMap() []IDMapEntry {
+ return ns.getIDMap(&ns.uidMapToParent)
+}
+
+// GIDMap returns the group ID mappings configured for ns. If no mappings
+// have been configured, GIDMap returns nil.
+func (ns *UserNamespace) GIDMap() []IDMapEntry {
+ return ns.getIDMap(&ns.gidMapToParent)
+}
+
+func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry {
+ ns.mu.Lock()
+ defer ns.mu.Unlock()
+ var entries []IDMapEntry
+ for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() {
+ entries = append(entries, IDMapEntry{
+ FirstID: it.Start(),
+ FirstParentID: it.Value(),
+ Length: it.Range().Length(),
+ })
+ }
+ return entries
+}
diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go
new file mode 100644
index 000000000..889291d96
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map_functions.go
@@ -0,0 +1,45 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+// idMapFunctions "implements" generic interface segment.Functions for
+// idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one
+// user namespace to non-overlapping ranges of contiguous IDs in another user
+// namespace. Each such ID mapping is implemented as a range-to-value mapping
+// in the set such that [range.Start(), range.End()) => [value, value +
+// range.Length()).
+type idMapFunctions struct{}
+
+func (idMapFunctions) MinKey() uint32 {
+ return 0
+}
+
+func (idMapFunctions) MaxKey() uint32 {
+ return NoID
+}
+
+func (idMapFunctions) ClearValue(*uint32) {}
+
+func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) {
+ // Mapped ranges have to be contiguous.
+ if val1+r1.Length() != val2 {
+ return 0, false
+ }
+ return val1, true
+}
+
+func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) {
+ return val, val + (split - r.Start)
+}
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
new file mode 100644
index 000000000..0980aeadf
--- /dev/null
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -0,0 +1,130 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "math"
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// A UserNamespace represents a user namespace. See user_namespaces(7) for
+// details.
+type UserNamespace struct {
+ // parent is this namespace's parent. If this is the root namespace, parent
+ // is nil. The parent pointer is immutable.
+ parent *UserNamespace
+
+ // owner is the effective UID of the namespace's creator in the root
+ // namespace. owner is immutable.
+ owner KUID
+
+ // mu protects the following fields.
+ //
+ // If mu will be locked in multiple UserNamespaces, it must be locked in
+ // descendant namespaces before ancestors.
+ mu sync.Mutex `state:"nosave"`
+
+ // Mappings of user/group IDs between this namespace and its parent.
+ //
+ // All ID maps, once set, cannot be changed. This means that successful
+ // UID/GID translations cannot be racy.
+ uidMapFromParent idMapSet
+ uidMapToParent idMapSet
+ gidMapFromParent idMapSet
+ gidMapToParent idMapSet
+
+ // TODO: Consider supporting disabling setgroups(2), which "was
+ // added in Linux 3.19, but was backported to many earlier stable kernel
+ // series, because it addresses a security issue" - user_namespaces(7). (It
+ // was not backported to 3.11.10, which we are currently imitating.)
+}
+
+// NewRootUserNamespace returns a UserNamespace that is appropriate for a
+// system's root user namespace.
+func NewRootUserNamespace() *UserNamespace {
+ var ns UserNamespace
+ // """
+ // The initial user namespace has no parent namespace, but, for
+ // consistency, the kernel provides dummy user and group ID mapping files
+ // for this namespace. Looking at the uid_map file (gid_map is the same)
+ // from a shell in the initial namespace shows:
+ //
+ // $ cat /proc/$$/uid_map
+ // 0 0 4294967295
+ // """ - user_namespaces(7)
+ for _, m := range []*idMapSet{
+ &ns.uidMapFromParent,
+ &ns.uidMapToParent,
+ &ns.gidMapFromParent,
+ &ns.gidMapToParent,
+ } {
+ if !m.Add(idMapRange{0, math.MaxUint32}, 0) {
+ panic("Failed to insert into empty ID map")
+ }
+ }
+ return &ns
+}
+
+// Root returns the root of the user namespace tree containing ns.
+func (ns *UserNamespace) Root() *UserNamespace {
+ for ns.parent != nil {
+ ns = ns.parent
+ }
+ return ns
+}
+
+// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user
+// namespaces." - user_namespaces(7)
+const maxUserNamespaceDepth = 32
+
+func (ns *UserNamespace) depth() int {
+ var i int
+ for ns != nil {
+ i++
+ ns = ns.parent
+ }
+ return i
+}
+
+// NewChildUserNamespace returns a new user namespace created by a caller with
+// credentials c.
+func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) {
+ if c.UserNamespace.depth() >= maxUserNamespaceDepth {
+ // "... Calls to unshare(2) or clone(2) that would cause this limit to
+ // be exceeded fail with the error EUSERS." - user_namespaces(7)
+ return nil, syserror.EUSERS
+ }
+ // "EPERM: CLONE_NEWUSER was specified in flags, but either the effective
+ // user ID or the effective group ID of the caller does not have a mapping
+ // in the parent namespace (see user_namespaces(7))." - clone(2)
+ // "CLONE_NEWUSER requires that the user ID and group ID of the calling
+ // process are mapped to user IDs and group IDs in the user namespace of
+ // the calling process at the time of the call." - unshare(2)
+ if !c.EffectiveKUID.In(c.UserNamespace).Ok() {
+ return nil, syserror.EPERM
+ }
+ if !c.EffectiveKGID.In(c.UserNamespace).Ok() {
+ return nil, syserror.EPERM
+ }
+ return &UserNamespace{
+ parent: c.UserNamespace,
+ owner: c.EffectiveKUID,
+ // "When a user namespace is created, it starts without a mapping of
+ // user IDs (group IDs) to the parent user namespace." -
+ // user_namespaces(7)
+ }, nil
+}
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
new file mode 100644
index 000000000..261ca6f7a
--- /dev/null
+++ b/pkg/sentry/kernel/context.go
@@ -0,0 +1,135 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the kernel package's type for context.Context.Value keys.
+type contextID int
+
+const (
+ // CtxCanTrace is a Context.Value key for a function with the same
+ // signature and semantics as kernel.Task.CanTrace.
+ CtxCanTrace contextID = iota
+
+ // CtxKernel is a Context.Value key for a Kernel.
+ CtxKernel
+
+ // CtxPIDNamespace is a Context.Value key for a PIDNamespace.
+ CtxPIDNamespace
+
+ // CtxTask is a Context.Value key for a Task.
+ CtxTask
+
+ // CtxUTSNamespace is a Context.Value key for a UTSNamespace.
+ CtxUTSNamespace
+
+ // CtxIPCNamespace is a Context.Value key for a IPCNamespace.
+ CtxIPCNamespace
+)
+
+// ContextCanTrace returns true if ctx is permitted to trace t, in the same sense
+// as kernel.Task.CanTrace.
+func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool {
+ if v := ctx.Value(CtxCanTrace); v != nil {
+ return v.(func(*Task, bool) bool)(t, attach)
+ }
+ return false
+}
+
+// KernelFromContext returns the Kernel in which ctx is executing, or nil if
+// there is no such Kernel.
+func KernelFromContext(ctx context.Context) *Kernel {
+ if v := ctx.Value(CtxKernel); v != nil {
+ return v.(*Kernel)
+ }
+ return nil
+}
+
+// PIDNamespaceFromContext returns the PID namespace in which ctx is executing,
+// or nil if there is no such PID namespace.
+func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace {
+ if v := ctx.Value(CtxPIDNamespace); v != nil {
+ return v.(*PIDNamespace)
+ }
+ return nil
+}
+
+// UTSNamespaceFromContext returns the UTS namespace in which ctx is executing,
+// or nil if there is no such UTS namespace.
+func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
+ if v := ctx.Value(CtxUTSNamespace); v != nil {
+ return v.(*UTSNamespace)
+ }
+ return nil
+}
+
+// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
+// or nil if there is no such IPC namespace.
+func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
+ if v := ctx.Value(CtxIPCNamespace); v != nil {
+ return v.(*IPCNamespace)
+ }
+ return nil
+}
+
+// TaskFromContext returns the Task associated with ctx, or nil if there is no
+// such Task.
+func TaskFromContext(ctx context.Context) *Task {
+ if v := ctx.Value(CtxTask); v != nil {
+ return v.(*Task)
+ }
+ return nil
+}
+
+// AsyncContext returns a context.Context that may be used by goroutines that
+// do work on behalf of t and therefore share its contextual values, but are
+// not t's task goroutine (e.g. asynchronous I/O).
+func (t *Task) AsyncContext() context.Context {
+ return taskAsyncContext{t: t}
+}
+
+type taskAsyncContext struct {
+ context.NoopSleeper
+ t *Task
+}
+
+// Debugf implements log.Logger.Debugf.
+func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
+ ctx.t.Debugf(format, v...)
+}
+
+// Infof implements log.Logger.Infof.
+func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
+ ctx.t.Infof(format, v...)
+}
+
+// Warningf implements log.Logger.Warningf.
+func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
+ ctx.t.Warningf(format, v...)
+}
+
+// IsLogging implements log.Logger.IsLogging.
+func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
+ return ctx.t.IsLogging(level)
+}
+
+// Value implements context.Context.Value.
+func (ctx taskAsyncContext) Value(key interface{}) interface{} {
+ return ctx.t.Value(key)
+}
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
new file mode 100644
index 000000000..04651d961
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -0,0 +1,52 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+ name = "epoll_autogen_state",
+ srcs = [
+ "epoll.go",
+ "epoll_state.go",
+ ],
+ out = "epoll_autogen_state.go",
+ package = "epoll",
+)
+
+go_library(
+ name = "epoll",
+ srcs = [
+ "epoll.go",
+ "epoll_autogen_state.go",
+ "epoll_state.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/ilist",
+ "//pkg/refs",
+ "//pkg/sentry/context",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/anon",
+ "//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/kernel/kdefs",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/usermem",
+ "//pkg/state",
+ "//pkg/waiter",
+ ],
+)
+
+go_test(
+ name = "epoll_test",
+ size = "small",
+ srcs = [
+ "epoll_test.go",
+ ],
+ embed = [":epoll"],
+ deps = [
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs/filetest",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
new file mode 100644
index 000000000..b572fcd7e
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -0,0 +1,466 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package epoll provides an implementation of Linux's IO event notification
+// facility. See epoll(7) for more details.
+package epoll
+
+import (
+ "fmt"
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/ilist"
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Event describes the event mask that was observed and the user data to be
+// returned when one of the events occurs. It has this format to match the linux
+// format to avoid extra copying/allocation when writing events to userspace.
+type Event struct {
+ // Events is the event mask containing the set of events that have been
+ // observed on an entry.
+ Events uint32
+
+ // Data is an opaque 64-bit value provided by the caller when adding the
+ // entry, and returned to the caller when the entry reports an event.
+ Data [2]int32
+}
+
+// EntryFlags is a bitmask that holds an entry's flags.
+type EntryFlags int
+
+// Valid entry flags.
+const (
+ OneShot EntryFlags = 1 << iota
+ EdgeTriggered
+)
+
+// FileIdentifier identifies a file. We cannot use just the FD because it could
+// potentially be reassigned. We also cannot use just the file pointer because
+// it is possible to have multiple entries for the same file object as long as
+// they are created with different FDs (i.e., the FDs point to the same file).
+type FileIdentifier struct {
+ File *fs.File
+ Fd kdefs.FD
+}
+
+// pollEntry holds all the state associated with an event poll entry, that is,
+// a file being observed by an event poll object.
+type pollEntry struct {
+ ilist.Entry
+ file *refs.WeakRef `state:"manual"`
+ id FileIdentifier `state:"wait"`
+ userData [2]int32
+ waiter waiter.Entry `state:"manual"`
+ mask waiter.EventMask
+ flags EntryFlags
+
+ epoll *EventPoll
+
+ // We cannot save the current list pointer as it points into EventPoll
+ // struct, while state framework currently does not support such
+ // in-struct pointers. Instead, EventPoll will properly set this field
+ // in its loading logic.
+ curList *ilist.List `state:"nosave"`
+}
+
+// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
+// weakReferenceGone is called when the file in the weak reference is destroyed.
+// The poll entry is removed in response to this.
+func (p *pollEntry) WeakRefGone() {
+ p.epoll.RemoveEntry(p.id)
+}
+
+// EventPoll holds all the state associated with an event poll object, that is,
+// collection of files to observe and their current state.
+type EventPoll struct {
+ fsutil.PipeSeek `state:"zerovalue"`
+ fsutil.NotDirReaddir `state:"zerovalue"`
+ fsutil.NoFsync `state:"zerovalue"`
+ fsutil.NoopFlush `state:"zerovalue"`
+ fsutil.NoMMap `state:"zerovalue"`
+ fsutil.NoIoctl `state:"zerovalue"`
+
+ // Wait queue is used to notify interested parties when the event poll
+ // object itself becomes readable or writable.
+ waiter.Queue
+
+ // files is the map of all the files currently being observed, it is
+ // protected by mu.
+ mu sync.Mutex `state:"nosave"`
+ files map[FileIdentifier]*pollEntry
+
+ // listsMu protects manipulation of the lists below. It needs to be a
+ // different lock to avoid circular lock acquisition order involving
+ // the wait queue mutexes and mu. The full order is mu, observed file
+ // wait queue mutex, then listsMu; this allows listsMu to be acquired
+ // when readyCallback is called.
+ //
+ // An entry is always in one of the following lists:
+ // readyList -- when there's a chance that it's ready to have
+ // events delivered to epoll waiters. Given that being
+ // ready is a transient state, the Readiness() and
+ // readEvents() functions always call the entry's file
+ // Readiness() function to confirm it's ready.
+ // waitingList -- when there's no chance that the entry is ready,
+ // so it's waiting for the readyCallback to be called
+ // on it before it gets moved to the readyList.
+ // disabledList -- when the entry is disabled. This happens when
+ // a one-shot entry gets delivered via readEvents().
+ listsMu sync.Mutex `state:"nosave"`
+ readyList ilist.List
+ waitingList ilist.List
+ disabledList ilist.List
+}
+
+// cycleMu is used to serialize all the cycle checks. This is only used when
+// an event poll file is added as an entry to another event poll. Such checks
+// are serialized to avoid lock acquisition order inversion: if a thread is
+// adding A to B, and another thread is adding B to A, each would acquire A's
+// and B's mutexes in reverse order, and could cause deadlocks. Having this
+// lock prevents this by allowing only one check at a time to happen.
+//
+// We do the cycle check to prevent callers from introducing potentially
+// infinite recursions. If a caller were to add A to B and then B to A, for
+// event poll A to know if it's readable, it would need to check event poll B,
+// which in turn would need event poll A and so on indefinitely.
+var cycleMu sync.Mutex
+
+// NewEventPoll allocates and initializes a new event poll object.
+func NewEventPoll(ctx context.Context) *fs.File {
+ // name matches fs/eventpoll.c:epoll_create1.
+ dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
+ return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
+ files: make(map[FileIdentifier]*pollEntry),
+ })
+}
+
+// Release implements fs.FileOperations.Release.
+func (e *EventPoll) Release() {
+ // We need to take the lock now because files may be attempting to
+ // remove entries in parallel if they get destroyed.
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // Go through all entries and clean up.
+ for _, entry := range e.files {
+ entry.id.File.EventUnregister(&entry.waiter)
+ entry.file.Drop()
+ }
+}
+
+// Read implements fs.FileOperations.Read.
+func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+ return 0, syscall.ENOSYS
+}
+
+// Write implements fs.FileOperations.Write.
+func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+ return 0, syscall.ENOSYS
+}
+
+// eventsAvailable determines if 'e' has events available for delivery.
+func (e *EventPoll) eventsAvailable() bool {
+ e.listsMu.Lock()
+
+ for it := e.readyList.Front(); it != nil; {
+ entry := it.(*pollEntry)
+ it = it.Next()
+
+ // If the entry is ready, we know 'e' has at least one entry
+ // ready for delivery.
+ ready := entry.id.File.Readiness(entry.mask)
+ if ready != 0 {
+ e.listsMu.Unlock()
+ return true
+ }
+
+ // Entry is not ready, so move it to waiting list.
+ e.readyList.Remove(entry)
+ e.waitingList.PushBack(entry)
+ entry.curList = &e.waitingList
+ }
+
+ e.listsMu.Unlock()
+
+ return false
+}
+
+// Readiness determines if the event poll object is currently readable (i.e.,
+// if there are pending events for delivery).
+func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask {
+ ready := waiter.EventMask(0)
+
+ if (mask&waiter.EventIn) != 0 && e.eventsAvailable() {
+ ready |= waiter.EventIn
+ }
+
+ return ready
+}
+
+// ReadEvents returns up to max available events.
+func (e *EventPoll) ReadEvents(max int) []Event {
+ var local ilist.List
+ var ret []Event
+
+ e.listsMu.Lock()
+
+ // Go through all entries we believe may be ready.
+ for it := e.readyList.Front(); it != nil && len(ret) < max; {
+ entry := it.(*pollEntry)
+ it = it.Next()
+
+ // Check the entry's readiness. It it's not really ready, we
+ // just put it back in the waiting list and move on to the next
+ // entry.
+ ready := entry.id.File.Readiness(entry.mask) & entry.mask
+ if ready == 0 {
+ e.readyList.Remove(entry)
+ e.waitingList.PushBack(entry)
+ entry.curList = &e.waitingList
+
+ continue
+ }
+
+ // Add event to the array that will be returned to caller.
+ ret = append(ret, Event{
+ Events: uint32(ready),
+ Data: entry.userData,
+ })
+
+ // The entry is consumed, so we must move it to the disabled
+ // list in case it's one-shot, or back to the wait list if it's
+ // edge-triggered. If it's neither, we leave it in the ready
+ // list so that its readiness can be checked the next time
+ // around; however, we must move it to the end of the list so
+ // that other events can be delivered as well.
+ e.readyList.Remove(entry)
+ if entry.flags&OneShot != 0 {
+ e.disabledList.PushBack(entry)
+ entry.curList = &e.disabledList
+ } else if entry.flags&EdgeTriggered != 0 {
+ e.waitingList.PushBack(entry)
+ entry.curList = &e.waitingList
+ } else {
+ local.PushBack(entry)
+ }
+ }
+
+ e.readyList.PushBackList(&local)
+
+ e.listsMu.Unlock()
+
+ return ret
+}
+
+// readyCallback is called when one of the files we're polling becomes ready. It
+// moves said file to the readyList if it's currently in the waiting list.
+type readyCallback struct{}
+
+// Callback implements waiter.EntryCallback.Callback.
+func (*readyCallback) Callback(w *waiter.Entry) {
+ entry := w.Context.(*pollEntry)
+ e := entry.epoll
+
+ e.listsMu.Lock()
+
+ if entry.curList == &e.waitingList {
+ e.waitingList.Remove(entry)
+ e.readyList.PushBack(entry)
+ entry.curList = &e.readyList
+
+ e.Notify(waiter.EventIn)
+ }
+
+ e.listsMu.Unlock()
+}
+
+// initEntryReadiness initializes the entry's state with regards to its
+// readiness by placing it in the appropriate list and registering for
+// notifications.
+func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
+ // A new entry starts off in the waiting list.
+ e.listsMu.Lock()
+ e.waitingList.PushBack(entry)
+ entry.curList = &e.waitingList
+ e.listsMu.Unlock()
+
+ // Register for event notifications.
+ f := entry.id.File
+ f.EventRegister(&entry.waiter, entry.mask)
+
+ // Check if the file happens to already be in a ready state.
+ ready := f.Readiness(entry.mask) & entry.mask
+ if ready != 0 {
+ (*readyCallback).Callback(nil, &entry.waiter)
+ }
+}
+
+// observes checks if event poll object e is directly or indirectly observing
+// event poll object ep. It uses a bounded recursive depth-first search.
+func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool {
+ // If we reached the maximum depth, we'll consider that we found it
+ // because we don't want to allow chains that are too long.
+ if depthLeft <= 0 {
+ return true
+ }
+
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // Go through each observed file and check if it is or observes ep.
+ for id := range e.files {
+ f, ok := id.File.FileOperations.(*EventPoll)
+ if !ok {
+ continue
+ }
+
+ if f == ep || f.observes(ep, depthLeft-1) {
+ return true
+ }
+ }
+
+ return false
+}
+
+// AddEntry adds a new file to the collection of files observed by e.
+func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+ // Acquire cycle check lock if another event poll is being added.
+ ep, ok := id.File.FileOperations.(*EventPoll)
+ if ok {
+ cycleMu.Lock()
+ defer cycleMu.Unlock()
+ }
+
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // Fail if the file already has an entry.
+ if _, ok := e.files[id]; ok {
+ return syscall.EEXIST
+ }
+
+ // Check if a cycle would be created. We use 4 as the limit because
+ // that's the value used by linux and we want to emulate it.
+ if ep != nil {
+ if e == ep {
+ return syscall.EINVAL
+ }
+
+ if ep.observes(e, 4) {
+ return syscall.ELOOP
+ }
+ }
+
+ // Create new entry and add it to map.
+ //
+ // N.B. Even though we are creating a weak reference here, we know it
+ // won't trigger a callback because we hold a reference to the file
+ // throughout the execution of this function.
+ entry := &pollEntry{
+ id: id,
+ userData: data,
+ epoll: e,
+ flags: flags,
+ waiter: waiter.Entry{Callback: &readyCallback{}},
+ mask: mask,
+ }
+ entry.waiter.Context = entry
+ e.files[id] = entry
+ entry.file = refs.NewWeakRef(id.File, entry)
+
+ // Initialize the readiness state of the new entry.
+ e.initEntryReadiness(entry)
+
+ return nil
+}
+
+// UpdateEntry updates the flags, mask and user data associated with a file that
+// is already part of the collection of observed files.
+func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // Fail if the file doesn't have an entry.
+ entry, ok := e.files[id]
+ if !ok {
+ return syscall.ENOENT
+ }
+
+ // Unregister the old mask and remove entry from the list it's in, so
+ // readyCallback is guaranteed to not be called on this entry anymore.
+ entry.id.File.EventUnregister(&entry.waiter)
+
+ // Remove entry from whatever list it's in. This ensure that no other
+ // threads have access to this entry as the only way left to find it
+ // is via e.files, but we hold e.mu, which prevents that.
+ e.listsMu.Lock()
+ entry.curList.Remove(entry)
+ e.listsMu.Unlock()
+
+ // Initialize new readiness state.
+ entry.flags = flags
+ entry.mask = mask
+ entry.userData = data
+ e.initEntryReadiness(entry)
+
+ return nil
+}
+
+// RemoveEntry a files from the collection of observed files.
+func (e *EventPoll) RemoveEntry(id FileIdentifier) error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // Fail if the file doesn't have an entry.
+ entry, ok := e.files[id]
+ if !ok {
+ return syscall.ENOENT
+ }
+
+ // Unregister from file first so that no concurrent attempts will be
+ // made to manipulate the file.
+ entry.id.File.EventUnregister(&entry.waiter)
+
+ // Remove from the current list.
+ e.listsMu.Lock()
+ entry.curList.Remove(entry)
+ entry.curList = nil
+ e.listsMu.Unlock()
+
+ // Remove file from map, and drop weak reference.
+ delete(e.files, id)
+ entry.file.Drop()
+
+ return nil
+}
+
+// UnregisterEpollWaiters removes the epoll waiter objects from the waiting
+// queues. This is different from Release() as the file is not dereferenced.
+func (e *EventPoll) UnregisterEpollWaiters() {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ for _, entry := range e.files {
+ entry.id.File.EventUnregister(&entry.waiter)
+ }
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go
new file mode 100644
index 000000000..dabb32f49
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_state.go
@@ -0,0 +1,51 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epoll
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/ilist"
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// afterLoad is invoked by stateify.
+func (p *pollEntry) afterLoad() {
+ p.waiter = waiter.Entry{Callback: &readyCallback{}}
+ p.waiter.Context = p
+ p.file = refs.NewWeakRef(p.id.File, p)
+ p.id.File.EventRegister(&p.waiter, p.mask)
+}
+
+// afterLoad is invoked by stateify.
+func (e *EventPoll) afterLoad() {
+ e.listsMu.Lock()
+ defer e.listsMu.Unlock()
+
+ for _, ls := range []*ilist.List{&e.waitingList, &e.readyList, &e.disabledList} {
+ for it := ls.Front(); it != nil; it = it.Next() {
+ it.(*pollEntry).curList = ls
+ }
+ }
+
+ for it := e.waitingList.Front(); it != nil; it = it.Next() {
+ p := it.(*pollEntry)
+ if p.id.File.Readiness(p.mask) != 0 {
+ e.waitingList.Remove(p)
+ e.readyList.PushBack(p)
+ p.curList = &e.readyList
+ e.Notify(waiter.EventIn)
+ }
+ }
+}
diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go
new file mode 100644
index 000000000..bc869fc13
--- /dev/null
+++ b/pkg/sentry/kernel/epoll/epoll_test.go
@@ -0,0 +1,54 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package epoll
+
+import (
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestFileDestroyed(t *testing.T) {
+ f := filetest.NewTestFile(t)
+ id := FileIdentifier{f, 12}
+
+ efile := NewEventPoll(contexttest.Context(t))
+ e := efile.FileOperations.(*EventPoll)
+ if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil {
+ t.Fatalf("addEntry failed: %v", err)
+ }
+
+ // Check that we get an event reported twice in a row.
+ evt := e.ReadEvents(1)
+ if len(evt) != 1 {
+ t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt))
+ }
+
+ evt = e.ReadEvents(1)
+ if len(evt) != 1 {
+ t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt))
+ }
+
+ // Destroy the file. Check that we get no more events.
+ f.DecRef()
+
+ evt = e.ReadEvents(1)
+ if len(evt) != 0 {
+ t.Fatalf("Unexpected number of ready events: want %v, got %v", 0, len(evt))
+ }
+
+}
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
new file mode 100644
index 000000000..2d5a3c693
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -0,0 +1,46 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+ name = "eventfd_state",
+ srcs = [
+ "eventfd.go",
+ ],
+ out = "eventfd_state.go",
+ package = "eventfd",
+)
+
+go_library(
+ name = "eventfd",
+ srcs = [
+ "eventfd.go",
+ "eventfd_state.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/refs",
+ "//pkg/sentry/context",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/anon",
+ "//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/usermem",
+ "//pkg/state",
+ "//pkg/syserror",
+ "//pkg/waiter",
+ ],
+)
+
+go_test(
+ name = "eventfd_test",
+ size = "small",
+ srcs = ["eventfd_test.go"],
+ embed = [":eventfd"],
+ deps = [
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/usermem",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
new file mode 100644
index 000000000..c9333719e
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -0,0 +1,172 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package eventfd provides an implementation of Linux's file-based event
+// notification.
+package eventfd
+
+import (
+ "math"
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// EventOperations represents an event with the semantics of Linux's file-based event
+// notification (eventfd).
+type EventOperations struct {
+ fsutil.NoopRelease `state:"nosave"`
+ fsutil.PipeSeek `state:"nosave"`
+ fsutil.NotDirReaddir `state:"nosave"`
+ fsutil.NoFsync `state:"nosave"`
+ fsutil.NoopFlush `state:"nosave"`
+ fsutil.NoMMap `state:"nosave"`
+ fsutil.NoIoctl `state:"nosave"`
+
+ // Mutex that protects accesses to the fields of this event.
+ mu sync.Mutex `state:"nosave"`
+
+ // Queue is used to notify interested parties when the event object
+ // becomes readable or writable.
+ waiter.Queue `state:"nosave"`
+
+ // val is the current value of the event counter.
+ val uint64
+
+ // semMode specifies whether the event is in "semaphore" mode.
+ semMode bool
+}
+
+// New creates a new event object with the supplied initial value and mode.
+func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
+ // name matches fs/eventfd.c:eventfd_file_create.
+ dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[eventfd]")
+ return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
+ val: initVal,
+ semMode: semMode,
+ })
+}
+
+// Read implements fs.FileOperations.Read.
+func (e *EventOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+ if dst.NumBytes() < 8 {
+ return 0, syscall.EINVAL
+ }
+ if err := e.read(ctx, dst); err != nil {
+ return 0, err
+ }
+ return 8, nil
+}
+
+// Write implements fs.FileOperations.Write.
+func (e *EventOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+ if src.NumBytes() < 8 {
+ return 0, syscall.EINVAL
+ }
+ if err := e.write(ctx, src); err != nil {
+ return 0, err
+ }
+ return 8, nil
+}
+
+func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) error {
+ e.mu.Lock()
+
+ // We can't complete the read if the value is currently zero.
+ if e.val == 0 {
+ e.mu.Unlock()
+ return syserror.ErrWouldBlock
+ }
+
+ // Update the value based on the mode the event is operating in.
+ var val uint64
+ if e.semMode {
+ val = 1
+ // Consistent with Linux, this is done even if writing to memory fails.
+ e.val--
+ } else {
+ val = e.val
+ e.val = 0
+ }
+
+ e.mu.Unlock()
+
+ // Notify writers. We do this even if we were already writable because
+ // it is possible that a writer is waiting to write the maximum value
+ // to the event.
+ e.Notify(waiter.EventOut)
+
+ var buf [8]byte
+ usermem.ByteOrder.PutUint64(buf[:], val)
+ _, err := dst.CopyOut(ctx, buf[:])
+ return err
+}
+
+func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) error {
+ var buf [8]byte
+ if _, err := src.CopyIn(ctx, buf[:]); err != nil {
+ return err
+ }
+ val := usermem.ByteOrder.Uint64(buf[:])
+
+ return e.Signal(val)
+}
+
+// Signal is an internal function to signal the event fd.
+func (e *EventOperations) Signal(val uint64) error {
+ if val == math.MaxUint64 {
+ return syscall.EINVAL
+ }
+
+ e.mu.Lock()
+
+ // We only allow writes that won't cause the value to go over the max
+ // uint64 minus 1.
+ if val > math.MaxUint64-1-e.val {
+ e.mu.Unlock()
+ return syserror.ErrWouldBlock
+ }
+
+ e.val += val
+ e.mu.Unlock()
+
+ // Always trigger a notification.
+ e.Notify(waiter.EventIn)
+
+ return nil
+}
+
+// Readiness returns the ready events for the event fd.
+func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
+ ready := waiter.EventMask(0)
+
+ e.mu.Lock()
+ if e.val > 0 {
+ ready |= waiter.EventIn
+ }
+
+ if e.val < math.MaxUint64-1 {
+ ready |= waiter.EventOut
+ }
+ e.mu.Unlock()
+
+ return mask & ready
+}
diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go
new file mode 100644
index 000000000..71326b62f
--- /dev/null
+++ b/pkg/sentry/kernel/eventfd/eventfd_test.go
@@ -0,0 +1,78 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventfd
+
+import (
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestEventfd(t *testing.T) {
+ initVals := []uint64{
+ 0,
+ // Using a non-zero initial value verifies that writing to an
+ // eventfd signals when the eventfd's counter was already
+ // non-zero.
+ 343,
+ }
+
+ for _, initVal := range initVals {
+ ctx := contexttest.Context(t)
+
+ // Make a new event that is writable.
+ event := New(ctx, initVal, false)
+
+ // Register a callback for a write event.
+ w, ch := waiter.NewChannelEntry(nil)
+ event.EventRegister(&w, waiter.EventIn)
+ defer event.EventUnregister(&w)
+
+ data := []byte("00000124")
+ // Create and submit a write request.
+ n, err := event.Writev(ctx, usermem.BytesIOSequence(data))
+ if err != nil {
+ t.Fatal(err)
+ }
+ if n != 8 {
+ t.Errorf("eventfd.write wrote %d bytes, not full int64", n)
+ }
+
+ // Check if the callback fired due to the write event.
+ select {
+ case <-ch:
+ default:
+ t.Errorf("Didn't get notified of EventIn after write")
+ }
+ }
+}
+
+func TestEventfdStat(t *testing.T) {
+ ctx := contexttest.Context(t)
+
+ // Make a new event that is writable.
+ event := New(ctx, 0, false)
+
+ // Create and submit an stat request.
+ uattr, err := event.Dirent.Inode.UnstableAttr(ctx)
+ if err != nil {
+ t.Fatalf("eventfd stat request failed: %v", err)
+ }
+ if uattr.Size != 0 {
+ t.Fatal("EventFD size should be 0")
+ }
+}
diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go
new file mode 100644
index 000000000..ef73125fd
--- /dev/null
+++ b/pkg/sentry/kernel/fd_map.go
@@ -0,0 +1,340 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "bytes"
+ "fmt"
+ "sort"
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+// FDs is an ordering of FD's that can be made stable.
+type FDs []kdefs.FD
+
+func (f FDs) Len() int {
+ return len(f)
+}
+
+func (f FDs) Swap(i, j int) {
+ f[i], f[j] = f[j], f[i]
+}
+
+func (f FDs) Less(i, j int) bool {
+ return f[i] < f[j]
+}
+
+// FDFlags define flags for an individual descriptor.
+type FDFlags struct {
+ // CloseOnExec indicates the descriptor should be closed on exec.
+ CloseOnExec bool
+}
+
+// descriptor holds the details about a file descriptor, namely a pointer the
+// file itself and the descriptor flags.
+type descriptor struct {
+ file *fs.File
+ flags FDFlags
+}
+
+// FDMap is used to manage File references and flags.
+type FDMap struct {
+ refs.AtomicRefCount
+ k *Kernel
+ files map[kdefs.FD]descriptor
+ mu sync.RWMutex `state:"nosave"`
+ uid uint64
+}
+
+// ID returns a unique identifier for this FDMap.
+func (f *FDMap) ID() uint64 {
+ return f.uid
+}
+
+// NewFDMap allocates a new FDMap that may be used by tasks in k.
+func (k *Kernel) NewFDMap() *FDMap {
+ return &FDMap{
+ k: k,
+ files: make(map[kdefs.FD]descriptor),
+ uid: atomic.AddUint64(&k.fdMapUids, 1),
+ }
+}
+
+// destroy removes all of the file descriptors from the map.
+func (f *FDMap) destroy() {
+ f.RemoveIf(func(*fs.File, FDFlags) bool {
+ return true
+ })
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FDMap) DecRef() {
+ f.DecRefWithDestructor(f.destroy)
+}
+
+// Size returns the number of file descriptor slots currently allocated.
+func (f *FDMap) Size() int {
+ f.mu.RLock()
+ defer f.mu.RUnlock()
+
+ return len(f.files)
+}
+
+// String is a stringer for FDMap.
+func (f *FDMap) String() string {
+ f.mu.RLock()
+ defer f.mu.RUnlock()
+
+ var b bytes.Buffer
+ for k, v := range f.files {
+ n, _ := v.file.Dirent.FullName(nil /* root */)
+ b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n))
+ }
+ return b.String()
+}
+
+// NewFDFrom allocates a new FD guaranteed to be the lowest number available
+// greater than or equal to from. This property is important as Unix programs
+// tend to count on this allocation order.
+func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return 0, syscall.EINVAL
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ // Finds the lowest fd not in the handles map.
+ lim := limitSet.Get(limits.NumberOfFiles)
+ for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ {
+ if _, ok := f.files[i]; !ok {
+ file.IncRef()
+ f.files[i] = descriptor{file, flags}
+ return i, nil
+ }
+ }
+
+ return -1, syscall.EMFILE
+}
+
+// NewFDAt sets the file reference for the given FD. If there is an
+// active reference for that FD, the ref count for that existing reference
+// is decremented.
+func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return syscall.EBADF
+ }
+
+ // In this one case we do not do a defer of the Unlock. The
+ // reason is that we must have done all the work needed for
+ // discarding any old open file before we return to the
+ // caller. In other words, the DecRef(), below, must have
+ // completed by the time we return to the caller to ensure
+ // side effects are, in fact, effected. A classic example is
+ // dup2(fd1, fd2); if fd2 was already open, it must be closed,
+ // and we don't want to resume the caller until it is; we have
+ // to block on the DecRef(). Hence we can not just do a 'go
+ // oldfile.DecRef()', since there would be no guarantee that
+ // it would be done before we the caller resumed. Since we
+ // must wait for the DecRef() to finish, and that could take
+ // time, it's best to first call f.muUnlock beore so we are
+ // not blocking other uses of this FDMap on the DecRef() call.
+ f.mu.Lock()
+ oldDesc, oldExists := f.files[fd]
+ lim := limitSet.Get(limits.NumberOfFiles).Cur
+ // if we're closing one then the effective limit is one
+ // more than the actual limit.
+ if oldExists && lim != limits.Infinity {
+ lim++
+ }
+ if lim != limits.Infinity && fd >= kdefs.FD(lim) {
+ f.mu.Unlock()
+ return syscall.EMFILE
+ }
+
+ file.IncRef()
+ f.files[fd] = descriptor{file, flags}
+ f.mu.Unlock()
+
+ if oldExists {
+ oldDesc.file.DecRef()
+ }
+ return nil
+}
+
+// SetFlags sets the flags for the given file descriptor, if it is valid.
+func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ desc, ok := f.files[fd]
+ if !ok {
+ return
+ }
+
+ f.files[fd] = descriptor{desc.file, flags}
+}
+
+// GetDescriptor returns a reference to the file and the flags for the FD. It
+// bumps its reference count as well. It returns nil if there is no File
+// for the FD, i.e. if the FD is invalid. The caller must use DecRef
+// when they are done.
+func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) {
+ f.mu.RLock()
+ defer f.mu.RUnlock()
+
+ if desc, ok := f.files[fd]; ok {
+ desc.file.IncRef()
+ return desc.file, desc.flags
+ }
+ return nil, FDFlags{}
+}
+
+// GetFile returns a reference to the File for the FD and bumps
+// its reference count as well. It returns nil if there is no File
+// for the FD, i.e. if the FD is invalid. The caller must use DecRef
+// when they are done.
+func (f *FDMap) GetFile(fd kdefs.FD) *fs.File {
+ f.mu.RLock()
+ if desc, ok := f.files[fd]; ok {
+ desc.file.IncRef()
+ f.mu.RUnlock()
+ return desc.file
+ }
+ f.mu.RUnlock()
+ return nil
+}
+
+// fds returns an ordering of FDs.
+func (f *FDMap) fds() FDs {
+ fds := make(FDs, 0, len(f.files))
+ for fd := range f.files {
+ fds = append(fds, fd)
+ }
+ sort.Sort(fds)
+ return fds
+}
+
+// GetFDs returns a list of valid fds.
+func (f *FDMap) GetFDs() FDs {
+ f.mu.RLock()
+ defer f.mu.RUnlock()
+ return f.fds()
+}
+
+// GetRefs returns a stable slice of references to all files and bumps the
+// reference count on each. The caller must use DecRef on each reference when
+// they're done using the slice.
+func (f *FDMap) GetRefs() []*fs.File {
+ f.mu.RLock()
+ defer f.mu.RUnlock()
+
+ fds := f.fds()
+ fs := make([]*fs.File, 0, len(fds))
+ for _, fd := range fds {
+ desc := f.files[fd]
+ desc.file.IncRef()
+ fs = append(fs, desc.file)
+ }
+ return fs
+}
+
+// Fork returns an independent FDMap pointing to the same descriptors.
+func (f *FDMap) Fork() *FDMap {
+ f.mu.RLock()
+ defer f.mu.RUnlock()
+
+ clone := f.k.NewFDMap()
+
+ // Grab a extra reference for every file.
+ for fd, desc := range f.files {
+ desc.file.IncRef()
+ clone.files[fd] = desc
+ }
+
+ // That's it!
+ return clone
+}
+
+// unlock releases all file locks held by this FDMap's uid. Must only be
+// called on a non-nil *fs.File.
+func (f *FDMap) unlock(file *fs.File) {
+ id := lock.UniqueID(f.ID())
+ file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF})
+}
+
+// inotifyFileClose generates the appropriate inotify events for f being closed.
+func inotifyFileClose(f *fs.File) {
+ var ev uint32
+ d := f.Dirent
+
+ if fs.IsDir(d.Inode.StableAttr) {
+ ev |= linux.IN_ISDIR
+ }
+
+ if f.Flags().Write {
+ ev |= linux.IN_CLOSE_WRITE
+ } else {
+ ev |= linux.IN_CLOSE_NOWRITE
+ }
+
+ d.InotifyEvent(ev, 0)
+}
+
+// Remove removes an FD from the FDMap, and returns (File, true) if a File
+// one was found. Callers are expected to decrement the reference count on
+// the File. Otherwise returns (nil, false).
+func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) {
+ f.mu.Lock()
+ desc := f.files[fd]
+ delete(f.files, fd)
+ f.mu.Unlock()
+ if desc.file != nil {
+ f.unlock(desc.file)
+ inotifyFileClose(desc.file)
+ return desc.file, true
+ }
+ return nil, false
+}
+
+// RemoveIf removes all FDs where cond is true.
+func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) {
+ var removed []*fs.File
+ f.mu.Lock()
+ for fd, desc := range f.files {
+ if desc.file != nil && cond(desc.file, desc.flags) {
+ delete(f.files, fd)
+ removed = append(removed, desc.file)
+ }
+ }
+ f.mu.Unlock()
+
+ for _, file := range removed {
+ f.unlock(file)
+ inotifyFileClose(file)
+ file.DecRef()
+ }
+}
diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go
new file mode 100644
index 000000000..e1ac900e8
--- /dev/null
+++ b/pkg/sentry/kernel/fd_map_test.go
@@ -0,0 +1,134 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+)
+
+const (
+ // maxFD is the maximum FD to try to create in the map.
+ // This number of open files has been seen in the wild.
+ maxFD = 2 * 1024
+)
+
+func newTestFDMap() *FDMap {
+ return &FDMap{
+ files: make(map[kdefs.FD]descriptor),
+ }
+}
+
+// TestFDMapMany allocates maxFD FDs, i.e. maxes out the FDMap,
+// until there is no room, then makes sure that NewFDAt works
+// and also that if we remove one and add one that works too.
+func TestFDMapMany(t *testing.T) {
+ file := filetest.NewTestFile(t)
+ limitSet := limits.NewLimitSet()
+ limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD})
+
+ f := newTestFDMap()
+ for i := 0; i < maxFD; i++ {
+ if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
+ t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD)
+ }
+ }
+
+ if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil {
+ t.Fatalf("f.NewFDFrom(0, r) in full map: got nil, wanted error")
+ }
+
+ if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil {
+ t.Fatalf("f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+ }
+}
+
+// TestFDMap does a set of simple tests to make sure simple adds,
+// removes, GetRefs, and DecRefs work. The ordering is just weird
+// enough that a table-driven approach seemed clumsy.
+func TestFDMap(t *testing.T) {
+ file := filetest.NewTestFile(t)
+ limitSet := limits.NewLimitSet()
+ limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD})
+
+ f := newTestFDMap()
+ if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
+ t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err)
+ }
+
+ if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil {
+ t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error")
+ }
+
+ largeLimit := limits.Limit{maxFD, maxFD}
+ limitSet.Set(limits.NumberOfFiles, largeLimit)
+
+ if fd, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil {
+ t.Fatalf("Adding an FD to a resized map: got %v, want nil", err)
+ } else if fd != kdefs.FD(1) {
+ t.Fatalf("Added an FD to a resized map: got %v, want 1", fd)
+ }
+
+ if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil {
+ t.Fatalf("Replacing FD 1 via f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err)
+ }
+
+ if err := f.NewFDAt(maxFD+1, file, FDFlags{}, limitSet); err == nil {
+ t.Fatalf("Using an FD that was too large via f.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1)
+ }
+
+ if ref := f.GetFile(1); ref == nil {
+ t.Fatalf("f.GetFile(1): got nil, wanted %v", file)
+ }
+
+ if ref := f.GetFile(2); ref != nil {
+ t.Fatalf("f.GetFile(2): got a %v, wanted nil", ref)
+ }
+
+ ref, ok := f.Remove(1)
+ if !ok {
+ t.Fatalf("f.Remove(1) for an existing FD: failed, want success")
+ }
+ ref.DecRef()
+
+ if ref, ok := f.Remove(1); ok {
+ ref.DecRef()
+ t.Fatalf("r.Remove(1) for a removed FD: got success, want failure")
+ }
+
+}
+
+func TestDescriptorFlags(t *testing.T) {
+ file := filetest.NewTestFile(t)
+ f := newTestFDMap()
+ limitSet := limits.NewLimitSet()
+ limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD})
+
+ if err := f.NewFDAt(2, file, FDFlags{CloseOnExec: true}, limitSet); err != nil {
+ t.Fatalf("f.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err)
+ }
+
+ newFile, flags := f.GetDescriptor(2)
+ if newFile == nil {
+ t.Fatalf("f.GetFile(2): got a %v, wanted nil", newFile)
+ }
+
+ if !flags.CloseOnExec {
+ t.Fatalf("new File flags %d don't match original %d\n", flags, 0)
+ }
+}
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
new file mode 100644
index 000000000..9aa6fa951
--- /dev/null
+++ b/pkg/sentry/kernel/fs_context.go
@@ -0,0 +1,172 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// FSContext contains filesystem context.
+//
+// This includes umask and working directory.
+type FSContext struct {
+ refs.AtomicRefCount
+
+ // mu protects below.
+ mu sync.Mutex `state:"nosave"`
+
+ // root is the filesystem root. Will be nil iff the FSContext has been
+ // destroyed.
+ root *fs.Dirent
+
+ // cwd is the current working directory. Will be nil iff the FSContext
+ // has been destroyed.
+ cwd *fs.Dirent
+
+ // umask is the current file mode creation mask. When a thread using this
+ // context invokes a syscall that creates a file, bits set in umask are
+ // removed from the permissions that the file is created with.
+ umask uint
+}
+
+// newFSContext returns a new filesystem context.
+func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
+ root.IncRef()
+ cwd.IncRef()
+ return &FSContext{
+ root: root,
+ cwd: cwd,
+ umask: umask,
+ }
+}
+
+// destroy is the destructor for an FSContext.
+//
+// This will call DecRef on both root and cwd Dirents. If either call to
+// DecRef returns an error, then it will be propigated. If both calls to
+// DecRef return an error, then the one from root.DecRef will be propigated.
+//
+// Note that there may still be calls to WorkingDirectory() or RootDirectory()
+// (that return nil). This is because valid references may still be held via
+// proc files or other mechanisms.
+func (f *FSContext) destroy() {
+ f.root.DecRef()
+ f.root = nil
+
+ f.cwd.DecRef()
+ f.cwd = nil
+}
+
+// DecRef implements RefCounter.DecRef with destructor f.destroy.
+func (f *FSContext) DecRef() {
+ f.DecRefWithDestructor(f.destroy)
+}
+
+// Fork forks this FSContext.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) Fork() *FSContext {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ f.cwd.IncRef()
+ f.root.IncRef()
+ return &FSContext{
+ cwd: f.cwd,
+ root: f.root,
+ umask: f.umask,
+ }
+}
+
+// WorkingDirectory returns the current working directory.
+// You should call DecRef on the returned Dirent when finished.
+//
+// This will return nil if called after destroy().
+func (f *FSContext) WorkingDirectory() *fs.Dirent {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ if f.cwd != nil {
+ f.cwd.IncRef()
+ }
+ return f.cwd
+}
+
+// SetWorkingDirectory sets the current working directory.
+// This will take an extra reference on the Dirent.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
+ if d == nil {
+ panic("FSContext.SetWorkingDirectory called with nil dirent")
+ }
+ if f.cwd == nil {
+ panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d))
+ }
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ old := f.cwd
+ f.cwd = d
+ d.IncRef()
+ old.DecRef()
+}
+
+// RootDirectory returns the current filesystem root.
+// You should call DecRef on the returned Dirent when finished.
+//
+// This will return nil if called after destroy().
+func (f *FSContext) RootDirectory() *fs.Dirent {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ f.root.IncRef()
+ return f.root
+}
+
+// SetRootDirectory sets the root directory.
+// This will take an extra reference on the Dirent.
+//
+// This is not a valid call after free.
+func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
+ if d == nil {
+ panic("FSContext.SetRootDirectory called with nil dirent")
+ }
+ if f.root == nil {
+ panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", d))
+ }
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ old := f.root
+ f.root = d
+ d.IncRef()
+ old.DecRef()
+}
+
+// Umask returns the current umask.
+func (f *FSContext) Umask() uint {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ return f.umask
+}
+
+// SwapUmask atomically sets the current umask and returns the old umask.
+func (f *FSContext) SwapUmask(mask uint) uint {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ old := f.umask
+ f.umask = mask
+ return old
+}
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
new file mode 100644
index 000000000..de9897c58
--- /dev/null
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -0,0 +1,48 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_template_instance(
+ name = "waiter_list",
+ out = "waiter_list.go",
+ package = "futex",
+ prefix = "waiter",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Linker": "*Waiter",
+ },
+)
+
+go_stateify(
+ name = "futex_state",
+ srcs = [
+ "futex.go",
+ "waiter_list.go",
+ ],
+ out = "futex_state.go",
+ package = "futex",
+)
+
+go_library(
+ name = "futex",
+ srcs = [
+ "futex.go",
+ "futex_state.go",
+ "waiter_list.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/state",
+ "//pkg/syserror",
+ ],
+)
+
+go_test(
+ name = "futex_test",
+ size = "small",
+ srcs = ["futex_test.go"],
+ embed = [":futex"],
+)
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
new file mode 100644
index 000000000..b3ba57a2c
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -0,0 +1,405 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package futex provides an implementation of the futex interface as found in
+// the Linux kernel. It allows one to easily transform Wait() calls into waits
+// on a channel, which is useful in a Go-based kernel, for example.
+package futex
+
+import (
+ "sync"
+ "sync/atomic"
+
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Checker abstracts memory accesses. This is useful because the "addresses"
+// used in this package may not be real addresses (they could be indices of an
+// array, for example), or they could be mapped via some special mechanism.
+//
+// TODO: Replace this with usermem.IO.
+type Checker interface {
+ // Check should validate that given address contains the given value.
+ // If it does not contain the value, syserror.EAGAIN must be returned.
+ // Any other error may be returned, which will be propagated.
+ Check(addr uintptr, val uint32) error
+
+ // Op should atomically perform the operation encoded in op on the data
+ // pointed to by addr, then apply the comparison encoded in op to the
+ // original value at addr, returning the result.
+ // Note that op is an opaque operation whose behaviour is defined
+ // outside of the futex manager.
+ Op(addr uintptr, op uint32) (bool, error)
+}
+
+// Waiter is the struct which gets enqueued into buckets for wake up routines
+// and requeue routines to scan and notify. Once a Waiter has been enqueued by
+// WaitPrepare(), callers may listen on C for wake up events.
+type Waiter struct {
+ // Synchronization:
+ //
+ // - A Waiter that is not enqueued in a bucket is exclusively owned (no
+ // synchronization applies).
+ //
+ // - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
+ // waiterEntry, complete, and addr are protected by the bucket.mu ("bucket
+ // lock") of the containing bucket, and bitmask is immutable. complete and
+ // addr are additionally mutated using atomic memory operations, ensuring
+ // that they can be read using atomic memory operations without holding the
+ // bucket lock.
+ //
+ // - A Waiter is only guaranteed to be no longer queued after calling
+ // WaitComplete().
+
+ // waiterEntry links Waiter into bucket.waiters.
+ waiterEntry
+
+ // complete is 1 if the Waiter was removed from its bucket by a wakeup and
+ // 0 otherwise.
+ complete int32
+
+ // C is sent to when the Waiter is woken.
+ C chan struct{}
+
+ // addr is the address being waited on.
+ addr uintptr
+
+ // The bitmask we're waiting on.
+ // This is used the case of a FUTEX_WAKE_BITSET.
+ bitmask uint32
+}
+
+// NewWaiter returns a new unqueued Waiter.
+func NewWaiter() *Waiter {
+ return &Waiter{
+ C: make(chan struct{}, 1),
+ }
+}
+
+// bucket holds a list of waiters for a given address hash.
+type bucket struct {
+ // mu protects waiters and contained Waiter state. See comment in Waiter.
+ mu sync.Mutex `state:"nosave"`
+
+ waiters waiterList `state:"zerovalue"`
+}
+
+// wakeLocked wakes up to n waiters matching the bitmask at the addr for this
+// bucket and returns the number of waiters woken.
+//
+// Preconditions: b.mu must be locked.
+func (b *bucket) wakeLocked(addr uintptr, bitmask uint32, n int) int {
+ done := 0
+ for w := b.waiters.Front(); done < n && w != nil; {
+ if w.addr != addr || w.bitmask&bitmask == 0 {
+ // Not matching.
+ w = w.Next()
+ continue
+ }
+
+ // Remove from the bucket and wake the waiter.
+ woke := w
+ w = w.Next() // Next iteration.
+ b.waiters.Remove(woke)
+ woke.C <- struct{}{}
+
+ // NOTE: The above channel write establishes a write barrier
+ // according to the memory model, so nothing may be ordered
+ // around it. Since we've dequeued w and will never touch it
+ // again, we can safely store 1 to w.complete here and allow
+ // the WaitComplete() to short-circuit grabbing the bucket
+ // lock. If they somehow miss the w.complete, we are still
+ // holding the lock, so we can know that they won't dequeue w,
+ // assume it's free and have the below operation afterwards.
+ atomic.StoreInt32(&woke.complete, 1)
+ done++
+ }
+ return done
+}
+
+// requeueLocked takes n waiters from the bucket and moves them to naddr on the
+// bucket "to".
+//
+// Preconditions: b and to must be locked.
+func (b *bucket) requeueLocked(to *bucket, addr, naddr uintptr, n int) int {
+ done := 0
+ for w := b.waiters.Front(); done < n && w != nil; {
+ if w.addr != addr {
+ // Not matching.
+ w = w.Next()
+ continue
+ }
+
+ requeued := w
+ w = w.Next() // Next iteration.
+ b.waiters.Remove(requeued)
+ atomic.StoreUintptr(&requeued.addr, naddr)
+ to.waiters.PushBack(requeued)
+ done++
+ }
+ return done
+}
+
+const (
+ // bucketCount is the number of buckets per Manager. By having many of
+ // these we reduce contention when concurrent yet unrelated calls are made.
+ bucketCount = 1 << bucketCountBits
+ bucketCountBits = 10
+)
+
+func checkAddr(addr uintptr) error {
+ // Ensure the address is aligned.
+ // It must be a DWORD boundary.
+ if addr&0x3 != 0 {
+ return syserror.EINVAL
+ }
+
+ return nil
+}
+
+// bucketIndexForAddr returns the index into Manager.buckets for addr.
+func bucketIndexForAddr(addr uintptr) uintptr {
+ // - The bottom 2 bits of addr must be 0, per checkAddr.
+ //
+ // - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
+ // for a canonical address, and (on all existing platforms) bit 47 must be
+ // 0 for an application address.
+ //
+ // Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
+ // bits. We choose one of the simplest possible hash functions that at
+ // least uses all 45 useful bits in the output, given that bucketCountBits
+ // == 10. This hash function also has the property that it will usually map
+ // adjacent addresses to adjacent buckets, slightly improving memory
+ // locality when an application synchronization structure uses multiple
+ // nearby futexes.
+ //
+ // Note that despite the large number of arithmetic operations in the
+ // function, many components can be computed in parallel, such that the
+ // critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
+ // is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
+ // (addr >> 42)" without any additional grouping, the compiler puts all 4
+ // additions in the critical path.
+ h1 := (addr >> 2) + (addr >> 12) + (addr >> 22)
+ h2 := (addr >> 32) + (addr >> 42)
+ return (h1 + h2) % bucketCount
+}
+
+// Manager holds futex state for a single virtual address space.
+type Manager struct {
+ buckets [bucketCount]bucket
+}
+
+// NewManager returns an initialized futex manager.
+// N.B. we use virtual address to tag futexes, so it only works for private
+// (within a single process) futex.
+func NewManager() *Manager {
+ return &Manager{}
+}
+
+// lockBucket returns a locked bucket for the given addr.
+//
+// Preconditions: checkAddr(addr) == nil.
+func (m *Manager) lockBucket(addr uintptr) *bucket {
+ b := &m.buckets[bucketIndexForAddr(addr)]
+ b.mu.Lock()
+ return b
+}
+
+// lockBuckets returns locked buckets for the given addrs.
+//
+// Preconditions: checkAddr(addr1) == checkAddr(addr2) == nil.
+func (m *Manager) lockBuckets(addr1 uintptr, addr2 uintptr) (*bucket, *bucket) {
+ i1 := bucketIndexForAddr(addr1)
+ i2 := bucketIndexForAddr(addr2)
+ b1 := &m.buckets[i1]
+ b2 := &m.buckets[i2]
+
+ // Ensure that buckets are locked in a consistent order (lowest index
+ // first) to avoid circular locking.
+ switch {
+ case i1 < i2:
+ b1.mu.Lock()
+ b2.mu.Lock()
+ case i2 < i1:
+ b2.mu.Lock()
+ b1.mu.Lock()
+ default:
+ b1.mu.Lock()
+ }
+
+ return b1, b2
+}
+
+// Wake wakes up to n waiters matching the bitmask on the given addr.
+// The number of waiters woken is returned.
+func (m *Manager) Wake(addr uintptr, bitmask uint32, n int) (int, error) {
+ if err := checkAddr(addr); err != nil {
+ return 0, err
+ }
+
+ b := m.lockBucket(addr)
+ // This function is very hot; avoid defer.
+ r := b.wakeLocked(addr, bitmask, n)
+ b.mu.Unlock()
+ return r, nil
+}
+
+func (m *Manager) doRequeue(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) {
+ if err := checkAddr(addr); err != nil {
+ return 0, err
+ }
+ if err := checkAddr(naddr); err != nil {
+ return 0, err
+ }
+
+ b1, b2 := m.lockBuckets(addr, naddr)
+ defer b1.mu.Unlock()
+ if b2 != b1 {
+ defer b2.mu.Unlock()
+ }
+
+ // Check our value.
+ // This only applied for RequeueCmp().
+ if c != nil {
+ if err := c.Check(addr, val); err != nil {
+ return 0, err
+ }
+ }
+
+ // Wake the number required.
+ done := b1.wakeLocked(addr, ^uint32(0), nwake)
+
+ // Requeue the number required.
+ b1.requeueLocked(b2, addr, naddr, nreq)
+
+ return done, nil
+}
+
+// Requeue wakes up to nwake waiters on the given addr, and unconditionally
+// requeues up to nreq waiters on naddr.
+func (m *Manager) Requeue(addr uintptr, naddr uintptr, nwake int, nreq int) (int, error) {
+ return m.doRequeue(nil, addr, 0, naddr, nwake, nreq)
+}
+
+// RequeueCmp atomically checks that the addr contains val (via the Checker),
+// wakes up to nwake waiters on addr and then unconditionally requeues nreq
+// waiters on naddr.
+func (m *Manager) RequeueCmp(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) {
+ return m.doRequeue(c, addr, val, naddr, nwake, nreq)
+}
+
+// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
+// waiters unconditionally from addr1, and, based on the original value at addr2
+// and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
+// It returns the total number of waiters woken.
+func (m *Manager) WakeOp(c Checker, addr1 uintptr, addr2 uintptr, nwake1 int, nwake2 int, op uint32) (int, error) {
+ if err := checkAddr(addr1); err != nil {
+ return 0, err
+ }
+ if err := checkAddr(addr2); err != nil {
+ return 0, err
+ }
+
+ b1, b2 := m.lockBuckets(addr1, addr2)
+
+ done := 0
+ cond, err := c.Op(addr2, op)
+ if err == nil {
+ // Wake up up to nwake1 entries from the first bucket.
+ done = b1.wakeLocked(addr1, ^uint32(0), nwake1)
+
+ // Wake up up to nwake2 entries from the second bucket if the
+ // operation yielded true.
+ if cond {
+ done += b2.wakeLocked(addr2, ^uint32(0), nwake2)
+ }
+ }
+
+ b1.mu.Unlock()
+ if b2 != b1 {
+ b2.mu.Unlock()
+ }
+ return done, err
+}
+
+// WaitPrepare atomically checks that addr contains val (via the Checker), then
+// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
+// Waiter must be subsequently removed by calling WaitComplete, whether or not
+// a wakeup is received on w.C.
+func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, val uint32, bitmask uint32) error {
+ if err := checkAddr(addr); err != nil {
+ return err
+ }
+
+ // Prepare the Waiter before taking the bucket lock.
+ w.complete = 0
+ select {
+ case <-w.C:
+ default:
+ }
+ w.addr = addr
+ w.bitmask = bitmask
+
+ b := m.lockBucket(addr)
+ // This function is very hot; avoid defer.
+
+ // Perform our atomic check.
+ if err := c.Check(addr, val); err != nil {
+ b.mu.Unlock()
+ return err
+ }
+
+ // Add the waiter to the bucket.
+ b.waiters.PushBack(w)
+
+ b.mu.Unlock()
+ return nil
+}
+
+// WaitComplete must be called when a Waiter previously added by WaitPrepare is
+// no longer eligible to be woken.
+func (m *Manager) WaitComplete(w *Waiter) {
+ // Can we short-circuit acquiring the lock?
+ // This is the happy path where a notification
+ // was received and we don't need to dequeue this
+ // waiter from any list (or take any locks).
+ if atomic.LoadInt32(&w.complete) != 0 {
+ return
+ }
+
+ // Take the bucket lock. Note that without holding the bucket lock, the
+ // waiter is not guaranteed to stay in that bucket, so after we take the
+ // bucket lock, we must ensure that the bucket hasn't changed: if it
+ // happens to have changed, we release the old bucket lock and try again
+ // with the new bucket; if it hasn't changed, we know it won't change now
+ // because we hold the lock.
+ var b *bucket
+ for {
+ addr := atomic.LoadUintptr(&w.addr)
+ b = m.lockBucket(addr)
+ // We still have to use an atomic load here, because if w was racily
+ // requeued then w.addr is not protected by b.mu.
+ if addr == atomic.LoadUintptr(&w.addr) {
+ break
+ }
+ b.mu.Unlock()
+ }
+
+ // Remove waiter from the bucket. w.complete can only be stored with b.mu
+ // locked, so this load doesn't need to use sync/atomic.
+ if w.complete == 0 {
+ b.waiters.Remove(w)
+ }
+ b.mu.Unlock()
+}
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
new file mode 100644
index 000000000..7b81358ec
--- /dev/null
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -0,0 +1,500 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package futex
+
+import (
+ "math"
+ "runtime"
+ "sync"
+ "sync/atomic"
+ "syscall"
+ "testing"
+ "unsafe"
+)
+
+const (
+ testMutexSize = 4
+ testMutexLocked uint32 = 1
+ testMutexUnlocked uint32 = 0
+)
+
+// testData implements the Checker interface, and allows us to
+// treat the address passed for futex operations as an index in
+// a byte slice for testing simplicity.
+type testData []byte
+
+func newTestData(size uint) testData {
+ return make([]byte, size)
+}
+
+func (t testData) Check(addr uintptr, val uint32) error {
+ if val != atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))) {
+ return syscall.EAGAIN
+ }
+ return nil
+}
+
+func (t testData) Op(addr uintptr, val uint32) (bool, error) {
+ return val == 0, nil
+}
+
+// testMutex ties together a testData slice, an address, and a
+// futex manager in order to implement the sync.Locker interface.
+// Beyond being used as a Locker, this is a simple mechanism for
+// changing the underlying values for simpler tests.
+type testMutex struct {
+ a uintptr
+ d testData
+ m *Manager
+}
+
+func newTestMutex(addr uintptr, d testData, m *Manager) *testMutex {
+ return &testMutex{a: addr, d: d, m: m}
+}
+
+// Lock acquires the testMutex.
+// This may wait for it to be available via the futex manager.
+func (t *testMutex) Lock() {
+ for {
+ // Attempt to grab the lock.
+ if atomic.CompareAndSwapUint32(
+ ((*uint32)(unsafe.Pointer(&t.d[t.a]))),
+ testMutexUnlocked,
+ testMutexLocked) {
+ // Lock held.
+ return
+ }
+
+ // Wait for it to be "not locked".
+ w := NewWaiter()
+ err := t.m.WaitPrepare(w, t.d, t.a, testMutexLocked, ^uint32(0))
+ if err == syscall.EAGAIN {
+ continue
+ }
+ if err != nil {
+ // Should never happen.
+ panic("WaitPrepare returned unexpected error: " + err.Error())
+ }
+ <-w.C
+ t.m.WaitComplete(w)
+ }
+}
+
+// Unlock releases the testMutex.
+// This will notify any waiters via the futex manager.
+func (t *testMutex) Unlock() {
+ // Unlock.
+ atomic.StoreUint32(((*uint32)(unsafe.Pointer(&t.d[t.a]))), testMutexUnlocked)
+
+ // Notify all waiters.
+ t.m.Wake(t.a, ^uint32(0), math.MaxInt32)
+}
+
+func TestFutexWake(t *testing.T) {
+ m := NewManager()
+ d := newTestData(testMutexSize)
+
+ // Wait for it to be locked.
+ // (This won't trigger the wake in testMutex)
+ w := NewWaiter()
+ m.WaitPrepare(w, d, 0, testMutexUnlocked, ^uint32(0))
+
+ // Wake the single thread.
+ if _, err := m.Wake(0, ^uint32(0), 1); err != nil {
+ t.Error("wake error:", err)
+ }
+
+ <-w.C
+ m.WaitComplete(w)
+}
+
+func TestFutexWakeBitmask(t *testing.T) {
+ m := NewManager()
+ d := newTestData(testMutexSize)
+
+ // Wait for it to be locked.
+ // (This won't trigger the wake in testMutex)
+ w := NewWaiter()
+ m.WaitPrepare(w, d, 0, testMutexUnlocked, 0x0000ffff)
+
+ // Wake the single thread, not using the bitmask.
+ if _, err := m.Wake(0, 0xffff0000, 1); err != nil {
+ t.Error("wake non-matching bitmask error:", err)
+ }
+
+ select {
+ case <-w.C:
+ t.Error("w is alive?")
+ default:
+ }
+
+ // Now use a matching bitmask.
+ if _, err := m.Wake(0, 0x00000001, 1); err != nil {
+ t.Error("wake matching bitmask error:", err)
+ }
+
+ <-w.C
+ m.WaitComplete(w)
+}
+
+func TestFutexWakeTwo(t *testing.T) {
+ m := NewManager()
+ d := newTestData(testMutexSize)
+
+ // Wait for it to be locked.
+ // (This won't trigger the wake in testMutex)
+ w1 := NewWaiter()
+ w2 := NewWaiter()
+ w3 := NewWaiter()
+ m.WaitPrepare(w1, d, 0, testMutexUnlocked, ^uint32(0))
+ m.WaitPrepare(w2, d, 0, testMutexUnlocked, ^uint32(0))
+ m.WaitPrepare(w3, d, 0, testMutexUnlocked, ^uint32(0))
+
+ // Wake exactly two threads.
+ if _, err := m.Wake(0, ^uint32(0), 2); err != nil {
+ t.Error("wake error:", err)
+ }
+
+ // Ensure exactly two are alive.
+ // We don't get guarantees about exactly which two,
+ // (although we expect them to be w1 and w2).
+ awake := 0
+ for {
+ select {
+ case <-w1.C:
+ awake++
+ case <-w2.C:
+ awake++
+ case <-w3.C:
+ awake++
+ default:
+ if awake != 2 {
+ t.Error("awake != 2?")
+ }
+
+ // Success.
+ return
+ }
+ }
+}
+
+func TestFutexWakeUnrelated(t *testing.T) {
+ m := NewManager()
+ d := newTestData(2 * testMutexSize)
+
+ // Wait for it to be locked.
+ w1 := NewWaiter()
+ w2 := NewWaiter()
+ m.WaitPrepare(w1, d, 0*testMutexSize, testMutexUnlocked, ^uint32(0))
+ m.WaitPrepare(w2, d, 1*testMutexSize, testMutexUnlocked, ^uint32(0))
+
+ // Wake only the second one.
+ if _, err := m.Wake(1*testMutexSize, ^uint32(0), 2); err != nil {
+ t.Error("wake error:", err)
+ }
+
+ // Ensure only r2 is alive.
+ select {
+ case <-w1.C:
+ t.Error("w1 is alive?")
+ default:
+ }
+ <-w2.C
+}
+
+// This function was shamelessly stolen from mutex_test.go.
+func HammerMutex(l sync.Locker, loops int, cdone chan bool) {
+ for i := 0; i < loops; i++ {
+ l.Lock()
+ runtime.Gosched()
+ l.Unlock()
+ }
+ cdone <- true
+}
+
+func TestFutexStress(t *testing.T) {
+ m := NewManager()
+ d := newTestData(testMutexSize)
+ tm := newTestMutex(0*testMutexSize, d, m)
+ c := make(chan bool)
+
+ for i := 0; i < 10; i++ {
+ go HammerMutex(tm, 1000, c)
+ }
+
+ for i := 0; i < 10; i++ {
+ <-c
+ }
+}
+
+func TestWakeOpEmpty(t *testing.T) {
+ m := NewManager()
+ d := newTestData(8)
+
+ n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+ if err != nil {
+ t.Fatalf("WakeOp failed: %v", err)
+ }
+
+ if n != 0 {
+ t.Fatalf("Invalid number of wakes: want 0, got %d", n)
+ }
+}
+
+func TestWakeOpFirstNonEmpty(t *testing.T) {
+ m := NewManager()
+ d := newTestData(8)
+
+ // Add two waiters on address 0.
+ w1 := NewWaiter()
+ if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w1)
+
+ w2 := NewWaiter()
+ if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w2)
+
+ // Wake up all waiters on address 0.
+ n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+ if err != nil {
+ t.Fatalf("WakeOp failed: %v", err)
+ }
+
+ if n != 2 {
+ t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+ }
+}
+
+func TestWakeOpSecondNonEmpty(t *testing.T) {
+ m := NewManager()
+ d := newTestData(8)
+
+ // Add two waiters on address 4.
+ w1 := NewWaiter()
+ if err := m.WaitPrepare(w1, d, 4, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w1)
+
+ w2 := NewWaiter()
+ if err := m.WaitPrepare(w2, d, 4, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w2)
+
+ // Wake up all waiters on address 4.
+ n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+ if err != nil {
+ t.Fatalf("WakeOp failed: %v", err)
+ }
+
+ if n != 2 {
+ t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+ }
+}
+
+func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) {
+ m := NewManager()
+ d := newTestData(8)
+
+ // Add two waiters on address 4.
+ w1 := NewWaiter()
+ if err := m.WaitPrepare(w1, d, 4, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w1)
+
+ w2 := NewWaiter()
+ if err := m.WaitPrepare(w2, d, 4, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w2)
+
+ // Wake up all waiters on address 4.
+ n, err := m.WakeOp(d, 0, 4, 10, 10, 1)
+ if err != nil {
+ t.Fatalf("WakeOp failed: %v", err)
+ }
+
+ if n != 0 {
+ t.Fatalf("Invalid number of wakes: want 0, got %d", n)
+ }
+}
+
+func TestWakeOpAllNonEmpty(t *testing.T) {
+ m := NewManager()
+ d := newTestData(8)
+
+ // Add two waiters on address 0.
+ w1 := NewWaiter()
+ if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w1)
+
+ w2 := NewWaiter()
+ if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w2)
+
+ // Add two waiters on address 4.
+ w3 := NewWaiter()
+ if err := m.WaitPrepare(w3, d, 4, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w3)
+
+ w4 := NewWaiter()
+ if err := m.WaitPrepare(w4, d, 4, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w4)
+
+ // Wake up all waiters on both addresses.
+ n, err := m.WakeOp(d, 0, 4, 10, 10, 0)
+ if err != nil {
+ t.Fatalf("WakeOp failed: %v", err)
+ }
+
+ if n != 4 {
+ t.Fatalf("Invalid number of wakes: want 4, got %d", n)
+ }
+}
+
+func TestWakeOpAllNonEmptyFailingOp(t *testing.T) {
+ m := NewManager()
+ d := newTestData(8)
+
+ // Add two waiters on address 0.
+ w1 := NewWaiter()
+ if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w1)
+
+ w2 := NewWaiter()
+ if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w2)
+
+ // Add two waiters on address 4.
+ w3 := NewWaiter()
+ if err := m.WaitPrepare(w3, d, 4, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w3)
+
+ w4 := NewWaiter()
+ if err := m.WaitPrepare(w4, d, 4, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w4)
+
+ // Wake up all waiters on both addresses.
+ n, err := m.WakeOp(d, 0, 4, 10, 10, 1)
+ if err != nil {
+ t.Fatalf("WakeOp failed: %v", err)
+ }
+
+ if n != 2 {
+ t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+ }
+}
+
+func TestWakeOpSameAddress(t *testing.T) {
+ m := NewManager()
+ d := newTestData(8)
+
+ // Add four waiters on address 0.
+ w1 := NewWaiter()
+ if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w1)
+
+ w2 := NewWaiter()
+ if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w2)
+
+ w3 := NewWaiter()
+ if err := m.WaitPrepare(w3, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w3)
+
+ w4 := NewWaiter()
+ if err := m.WaitPrepare(w4, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w4)
+
+ // Use the same address, with one at most one waiter from each.
+ n, err := m.WakeOp(d, 0, 0, 1, 1, 0)
+ if err != nil {
+ t.Fatalf("WakeOp failed: %v", err)
+ }
+
+ if n != 2 {
+ t.Fatalf("Invalid number of wakes: want 2, got %d", n)
+ }
+}
+
+func TestWakeOpSameAddressFailingOp(t *testing.T) {
+ m := NewManager()
+ d := newTestData(8)
+
+ // Add four waiters on address 0.
+ w1 := NewWaiter()
+ if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w1)
+
+ w2 := NewWaiter()
+ if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w2)
+
+ w3 := NewWaiter()
+ if err := m.WaitPrepare(w3, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w3)
+
+ w4 := NewWaiter()
+ if err := m.WaitPrepare(w4, d, 0, 0, ^uint32(0)); err != nil {
+ t.Fatalf("WaitPrepare failed: %v", err)
+ }
+ defer m.WaitComplete(w4)
+
+ // Use the same address, with one at most one waiter from each.
+ n, err := m.WakeOp(d, 0, 0, 1, 1, 1)
+ if err != nil {
+ t.Fatalf("WakeOp failed: %v", err)
+ }
+
+ if n != 1 {
+ t.Fatalf("Invalid number of wakes: want 1, got %d", n)
+ }
+}
diff --git a/pkg/sentry/kernel/g3doc/run_states.dot b/pkg/sentry/kernel/g3doc/run_states.dot
new file mode 100644
index 000000000..7861fe1f5
--- /dev/null
+++ b/pkg/sentry/kernel/g3doc/run_states.dot
@@ -0,0 +1,99 @@
+digraph {
+ subgraph {
+ App;
+ }
+ subgraph {
+ Interrupt;
+ InterruptAfterSignalDeliveryStop;
+ }
+ subgraph {
+ Syscall;
+ SyscallAfterPtraceEventSeccomp;
+ SyscallEnter;
+ SyscallAfterSyscallEnterStop;
+ SyscallAfterSysemuStop;
+ SyscallInvoke;
+ SyscallAfterPtraceEventClone;
+ SyscallAfterExecStop;
+ SyscallAfterVforkStop;
+ SyscallReinvoke;
+ SyscallExit;
+ }
+ subgraph {
+ Vsyscall;
+ VsyscallAfterPtraceEventSeccomp;
+ VsyscallInvoke;
+ }
+ subgraph {
+ Exit;
+ ExitMain; // leave thread group, release resources, reparent children, kill PID namespace and wait if TGID 1
+ ExitNotify; // signal parent/tracer, become waitable
+ ExitDone; // represented by t.runState == nil
+ }
+
+ // Task exit
+ Exit -> ExitMain;
+ ExitMain -> ExitNotify;
+ ExitNotify -> ExitDone;
+
+ // Execution of untrusted application code
+ App -> App;
+
+ // Interrupts (usually signal delivery)
+ App -> Interrupt;
+ Interrupt -> Interrupt; // if other interrupt conditions may still apply
+ Interrupt -> Exit; // if killed
+
+ // Syscalls
+ App -> Syscall;
+ Syscall -> SyscallEnter;
+ SyscallEnter -> SyscallInvoke;
+ SyscallInvoke -> SyscallExit;
+ SyscallExit -> App;
+
+ // exit, exit_group
+ SyscallInvoke -> Exit;
+
+ // execve
+ SyscallInvoke -> SyscallAfterExecStop;
+ SyscallAfterExecStop -> SyscallExit;
+ SyscallAfterExecStop -> App; // fatal signal pending
+
+ // vfork
+ SyscallInvoke -> SyscallAfterVforkStop;
+ SyscallAfterVforkStop -> SyscallExit;
+
+ // Vsyscalls
+ App -> Vsyscall;
+ Vsyscall -> VsyscallInvoke;
+ Vsyscall -> App; // fault while reading return address from stack
+ VsyscallInvoke -> App;
+
+ // ptrace-specific branches
+ Interrupt -> InterruptAfterSignalDeliveryStop;
+ InterruptAfterSignalDeliveryStop -> Interrupt;
+ SyscallEnter -> SyscallAfterSyscallEnterStop;
+ SyscallAfterSyscallEnterStop -> SyscallInvoke;
+ SyscallAfterSyscallEnterStop -> SyscallExit; // skipped by tracer
+ SyscallAfterSyscallEnterStop -> App; // fatal signal pending
+ SyscallEnter -> SyscallAfterSysemuStop;
+ SyscallAfterSysemuStop -> SyscallExit;
+ SyscallAfterSysemuStop -> App; // fatal signal pending
+ SyscallInvoke -> SyscallAfterPtraceEventClone;
+ SyscallAfterPtraceEventClone -> SyscallExit;
+ SyscallAfterPtraceEventClone -> SyscallAfterVforkStop;
+
+ // seccomp
+ Syscall -> App; // SECCOMP_RET_TRAP, SECCOMP_RET_ERRNO, SECCOMP_RET_KILL, SECCOMP_RET_TRACE without tracer
+ Syscall -> SyscallAfterPtraceEventSeccomp; // SECCOMP_RET_TRACE
+ SyscallAfterPtraceEventSeccomp -> SyscallEnter;
+ SyscallAfterPtraceEventSeccomp -> SyscallExit; // skipped by tracer
+ SyscallAfterPtraceEventSeccomp -> App; // fatal signal pending
+ Vsyscall -> VsyscallAfterPtraceEventSeccomp;
+ VsyscallAfterPtraceEventSeccomp -> VsyscallInvoke;
+ VsyscallAfterPtraceEventSeccomp -> App;
+
+ // Autosave
+ SyscallInvoke -> SyscallReinvoke;
+ SyscallReinvoke -> SyscallInvoke;
+}
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
new file mode 100644
index 000000000..78737f58f
--- /dev/null
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -0,0 +1,43 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore"
+)
+
+// IPCNamespace represents an IPC namespace.
+type IPCNamespace struct {
+ semaphores *semaphore.Registry
+}
+
+// NewIPCNamespace creates a new IPC namespace.
+func NewIPCNamespace() *IPCNamespace {
+ return &IPCNamespace{
+ semaphores: semaphore.NewRegistry(),
+ }
+}
+
+// SemaphoreRegistry returns the semanphore set registry for this namespace.
+func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry {
+ return i.semaphores
+}
+
+// IPCNamespace returns the task's IPC namespace.
+func (t *Task) IPCNamespace() *IPCNamespace {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.ipcns
+}
diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD
new file mode 100644
index 000000000..b6c00042a
--- /dev/null
+++ b/pkg/sentry/kernel/kdefs/BUILD
@@ -0,0 +1,10 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+ name = "kdefs",
+ srcs = ["kdefs.go"],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs",
+ visibility = ["//:sandbox"],
+)
diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go
new file mode 100644
index 000000000..bbb476544
--- /dev/null
+++ b/pkg/sentry/kernel/kdefs/kdefs.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kdefs defines common kernel definitions.
+//
+package kdefs
+
+// FD is a File Descriptor.
+type FD int32
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
new file mode 100644
index 000000000..0932965e0
--- /dev/null
+++ b/pkg/sentry/kernel/kernel.go
@@ -0,0 +1,957 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernel provides an emulation of the Linux kernel.
+//
+// See README.md for a detailed overview.
+//
+// Lock order (outermost locks must be taken first):
+//
+// Kernel.extMu
+// TaskSet.mu
+// SignalHandlers.mu
+// Task.mu
+//
+// Locking SignalHandlers.mu in multiple SignalHandlers requires locking
+// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
+// time requires locking all of their signal mutexes first.
+package kernel
+
+import (
+ "fmt"
+ "io"
+ "path/filepath"
+ "sync"
+ "sync/atomic"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port"
+ sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+ "gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+// Kernel represents an emulated Linux kernel. It must be initialized by calling
+// Init() or LoadFrom().
+type Kernel struct {
+ // extMu serializes external changes to the Kernel with calls to
+ // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
+ // remains frozen for the duration of the call; it requires that the Kernel
+ // is paused as a precondition, which ensures that none of the tasks
+ // running within the Kernel can affect its state, but extMu is required to
+ // ensure that concurrent users of the Kernel *outside* the Kernel's
+ // control cannot affect its state by calling e.g.
+ // Kernel.SendExternalSignal.)
+ extMu sync.Mutex `state:"nosave"`
+
+ // started is true if Start has been called. Unless otherwise specified,
+ // all Kernel fields become immutable once started becomes true.
+ started bool `state:"nosave"`
+
+ // All of the following fields are immutable unless otherwise specified.
+
+ // Platform is the platform that is used to execute tasks in the
+ // created Kernel. It is embedded so that Kernel can directly serve as
+ // Platform in mm logic and also serve as platform.MemoryProvider in
+ // filemem S/R logic.
+ platform.Platform `state:"nosave"`
+
+ // See InitKernelArgs for the meaning of these fields.
+ featureSet *cpuid.FeatureSet
+ timekeeper *Timekeeper
+ tasks *TaskSet
+ rootUserNamespace *auth.UserNamespace
+ networkStack inet.Stack `state:"nosave"`
+ applicationCores uint
+ useHostCores bool
+ extraAuxv []arch.AuxEntry
+ vdso *loader.VDSO
+ rootUTSNamespace *UTSNamespace
+ rootIPCNamespace *IPCNamespace
+
+ // mounts holds the state of the virtual filesystem. mounts is initially
+ // nil, and must be set by calling Kernel.SetRootMountNamespace before
+ // Kernel.CreateProcess can succeed.
+ mounts *fs.MountNamespace
+
+ // globalInit is the thread group whose leader has ID 1 in the root PID
+ // namespace. globalInit is stored separately so that it is accessible even
+ // after all tasks in the thread group have exited, such that ID 1 is no
+ // longer mapped.
+ //
+ // globalInit is mutable until it is assigned by the first successful call
+ // to CreateProcess, and is protected by extMu.
+ globalInit *ThreadGroup
+
+ // realtimeClock is a ktime.Clock based on timekeeper's Realtime.
+ realtimeClock *timekeeperClock
+
+ // monotonicClock is a ktime.Clock based on timekeeper's Monotonic.
+ monotonicClock *timekeeperClock
+
+ // syslog is the kernel log.
+ syslog syslog
+
+ // cpuClock is incremented every linux.ClockTick. cpuClock is used to
+ // measure task CPU usage, since sampling monotonicClock twice on every
+ // syscall turns out to be unreasonably expensive. This is similar to how
+ // Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING),
+ // although Linux also uses scheduler timing information to improve
+ // resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do
+ // since "preeemptive" scheduling is managed by the Go runtime, which
+ // doesn't provide this information.
+ //
+ // cpuClock is mutable, and is accessed using atomic memory operations.
+ cpuClock uint64
+
+ // cpuClockTicker increments cpuClock.
+ cpuClockTicker *ktime.Timer `state:"nosave"`
+
+ // fdMapUids is an ever-increasing counter for generating FDMap uids.
+ //
+ // fdMapUids is mutable, and is accessed using atomic memory operations.
+ fdMapUids uint64
+
+ // uniqueID is used to generate unique identifiers.
+ //
+ // uniqueID is mutable, and is accessed using atomic memory operations.
+ uniqueID uint64
+
+ // nextInotifyCookie is a monotonically increasing counter used for
+ // generating unique inotify event cookies.
+ //
+ // nextInotifyCookie is mutable, and is accesed using atomic memory
+ // operations.
+ nextInotifyCookie uint32
+
+ // netlinkPorts manages allocation of netlink socket port IDs.
+ netlinkPorts *port.Manager
+
+ // exitErr is the error causing the sandbox to exit, if any. It is
+ // protected by extMu.
+ exitErr error
+}
+
+// InitKernelArgs holds arguments to Init.
+type InitKernelArgs struct {
+ // FeatureSet is the emulated CPU feature set.
+ FeatureSet *cpuid.FeatureSet
+
+ // Timekeeper manages time for all tasks in the system.
+ Timekeeper *Timekeeper
+
+ // RootUserNamespace is the root user namespace.
+ RootUserNamespace *auth.UserNamespace
+
+ // NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
+ NetworkStack inet.Stack
+
+ // ApplicationCores is the number of logical CPUs visible to sandboxed
+ // applications. The set of logical CPU IDs is [0, ApplicationCores); thus
+ // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
+ // most significant bit in cpu_possible_mask + 1.
+ ApplicationCores uint
+
+ // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
+ // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
+ // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
+ // will be overridden.
+ UseHostCores bool
+
+ // ExtraAuxv contains additional auxiliary vector entries that are added to
+ // each process by the ELF loader.
+ ExtraAuxv []arch.AuxEntry
+
+ // Vdso holds the VDSO and its parameter page.
+ Vdso *loader.VDSO
+
+ // RootUTSNamespace is the root UTS namepsace.
+ RootUTSNamespace *UTSNamespace
+
+ // RootIPCNamespace is the root IPC namepsace.
+ RootIPCNamespace *IPCNamespace
+}
+
+// Init initialize the Kernel with no tasks.
+//
+// Callers must manually set Kernel.Platform before caling Init.
+func (k *Kernel) Init(args InitKernelArgs) error {
+ if args.FeatureSet == nil {
+ return fmt.Errorf("FeatureSet is nil")
+ }
+ if args.Timekeeper == nil {
+ return fmt.Errorf("Timekeeper is nil")
+ }
+ if args.RootUserNamespace == nil {
+ return fmt.Errorf("RootUserNamespace is nil")
+ }
+ if args.ApplicationCores == 0 {
+ return fmt.Errorf("ApplicationCores is 0")
+ }
+
+ k.featureSet = args.FeatureSet
+ k.timekeeper = args.Timekeeper
+ k.tasks = newTaskSet()
+ k.rootUserNamespace = args.RootUserNamespace
+ k.rootUTSNamespace = args.RootUTSNamespace
+ k.rootIPCNamespace = args.RootIPCNamespace
+ k.networkStack = args.NetworkStack
+ k.applicationCores = args.ApplicationCores
+ if args.UseHostCores {
+ k.useHostCores = true
+ maxCPU, err := hostcpu.MaxPossibleCPU()
+ if err != nil {
+ return fmt.Errorf("Failed to get maximum CPU number: %v", err)
+ }
+ minAppCores := uint(maxCPU) + 1
+ if k.applicationCores < minAppCores {
+ log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
+ k.applicationCores = minAppCores
+ }
+ }
+ k.extraAuxv = args.ExtraAuxv
+ k.vdso = args.Vdso
+ k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime}
+ k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
+ k.netlinkPorts = port.New()
+
+ return nil
+}
+
+// SaveTo saves the state of k to w.
+//
+// Preconditions: The kernel must be paused throughout the call to SaveTo.
+func (k *Kernel) SaveTo(w io.Writer) error {
+ saveStart := time.Now()
+ ctx := k.SupervisorContext()
+
+ // Do not allow other Kernel methods to affect it while it's being saved.
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+
+ // Stop time.
+ k.pauseTimeLocked()
+ defer k.resumeTimeLocked()
+
+ // Flush write operations on open files so data reaches backing storage.
+ if err := k.tasks.flushWritesToFiles(ctx); err != nil {
+ return err
+ }
+
+ // Remove all epoll waiter objects from underlying wait queues.
+ // NOTE: for programs to resume execution in future snapshot scenarios,
+ // we will need to re-establish these waiter objects after saving.
+ k.tasks.unregisterEpollWaiters()
+
+ // Clear the dirent cache before saving because Dirents must be Loaded in a
+ // particular order (parents before children), and Loading dirents from a cache
+ // breaks that order.
+ k.mounts.FlushMountSourceRefs()
+
+ // Ensure that all pending asynchronous work is complete:
+ // - inode and mount release
+ // - asynchronuous IO
+ fs.AsyncBarrier()
+
+ // Once all fs work has completed (flushed references have all been released),
+ // reset mount mappings. This allows individual mounts to save how inodes map
+ // to filesystem resources. Without this, fs.Inodes cannot be restored.
+ fs.SaveInodeMappings()
+
+ // Discard unsavable mappings, such as those for host file descriptors.
+ // This must be done after waiting for "asynchronous fs work", which
+ // includes async I/O that may touch application memory.
+ if err := k.invalidateUnsavableMappings(ctx); err != nil {
+ return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+ }
+
+ // Save the kernel state.
+ kernelStart := time.Now()
+ var stats state.Stats
+ if err := state.Save(w, k, &stats); err != nil {
+ return err
+ }
+ log.Infof("Kernel save stats: %s", &stats)
+ log.Infof("Kernel save took [%s].", time.Since(kernelStart))
+
+ // Save the memory state.
+ //
+ // FIXME: In the future, this should not be dispatched via
+ // an abstract memory type. This should be dispatched to a single
+ // memory implementation that belongs to the kernel. (There is
+ // currently a single implementation anyways, it just needs to be
+ // "unabstracted" and reparented appropriately.)
+ memoryStart := time.Now()
+ if err := k.Platform.Memory().SaveTo(w); err != nil {
+ return err
+ }
+ log.Infof("Memory save took [%s].", time.Since(memoryStart))
+
+ log.Infof("Overall save took [%s].", time.Since(saveStart))
+
+ return nil
+}
+
+func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
+ ts.mu.RLock()
+ defer ts.mu.RUnlock()
+ for t := range ts.Root.tids {
+ if fdmap := t.FDMap(); fdmap != nil {
+ for _, desc := range fdmap.files {
+ if flags := desc.file.Flags(); !flags.Write {
+ continue
+ }
+ if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
+ continue
+ }
+ // Here we need all metadata synced.
+ syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
+ if err := fs.SaveFileFsyncError(syncErr); err != nil {
+ name, _ := desc.file.Dirent.FullName(nil /* root */)
+ return fmt.Errorf("%q was not sufficiently synced: %v", name, err)
+ }
+ }
+ }
+ }
+ return nil
+}
+
+// Preconditions: The kernel must be paused.
+func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
+ invalidated := make(map[*mm.MemoryManager]struct{})
+ k.tasks.mu.RLock()
+ defer k.tasks.mu.RUnlock()
+ for t := range k.tasks.Root.tids {
+ // We can skip locking Task.mu here since the kernel is paused.
+ if mm := t.tc.MemoryManager; mm != nil {
+ if _, ok := invalidated[mm]; !ok {
+ if err := mm.InvalidateUnsavable(ctx); err != nil {
+ return err
+ }
+ invalidated[mm] = struct{}{}
+ }
+ }
+ // I really wish we just had a sync.Map of all MMs...
+ if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
+ if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+func (ts *TaskSet) unregisterEpollWaiters() {
+ ts.mu.RLock()
+ defer ts.mu.RUnlock()
+ for t := range ts.Root.tids {
+ if fdmap := t.FDMap(); fdmap != nil {
+ for _, desc := range fdmap.files {
+ if desc.file != nil {
+ if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok {
+ e.UnregisterEpollWaiters()
+ }
+ }
+ }
+ }
+ }
+}
+
+// LoadFrom returns a new Kernel loaded from args.
+func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) error {
+ loadStart := time.Now()
+ if p == nil {
+ return fmt.Errorf("Platform is nil")
+ }
+
+ k.Platform = p
+ k.networkStack = net
+
+ initAppCores := k.applicationCores
+
+ // Load the kernel state.
+ kernelStart := time.Now()
+ var stats state.Stats
+ if err := state.Load(r, k, &stats); err != nil {
+ return err
+ }
+ log.Infof("Kernel load stats: %s", &stats)
+ log.Infof("Kernel load took [%s].", time.Since(kernelStart))
+
+ // Load the memory state.
+ //
+ // See the note in SaveTo.
+ memoryStart := time.Now()
+ if err := k.Platform.Memory().LoadFrom(r); err != nil {
+ return err
+ }
+ log.Infof("Memory load took [%s].", time.Since(memoryStart))
+
+ // Ensure that all pending asynchronous work is complete:
+ // - namedpipe opening
+ // - inode file opening
+ fs.AsyncBarrier()
+
+ log.Infof("Overall load took [%s]", time.Since(loadStart))
+
+ // Applications may size per-cpu structures based on k.applicationCores, so
+ // it can't change across save/restore. When we are virtualizing CPU
+ // numbers, this isn't a problem. However, when we are exposing host CPU
+ // assignments, we can't tolerate an increase in the number of host CPUs,
+ // which could result in getcpu(2) returning CPUs that applications expect
+ // not to exist.
+ if k.useHostCores && initAppCores > k.applicationCores {
+ return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
+ }
+
+ return nil
+}
+
+// Destroy releases resources owned by k.
+//
+// Preconditions: There must be no task goroutines running in k.
+func (k *Kernel) Destroy() {
+ if k.mounts != nil {
+ k.mounts.DecRef()
+ k.mounts = nil
+ }
+}
+
+// UniqueID returns a unique identifier.
+func (k *Kernel) UniqueID() uint64 {
+ id := atomic.AddUint64(&k.uniqueID, 1)
+ if id == 0 {
+ panic("unique identifier generator wrapped around")
+ }
+ return id
+}
+
+// CreateProcessArgs holds arguments to kernel.CreateProcess.
+type CreateProcessArgs struct {
+ // Filename is the filename to load.
+ //
+ // If this is provided as "", then the file will be guessed via Argv[0].
+ Filename string
+
+ // Argvv is a list of arguments.
+ Argv []string
+
+ // Envv is a list of environment variables.
+ Envv []string
+
+ // WorkingDirectory is the initial working directory.
+ //
+ // This defaults to the root if empty.
+ WorkingDirectory string
+
+ // Credentials is the initial credentials.
+ Credentials *auth.Credentials
+
+ // FDMap is the initial set of file descriptors. If CreateProcess succeeds,
+ // it takes a reference on FDMap.
+ FDMap *FDMap
+
+ // Umask is the initial umask.
+ Umask uint
+
+ // Limits is the initial resource limits.
+ Limits *limits.LimitSet
+
+ // MaxSymlinkTraversals is the maximum number of symlinks to follow
+ // during resolution.
+ MaxSymlinkTraversals uint
+
+ // UTSNamespace is the initial UTS namespace.
+ UTSNamespace *UTSNamespace
+
+ // IPCNamespace is the initial IPC namespace.
+ IPCNamespace *IPCNamespace
+}
+
+// NewContext returns a context.Context that represents the task that will be
+// created by args.NewContext(k).
+func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext {
+ return &createProcessContext{
+ Logger: log.Log(),
+ k: k,
+ args: args,
+ }
+}
+
+// createProcessContext is a context.Context that represents the context
+// associated with a task that is being created.
+type createProcessContext struct {
+ context.NoopSleeper
+ log.Logger
+ k *Kernel
+ args *CreateProcessArgs
+}
+
+// Value implements context.Context.Value.
+func (ctx *createProcessContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxKernel:
+ return ctx.k
+ case CtxPIDNamespace:
+ // "The new task ... is in the root PID namespace." -
+ // Kernel.CreateProcess
+ return ctx.k.tasks.Root
+ case CtxUTSNamespace:
+ return ctx.args.UTSNamespace
+ case CtxIPCNamespace:
+ return ctx.args.IPCNamespace
+ case auth.CtxCredentials:
+ return ctx.args.Credentials
+ case fs.CtxRoot:
+ if ctx.k.mounts == nil {
+ return nil
+ }
+ return ctx.k.mounts.Root()
+ case ktime.CtxRealtimeClock:
+ return ctx.k.RealtimeClock()
+ case limits.CtxLimits:
+ return ctx.args.Limits
+ case platform.CtxPlatform:
+ return ctx.k
+ case uniqueid.CtxGlobalUniqueID:
+ return ctx.k.UniqueID()
+ case uniqueid.CtxInotifyCookie:
+ return ctx.k.GenerateInotifyCookie()
+ default:
+ return nil
+ }
+}
+
+// CreateProcess creates a new task in a new thread group with the given
+// options. The new task has no parent and is in the root PID namespace.
+//
+// If k.Start() has already been called, the created task will begin running
+// immediately. Otherwise, it will be started when k.Start() is called.
+//
+// CreateProcess has no analogue in Linux; it is used to create the initial
+// application task, as well as processes started by the control server.
+func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ log.Infof("EXEC: %v", args.Argv)
+
+ if k.mounts == nil {
+ return nil, fmt.Errorf("no kernel MountNamespace")
+ }
+
+ tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+ ctx := args.NewContext(k)
+
+ // Grab the root directory.
+ root := fs.RootFromContext(ctx)
+ defer root.DecRef()
+
+ // Grab the working directory.
+ wd := root // Default.
+ if args.WorkingDirectory != "" {
+ var err error
+ wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, args.MaxSymlinkTraversals)
+ if err != nil {
+ return nil, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+ }
+ defer wd.DecRef()
+ }
+
+ if args.Filename == "" {
+ // Was anything provided?
+ if len(args.Argv) == 0 {
+ return nil, fmt.Errorf("no filename or command provided")
+ }
+ if !filepath.IsAbs(args.Argv[0]) {
+ return nil, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
+ }
+ args.Filename = args.Argv[0]
+ }
+
+ // Create a fresh task context.
+ tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, args.MaxSymlinkTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+ if err != nil {
+ return nil, err
+ }
+ tr := newTaskResources(args.FDMap, newFSContext(root, wd, args.Umask))
+ // NewTask unconditionally takes ownership of tr, so we never have to call
+ // tr.release.
+
+ // Create the task.
+ config := &TaskConfig{
+ Kernel: k,
+ ThreadGroup: tg,
+ TaskContext: tc,
+ TaskResources: tr,
+ Credentials: args.Credentials,
+ UTSNamespace: args.UTSNamespace,
+ IPCNamespace: args.IPCNamespace,
+ AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores),
+ }
+ t, err := k.tasks.NewTask(config)
+ if err != nil {
+ return nil, err
+ }
+
+ // Success.
+ if k.started {
+ tid := k.tasks.Root.IDOfTask(t)
+ t.Start(tid)
+ } else if k.globalInit == nil {
+ k.globalInit = tg
+ }
+ return tg, nil
+}
+
+// Start starts execution of all tasks in k.
+//
+// Preconditions: Start may be called exactly once.
+func (k *Kernel) Start() error {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+
+ if k.globalInit == nil {
+ return fmt.Errorf("kernel contains no tasks")
+ }
+ if k.started {
+ return fmt.Errorf("kernel already started")
+ }
+
+ k.started = true
+ k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, kernelCPUClockListener{k})
+ k.cpuClockTicker.Swap(ktime.Setting{
+ Enabled: true,
+ Period: linux.ClockTick,
+ })
+ // If k was created by LoadKernelFrom, timers were stopped during
+ // Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
+ // this is a no-op.
+ k.resumeTimeLocked()
+ // Start task goroutines.
+ k.tasks.mu.RLock()
+ defer k.tasks.mu.RUnlock()
+ for t, tid := range k.tasks.Root.tids {
+ t.Start(tid)
+ }
+ return nil
+}
+
+// pauseTimeLocked pauses all Timers and Timekeeper updates.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) pauseTimeLocked() {
+ // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
+ // Kernel.Start().
+ if k.cpuClockTicker != nil {
+ k.cpuClockTicker.Pause()
+ }
+
+ // By precondition, nothing else can be interacting with PIDNamespace.tids
+ // or FDMap.files, so we can iterate them without synchronization. (We
+ // can't hold the TaskSet mutex when pausing thread group timers because
+ // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
+ // mutex, while holding the Timer mutex.)
+ for t := range k.tasks.Root.tids {
+ if t == t.tg.leader {
+ t.tg.tm.pause()
+ }
+ // This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+ // but ktime.Timer.Pause is idempotent so this is harmless.
+ if fdm := t.tr.FDMap; fdm != nil {
+ for _, desc := range fdm.files {
+ if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+ tfd.PauseTimer()
+ }
+ }
+ }
+ }
+ k.timekeeper.PauseUpdates()
+}
+
+// resumeTimeLocked resumes all Timers and Timekeeper updates. If
+// pauseTimeLocked has not been previously called, resumeTimeLocked has no
+// effect.
+//
+// Preconditions: Any task goroutines running in k must be stopped. k.extMu
+// must be locked.
+func (k *Kernel) resumeTimeLocked() {
+ if k.cpuClockTicker != nil {
+ k.cpuClockTicker.Resume()
+ }
+
+ k.timekeeper.ResumeUpdates()
+ for t := range k.tasks.Root.tids {
+ if t == t.tg.leader {
+ t.tg.tm.resume()
+ }
+ if fdm := t.tr.FDMap; fdm != nil {
+ for _, desc := range fdm.files {
+ if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
+ tfd.ResumeTimer()
+ }
+ }
+ }
+ }
+}
+
+// WaitExited blocks until all tasks in k have exited.
+func (k *Kernel) WaitExited() {
+ k.tasks.liveGoroutines.Wait()
+}
+
+// Kill requests that all tasks in k immediately exit as if group exiting with
+// status es. Kill does not wait for tasks to exit.
+func (k *Kernel) Kill(es ExitStatus) {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ k.tasks.Kill(es)
+}
+
+// Pause requests that all tasks in k temporarily stop executing, and blocks
+// until all tasks in k have stopped. Multiple calls to Pause nest and require
+// an equal number of calls to Unpause to resume execution.
+func (k *Kernel) Pause() {
+ k.extMu.Lock()
+ k.tasks.BeginExternalStop()
+ k.extMu.Unlock()
+ k.tasks.runningGoroutines.Wait()
+}
+
+// Unpause ends the effect of a previous call to Pause. If Unpause is called
+// without a matching preceding call to Pause, Unpause may panic.
+func (k *Kernel) Unpause() {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ k.tasks.EndExternalStop()
+}
+
+// SendExternalSignal injects a signal into the kernel.
+//
+// context is used only for debugging to describe how the signal was received.
+//
+// Returns false if signal could not be sent because the Kernel is not fully
+// initialized yet.
+func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) bool {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ return k.sendExternalSignal(info, context)
+}
+
+// FeatureSet returns the FeatureSet.
+func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
+ return k.featureSet
+}
+
+// Timekeeper returns the Timekeeper.
+func (k *Kernel) Timekeeper() *Timekeeper {
+ return k.timekeeper
+}
+
+// TaskSet returns the TaskSet.
+func (k *Kernel) TaskSet() *TaskSet {
+ return k.tasks
+}
+
+// RootUserNamespace returns the root UserNamespace.
+func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
+ return k.rootUserNamespace
+}
+
+// RootUTSNamespace returns the root UTSNamespace.
+func (k *Kernel) RootUTSNamespace() *UTSNamespace {
+ return k.rootUTSNamespace
+}
+
+// RootIPCNamespace returns the root IPCNamespace.
+func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+ return k.rootIPCNamespace
+}
+
+// RootMountNamespace returns the MountNamespace.
+func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ return k.mounts
+}
+
+// SetRootMountNamespace sets the MountNamespace.
+func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ k.mounts = mounts
+}
+
+// NetworkStack returns the network stack. NetworkStack may return nil if no
+// network stack is available.
+func (k *Kernel) NetworkStack() inet.Stack {
+ return k.networkStack
+}
+
+// GlobalInit returns the thread group with ID 1 in the root PID namespace, or
+// nil if no such thread group exists. GlobalInit may return a thread group
+// containing no tasks if the thread group has already exited.
+func (k *Kernel) GlobalInit() *ThreadGroup {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ return k.globalInit
+}
+
+// ApplicationCores returns the number of CPUs visible to sandboxed
+// applications.
+func (k *Kernel) ApplicationCores() uint {
+ return k.applicationCores
+}
+
+// RealtimeClock returns the application CLOCK_REALTIME clock.
+func (k *Kernel) RealtimeClock() ktime.Clock {
+ return k.realtimeClock
+}
+
+// MonotonicClock returns the application CLOCK_MONOTONIC clock.
+func (k *Kernel) MonotonicClock() ktime.Clock {
+ return k.monotonicClock
+}
+
+// CPUClockNow returns the current value of k.cpuClock.
+func (k *Kernel) CPUClockNow() uint64 {
+ return atomic.LoadUint64(&k.cpuClock)
+}
+
+// Syslog returns the syslog.
+func (k *Kernel) Syslog() *syslog {
+ return &k.syslog
+}
+
+// GenerateInotifyCookie generates a unique inotify event cookie.
+//
+// Returned values may overlap with previously returned values if the value
+// space is exhausted. 0 is not a valid cookie value, all other values
+// representable in a uint32 are allowed.
+func (k *Kernel) GenerateInotifyCookie() uint32 {
+ id := atomic.AddUint32(&k.nextInotifyCookie, 1)
+ // Wrap-around is explicitly allowed for inotify event cookies.
+ if id == 0 {
+ id = atomic.AddUint32(&k.nextInotifyCookie, 1)
+ }
+ return id
+}
+
+// NetlinkPorts returns the netlink port manager.
+func (k *Kernel) NetlinkPorts() *port.Manager {
+ return k.netlinkPorts
+}
+
+// ExitError returns the sandbox error that caused the kernel to exit.
+func (k *Kernel) ExitError() error {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ return k.exitErr
+}
+
+// SetExitError sets the sandbox error that caused the kernel to exit, if one is
+// not already set.
+func (k *Kernel) SetExitError(err error) {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ if k.exitErr == nil {
+ k.exitErr = err
+ }
+}
+
+// SupervisorContext returns a Context with maximum privileges in k. It should
+// only be used by goroutines outside the control of the emulated kernel
+// defined by e.
+//
+// Callers are responsible for ensuring that the returned Context is not used
+// concurrently with changes to the Kernel.
+func (k *Kernel) SupervisorContext() context.Context {
+ return supervisorContext{
+ Logger: log.Log(),
+ k: k,
+ }
+}
+
+type supervisorContext struct {
+ context.NoopSleeper
+ log.Logger
+ k *Kernel
+}
+
+// Value implements context.Context.
+func (ctx supervisorContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxCanTrace:
+ // The supervisor context can trace anything. (None of
+ // supervisorContext's users are expected to invoke ptrace, but ptrace
+ // permissions are required for certain file accesses.)
+ return func(*Task, bool) bool { return true }
+ case CtxKernel:
+ return ctx.k
+ case CtxPIDNamespace:
+ return ctx.k.tasks.Root
+ case CtxUTSNamespace:
+ return ctx.k.rootUTSNamespace
+ case CtxIPCNamespace:
+ return ctx.k.rootIPCNamespace
+ case auth.CtxCredentials:
+ // The supervisor context is global root.
+ return auth.NewRootCredentials(ctx.k.rootUserNamespace)
+ case fs.CtxRoot:
+ return ctx.k.mounts.Root()
+ case ktime.CtxRealtimeClock:
+ return ctx.k.RealtimeClock()
+ case limits.CtxLimits:
+ // No limits apply.
+ return limits.NewLimitSet()
+ case platform.CtxPlatform:
+ return ctx.k
+ case uniqueid.CtxGlobalUniqueID:
+ return ctx.k.UniqueID()
+ case uniqueid.CtxInotifyCookie:
+ return ctx.k.GenerateInotifyCookie()
+ default:
+ return nil
+ }
+}
+
+type kernelCPUClockListener struct {
+ k *Kernel
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (l kernelCPUClockListener) Notify(exp uint64) {
+ atomic.AddUint64(&l.k.cpuClock, exp)
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (l kernelCPUClockListener) Destroy() {
+}
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
new file mode 100644
index 000000000..c7779e1d5
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -0,0 +1,31 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+ name = "memevent",
+ srcs = ["memory_events.go"],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent",
+ visibility = ["//:sandbox"],
+ deps = [
+ ":memory_events_go_proto",
+ "//pkg/eventchannel",
+ "//pkg/log",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/usage",
+ ],
+)
+
+proto_library(
+ name = "memory_events_proto",
+ srcs = ["memory_events.proto"],
+ visibility = ["//visibility:public"],
+)
+
+go_proto_library(
+ name = "memory_events_go_proto",
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto",
+ proto = ":memory_events_proto",
+ visibility = ["//visibility:public"],
+)
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
new file mode 100644
index 000000000..ecc9151de
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -0,0 +1,98 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memevent implements the memory usage events controller, which
+// periodically emits events via the eventchannel.
+package memevent
+
+import (
+ "sync"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/eventchannel"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ pb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// MemoryEvents describes the configuration for the global memory event emitter.
+type MemoryEvents struct {
+ k *kernel.Kernel
+
+ // The period is how often to emit an event. The memory events goroutine
+ // will ensure a minimum of one event is emitted per this period, regardless
+ // how of much memory usage has changed.
+ period time.Duration
+
+ // Writing to this channel indicates the memory goroutine should stop.
+ stop chan struct{}
+
+ // done is used to signal when the memory event goroutine has exited.
+ done sync.WaitGroup
+}
+
+// New creates a new MemoryEvents.
+func New(k *kernel.Kernel, period time.Duration) *MemoryEvents {
+ return &MemoryEvents{
+ k: k,
+ period: period,
+ stop: make(chan struct{}),
+ }
+}
+
+// Stop stops the memory usage events emitter goroutine. Stop must not be called
+// concurrently with Start and may only be called once.
+func (m *MemoryEvents) Stop() {
+ close(m.stop)
+ m.done.Wait()
+}
+
+// Start starts the memory usage events emitter goroutine. Start must not be
+// called concurrently with Stop and may only be called once.
+func (m *MemoryEvents) Start() {
+ if m.period == 0 {
+ return
+ }
+ go m.run() // S/R-SAFE: doesn't interact with saved state.
+}
+
+func (m *MemoryEvents) run() {
+ m.done.Add(1)
+
+ ticker := time.NewTicker(m.period)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-m.stop:
+ m.done.Done()
+ return
+ case <-ticker.C:
+ m.emit()
+ }
+ }
+}
+
+func (m *MemoryEvents) emit() {
+ totalPlatform, err := m.k.Platform.Memory().TotalUsage()
+ if err != nil {
+ log.Warningf("Failed to fetch memory usage for memory events: %v", err)
+ return
+ }
+ snapshot, _ := usage.MemoryAccounting.Copy()
+ total := totalPlatform + snapshot.Mapped
+
+ eventchannel.Emit(&pb.MemoryUsageEvent{Total: total})
+}
diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto
new file mode 100644
index 000000000..e6e0bd628
--- /dev/null
+++ b/pkg/sentry/kernel/memevent/memory_events.proto
@@ -0,0 +1,25 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+// MemoryUsageEvent describes the memory usage of the sandbox at a single
+// instant in time. These messages are emitted periodically on the eventchannel.
+message MemoryUsageEvent {
+ // The total memory usage of the sandboxed application in bytes, calculated
+ // using the 'fast' method.
+ uint64 total = 1;
+}
diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go
new file mode 100644
index 000000000..d8701f47a
--- /dev/null
+++ b/pkg/sentry/kernel/pending_signals.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/bits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+const (
+ // stdSignalCap is the maximum number of instances of a given standard
+ // signal that may be pending. ("[If] multiple instances of a standard
+ // signal are delivered while that signal is currently blocked, then only
+ // one instance is queued.") - signal(7)
+ stdSignalCap = 1
+
+ // rtSignalCap is the maximum number of instances of a given realtime
+ // signal that may be pending.
+ //
+ // TODO: In Linux, the minimum signal queue size is
+ // RLIMIT_SIGPENDING, which is by default max_threads/2.
+ rtSignalCap = 32
+)
+
+// pendingSignals holds a collection of pending signals. The zero value of
+// pendingSignals is a valid empty collection. pendingSignals is thread-unsafe;
+// users must provide synchronization.
+type pendingSignals struct {
+ // signals contains all pending signals.
+ //
+ // Note that signals is zero-indexed, but signal 1 is the first valid
+ // signal, so signals[0] contains signals with signo 1 etc. This offset is
+ // usually handled by using Signal.index().
+ signals [linux.SignalMaximum]pendingSignalQueue
+
+ // Bit i of pendingSet is set iff there is at least one signal with signo
+ // i+1 pending.
+ pendingSet linux.SignalSet
+}
+
+// pendingSignalQueue holds a pendingSignalList for a single signal number.
+type pendingSignalQueue struct {
+ pendingSignalList
+ length int
+}
+
+type pendingSignal struct {
+ // pendingSignalEntry links into a pendingSignalList.
+ pendingSignalEntry
+ *arch.SignalInfo
+}
+
+// enqueue enqueues the given signal. enqueue returns true on success and false
+// on failure (if the given signal's queue is full).
+//
+// Preconditions: info represents a valid signal.
+func (p *pendingSignals) enqueue(info *arch.SignalInfo) bool {
+ sig := linux.Signal(info.Signo)
+ q := &p.signals[sig.Index()]
+ if sig.IsStandard() {
+ if q.length >= stdSignalCap {
+ return false
+ }
+ } else if q.length >= rtSignalCap {
+ return false
+ }
+ q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info})
+ q.length++
+ p.pendingSet |= linux.SignalSetOf(sig)
+ return true
+}
+
+// dequeue dequeues and returns any pending signal not masked by mask. If no
+// unmasked signals are pending, dequeue returns nil.
+func (p *pendingSignals) dequeue(mask linux.SignalSet) *arch.SignalInfo {
+ // "Real-time signals are delivered in a guaranteed order. Multiple
+ // real-time signals of the same type are delivered in the order they were
+ // sent. If different real-time signals are sent to a process, they are
+ // delivered starting with the lowest-numbered signal. (I.e., low-numbered
+ // signals have highest priority.) By contrast, if multiple standard
+ // signals are pending for a process, the order in which they are delivered
+ // is unspecified. If both standard and real-time signals are pending for a
+ // process, POSIX leaves it unspecified which is delivered first. Linux,
+ // like many other implementations, gives priority to standard signals in
+ // this case." - signal(7)
+ lowestPendingUnblockedBit := bits.TrailingZeros64(uint64(p.pendingSet &^ mask))
+ if lowestPendingUnblockedBit >= linux.SignalMaximum {
+ return nil
+ }
+ return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1))
+}
+
+func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *arch.SignalInfo {
+ q := &p.signals[sig.Index()]
+ ps := q.pendingSignalList.Front()
+ if ps == nil {
+ return nil
+ }
+ q.pendingSignalList.Remove(ps)
+ q.length--
+ if q.length == 0 {
+ p.pendingSet &^= linux.SignalSetOf(sig)
+ }
+ return ps.SignalInfo
+}
+
+// discardSpecific causes all pending signals with number sig to be discarded.
+func (p *pendingSignals) discardSpecific(sig linux.Signal) {
+ q := &p.signals[sig.Index()]
+ q.pendingSignalList.Reset()
+ q.length = 0
+ p.pendingSet &^= linux.SignalSetOf(sig)
+}
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
new file mode 100644
index 000000000..ca9825f9d
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -0,0 +1,68 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+ name = "pipe_state",
+ srcs = [
+ "buffers.go",
+ "node.go",
+ "pipe.go",
+ "reader.go",
+ "reader_writer.go",
+ "writer.go",
+ ],
+ out = "pipe_state.go",
+ package = "pipe",
+)
+
+go_library(
+ name = "pipe",
+ srcs = [
+ "buffers.go",
+ "device.go",
+ "node.go",
+ "pipe.go",
+ "pipe_state.go",
+ "reader.go",
+ "reader_writer.go",
+ "writer.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/amutex",
+ "//pkg/ilist",
+ "//pkg/log",
+ "//pkg/refs",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/context",
+ "//pkg/sentry/device",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/usermem",
+ "//pkg/state",
+ "//pkg/syserror",
+ "//pkg/waiter",
+ ],
+)
+
+go_test(
+ name = "pipe_test",
+ size = "small",
+ srcs = [
+ "node_test.go",
+ "pipe_test.go",
+ ],
+ embed = [":pipe"],
+ deps = [
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/usermem",
+ "//pkg/syserror",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go
new file mode 100644
index 000000000..f300537c5
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/buffers.go
@@ -0,0 +1,50 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/ilist"
+)
+
+// Buffer encapsulates a queueable byte buffer that can
+// easily be truncated. It is designed only for use with pipes.
+type Buffer struct {
+ ilist.Entry
+ data []byte
+}
+
+// newBuffer initializes a Buffer.
+func newBuffer(buf []byte) *Buffer {
+ return &Buffer{data: buf}
+}
+
+// bytes returns the bytes contained in the buffer.
+func (b *Buffer) bytes() []byte {
+ return b.data
+}
+
+// size returns the number of bytes contained in the buffer.
+func (b *Buffer) size() int {
+ return len(b.data)
+}
+
+// truncate removes the first n bytes from the buffer.
+func (b *Buffer) truncate(n int) int {
+ if n > len(b.data) {
+ panic("Trying to truncate past end of array.")
+ }
+ b.data = b.data[n:]
+ return len(b.data)
+}
diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go
new file mode 100644
index 000000000..8d383577a
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/device.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import "gvisor.googlesource.com/gvisor/pkg/sentry/device"
+
+// pipeDevice is used for all pipe files.
+var pipeDevice = device.NewAnonDevice()
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
new file mode 100644
index 000000000..5b47427ef
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -0,0 +1,175 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/amutex"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// inodeOperations wraps fs.InodeOperations operations with common pipe opening semantics.
+type inodeOperations struct {
+ fs.InodeOperations
+
+ // mu protects the fields below.
+ mu sync.Mutex `state:"nosave"`
+
+ // p is the underlying Pipe object representing this fifo.
+ p *Pipe
+
+ // Channels for synchronizing the creation of new readers and writers of
+ // this fifo. See waitFor and newHandleLocked.
+ //
+ // These are not saved/restored because all waiters are unblocked on save,
+ // and either automatically restart (via ERESTARTSYS) or return EINTR on
+ // resume. On restarts via ERESTARTSYS, the appropriate channel will be
+ // recreated.
+ rWakeup chan struct{} `state:"nosave"`
+ wWakeup chan struct{} `state:"nosave"`
+}
+
+// NewInodeOperations creates a new pipe fs.InodeOperations.
+func NewInodeOperations(base fs.InodeOperations, p *Pipe) fs.InodeOperations {
+ return &inodeOperations{
+ InodeOperations: base,
+ p: p,
+ }
+}
+
+// GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking
+// semantics during open:
+//
+// "Normally, opening the FIFO blocks until the other end is opened also. A
+// process can open a FIFO in nonblocking mode. In this case, opening for
+// read-only will succeed even if no-one has opened on the write side yet,
+// opening for write-only will fail with ENXIO (no such device or address)
+// unless the other end has already been opened. Under Linux, opening a FIFO
+// for read and write will succeed both in blocking and nonblocking mode. POSIX
+// leaves this behavior undefined. This can be used to open a FIFO for writing
+// while there are no readers available." - fifo(7)
+func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ i.mu.Lock()
+ defer i.mu.Unlock()
+
+ switch {
+ case flags.Read && !flags.Write: // O_RDONLY.
+ r := i.p.ROpen(ctx)
+ i.newHandleLocked(&i.rWakeup)
+
+ if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
+ if !i.waitFor(&i.wWakeup, ctx) {
+ r.DecRef()
+ return nil, syserror.ErrInterrupted
+ }
+ }
+
+ // By now, either we're doing a nonblocking open or we have a writer. On
+ // a nonblocking read-only open, the open succeeds even if no-one has
+ // opened the write side yet.
+ return r, nil
+
+ case flags.Write && !flags.Read: // O_WRONLY.
+ w := i.p.WOpen(ctx)
+ i.newHandleLocked(&i.wWakeup)
+
+ if i.p.isNamed && !i.p.HasReaders() {
+ // On a nonblocking, write-only open, the open fails with ENXIO if the
+ // read side isn't open yet.
+ if flags.NonBlocking {
+ w.DecRef()
+ return nil, syserror.ENXIO
+ }
+
+ if !i.waitFor(&i.rWakeup, ctx) {
+ w.DecRef()
+ return nil, syserror.ErrInterrupted
+ }
+ }
+ return w, nil
+
+ case flags.Read && flags.Write: // O_RDWR.
+ // Pipes opened for read-write always succeeds without blocking.
+ rw := i.p.RWOpen(ctx)
+ i.newHandleLocked(&i.rWakeup)
+ i.newHandleLocked(&i.wWakeup)
+ return rw, nil
+
+ default:
+ return nil, syserror.EINVAL
+ }
+}
+
+// waitFor blocks until the underlying pipe has at least one reader/writer is
+// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this
+// function will block for either readers or writers, depending on where
+// 'wakeupChan' points.
+//
+// f.mu must be held by the caller. waitFor returns with f.mu held, but it will
+// drop f.mu before blocking for any reader/writers.
+func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool {
+ // Ideally this function would simply use a condition variable. However, the
+ // wait needs to be interruptible via 'sleeper', so we must sychronize via a
+ // channel. The synchronization below relies on the fact that closing a
+ // channel unblocks all receives on the channel.
+
+ // Does an appropriate wakeup channel already exist? If not, create a new
+ // one. This is all done under f.mu to avoid races.
+ if *wakeupChan == nil {
+ *wakeupChan = make(chan struct{})
+ }
+
+ // Grab a local reference to the wakeup channel since it may disappear as
+ // soon as we drop f.mu.
+ wakeup := *wakeupChan
+
+ // Drop the lock and prepare to sleep.
+ i.mu.Unlock()
+ cancel := sleeper.SleepStart()
+
+ // Wait for either a new reader/write to be signalled via 'wakeup', or
+ // for the sleep to be cancelled.
+ select {
+ case <-wakeup:
+ sleeper.SleepFinish(true)
+ case <-cancel:
+ sleeper.SleepFinish(false)
+ }
+
+ // Take the lock and check if we were woken. If we were woken and
+ // interrupted, the former takes priority.
+ i.mu.Lock()
+ select {
+ case <-wakeup:
+ return true
+ default:
+ return false
+ }
+}
+
+// newHandleLocked signals a new pipe reader or writer depending on where
+// 'wakeupChan' points. This unblocks any corresponding reader or writer
+// waiting for the other end of the channel to be opened, see Fifo.waitFor.
+//
+// i.mu must be held.
+func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) {
+ if *wakeupChan != nil {
+ close(*wakeupChan)
+ *wakeupChan = nil
+ }
+}
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
new file mode 100644
index 000000000..cc1ebf4f6
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -0,0 +1,308 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+ "testing"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+type sleeper struct {
+ context.Context
+ ch chan struct{}
+}
+
+func newSleeperContext(t *testing.T) context.Context {
+ return &sleeper{
+ Context: contexttest.Context(t),
+ ch: make(chan struct{}),
+ }
+}
+
+func (s *sleeper) SleepStart() <-chan struct{} {
+ return s.ch
+}
+
+func (s *sleeper) SleepFinish(bool) {
+}
+
+func (s *sleeper) Cancel() {
+ s.ch <- struct{}{}
+}
+
+type openResult struct {
+ *fs.File
+ error
+}
+
+func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) {
+ file, err := n.GetFile(ctx, nil, flags)
+ if err != nil {
+ t.Fatalf("open with flags %+v failed: %v", flags, err)
+ }
+ if doneChan != nil {
+ doneChan <- struct{}{}
+ }
+ return file, err
+}
+
+func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, resChan chan<- openResult) (*fs.File, error) {
+ file, err := n.GetFile(ctx, nil, flags)
+ if resChan != nil {
+ resChan <- openResult{file, err}
+ }
+ return file, err
+}
+
+func newNamedPipe(t *testing.T) *Pipe {
+ return NewPipe(contexttest.Context(t), true, DefaultPipeSize, usermem.PageSize)
+}
+
+func newAnonPipe(t *testing.T) *Pipe {
+ return NewPipe(contexttest.Context(t), false, DefaultPipeSize, usermem.PageSize)
+}
+
+// assertRecvBlocks ensures that a recv attempt on c blocks for at least
+// blockDuration. This is useful for checking that a goroutine that is supposed
+// to be executing a blocking operation is actually blocking.
+func assertRecvBlocks(t *testing.T, c <-chan struct{}, blockDuration time.Duration, failMsg string) {
+ select {
+ case <-c:
+ t.Fatalf(failMsg)
+ case <-time.After(blockDuration):
+ // Ok, blocked for the required duration.
+ }
+}
+
+func TestReadOpenBlocksForWriteOpen(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ rDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+ // Verify that the open for read is blocking.
+ assertRecvBlocks(t, rDone, time.Millisecond*100,
+ "open for read not blocking with no writers")
+
+ wDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+ <-wDone
+ <-rDone
+}
+
+func TestWriteOpenBlocksForReadOpen(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ wDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+ // Verify that the open for write is blocking
+ assertRecvBlocks(t, wDone, time.Millisecond*100,
+ "open for write not blocking with no readers")
+
+ rDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+ <-rDone
+ <-wDone
+}
+
+func TestMultipleWriteOpenDoesntCountAsReadOpen(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ rDone1 := make(chan struct{})
+ rDone2 := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone1)
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone2)
+
+ assertRecvBlocks(t, rDone1, time.Millisecond*100,
+ "open for read didn't block with no writers")
+ assertRecvBlocks(t, rDone2, time.Millisecond*100,
+ "open for read didn't block with no writers")
+
+ wDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+ <-wDone
+ <-rDone2
+ <-rDone1
+}
+
+func TestClosedReaderBlocksWriteOpen(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil)
+ rFile.DecRef()
+
+ wDone := make(chan struct{})
+ // This open for write should block because the reader is now gone.
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+ assertRecvBlocks(t, wDone, time.Millisecond*100,
+ "open for write didn't block with no concurrent readers")
+
+ // Open for read again. This should unblock the open for write.
+ rDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+ <-rDone
+ <-wDone
+}
+
+func TestReadWriteOpenNeverBlocks(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ rwDone := make(chan struct{})
+ // Open for read-write never wait for a reader or writer, even if the
+ // nonblocking flag is not set.
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true, NonBlocking: false}, rwDone)
+ <-rwDone
+}
+
+func TestReadWriteOpenUnblocksReadOpen(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ rDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+ rwDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone)
+
+ <-rwDone
+ <-rDone
+}
+
+func TestReadWriteOpenUnblocksWriteOpen(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ wDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+ rwDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone)
+
+ <-rwDone
+ <-wDone
+}
+
+func TestBlockedOpenIsCancellable(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ done := make(chan openResult)
+ go testOpen(ctx, t, f, fs.FileFlags{Read: true}, done)
+ select {
+ case <-done:
+ t.Fatalf("open for read didn't block with no writers")
+ case <-time.After(time.Millisecond * 100):
+ // Ok.
+ }
+
+ ctx.(*sleeper).Cancel()
+ // If the cancel on the sleeper didn't work, the open for read would never
+ // return.
+ res := <-done
+ if res.error != syserror.ErrInterrupted {
+ t.Fatalf("Cancellation didn't cause GetFile to return fs.ErrInterrupted, got %v.",
+ res.error)
+ }
+}
+
+func TestNonblockingReadOpenNoWriters(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil {
+ t.Fatalf("Nonblocking open for read failed with error %v.", err)
+ }
+}
+
+func TestNonblockingWriteOpenNoReaders(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != syserror.ENXIO {
+ t.Fatalf("Nonblocking open for write failed unexpected error %v.", err)
+ }
+}
+
+func TestNonBlockingReadOpenWithWriter(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ wDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone)
+
+ // Open for write blocks since there are no readers yet.
+ assertRecvBlocks(t, wDone, time.Millisecond*100,
+ "Open for write didn't block with no reader.")
+
+ if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil {
+ t.Fatalf("Nonblocking open for read failed with error %v.", err)
+ }
+
+ // Open for write should now be unblocked.
+ <-wDone
+}
+
+func TestNonBlockingWriteOpenWithReader(t *testing.T) {
+ f := NewInodeOperations(nil, newNamedPipe(t))
+ ctx := newSleeperContext(t)
+
+ rDone := make(chan struct{})
+ go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone)
+
+ // Open for write blocked, since no reader yet.
+ assertRecvBlocks(t, rDone, time.Millisecond*100,
+ "Open for reader didn't block with no writer.")
+
+ if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != nil {
+ t.Fatalf("Nonblocking open for write failed with error %v.", err)
+ }
+
+ // Open for write should now be unblocked.
+ <-rDone
+}
+
+func TestAnonReadOpen(t *testing.T) {
+ f := NewInodeOperations(nil, newAnonPipe(t))
+ ctx := newSleeperContext(t)
+
+ if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true}, nil); err != nil {
+ t.Fatalf("open anon pipe for read failed: %v", err)
+ }
+}
+
+func TestAnonWriteOpen(t *testing.T) {
+ f := NewInodeOperations(nil, newAnonPipe(t))
+ ctx := newSleeperContext(t)
+
+ if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true}, nil); err != nil {
+ t.Fatalf("open anon pipe for write failed: %v", err)
+ }
+}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
new file mode 100644
index 000000000..1656c6ff3
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -0,0 +1,335 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipe provides an in-memory implementation of a unidirectional
+// pipe.
+//
+// The goal of this pipe is to emulate the pipe syscall in all of its
+// edge cases and guarantees of atomic IO.
+package pipe
+
+import (
+ "fmt"
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/ilist"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// DefaultPipeSize is the system-wide default size of a pipe in bytes.
+const DefaultPipeSize = 65536
+
+// Pipe is an encapsulation of a platform-independent pipe.
+// It manages a buffered byte queue shared between a reader/writer
+// pair.
+type Pipe struct {
+ waiter.Queue `state:"nosave"`
+
+ // Whether this is a named or anonymous pipe.
+ isNamed bool
+
+ // The dirent backing this pipe. Shared by all readers and writers.
+ dirent *fs.Dirent
+
+ // The buffered byte queue.
+ data ilist.List
+
+ // Max size of the pipe in bytes. When this max has been reached,
+ // writers will get EWOULDBLOCK.
+ max int
+
+ // Current size of the pipe in bytes.
+ size int
+
+ // Max number of bytes the pipe can guarantee to read or write
+ // atomically.
+ atomicIOBytes int
+
+ // The number of active readers for this pipe. Load/store atomically.
+ readers int32
+
+ // The number of active writes for this pipe. Load/store atomically.
+ writers int32
+
+ // This flag indicates if this pipe ever had a writer. Note that this does
+ // not necessarily indicate there is *currently* a writer, just that there
+ // has been a writer at some point since the pipe was created.
+ //
+ // Protected by mu.
+ hadWriter bool
+
+ // Lock protecting all pipe internal state.
+ mu sync.Mutex `state:"nosave"`
+}
+
+// NewPipe initializes and returns a pipe. A pipe created by this function is
+// persistent, and will remain valid even without any open fds to it. Named
+// pipes for mknod(2) are created via this function. Note that the
+// implementation of blocking semantics for opening the read and write ends of a
+// named pipe are left to filesystems.
+func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int) *Pipe {
+ p := &Pipe{
+ isNamed: isNamed,
+ max: sizeBytes,
+ atomicIOBytes: atomicIOBytes,
+ }
+
+ // Build the fs.Dirent of this pipe, shared by all fs.Files associated
+ // with this pipe.
+ ino := pipeDevice.NextIno()
+ base := fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{
+ FSType: linux.PIPEFS_MAGIC,
+ UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+ Owner: fs.FileOwnerFromContext(ctx),
+ Perms: fs.FilePermissions{
+ User: fs.PermMask{Read: true, Write: true},
+ },
+ Links: 1,
+ }),
+ })
+ sattr := fs.StableAttr{
+ Type: fs.Pipe,
+ DeviceID: pipeDevice.DeviceID(),
+ InodeID: ino,
+ BlockSize: int64(atomicIOBytes),
+ }
+ // There is no real filesystem backing this pipe, so we pass in a nil
+ // Filesystem.
+ sb := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+ p.dirent = fs.NewDirent(fs.NewInode(NewInodeOperations(base, p), sb, sattr), fmt.Sprintf("pipe:[%d]", ino))
+
+ return p
+}
+
+// NewConnectedPipe initializes a pipe and returns a pair of objects (which
+// implement kio.File) representing the read and write ends of the pipe. A pipe
+// created by this function becomes invalid as soon as either the read or write
+// end is closed, and errors on subsequent operations on either end. Pipes
+// for pipe(2) and pipe2(2) are generally created this way.
+func NewConnectedPipe(ctx context.Context, sizeBytes int, atomicIOBytes int) (*fs.File, *fs.File) {
+ p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes)
+ return p.ROpen(ctx), p.WOpen(ctx)
+}
+
+// ROpen opens the pipe for reading.
+func (p *Pipe) ROpen(ctx context.Context) *fs.File {
+ p.rOpen()
+ return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true}, &Reader{
+ ReaderWriter: ReaderWriter{Pipe: p},
+ })
+}
+
+// WOpen opens the pipe for writing.
+func (p *Pipe) WOpen(ctx context.Context) *fs.File {
+ p.wOpen()
+ return fs.NewFile(ctx, p.dirent, fs.FileFlags{Write: true}, &Writer{
+ ReaderWriter: ReaderWriter{Pipe: p},
+ })
+}
+
+// RWOpen opens the pipe for both reading and writing.
+func (p *Pipe) RWOpen(ctx context.Context) *fs.File {
+ p.rOpen()
+ p.wOpen()
+ return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true, Write: true}, &ReaderWriter{
+ Pipe: p,
+ })
+}
+
+// read reads data from the pipe into dst and returns the number of bytes
+// read, or returns ErrWouldBlock if the pipe is empty.
+func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+ if !p.HasReaders() {
+ return 0, syscall.EBADF
+ }
+
+ // Don't block for a zero-length read even if the pipe is empty.
+ if dst.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ // If there is nothing to read at the moment but there is a writer, tell the
+ // caller to block.
+ if p.size == 0 {
+ if !p.HasWriters() {
+ // There are no writers, return EOF.
+ return 0, nil
+ }
+ return 0, syserror.ErrWouldBlock
+ }
+ var n int64
+ for b := p.data.Front(); b != nil; b = p.data.Front() {
+ buffer := b.(*Buffer)
+ n0, err := dst.CopyOut(ctx, buffer.bytes())
+ n += int64(n0)
+ p.size -= n0
+ if buffer.truncate(n0) == 0 {
+ p.data.Remove(b)
+ }
+ dst = dst.DropFirst(n0)
+ if dst.NumBytes() == 0 || err != nil {
+ return n, err
+ }
+ }
+ return n, nil
+}
+
+// write writes data from sv into the pipe and returns the number of bytes
+// written. If no bytes are written because the pipe is full (or has less than
+// atomicIOBytes free capacity), write returns ErrWouldBlock.
+func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ if !p.HasWriters() {
+ return 0, syscall.EBADF
+ }
+ if !p.HasReaders() {
+ return 0, syscall.EPIPE
+ }
+
+ // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be
+ // atomic, but requires no atomicity for writes larger than this. However,
+ // Linux appears to provide stronger semantics than this in practice:
+ // unmerged writes are done one PAGE_SIZE buffer at a time, so for larger
+ // writes, the writing of each PIPE_BUF-sized chunk is atomic. We implement
+ // this by writing at most atomicIOBytes at a time if we can't service the
+ // write in its entirety.
+ canWrite := src.NumBytes()
+ if canWrite > int64(p.max-p.size) {
+ if p.max-p.size >= p.atomicIOBytes {
+ canWrite = int64(p.atomicIOBytes)
+ } else {
+ return 0, syserror.ErrWouldBlock
+ }
+ }
+
+ // Copy data from user memory into a pipe-owned buffer.
+ buf := make([]byte, canWrite)
+ n, err := src.CopyIn(ctx, buf)
+ if n > 0 {
+ p.data.PushBack(newBuffer(buf[:n]))
+ p.size += n
+ }
+ if int64(n) < src.NumBytes() && err == nil {
+ // Partial write due to full pipe.
+ err = syserror.ErrWouldBlock
+ }
+ return int64(n), err
+}
+
+// rOpen signals a new reader of the pipe.
+func (p *Pipe) rOpen() {
+ atomic.AddInt32(&p.readers, 1)
+}
+
+// wOpen signals a new writer of the pipe.
+func (p *Pipe) wOpen() {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ p.hadWriter = true
+ atomic.AddInt32(&p.writers, 1)
+}
+
+// rClose signals that a reader has closed their end of the pipe.
+func (p *Pipe) rClose() {
+ newReaders := atomic.AddInt32(&p.readers, -1)
+ if newReaders < 0 {
+ panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders))
+ }
+}
+
+// wClose signals that a writer has closed their end of the pipe.
+func (p *Pipe) wClose() {
+ newWriters := atomic.AddInt32(&p.writers, -1)
+ if newWriters < 0 {
+ panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters))
+ }
+}
+
+// HasReaders returns whether the pipe has any active readers.
+func (p *Pipe) HasReaders() bool {
+ return atomic.LoadInt32(&p.readers) > 0
+}
+
+// HasWriters returns whether the pipe has any active writers.
+func (p *Pipe) HasWriters() bool {
+ return atomic.LoadInt32(&p.writers) > 0
+}
+
+func (p *Pipe) rReadinessLocked() waiter.EventMask {
+ ready := waiter.EventMask(0)
+ if p.HasReaders() && p.data.Front() != nil {
+ ready |= waiter.EventIn
+ }
+ if !p.HasWriters() && p.hadWriter {
+ // POLLHUP must be supressed until the pipe has had at least one writer
+ // at some point. Otherwise a reader thread may poll and immediately get
+ // a POLLHUP before the writer ever opens the pipe, which the reader may
+ // interpret as the writer opening then closing the pipe.
+ ready |= waiter.EventHUp
+ }
+ return ready
+}
+
+// rReadiness returns a mask that states whether the read end of the pipe is
+// ready for reading.
+func (p *Pipe) rReadiness() waiter.EventMask {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ return p.rReadinessLocked()
+}
+
+func (p *Pipe) wReadinessLocked() waiter.EventMask {
+ ready := waiter.EventMask(0)
+ if p.HasWriters() && p.size < p.max {
+ ready |= waiter.EventOut
+ }
+ if !p.HasReaders() {
+ ready |= waiter.EventErr
+ }
+ return ready
+}
+
+// wReadiness returns a mask that states whether the write end of the pipe
+// is ready for writing.
+func (p *Pipe) wReadiness() waiter.EventMask {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ return p.wReadinessLocked()
+}
+
+// rwReadiness returns a mask that states whether a read-write handle to the
+// pipe is ready for IO.
+func (p *Pipe) rwReadiness() waiter.EventMask {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ return p.rReadinessLocked() | p.wReadinessLocked()
+}
+
+func (p *Pipe) queuedSize() int {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ return p.size
+}
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
new file mode 100644
index 000000000..49ef8c8ac
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -0,0 +1,138 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+ "bytes"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+func TestPipeRW(t *testing.T) {
+ ctx := contexttest.Context(t)
+ r, w := NewConnectedPipe(ctx, 65536, 4096)
+ defer r.DecRef()
+ defer w.DecRef()
+
+ msg := []byte("here's some bytes")
+ wantN := int64(len(msg))
+ n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
+ if n != wantN || err != nil {
+ t.Fatalf("Writev: got (%d, %v), wanted (%d, nil)", n, err, wantN)
+ }
+
+ buf := make([]byte, len(msg))
+ n, err = r.Readv(ctx, usermem.BytesIOSequence(buf))
+ if n != wantN || err != nil || !bytes.Equal(buf, msg) {
+ t.Fatalf("Readv: got (%d, %v) %q, wanted (%d, nil) %q", n, err, buf, wantN, msg)
+ }
+}
+
+func TestPipeReadBlock(t *testing.T) {
+ ctx := contexttest.Context(t)
+ r, w := NewConnectedPipe(ctx, 65536, 4096)
+ defer r.DecRef()
+ defer w.DecRef()
+
+ n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1)))
+ if n != 0 || err != syserror.ErrWouldBlock {
+ t.Fatalf("Readv: got (%d, %v), wanted (0, %v)", n, err, syserror.ErrWouldBlock)
+ }
+}
+
+func TestPipeWriteBlock(t *testing.T) {
+ const atomicIOBytes = 2
+
+ ctx := contexttest.Context(t)
+ r, w := NewConnectedPipe(ctx, 10, atomicIOBytes)
+ defer r.DecRef()
+ defer w.DecRef()
+
+ msg := []byte("here's some bytes")
+ n, err := w.Writev(ctx, usermem.BytesIOSequence(msg))
+ if wantN, wantErr := int64(atomicIOBytes), syserror.ErrWouldBlock; n != wantN || err != wantErr {
+ t.Fatalf("Writev: got (%d, %v), wanted (%d, %v)", n, err, wantN, wantErr)
+ }
+}
+
+func TestPipeWriteUntilEnd(t *testing.T) {
+ const atomicIOBytes = 2
+
+ ctx := contexttest.Context(t)
+ r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes)
+ defer r.DecRef()
+ defer w.DecRef()
+
+ msg := []byte("here's some bytes")
+
+ wDone := make(chan struct{}, 0)
+ rDone := make(chan struct{}, 0)
+ defer func() {
+ // Signal the reader to stop and wait until it does so.
+ close(wDone)
+ <-rDone
+ }()
+
+ go func() {
+ defer close(rDone)
+ // Read from r until done is closed.
+ ctx := contexttest.Context(t)
+ buf := make([]byte, len(msg)+1)
+ dst := usermem.BytesIOSequence(buf)
+ e, ch := waiter.NewChannelEntry(nil)
+ r.EventRegister(&e, waiter.EventIn)
+ defer r.EventUnregister(&e)
+ for {
+ n, err := r.Readv(ctx, dst)
+ dst = dst.DropFirst64(n)
+ if err == syserror.ErrWouldBlock {
+ select {
+ case <-ch:
+ continue
+ case <-wDone:
+ // We expect to have 1 byte left in dst since len(buf) ==
+ // len(msg)+1.
+ if dst.NumBytes() != 1 || !bytes.Equal(buf[:len(msg)], msg) {
+ t.Errorf("Reader: got %q (%d bytes remaining), wanted %q", buf, dst.NumBytes(), msg)
+ }
+ return
+ }
+ }
+ if err != nil {
+ t.Fatalf("Readv: got unexpected error %v", err)
+ }
+ }
+ }()
+
+ src := usermem.BytesIOSequence(msg)
+ e, ch := waiter.NewChannelEntry(nil)
+ w.EventRegister(&e, waiter.EventOut)
+ defer w.EventUnregister(&e)
+ for src.NumBytes() != 0 {
+ n, err := w.Writev(ctx, src)
+ src = src.DropFirst64(n)
+ if err == syserror.ErrWouldBlock {
+ <-ch
+ continue
+ }
+ if err != nil {
+ t.Fatalf("Writev: got (%d, %v)", n, err)
+ }
+ }
+}
diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go
new file mode 100644
index 000000000..40d5e4943
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/reader.go
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Reader satisfies the fs.FileOperations interface for read-only pipes.
+// Reader should be used with !fs.FileFlags.Write to reject writes.
+type Reader struct {
+ ReaderWriter
+}
+
+// Release implements fs.FileOperations.Release.
+func (r *Reader) Release() {
+ r.Pipe.rClose()
+ // Wake up writers.
+ r.Pipe.Notify(waiter.EventOut)
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (r *Reader) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return r.Pipe.rReadiness() & mask
+}
diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go
new file mode 100644
index 000000000..dc642a3a6
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/reader_writer.go
@@ -0,0 +1,91 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+ "fmt"
+ "math"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// ReaderWriter satisfies the FileOperations interface and services both
+// read and write requests. This should only be used directly for named pipes.
+// pipe(2) and pipe2(2) only support unidirectional pipes and should use
+// either pipe.Reader or pipe.Writer.
+type ReaderWriter struct {
+ fsutil.PipeSeek `state:"nosave"`
+ fsutil.NotDirReaddir `state:"nosave"`
+ fsutil.NoFsync `state:"nosave"`
+ fsutil.NoopFlush `state:"nosave"`
+ fsutil.NoMMap `state:"nosave"`
+ *Pipe
+}
+
+// Release implements fs.FileOperations.Release.
+func (rw *ReaderWriter) Release() {
+ rw.Pipe.rClose()
+ rw.Pipe.wClose()
+ // Wake up readers and writers.
+ rw.Pipe.Notify(waiter.EventIn | waiter.EventOut)
+}
+
+// Read implements fs.FileOperations.Read.
+func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
+ n, err := rw.Pipe.read(ctx, dst)
+ if n > 0 {
+ rw.Pipe.Notify(waiter.EventOut)
+ }
+ return n, err
+}
+
+// Write implements fs.FileOperations.Write.
+func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
+ n, err := rw.Pipe.write(ctx, src)
+ if n > 0 {
+ rw.Pipe.Notify(waiter.EventIn)
+ }
+ return n, err
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return rw.Pipe.rwReadiness() & mask
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ // Switch on ioctl request.
+ switch int(args[1].Int()) {
+ case syscall.TIOCINQ:
+ v := rw.queuedSize()
+ if v > math.MaxInt32 {
+ panic(fmt.Sprintf("Impossibly large pipe queued size: %d", v))
+ }
+ // Copy result to user-space.
+ _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ return 0, err
+ default:
+ return 0, syscall.ENOTTY
+ }
+}
diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go
new file mode 100644
index 000000000..fd13008ac
--- /dev/null
+++ b/pkg/sentry/kernel/pipe/writer.go
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Writer satisfies the fs.FileOperations interface for write-only pipes.
+// Writer should be used with !fs.FileFlags.Read to reject reads.
+type Writer struct {
+ ReaderWriter
+}
+
+// Release implements fs.FileOperations.Release.
+func (w *Writer) Release() {
+ w.Pipe.wClose()
+ // Wake up readers.
+ w.Pipe.Notify(waiter.EventHUp)
+}
+
+// Readiness returns the ready events in the underlying pipe.
+func (w *Writer) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return w.Pipe.wReadiness() & mask
+}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
new file mode 100644
index 000000000..20b1c4cd4
--- /dev/null
+++ b/pkg/sentry/kernel/ptrace.go
@@ -0,0 +1,1054 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// ptrace constants from Linux's include/uapi/linux/ptrace.h.
+const (
+ _PTRACE_EVENT_SECCOMP = 7
+ PTRACE_SEIZE = 0x4206
+ PTRACE_INTERRUPT = 0x4207
+ PTRACE_LISTEN = 0x4208
+ PTRACE_PEEKSIGINFO = 0x4209
+ PTRACE_GETSIGMASK = 0x420a
+ PTRACE_SETSIGMASK = 0x420b
+ _PTRACE_O_EXITKILL = 1 << 20
+ _PTRACE_O_TRACESECCOMP = 1 << _PTRACE_EVENT_SECCOMP
+)
+
+// ptraceOptions are the subset of options controlling a task's ptrace behavior
+// that are set by ptrace(PTRACE_SETOPTIONS).
+type ptraceOptions struct {
+ // ExitKill is true if the tracee should be sent SIGKILL when the tracer
+ // exits.
+ ExitKill bool
+
+ // If SysGood is true, set bit 7 in the signal number for
+ // syscall-entry-stop and syscall-exit-stop traps delivered to this task's
+ // tracer.
+ SysGood bool
+
+ // TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE
+ // events.
+ TraceClone bool
+
+ // TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC
+ // events.
+ TraceExec bool
+
+ // TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT
+ // events.
+ TraceExit bool
+
+ // TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK
+ // events.
+ TraceFork bool
+
+ // TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP
+ // events.
+ TraceSeccomp bool
+
+ // TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK
+ // events.
+ TraceVfork bool
+
+ // TraceVforkDone is true if the tracer wants to receive
+ // PTRACE_EVENT_VFORK_DONE events.
+ TraceVforkDone bool
+}
+
+// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry
+// and exit.
+type ptraceSyscallMode int
+
+const (
+ // ptraceSyscallNone indicates that the task has never ptrace-stopped, or
+ // that it was resumed from its last ptrace-stop by PTRACE_CONT or
+ // PTRACE_DETACH. The task's syscalls will not be intercepted.
+ ptraceSyscallNone ptraceSyscallMode = iota
+
+ // ptraceSyscallIntercept indicates that the task was resumed from its last
+ // ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a
+ // syscall, a ptrace-stop will occur.
+ ptraceSyscallIntercept
+
+ // ptraceSyscallEmu indicates that the task was resumed from its last
+ // ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time
+ // the task enters a syscall, the syscall will be skipped, and a
+ // ptrace-stop will occur.
+ ptraceSyscallEmu
+)
+
+// CanTrace checks that t is permitted to access target's state, as defined by
+// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it
+// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access
+// mode PTRACE_MODE_READ.
+func (t *Task) CanTrace(target *Task, attach bool) bool {
+ // "1. If the calling thread and the target thread are in the same thread
+ // group, access is always allowed." - ptrace(2)
+ //
+ // Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access()
+ // should not deny sub-threads", first released in Linux 3.12), the rule
+ // only applies if t and target are the same task. But, as that commit
+ // message puts it, "[any] security check is pointless when the tasks share
+ // the same ->mm."
+ if t.tg == target.tg {
+ return true
+ }
+
+ // """
+ // 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped,
+ // doesn't exist until Linux 4.5).
+ //
+ // Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the
+ // caller's real UID and GID for the checks in the next step. (Most APIs
+ // that check the caller's UID and GID use the effective IDs. For
+ // historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs
+ // instead.)
+ //
+ // 3. Deny access if neither of the following is true:
+ //
+ // - The real, effective, and saved-set user IDs of the target match the
+ // caller's user ID, *and* the real, effective, and saved-set group IDs of
+ // the target match the caller's group ID.
+ //
+ // - The caller has the CAP_SYS_PTRACE capability in the user namespace of
+ // the target.
+ //
+ // 4. Deny access if the target process "dumpable" attribute has a value
+ // other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in
+ // prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in
+ // the user namespace of the target process.
+ //
+ // 5. The kernel LSM security_ptrace_access_check() interface is invoked to
+ // see if ptrace access is permitted. The results depend on the LSM(s). The
+ // implementation of this interface in the commoncap LSM performs the
+ // following steps:
+ //
+ // a) If the access mode includes PTRACE_MODE_FSCREDS, then use the
+ // caller's effective capability set; otherwise (the access mode specifies
+ // PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set.
+ //
+ // b) Deny access if neither of the following is true:
+ //
+ // - The caller and the target process are in the same user namespace, and
+ // the caller's capabilities are a proper superset of the target process's
+ // permitted capabilities.
+ //
+ // - The caller has the CAP_SYS_PTRACE capability in the target process's
+ // user namespace.
+ //
+ // Note that the commoncap LSM does not distinguish between
+ // PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this
+ // section: "the commoncap LSM ... is always invoked".)
+ // """
+ callerCreds := t.Credentials()
+ targetCreds := target.Credentials()
+ if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) {
+ return true
+ }
+ if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID {
+ return false
+ }
+ if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
+ return false
+ }
+ // TODO: dumpability check
+ if callerCreds.UserNamespace != targetCreds.UserNamespace {
+ return false
+ }
+ if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
+ return false
+ }
+ // TODO: Yama LSM
+ return true
+}
+
+// Tracer returns t's ptrace Tracer.
+func (t *Task) Tracer() *Task {
+ return t.ptraceTracer.Load().(*Task)
+}
+
+// hasTracer returns true if t has a ptrace tracer attached.
+func (t *Task) hasTracer() bool {
+ // This isn't just inlined into callers so that if Task.Tracer() turns out
+ // to be too expensive because of e.g. interface conversion, we can switch
+ // to having a separate atomic flag more easily.
+ return t.Tracer() != nil
+}
+
+// ptraceStop is a TaskStop placed on tasks in a ptrace-stop.
+type ptraceStop struct {
+ // If frozen is true, the stopped task's tracer is currently operating on
+ // it, so Task.Kill should not remove the stop.
+ frozen bool
+}
+
+// Killable implements TaskStop.Killable.
+func (s *ptraceStop) Killable() bool {
+ return !s.frozen
+}
+
+// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been
+// killed, the stop is skipped, and beginPtraceStopLocked returns false.
+//
+// beginPtraceStopLocked does not signal t's tracer or wake it if it is
+// waiting.
+//
+// Preconditions: The TaskSet mutex must be locked. The caller must be running
+// on the task goroutine.
+func (t *Task) beginPtraceStopLocked() bool {
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ // This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... =>
+ // kernel/sched/core.c:__schedule() => signal_pending_state() check, which
+ // is what prevents tasks from entering ptrace-stops after being killed.
+ // Note that if t was SIGKILLed and beingPtraceStopLocked is being called
+ // for PTRACE_EVENT_EXIT, the task will have dequeued the signal before
+ // entering the exit path, so t.killable() will no longer return true. This
+ // is consistent with Linux: "Bugs: ... A SIGKILL signal may still cause a
+ // PTRACE_EVENT_EXIT stop before actual signal death. This may be changed
+ // in the future; SIGKILL is meant to always immediately kill tasks even
+ // under ptrace. Last confirmed on Linux 3.13." - ptrace(2)
+ if t.killedLocked() {
+ return false
+ }
+ t.beginInternalStopLocked(&ptraceStop{})
+ return true
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceTrapLocked(code int32) {
+ t.ptraceCode = code
+ t.ptraceSiginfo = &arch.SignalInfo{
+ Signo: int32(linux.SIGTRAP),
+ Code: code,
+ }
+ t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
+ t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+ if t.beginPtraceStopLocked() {
+ tracer := t.Tracer()
+ tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP))
+ tracer.tg.eventQueue.Notify(EventTraceeStop)
+ }
+}
+
+// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the
+// ptraceStop, temporarily preventing it from being removed by a concurrent
+// Task.Kill, and returns true. Otherwise it returns false.
+//
+// Preconditions: The TaskSet mutex must be locked. The caller must be running
+// on the task goroutine of t's tracer.
+func (t *Task) ptraceFreeze() bool {
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ if t.stop == nil {
+ return false
+ }
+ s, ok := t.stop.(*ptraceStop)
+ if !ok {
+ return false
+ }
+ s.frozen = true
+ return true
+}
+
+// ptraceUnfreeze ends the effect of a previous successful call to
+// ptraceFreeze.
+//
+// Preconditions: t must be in a frozen ptraceStop.
+func (t *Task) ptraceUnfreeze() {
+ // t.tg.signalHandlers is stable because t is in a frozen ptrace-stop,
+ // preventing its thread group from completing execve.
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ // Do this even if the task has been killed to ensure a panic if t.stop is
+ // nil or not a ptraceStop.
+ t.stop.(*ptraceStop).frozen = false
+ if t.killedLocked() {
+ t.endInternalStopLocked()
+ }
+}
+
+// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL,
+// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on
+// mode and singlestep.
+//
+// Preconditions: t must be in a frozen ptrace stop.
+//
+// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace
+// stop.
+func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error {
+ if sig != 0 && !sig.IsValid() {
+ return syserror.EIO
+ }
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ t.ptraceCode = int32(sig)
+ t.ptraceSyscallMode = mode
+ t.ptraceSinglestep = singlestep
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ t.endInternalStopLocked()
+ return nil
+}
+
+func (t *Task) ptraceTraceme() error {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ if t.hasTracer() {
+ return syserror.EPERM
+ }
+ if t.parent == nil {
+ // In Linux, only init can not have a parent, and init is assumed never
+ // to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user
+ // application that may invoke PTRACE_TRACEME; having no parent can
+ // also occur if all tasks in the parent thread group have exited, and
+ // failed to find a living thread group to reparent to. The former case
+ // is treated as if TGID 1 has an exited parent in an invisible
+ // ancestor PID namespace that is an owner of the root user namespace
+ // (and consequently has CAP_SYS_PTRACE), and the latter case is a
+ // special form of the exited parent case below. In either case,
+ // returning nil here is correct.
+ return nil
+ }
+ if !t.parent.CanTrace(t, true) {
+ return syserror.EPERM
+ }
+ if t.parent.exitState != TaskExitNone {
+ // Fail silently, as if we were successfully attached but then
+ // immediately detached. This is consistent with Linux.
+ return nil
+ }
+ t.ptraceTracer.Store(t.parent)
+ t.parent.ptraceTracees[t] = struct{}{}
+ return nil
+}
+
+// ptraceAttach implements ptrace(PTRACE_ATTACH, target). t is the caller.
+func (t *Task) ptraceAttach(target *Task) error {
+ if t.tg == target.tg {
+ return syserror.EPERM
+ }
+ if !t.CanTrace(target, true) {
+ return syserror.EPERM
+ }
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ if target.hasTracer() {
+ return syserror.EPERM
+ }
+ // Attaching to zombies and dead tasks is not permitted; the exit
+ // notification logic relies on this. Linux allows attaching to PF_EXITING
+ // tasks, though.
+ if target.exitState >= TaskExitZombie {
+ return syserror.EPERM
+ }
+ target.ptraceTracer.Store(t)
+ t.ptraceTracees[target] = struct{}{}
+ target.tg.signalHandlers.mu.Lock()
+ target.sendSignalLocked(&arch.SignalInfo{
+ Signo: int32(linux.SIGSTOP),
+ Code: arch.SignalInfoUser,
+ }, false /* group */)
+ // Undocumented Linux feature: If the tracee is already group-stopped (and
+ // consequently will not report the SIGSTOP just sent), force it to leave
+ // and re-enter the stop so that it will switch to a ptrace-stop.
+ if target.stop == (*groupStop)(nil) {
+ target.groupStopRequired = true
+ target.endInternalStopLocked()
+ }
+ target.tg.signalHandlers.mu.Unlock()
+ return nil
+}
+
+// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the
+// caller.
+//
+// Preconditions: target must be a tracee of t in a frozen ptrace stop.
+//
+// Postconditions: If ptraceDetach returns nil, target will no longer be in a
+// ptrace stop.
+func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error {
+ if sig != 0 && !sig.IsValid() {
+ return syserror.EIO
+ }
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ target.ptraceCode = int32(sig)
+ target.forgetTracerLocked()
+ delete(t.ptraceTracees, target)
+ return nil
+}
+
+// exitPtrace is called in the exit path to detach all of t's tracees.
+func (t *Task) exitPtrace() {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ for target := range t.ptraceTracees {
+ if target.ptraceOpts.ExitKill {
+ target.tg.signalHandlers.mu.Lock()
+ target.sendSignalLocked(&arch.SignalInfo{
+ Signo: int32(linux.SIGKILL),
+ }, false /* group */)
+ target.tg.signalHandlers.mu.Unlock()
+ }
+ // Leave ptraceCode unchanged so that if the task is ptrace-stopped, it
+ // observes the ptraceCode it set before it entered the stop. I believe
+ // this is consistent with Linux.
+ target.forgetTracerLocked()
+ }
+ // "nil maps cannot be saved"
+ t.ptraceTracees = make(map[*Task]struct{})
+}
+
+// forgetTracerLocked detaches t's tracer and ensures that t is no longer
+// ptrace-stopped.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) forgetTracerLocked() {
+ t.ptraceOpts = ptraceOptions{}
+ t.ptraceSyscallMode = ptraceSyscallNone
+ t.ptraceSinglestep = false
+ t.ptraceTracer.Store((*Task)(nil))
+ if t.exitTracerNotified && !t.exitTracerAcked {
+ t.exitTracerAcked = true
+ t.exitNotifyLocked(true)
+ }
+ // If t is ptrace-stopped, but its thread group is in a group stop and t is
+ // eligible to participate, make it do so. This is essentially the reverse
+ // of the special case in ptraceAttach, which converts a group stop to a
+ // ptrace stop. ("Handling of restart from group-stop is currently buggy,
+ // but the "as planned" behavior is to leave tracee stopped and waiting for
+ // SIGCONT." - ptrace(2))
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ if t.stop == nil {
+ return
+ }
+ if _, ok := t.stop.(*ptraceStop); ok {
+ if t.exitState < TaskExitInitiated && t.tg.groupStopPhase >= groupStopInitiated {
+ t.groupStopRequired = true
+ }
+ t.endInternalStopLocked()
+ }
+}
+
+// ptraceSignalLocked is called after signal dequeueing to check if t should
+// enter ptrace signal-delivery-stop.
+//
+// Preconditions: The signal mutex must be locked. The caller must be running
+// on the task goroutine.
+func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
+ if linux.Signal(info.Signo) == linux.SIGKILL {
+ return false
+ }
+ if !t.hasTracer() {
+ return false
+ }
+ // The tracer might change this signal into a stop signal, in which case
+ // any SIGCONT received after the signal was originally dequeued should
+ // cancel it. This is consistent with Linux.
+ if t.tg.groupStopPhase == groupStopNone {
+ t.tg.groupStopPhase = groupStopDequeued
+ }
+ // Can't lock the TaskSet mutex while holding a signal mutex.
+ t.tg.signalHandlers.mu.Unlock()
+ defer t.tg.signalHandlers.mu.Lock()
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ tracer := t.Tracer()
+ if tracer == nil {
+ return false
+ }
+ t.ptraceCode = info.Signo
+ t.ptraceSiginfo = info
+ t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo)
+ if t.beginPtraceStopLocked() {
+ tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo)
+ tracer.tg.eventQueue.Notify(EventTraceeStop)
+ }
+ return true
+}
+
+// ptraceSeccomp is called when a seccomp-bpf filter returns action
+// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data
+// is the lower 16 bits of the filter's return value.
+func (t *Task) ptraceSeccomp(data uint16) bool {
+ if !t.hasTracer() {
+ return false
+ }
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ if !t.ptraceOpts.TraceSeccomp {
+ return false
+ }
+ t.Debugf("Entering PTRACE_EVENT_SECCOMP stop")
+ t.ptraceEventLocked(_PTRACE_EVENT_SECCOMP, uint64(data))
+ return true
+}
+
+// ptraceSyscallEnter is called immediately before entering a syscall to check
+// if t should enter ptrace syscall-enter-stop.
+func (t *Task) ptraceSyscallEnter() (taskRunState, bool) {
+ if !t.hasTracer() {
+ return nil, false
+ }
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ switch t.ptraceSyscallMode {
+ case ptraceSyscallNone:
+ return nil, false
+ case ptraceSyscallIntercept:
+ t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL")
+ t.ptraceSyscallStopLocked()
+ return (*runSyscallAfterSyscallEnterStop)(nil), true
+ case ptraceSyscallEmu:
+ t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU")
+ t.ptraceSyscallStopLocked()
+ return (*runSyscallAfterSysemuStop)(nil), true
+ }
+ panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode))
+}
+
+// ptraceSyscallExit is called immediately after leaving a syscall to check if
+// t should enter ptrace syscall-exit-stop.
+func (t *Task) ptraceSyscallExit() {
+ if !t.hasTracer() {
+ return
+ }
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ if t.ptraceSyscallMode != ptraceSyscallIntercept {
+ return
+ }
+ t.Debugf("Entering syscall-exit-stop")
+ t.ptraceSyscallStopLocked()
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceSyscallStopLocked() {
+ code := int32(linux.SIGTRAP)
+ if t.ptraceOpts.SysGood {
+ code |= 0x80
+ }
+ t.ptraceTrapLocked(code)
+}
+
+type ptraceCloneKind int32
+
+const (
+ // ptraceCloneKindClone represents a call to Task.Clone where
+ // TerminationSignal is not SIGCHLD and Vfork is false.
+ ptraceCloneKindClone ptraceCloneKind = iota
+
+ // ptraceCloneKindFork represents a call to Task.Clone where
+ // TerminationSignal is SIGCHLD and Vfork is false.
+ ptraceCloneKindFork
+
+ // ptraceCloneKindVfork represents a call to Task.Clone where Vfork is
+ // true.
+ ptraceCloneKindVfork
+)
+
+// ptraceClone is called at the end of a clone or fork syscall to check if t
+// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK
+// stop. child is the new task.
+func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool {
+ if !t.hasTracer() {
+ return false
+ }
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ event := false
+ if !opts.Untraced {
+ switch kind {
+ case ptraceCloneKindClone:
+ if t.ptraceOpts.TraceClone {
+ t.Debugf("Entering PTRACE_EVENT_CLONE stop")
+ t.ptraceEventLocked(syscall.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child]))
+ event = true
+ }
+ case ptraceCloneKindFork:
+ if t.ptraceOpts.TraceFork {
+ t.Debugf("Entering PTRACE_EVENT_FORK stop")
+ t.ptraceEventLocked(syscall.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child]))
+ event = true
+ }
+ case ptraceCloneKindVfork:
+ if t.ptraceOpts.TraceVfork {
+ t.Debugf("Entering PTRACE_EVENT_VFORK stop")
+ t.ptraceEventLocked(syscall.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child]))
+ event = true
+ }
+ default:
+ panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind))
+ }
+ }
+ // "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE
+ // options are in effect, then children created by, respectively, vfork(2)
+ // or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit
+ // signal set to SIGCHLD, and other kinds of clone(2), are automatically
+ // attached to the same tracer which traced their parent. SIGSTOP is
+ // delivered to the children, causing them to enter signal-delivery-stop
+ // after they exit the system call which created them." - ptrace(2)
+ //
+ // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is
+ // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() =>
+ // include/linux/ptrace.h:ptrace_init_task().
+ if event || opts.InheritTracer {
+ tracer := t.Tracer()
+ if tracer != nil {
+ child.ptraceTracer.Store(tracer)
+ tracer.ptraceTracees[child] = struct{}{}
+ // "Flags are inherited by new tracees created and "auto-attached"
+ // via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or
+ // PTRACE_O_TRACECLONE options."
+ child.ptraceOpts = t.ptraceOpts
+ child.tg.signalHandlers.mu.Lock()
+ // If the child is PT_SEIZED (currently not possible in the sentry
+ // because PTRACE_SEIZE is unimplemented, but for future
+ // reference), Linux just sets JOBCTL_TRAP_STOP instead, so the
+ // child skips signal-delivery-stop and goes directly to
+ // group-stop.
+ //
+ // The child will self-t.interrupt() when its task goroutine starts
+ // running, so we don't have to.
+ child.pendingSignals.enqueue(&arch.SignalInfo{
+ Signo: int32(linux.SIGSTOP),
+ })
+ child.tg.signalHandlers.mu.Unlock()
+ }
+ }
+ return event
+}
+
+// ptraceVforkDone is called after the end of a vfork stop to check if t should
+// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's
+// PID namespace.
+func (t *Task) ptraceVforkDone(child ThreadID) bool {
+ if !t.hasTracer() {
+ return false
+ }
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ if !t.ptraceOpts.TraceVforkDone {
+ return false
+ }
+ t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop")
+ t.ptraceEventLocked(syscall.PTRACE_EVENT_VFORK_DONE, uint64(child))
+ return true
+}
+
+// ptraceExec is called at the end of an execve syscall to check if t should
+// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID
+// namespace, prior to the execve. (If t did not have a tracer at the time
+// oldTID was read, oldTID may be 0. This is consistent with Linux.)
+func (t *Task) ptraceExec(oldTID ThreadID) {
+ if !t.hasTracer() {
+ return
+ }
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ // Recheck with the TaskSet mutex locked. Most ptrace points don't need to
+ // do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC
+ // is special because both TraceExec and !TraceExec do something if a
+ // tracer is attached.
+ if !t.hasTracer() {
+ return
+ }
+ if t.ptraceOpts.TraceExec {
+ t.Debugf("Entering PTRACE_EVENT_EXEC stop")
+ t.ptraceEventLocked(syscall.PTRACE_EVENT_EXEC, uint64(oldTID))
+ return
+ }
+ // "If the PTRACE_O_TRACEEXEC option is not in effect for the execing
+ // tracee, and if the tracee was PTRACE_ATTACHed rather that [sic]
+ // PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after
+ // execve(2) returns. This is an ordinary signal (similar to one which can
+ // be generated by `kill -TRAP`, not a special kind of ptrace-stop.
+ // Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0
+ // (SI_USER). This signal may be blocked by signal mask, and thus may be
+ // delivered (much) later." - ptrace(2)
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ t.sendSignalLocked(&arch.SignalInfo{
+ Signo: int32(linux.SIGTRAP),
+ Code: arch.SignalInfoUser,
+ }, false /* group */)
+}
+
+// ptraceExit is called early in the task exit path to check if t should enter
+// PTRACE_EVENT_EXIT stop.
+func (t *Task) ptraceExit() {
+ if !t.hasTracer() {
+ return
+ }
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ if !t.ptraceOpts.TraceExit {
+ return
+ }
+ t.tg.signalHandlers.mu.Lock()
+ status := t.exitStatus.Status()
+ t.tg.signalHandlers.mu.Unlock()
+ t.Debugf("Entering PTRACE_EVENT_EXIT stop")
+ t.ptraceEventLocked(syscall.PTRACE_EVENT_EXIT, uint64(status))
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) ptraceEventLocked(event int32, msg uint64) {
+ t.ptraceEventMsg = msg
+ // """
+ // PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning
+ // with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An
+ // additional bit is set in the higher byte of the status word: the value
+ // status>>8 will be
+ //
+ // (SIGTRAP | PTRACE_EVENT_foo << 8).
+ //
+ // ...
+ //
+ // """ - ptrace(2)
+ t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8))
+}
+
+// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller.
+func (t *Task) ptraceKill(target *Task) error {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ if target.Tracer() != t {
+ return syserror.ESRCH
+ }
+ target.tg.signalHandlers.mu.Lock()
+ defer target.tg.signalHandlers.mu.Unlock()
+ // "This operation is deprecated; do not use it! Instead, send a SIGKILL
+ // directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is
+ // that it requires the tracee to be in signal-delivery-stop, otherwise it
+ // may not work (i.e., may complete successfully but won't kill the
+ // tracee)." - ptrace(2)
+ if target.stop == nil {
+ return nil
+ }
+ if _, ok := target.stop.(*ptraceStop); !ok {
+ return nil
+ }
+ target.ptraceCode = int32(linux.SIGKILL)
+ target.endInternalStopLocked()
+ return nil
+}
+
+// Ptrace implements the ptrace system call.
+func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
+ // PTRACE_TRACEME ignores all other arguments.
+ if req == syscall.PTRACE_TRACEME {
+ return t.ptraceTraceme()
+ }
+ // All other ptrace requests operate on a current or future tracee
+ // specified by pid.
+ target := t.tg.pidns.TaskWithID(pid)
+ if target == nil {
+ return syserror.ESRCH
+ }
+
+ // PTRACE_ATTACH (and PTRACE_SEIZE, which is unimplemented) do not require
+ // that target is not already a tracee.
+ if req == syscall.PTRACE_ATTACH {
+ return t.ptraceAttach(target)
+ }
+ // PTRACE_KILL (and PTRACE_INTERRUPT, which is unimplemented) require that
+ // the target is a tracee, but does not require that it is ptrace-stopped.
+ if req == syscall.PTRACE_KILL {
+ return t.ptraceKill(target)
+ }
+ // All other ptrace requests require that the target is a ptrace-stopped
+ // tracee, and freeze the ptrace-stop so the tracee can be operated on.
+ t.tg.pidns.owner.mu.RLock()
+ if target.Tracer() != t {
+ t.tg.pidns.owner.mu.RUnlock()
+ return syserror.ESRCH
+ }
+ if !target.ptraceFreeze() {
+ t.tg.pidns.owner.mu.RUnlock()
+ // "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE,
+ // PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the
+ // tracee to be in a ptrace-stop, otherwise they fail with ESRCH." -
+ // ptrace(2)
+ return syserror.ESRCH
+ }
+ t.tg.pidns.owner.mu.RUnlock()
+ // Even if the target has a ptrace-stop active, the tracee's task goroutine
+ // may not yet have reached Task.doStop; wait for it to do so. This is safe
+ // because there's no way for target to initiate a ptrace-stop and then
+ // block (by calling Task.block) before entering it.
+ //
+ // Caveat: If tasks were just restored, the tracee's first call to
+ // Task.Activate (in Task.run) occurs before its first call to Task.doStop,
+ // which may block if the tracer's address space is active.
+ t.UninterruptibleSleepStart(true)
+ target.waitGoroutineStoppedOrExited()
+ t.UninterruptibleSleepFinish(true)
+
+ // Resuming commands end the ptrace stop, but only if successful.
+ switch req {
+ case syscall.PTRACE_DETACH:
+ if err := t.ptraceDetach(target, linux.Signal(data)); err != nil {
+ target.ptraceUnfreeze()
+ return err
+ }
+ return nil
+ case syscall.PTRACE_CONT:
+ if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil {
+ target.ptraceUnfreeze()
+ return err
+ }
+ return nil
+ case syscall.PTRACE_SYSCALL:
+ if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil {
+ target.ptraceUnfreeze()
+ return err
+ }
+ return nil
+ case syscall.PTRACE_SINGLESTEP:
+ if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil {
+ target.ptraceUnfreeze()
+ return err
+ }
+ return nil
+ case syscall.PTRACE_SYSEMU:
+ if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil {
+ target.ptraceUnfreeze()
+ return err
+ }
+ return nil
+ case syscall.PTRACE_SYSEMU_SINGLESTEP:
+ if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil {
+ target.ptraceUnfreeze()
+ return err
+ }
+ return nil
+ }
+ // All other ptrace requests expect us to unfreeze the stop.
+ defer target.ptraceUnfreeze()
+
+ switch req {
+ case syscall.PTRACE_PEEKTEXT, syscall.PTRACE_PEEKDATA:
+ // "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and
+ // PTRACE_PEEKUSER requests have a different API: they store the result
+ // at the address specified by the data parameter, and the return value
+ // is the error flag." - ptrace(2)
+ word := t.Arch().Native(0)
+ if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{
+ IgnorePermissions: true,
+ }); err != nil {
+ return err
+ }
+ _, err := t.CopyOut(data, word)
+ return err
+
+ case syscall.PTRACE_POKETEXT, syscall.PTRACE_POKEDATA:
+ _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{
+ IgnorePermissions: true,
+ })
+ return err
+
+ case syscall.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
+ n, err := target.Arch().PtracePeekUser(uintptr(addr))
+ if err != nil {
+ return err
+ }
+ _, err = t.CopyOut(data, n)
+ return err
+
+ case syscall.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
+ return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data))
+
+ case syscall.PTRACE_GETREGS:
+ // "Copy the tracee's general-purpose ... registers ... to the address
+ // data in the tracer. ... (addr is ignored.) Note that SPARC systems
+ // have the meaning of data and addr reversed ..."
+ _, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{
+ Ctx: t,
+ IO: t.MemoryManager(),
+ Addr: data,
+ Opts: usermem.IOOpts{
+ AddressSpaceActive: true,
+ },
+ })
+ return err
+
+ case syscall.PTRACE_GETFPREGS:
+ _, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{
+ Ctx: t,
+ IO: t.MemoryManager(),
+ Addr: data,
+ Opts: usermem.IOOpts{
+ AddressSpaceActive: true,
+ },
+ })
+ return err
+
+ case syscall.PTRACE_GETREGSET:
+ // "Read the tracee's registers. addr specifies, in an
+ // architecture-dependent way, the type of registers to be read. ...
+ // data points to a struct iovec, which describes the destination
+ // buffer's location and length. On return, the kernel modifies iov.len
+ // to indicate the actual number of bytes returned." - ptrace(2)
+ ars, err := t.CopyInIovecs(data, 1)
+ if err != nil {
+ return err
+ }
+ ar := ars.Head()
+ n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{
+ Ctx: t,
+ IO: t.MemoryManager(),
+ Addr: ar.Start,
+ Opts: usermem.IOOpts{
+ AddressSpaceActive: true,
+ },
+ }, int(ar.Length()))
+ if err != nil {
+ return err
+ }
+ ar.End -= usermem.Addr(n)
+ return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+
+ case syscall.PTRACE_SETREGS:
+ _, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{
+ Ctx: t,
+ IO: t.MemoryManager(),
+ Addr: data,
+ Opts: usermem.IOOpts{
+ AddressSpaceActive: true,
+ },
+ })
+ return err
+
+ case syscall.PTRACE_SETFPREGS:
+ _, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{
+ Ctx: t,
+ IO: t.MemoryManager(),
+ Addr: data,
+ Opts: usermem.IOOpts{
+ AddressSpaceActive: true,
+ },
+ })
+ return err
+
+ case syscall.PTRACE_SETREGSET:
+ ars, err := t.CopyInIovecs(data, 1)
+ if err != nil {
+ return err
+ }
+ ar := ars.Head()
+ n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{
+ Ctx: t,
+ IO: t.MemoryManager(),
+ Addr: ar.Start,
+ Opts: usermem.IOOpts{
+ AddressSpaceActive: true,
+ },
+ }, int(ar.Length()))
+ if err != nil {
+ return err
+ }
+ ar.End -= usermem.Addr(n)
+ return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))
+
+ case syscall.PTRACE_GETSIGINFO:
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ if target.ptraceSiginfo == nil {
+ return syserror.EINVAL
+ }
+ _, err := t.CopyOut(data, target.ptraceSiginfo)
+ return err
+
+ case syscall.PTRACE_SETSIGINFO:
+ var info arch.SignalInfo
+ if _, err := t.CopyIn(data, &info); err != nil {
+ return err
+ }
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ if target.ptraceSiginfo == nil {
+ return syserror.EINVAL
+ }
+ target.ptraceSiginfo = &info
+ return nil
+
+ case PTRACE_GETSIGMASK:
+ if addr != linux.SignalSetSize {
+ return syserror.EINVAL
+ }
+ target.mu.Lock()
+ defer target.mu.Unlock()
+ _, err := t.CopyOut(data, target.tr.SignalMask)
+ return err
+
+ case PTRACE_SETSIGMASK:
+ if addr != linux.SignalSetSize {
+ return syserror.EINVAL
+ }
+ var mask linux.SignalSet
+ if _, err := t.CopyIn(data, &mask); err != nil {
+ return err
+ }
+ // The target's task goroutine is stopped, so this is safe:
+ target.SetSignalMask(mask &^ UnblockableSignals)
+ return nil
+
+ case syscall.PTRACE_SETOPTIONS:
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ validOpts := uintptr(_PTRACE_O_EXITKILL | syscall.PTRACE_O_TRACESYSGOOD | syscall.PTRACE_O_TRACECLONE |
+ syscall.PTRACE_O_TRACEEXEC | syscall.PTRACE_O_TRACEEXIT | syscall.PTRACE_O_TRACEFORK |
+ _PTRACE_O_TRACESECCOMP | syscall.PTRACE_O_TRACEVFORK | syscall.PTRACE_O_TRACEVFORKDONE)
+ if uintptr(data)&^validOpts != 0 {
+ return syserror.EINVAL
+ }
+ target.ptraceOpts = ptraceOptions{
+ ExitKill: data&_PTRACE_O_EXITKILL != 0,
+ SysGood: data&syscall.PTRACE_O_TRACESYSGOOD != 0,
+ TraceClone: data&syscall.PTRACE_O_TRACECLONE != 0,
+ TraceExec: data&syscall.PTRACE_O_TRACEEXEC != 0,
+ TraceExit: data&syscall.PTRACE_O_TRACEEXIT != 0,
+ TraceFork: data&syscall.PTRACE_O_TRACEFORK != 0,
+ TraceSeccomp: data&_PTRACE_O_TRACESECCOMP != 0,
+ TraceVfork: data&syscall.PTRACE_O_TRACEVFORK != 0,
+ TraceVforkDone: data&syscall.PTRACE_O_TRACEVFORKDONE != 0,
+ }
+ return nil
+
+ case syscall.PTRACE_GETEVENTMSG:
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg)
+ return err
+
+ default:
+ // PEEKSIGINFO is unimplemented but seems to have no users anywhere.
+ return syserror.EIO
+ }
+}
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
new file mode 100644
index 000000000..635372993
--- /dev/null
+++ b/pkg/sentry/kernel/rseq.go
@@ -0,0 +1,118 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Restartable sequences, as described in https://lwn.net/Articles/650333/.
+
+// RSEQCriticalRegion describes a restartable sequence critical region.
+type RSEQCriticalRegion struct {
+ // When a task in this thread group has its CPU preempted (as defined by
+ // platform.ErrContextCPUPreempted) or has a signal delivered to an
+ // application handler while its instruction pointer is in CriticalSection,
+ // set the instruction pointer to Restart and application register r10 (on
+ // amd64) to the former instruction pointer.
+ CriticalSection usermem.AddrRange
+ Restart usermem.Addr
+}
+
+// RSEQAvailable returns true if t supports restartable sequences.
+func (t *Task) RSEQAvailable() bool {
+ return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
+}
+
+// RSEQCriticalRegion returns a copy of t's thread group's current restartable
+// sequence.
+func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion {
+ return *t.tg.rscr.Load().(*RSEQCriticalRegion)
+}
+
+// SetRSEQCriticalRegion replaces t's thread group's restartable sequence.
+//
+// Preconditions: t.RSEQAvailable() == true.
+func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error {
+ // These checks are somewhat more lenient than in Linux, which (bizarrely)
+ // requires rscr.CriticalSection to be non-empty and rscr.Restart to be
+ // outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0
+ // (which disables the critical region).
+ if rscr.CriticalSection.Start == 0 {
+ rscr.CriticalSection.End = 0
+ rscr.Restart = 0
+ t.tg.rscr.Store(&rscr)
+ return nil
+ }
+ if rscr.CriticalSection.Start >= rscr.CriticalSection.End {
+ return syserror.EINVAL
+ }
+ if rscr.CriticalSection.Contains(rscr.Restart) {
+ return syserror.EINVAL
+ }
+ // TODO: check that rscr.CriticalSection and rscr.Restart are in
+ // the application address range, for consistency with Linux
+ t.tg.rscr.Store(&rscr)
+ return nil
+}
+
+// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU
+// number.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) RSEQCPUAddr() usermem.Addr {
+ return t.rseqCPUAddr
+}
+
+// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU
+// number.
+//
+// Preconditions: t.RSEQAvailable() == true. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error {
+ t.rseqCPUAddr = addr
+ if addr != 0 {
+ if err := t.rseqCopyOutCPU(); err != nil {
+ t.rseqCPUAddr = 0
+ t.rseqCPU = -1
+ return syserror.EINVAL // yes, EINVAL, not err or EFAULT
+ }
+ } else {
+ t.rseqCPU = -1
+ }
+ return nil
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqCopyOutCPU() error {
+ t.rseqCPU = int32(hostcpu.GetCPU())
+ buf := t.CopyScratchBuffer(4)
+ usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
+ _, err := t.CopyOutBytes(t.rseqCPUAddr, buf)
+ return err
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) rseqInterrupt() {
+ rscr := t.tg.rscr.Load().(*RSEQCriticalRegion)
+ if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) {
+ t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart)
+ t.Arch().SetIP(uintptr(rscr.Restart))
+ t.Arch().SetRSEQInterruptedIP(ip)
+ }
+}
diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD
new file mode 100644
index 000000000..b533c51c4
--- /dev/null
+++ b/pkg/sentry/kernel/sched/BUILD
@@ -0,0 +1,20 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+ name = "sched",
+ srcs = [
+ "cpuset.go",
+ "sched.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched",
+ visibility = ["//pkg/sentry:internal"],
+)
+
+go_test(
+ name = "sched_test",
+ size = "small",
+ srcs = ["cpuset_test.go"],
+ embed = [":sched"],
+)
diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go
new file mode 100644
index 000000000..0a97603f0
--- /dev/null
+++ b/pkg/sentry/kernel/sched/cpuset.go
@@ -0,0 +1,105 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sched
+
+import "math/bits"
+
+const (
+ bitsPerByte = 8
+ bytesPerLong = 8 // only for 64-bit architectures
+)
+
+// CPUSet contains a bitmap to record CPU information.
+//
+// Note that this definition is only correct for little-endian architectures,
+// since Linux's cpumask_t uses unsigned long.
+type CPUSet []byte
+
+// CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus.
+func CPUSetSize(num uint) uint {
+ // NOTE: Applications may expect that the size of a CPUSet in
+ // bytes is always a multiple of sizeof(unsigned long), since this is true
+ // in Linux. Thus we always round up.
+ bytes := (num + bitsPerByte - 1) / bitsPerByte
+ longs := (bytes + bytesPerLong - 1) / bytesPerLong
+ return longs * bytesPerLong
+}
+
+// NewCPUSet returns a CPUSet for the given number of CPUs which initially
+// contains no CPUs.
+func NewCPUSet(num uint) CPUSet {
+ return CPUSet(make([]byte, CPUSetSize(num)))
+}
+
+// NewFullCPUSet returns a CPUSet for the given number of CPUs, all of which
+// are present in the set.
+func NewFullCPUSet(num uint) CPUSet {
+ c := NewCPUSet(num)
+ var i uint
+ for ; i < num/bitsPerByte; i++ {
+ c[i] = 0xff
+ }
+ if rem := num % bitsPerByte; rem != 0 {
+ c[i] = (1 << rem) - 1
+ }
+ return c
+}
+
+// Size returns the size of 'c' in bytes.
+func (c CPUSet) Size() uint {
+ return uint(len(c))
+}
+
+// NumCPUs returns how many cpus are set in the CPUSet.
+func (c CPUSet) NumCPUs() uint {
+ var n int
+ for _, b := range c {
+ n += bits.OnesCount8(b)
+ }
+ return uint(n)
+}
+
+// Copy returns a copy of the CPUSet.
+func (c CPUSet) Copy() CPUSet {
+ return append(CPUSet(nil), c...)
+}
+
+// Set sets the bit corresponding to cpu.
+func (c *CPUSet) Set(cpu uint) {
+ (*c)[cpu/bitsPerByte] |= 1 << (cpu % bitsPerByte)
+}
+
+// ClearAbove clears bits corresponding to cpu and all higher cpus.
+func (c *CPUSet) ClearAbove(cpu uint) {
+ i := cpu / bitsPerByte
+ if i >= c.Size() {
+ return
+ }
+ (*c)[i] &^= 0xff << (cpu % bitsPerByte)
+ for i++; i < c.Size(); i++ {
+ (*c)[i] = 0
+ }
+}
+
+// ForEachCPU iterates over the CPUSet and calls fn with the cpu index if
+// it's set.
+func (c CPUSet) ForEachCPU(fn func(uint)) {
+ for i := uint(0); i < c.Size()*bitsPerByte; i++ {
+ bit := uint(1) << (i & (bitsPerByte - 1))
+ if uint(c[i/bitsPerByte])&bit == bit {
+ fn(i)
+ }
+ }
+}
diff --git a/pkg/sentry/kernel/sched/cpuset_test.go b/pkg/sentry/kernel/sched/cpuset_test.go
new file mode 100644
index 000000000..8a6e12958
--- /dev/null
+++ b/pkg/sentry/kernel/sched/cpuset_test.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sched
+
+import (
+ "testing"
+)
+
+func TestNumCPUs(t *testing.T) {
+ for i := uint(0); i < 1024; i++ {
+ c := NewCPUSet(i)
+ for j := uint(0); j < i; j++ {
+ c.Set(j)
+ }
+ n := c.NumCPUs()
+ if n != i {
+ t.Errorf("got wrong number of cpus %d, want %d", n, i)
+ }
+ }
+}
+
+func TestClearAbove(t *testing.T) {
+ const n = 1024
+ c := NewFullCPUSet(n)
+ for i := uint(0); i < n; i++ {
+ cpu := n - i
+ c.ClearAbove(cpu)
+ if got := c.NumCPUs(); got != cpu {
+ t.Errorf("iteration %d: got %d cpus, wanted %d", i, got, cpu)
+ }
+ }
+}
diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go
new file mode 100644
index 000000000..f1de1da60
--- /dev/null
+++ b/pkg/sentry/kernel/sched/sched.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sched implements scheduler related features.
+package sched
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
new file mode 100644
index 000000000..b7c4a507f
--- /dev/null
+++ b/pkg/sentry/kernel/seccomp.go
@@ -0,0 +1,205 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/binary"
+ "gvisor.googlesource.com/gvisor/pkg/bpf"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const maxSyscallFilterInstructions = 1 << 15
+
+type seccompResult int
+
+const (
+ // seccompResultDeny indicates that a syscall should not be executed.
+ seccompResultDeny seccompResult = iota
+
+ // seccompResultAllow indicates that a syscall should be executed.
+ seccompResultAllow
+
+ // seccompResultKill indicates that the task should be killed immediately,
+ // with the exit status indicating that the task was killed by SIGSYS.
+ seccompResultKill
+
+ // seccompResultTrace indicates that a ptracer was successfully notified as
+ // a result of a SECCOMP_RET_TRACE.
+ seccompResultTrace
+)
+
+// seccompData is equivalent to struct seccomp_data, which contains the data
+// passed to seccomp-bpf filters.
+type seccompData struct {
+ // nr is the system call number.
+ nr int32
+
+ // arch is an AUDIT_ARCH_* value indicating the system call convention.
+ arch uint32
+
+ // instructionPointer is the value of the instruction pointer at the time
+ // of the system call.
+ instructionPointer uint64
+
+ // args contains the first 6 system call arguments.
+ args [6]uint64
+}
+
+func (d *seccompData) asBPFInput() bpf.Input {
+ return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder}
+}
+
+func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo {
+ si := &arch.SignalInfo{
+ Signo: int32(linux.SIGSYS),
+ Errno: errno,
+ Code: arch.SYS_SECCOMP,
+ }
+ si.SetCallAddr(uint64(ip))
+ si.SetSyscall(sysno)
+ si.SetArch(t.SyscallTable().AuditNumber)
+ return si
+}
+
+// checkSeccompSyscall applies the task's seccomp filters before the execution
+// of syscall sysno at instruction pointer ip. (These parameters must be passed
+// in because vsyscalls do not use the values in t.Arch().)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) seccompResult {
+ result := t.evaluateSyscallFilters(sysno, args, ip)
+ switch result & linux.SECCOMP_RET_ACTION {
+ case linux.SECCOMP_RET_TRAP:
+ // "Results in the kernel sending a SIGSYS signal to the triggering
+ // task without executing the system call. ... The SECCOMP_RET_DATA
+ // portion of the return value will be passed as si_errno." -
+ // Documentation/prctl/seccomp_filter.txt
+ t.SendSignal(seccompSiginfo(t, int32(result&linux.SECCOMP_RET_DATA), sysno, ip))
+ return seccompResultDeny
+
+ case linux.SECCOMP_RET_ERRNO:
+ // "Results in the lower 16-bits of the return value being passed to
+ // userland as the errno without executing the system call."
+ t.Arch().SetReturn(-uintptr(result & linux.SECCOMP_RET_DATA))
+ return seccompResultDeny
+
+ case linux.SECCOMP_RET_TRACE:
+ // "When returned, this value will cause the kernel to attempt to
+ // notify a ptrace()-based tracer prior to executing the system call.
+ // If there is no tracer present, -ENOSYS is returned to userland and
+ // the system call is not executed."
+ if t.ptraceSeccomp(uint16(result & linux.SECCOMP_RET_DATA)) {
+ return seccompResultTrace
+ }
+ // This useless-looking temporary is needed because Go.
+ tmp := uintptr(syscall.ENOSYS)
+ t.Arch().SetReturn(-tmp)
+ return seccompResultDeny
+
+ case linux.SECCOMP_RET_ALLOW:
+ // "Results in the system call being executed."
+ return seccompResultAllow
+
+ case linux.SECCOMP_RET_KILL:
+ // "Results in the task exiting immediately without executing the
+ // system call. The exit status of the task will be SIGSYS, not
+ // SIGKILL."
+ fallthrough
+ default: // consistent with Linux
+ return seccompResultKill
+ }
+}
+
+func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
+ data := seccompData{
+ nr: sysno,
+ arch: t.tc.st.AuditNumber,
+ instructionPointer: uint64(ip),
+ }
+ // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
+ // we can't do any slicing tricks or even use copy/append here.
+ for i, arg := range args {
+ if i >= len(data.args) {
+ break
+ }
+ data.args[i] = arg.Uint64()
+ }
+ input := data.asBPFInput()
+
+ ret := uint32(linux.SECCOMP_RET_ALLOW)
+ // "Every filter successfully installed will be evaluated (in reverse
+ // order) for each system call the task makes." - kernel/seccomp.c
+ for i := len(t.syscallFilters) - 1; i >= 0; i-- {
+ thisRet, err := bpf.Exec(t.syscallFilters[i], input)
+ if err != nil {
+ t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
+ thisRet = linux.SECCOMP_RET_KILL
+ }
+ // "If multiple filters exist, the return value for the evaluation of a
+ // given system call will always use the highest precedent value." -
+ // Documentation/prctl/seccomp_filter.txt
+ //
+ // (Note that this contradicts prctl(2): "If the filters permit prctl()
+ // calls, then additional filters can be added; they are run in order
+ // until the first non-allow result is seen." prctl(2) is incorrect.)
+ //
+ // "The ordering ensures that a min_t() over composed return values
+ // always selects the least permissive choice." -
+ // include/uapi/linux/seccomp.h
+ if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) {
+ ret = thisRet
+ }
+ }
+
+ return ret
+}
+
+// AppendSyscallFilter adds BPF program p as a system call filter.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) AppendSyscallFilter(p bpf.Program) error {
+ // Cap the combined length of all syscall filters (plus a penalty of 4
+ // instructions per filter beyond the first) to
+ // maxSyscallFilterInstructions. (This restriction is inherited from
+ // Linux.)
+ totalLength := p.Length()
+ for _, f := range t.syscallFilters {
+ totalLength += f.Length() + 4
+ }
+ if totalLength > maxSyscallFilterInstructions {
+ return syserror.ENOMEM
+ }
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.syscallFilters = append(t.syscallFilters, p)
+ return nil
+}
+
+// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current
+// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
+// and /proc/[pid]/status.
+func (t *Task) SeccompMode() int {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if len(t.syscallFilters) > 0 {
+ return linux.SECCOMP_MODE_FILTER
+ }
+ return linux.SECCOMP_MODE_NONE
+}
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
new file mode 100644
index 000000000..1656ad126
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -0,0 +1,62 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_template_instance(
+ name = "waiter_list",
+ out = "waiter_list.go",
+ package = "semaphore",
+ prefix = "waiter",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Linker": "*waiter",
+ },
+)
+
+go_stateify(
+ name = "semaphore_state",
+ srcs = [
+ "semaphore.go",
+ "waiter_list.go",
+ ],
+ out = "semaphore_autogen_state.go",
+ package = "semaphore",
+)
+
+go_library(
+ name = "semaphore",
+ srcs = [
+ "semaphore.go",
+ "semaphore_autogen_state.go",
+ "waiter_list.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/log",
+ "//pkg/sentry/context",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/time",
+ "//pkg/state",
+ "//pkg/state/statefile",
+ "//pkg/syserror",
+ ],
+)
+
+go_test(
+ name = "semaphore_test",
+ size = "small",
+ srcs = ["semaphore_test.go"],
+ embed = [":semaphore"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
new file mode 100644
index 000000000..19ad5d537
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -0,0 +1,473 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package semaphore implements System V semaphores.
+package semaphore
+
+import (
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+const (
+ valueMax = 32767 // SEMVMX
+
+ // semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL).
+ semaphoresMax = 32000
+
+ // setMax is "system-wide limit on the number of semaphore sets" (SEMMNI).
+ setsMax = 32000
+
+ // semaphoresTotalMax is "system-wide limit on the number of semaphores"
+ // (SEMMNS = SEMMNI*SEMMSL).
+ semaphoresTotalMax = 1024000000
+)
+
+// Registry maintains a set of semaphores that can be found by key or ID.
+type Registry struct {
+ // mu protects all fields below.
+ mu sync.Mutex `state:"nosave"`
+ semaphores map[int32]*Set
+ lastIDUsed int32
+}
+
+// Set represents a set of semaphores that can be operated atomically.
+type Set struct {
+ // Id is a handle that identifies the set.
+ ID int32
+
+ // key is an user provided key that can be shared between processes.
+ key int32
+
+ // creator is the user that created the set. Immutable.
+ creator fs.FileOwner
+
+ // mu protects all fields below.
+ mu sync.Mutex `state:"nosave"`
+ owner fs.FileOwner
+ perms fs.FilePermissions
+ opTime ktime.Time
+ changeTime ktime.Time
+ sems []sem
+
+ // dead is set to true when the set is removed and can't be reached anymore.
+ // All waiters must wake up and fail when set is dead.
+ dead bool
+}
+
+// sem represents a single semanphore from a set.
+type sem struct {
+ value int16
+ waiters waiterList `state:"zerovalue"`
+}
+
+// waiter represents a caller that is waiting for the semaphore value to
+// become positive or zero.
+type waiter struct {
+ waiterEntry
+
+ // value represents how much resource the waiter needs to wake up.
+ value int16
+ ch chan struct{}
+}
+
+// NewRegistry creates a new semaphore set registry.
+func NewRegistry() *Registry {
+ return &Registry{semaphores: make(map[int32]*Set)}
+}
+
+// FindOrCreate searches for a semaphore set that matches 'key'. If not found,
+// it may create a new one if requested. If private is true, key is ignored and
+// a new set is always created. If create is false, it fails if a set cannot
+// be found. If exclusive is true, it fails if a set with the same key already
+// exists.
+func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
+ if nsems < 0 || nsems > semaphoresMax {
+ return nil, syserror.EINVAL
+ }
+
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if !private {
+ // Look up an existing semaphore.
+ if set := r.findByKey(key); set != nil {
+ // Check that caller can access semaphore set.
+ creds := auth.CredentialsFromContext(ctx)
+ if !set.checkPerms(creds, fs.PermsFromMode(mode)) {
+ return nil, syserror.EACCES
+ }
+
+ // Validate parameters.
+ if nsems > int32(set.size()) {
+ return nil, syserror.EINVAL
+ }
+ if create && exclusive {
+ return nil, syserror.EEXIST
+ }
+ return set, nil
+ }
+
+ if !create {
+ // Semaphore not found and should not be created.
+ return nil, syserror.ENOENT
+ }
+ }
+
+ // Zero is only valid if an existing set is found.
+ if nsems == 0 {
+ return nil, syserror.EINVAL
+ }
+
+ // Apply system limits.
+ if len(r.semaphores) >= setsMax {
+ return nil, syserror.EINVAL
+ }
+ if r.totalSems() > int(semaphoresTotalMax-nsems) {
+ return nil, syserror.EINVAL
+ }
+
+ // Finally create a new set.
+ owner := fs.FileOwnerFromContext(ctx)
+ perms := fs.FilePermsFromMode(mode)
+ return r.newSet(ctx, key, owner, owner, perms, nsems)
+}
+
+// RemoveID removes set with give 'id' from the registry and marks the set as
+// dead. All waiters will be awakened and fail.
+func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ set := r.semaphores[id]
+ if set == nil {
+ return syserror.EINVAL
+ }
+
+ // "The effective user ID of the calling process must match the creator or
+ // owner of the semaphore set, or the caller must be privileged."
+ if !set.checkCredentials(creds) && !set.checkCapability(creds) {
+ return syserror.EACCES
+ }
+
+ delete(r.semaphores, set.ID)
+ set.destroy()
+ return nil
+}
+
+func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
+ set := &Set{
+ key: key,
+ owner: owner,
+ creator: owner,
+ perms: perms,
+ changeTime: ktime.NowFromContext(ctx),
+ sems: make([]sem, nsems),
+ }
+
+ // Find the next available ID.
+ for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
+ // Handle wrap around.
+ if id < 0 {
+ id = 0
+ continue
+ }
+ if r.semaphores[id] == nil {
+ r.lastIDUsed = id
+ r.semaphores[id] = set
+ set.ID = id
+ return set, nil
+ }
+ }
+
+ log.Warningf("Semaphore map is full, they must be leaking")
+ return nil, syserror.ENOMEM
+}
+
+// FindByID looks up a set given an ID.
+func (r *Registry) FindByID(id int32) *Set {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+ return r.semaphores[id]
+}
+
+func (r *Registry) findByKey(key int32) *Set {
+ for _, v := range r.semaphores {
+ if v.key == key {
+ return v
+ }
+ }
+ return nil
+}
+
+func (r *Registry) totalSems() int {
+ totalSems := 0
+ for _, v := range r.semaphores {
+ totalSems += v.size()
+ }
+ return totalSems
+}
+
+func (s *Set) findSem(num int32) *sem {
+ if num < 0 || int(num) >= s.size() {
+ return nil
+ }
+ return &s.sems[num]
+}
+
+func (s *Set) size() int {
+ return len(s.sems)
+}
+
+// Change changes some fields from the set atomically.
+func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // "The effective UID of the calling process must match the owner or creator
+ // of the semaphore set, or the caller must be privileged."
+ if !s.checkCredentials(creds) && !s.checkCapability(creds) {
+ return syserror.EACCES
+ }
+
+ s.owner = owner
+ s.perms = perms
+ s.changeTime = ktime.NowFromContext(ctx)
+ return nil
+}
+
+// SetVal overrides a semaphore value, waking up waiters as needed.
+func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials) error {
+ if val < 0 || val > valueMax {
+ return syserror.ERANGE
+ }
+
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // "The calling process must have alter permission on the semaphore set."
+ if !s.checkPerms(creds, fs.PermMask{Write: true}) {
+ return syserror.EACCES
+ }
+
+ sem := s.findSem(num)
+ if sem == nil {
+ return syserror.ERANGE
+ }
+
+ // TODO: Clear undo entries in all processes
+ sem.value = val
+ s.changeTime = ktime.NowFromContext(ctx)
+ sem.wakeWaiters()
+ return nil
+}
+
+// GetVal returns a semaphore value.
+func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // "The calling process must have read permission on the semaphore set."
+ if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+ return 0, syserror.EACCES
+ }
+
+ sem := s.findSem(num)
+ if sem == nil {
+ return 0, syserror.ERANGE
+ }
+ return sem.value, nil
+}
+
+// ExecuteOps attempts to execute a list of operations to the set. It only
+// suceeds when all operations can be applied. No changes are made if it fails.
+//
+// On failure, it may return an error (retries are hopeless) or it may return
+// a channel that can be waited on before attempting again.
+func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials) (chan struct{}, int32, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // Did it race with a removal operation?
+ if s.dead {
+ return nil, 0, syserror.EIDRM
+ }
+
+ // Validate the operations.
+ readOnly := true
+ for _, op := range ops {
+ if s.findSem(int32(op.SemNum)) == nil {
+ return nil, 0, syserror.EFBIG
+ }
+ if op.SemOp != 0 {
+ readOnly = false
+ }
+ }
+
+ if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) {
+ return nil, 0, syserror.EACCES
+ }
+
+ ch, num, err := s.executeOps(ctx, ops)
+ if err != nil {
+ return nil, 0, err
+ }
+ return ch, num, nil
+}
+
+func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf) (chan struct{}, int32, error) {
+ // Changes to semaphores go to this slice temporarily until they all succeed.
+ tmpVals := make([]int16, len(s.sems))
+ for i := range s.sems {
+ tmpVals[i] = s.sems[i].value
+ }
+
+ for _, op := range ops {
+ sem := &s.sems[op.SemNum]
+ if op.SemOp == 0 {
+ // Handle 'wait for zero' operation.
+ if tmpVals[op.SemNum] != 0 {
+ // Semaphore isn't 0, must wait.
+ if op.SemFlg&linux.IPC_NOWAIT != 0 {
+ return nil, 0, syserror.ErrWouldBlock
+ }
+
+ w := newWaiter(op.SemOp)
+ sem.waiters.PushBack(w)
+ return w.ch, int32(op.SemNum), nil
+ }
+ } else {
+ if op.SemOp < 0 {
+ // Handle 'wait' operation.
+ if -op.SemOp > valueMax {
+ return nil, 0, syserror.ERANGE
+ }
+ if -op.SemOp > tmpVals[op.SemNum] {
+ // Not enough resources, must wait.
+ if op.SemFlg&linux.IPC_NOWAIT != 0 {
+ return nil, 0, syserror.ErrWouldBlock
+ }
+
+ w := newWaiter(op.SemOp)
+ sem.waiters.PushBack(w)
+ return w.ch, int32(op.SemNum), nil
+ }
+ } else {
+ // op.SemOp > 0: Handle 'signal' operation.
+ if tmpVals[op.SemNum] > valueMax-op.SemOp {
+ return nil, 0, syserror.ERANGE
+ }
+ }
+
+ tmpVals[op.SemNum] += op.SemOp
+ }
+ }
+
+ // All operations succeeded, apply them.
+ // TODO: handle undo operations.
+ for i, v := range tmpVals {
+ s.sems[i].value = v
+ s.sems[i].wakeWaiters()
+ }
+ s.opTime = ktime.NowFromContext(ctx)
+ return nil, 0, nil
+}
+
+// AbortWait notifies that a waiter is giving up and will not wait on the
+// channel anymore.
+func (s *Set) AbortWait(num int32, ch chan struct{}) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ sem := &s.sems[num]
+ for w := sem.waiters.Front(); w != nil; w = w.Next() {
+ if w.ch == ch {
+ sem.waiters.Remove(w)
+ return
+ }
+ }
+ // Waiter may not be found in case it raced with wakeWaiters().
+}
+
+func (s *Set) checkCredentials(creds *auth.Credentials) bool {
+ return s.owner.UID == creds.EffectiveKUID ||
+ s.owner.GID == creds.EffectiveKGID ||
+ s.creator.UID == creds.EffectiveKUID ||
+ s.creator.GID == creds.EffectiveKGID
+}
+
+func (s *Set) checkCapability(creds *auth.Credentials) bool {
+ return creds.HasCapability(linux.CAP_IPC_OWNER) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
+}
+
+func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
+ // Are we owner, or in group, or other?
+ p := s.perms.Other
+ if s.owner.UID == creds.EffectiveKUID {
+ p = s.perms.User
+ } else if creds.InGroup(s.owner.GID) {
+ p = s.perms.Group
+ }
+
+ // Are permissions satisfied without capability checks?
+ if p.SupersetOf(reqPerms) {
+ return true
+ }
+
+ return s.checkCapability(creds)
+}
+
+func (s *Set) destroy() {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // Notify all waiters. Tney will fail on the next attempt to execute
+ // operations and return error.
+ s.dead = true
+ for _, s := range s.sems {
+ for w := s.waiters.Front(); w != nil; w = w.Next() {
+ w.ch <- struct{}{}
+ }
+ s.waiters.Reset()
+ }
+}
+
+// wakeWaiters goes over all waiters and checks which of them can be notified.
+func (s *sem) wakeWaiters() {
+ // Note that this will release all waiters waiting for 0 too.
+ for w := s.waiters.Front(); w != nil; {
+ if s.value < w.value {
+ // Still blocked, skip it.
+ continue
+ }
+ w.ch <- struct{}{}
+ old := w
+ w = w.Next()
+ s.waiters.Remove(old)
+ }
+}
+
+func newWaiter(val int16) *waiter {
+ return &waiter{
+ value: val,
+ ch: make(chan struct{}, 1),
+ }
+}
diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go
new file mode 100644
index 000000000..0386586ab
--- /dev/null
+++ b/pkg/sentry/kernel/semaphore/semaphore_test.go
@@ -0,0 +1,172 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package semaphore
+
+import (
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} {
+ ch, _, err := set.executeOps(ctx, ops)
+ if err != nil {
+ t.Fatalf("ExecuteOps(ops) failed, err: %v, ops: %+v", err, ops)
+ }
+ if block {
+ if ch == nil {
+ t.Fatalf("ExecuteOps(ops) got: nil, expected: !nil, ops: %+v", ops)
+ }
+ if signalled(ch) {
+ t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops)
+ }
+ } else {
+ if ch != nil {
+ t.Fatalf("ExecuteOps(ops) got: %v, expected: nil, ops: %+v", ch, ops)
+ }
+ }
+ return ch
+}
+
+func signalled(ch chan struct{}) bool {
+ select {
+ case <-ch:
+ return true
+ default:
+ return false
+ }
+}
+
+func TestBasic(t *testing.T) {
+ ctx := contexttest.Context(t)
+ set := &Set{ID: 123, sems: make([]sem, 1)}
+ ops := []linux.Sembuf{
+ linux.Sembuf{SemOp: 1},
+ }
+ executeOps(ctx, t, set, ops, false)
+
+ ops[0].SemOp = -1
+ executeOps(ctx, t, set, ops, false)
+
+ ops[0].SemOp = -1
+ ch1 := executeOps(ctx, t, set, ops, true)
+
+ ops[0].SemOp = 1
+ executeOps(ctx, t, set, ops, false)
+ if !signalled(ch1) {
+ t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops)
+ }
+}
+
+func TestWaitForZero(t *testing.T) {
+ ctx := contexttest.Context(t)
+ set := &Set{ID: 123, sems: make([]sem, 1)}
+ ops := []linux.Sembuf{
+ linux.Sembuf{SemOp: 0},
+ }
+ executeOps(ctx, t, set, ops, false)
+
+ ops[0].SemOp = -2
+ ch1 := executeOps(ctx, t, set, ops, true)
+
+ ops[0].SemOp = 0
+ executeOps(ctx, t, set, ops, false)
+
+ ops[0].SemOp = 1
+ executeOps(ctx, t, set, ops, false)
+
+ ops[0].SemOp = 0
+ chZero1 := executeOps(ctx, t, set, ops, true)
+
+ ops[0].SemOp = 0
+ chZero2 := executeOps(ctx, t, set, ops, true)
+
+ ops[0].SemOp = 1
+ executeOps(ctx, t, set, ops, false)
+ if !signalled(ch1) {
+ t.Fatalf("ExecuteOps(ops) channel should have been signalled, ops: %+v, set: %+v", ops, set)
+ }
+
+ ops[0].SemOp = -2
+ executeOps(ctx, t, set, ops, false)
+ if !signalled(chZero1) {
+ t.Fatalf("ExecuteOps(ops) channel zero 1 should have been signalled, ops: %+v, set: %+v", ops, set)
+ }
+ if !signalled(chZero2) {
+ t.Fatalf("ExecuteOps(ops) channel zero 2 should have been signalled, ops: %+v, set: %+v", ops, set)
+ }
+}
+
+func TestNoWait(t *testing.T) {
+ ctx := contexttest.Context(t)
+ set := &Set{ID: 123, sems: make([]sem, 1)}
+ ops := []linux.Sembuf{
+ linux.Sembuf{SemOp: 1},
+ }
+ executeOps(ctx, t, set, ops, false)
+
+ ops[0].SemOp = -2
+ ops[0].SemFlg = linux.IPC_NOWAIT
+ if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+ t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
+ }
+
+ ops[0].SemOp = 0
+ ops[0].SemFlg = linux.IPC_NOWAIT
+ if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock {
+ t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock)
+ }
+}
+
+func TestUnregister(t *testing.T) {
+ ctx := contexttest.Context(t)
+ r := NewRegistry()
+ set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true)
+ if err != nil {
+ t.Fatalf("FindOrCreate() failed, err: %v", err)
+ }
+ if got := r.FindByID(set.ID); got.ID != set.ID {
+ t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.ID, got, set)
+ }
+
+ ops := []linux.Sembuf{
+ linux.Sembuf{SemOp: -1},
+ }
+ chs := make([]chan struct{}, 0, 5)
+ for i := 0; i < 5; i++ {
+ ch := executeOps(ctx, t, set, ops, true)
+ chs = append(chs, ch)
+ }
+
+ creds := auth.CredentialsFromContext(ctx)
+ if err := r.RemoveID(set.ID, creds); err != nil {
+ t.Fatalf("RemoveID(%d) failed, err: %v", set.ID, err)
+ }
+ if !set.dead {
+ t.Fatalf("set is not dead: %+v", set)
+ }
+ if got := r.FindByID(set.ID); got != nil {
+ t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.ID, got)
+ }
+ for i, ch := range chs {
+ if !signalled(ch) {
+ t.Fatalf("channel %d should have been signalled", i)
+ }
+ }
+}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
new file mode 100644
index 000000000..53d8fb844
--- /dev/null
+++ b/pkg/sentry/kernel/sessions.go
@@ -0,0 +1,462 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SessionID is the public identifier.
+type SessionID ThreadID
+
+// ProcessGroupID is the public identifier.
+type ProcessGroupID ThreadID
+
+// Session contains a leader threadgroup and a list of ProcessGroups.
+type Session struct {
+ refs refs.AtomicRefCount
+
+ // leader is the originator of the Session.
+ //
+ // Note that this may no longer be running (and may be reaped), so the
+ // ID is cached upon initial creation. The leader is still required
+ // however, since its PIDNamespace defines the scope of the Session.
+ //
+ // The leader is immutable.
+ leader *ThreadGroup
+
+ // id is the cached identifier in the leader's namespace.
+ //
+ // The id is immutable.
+ id SessionID
+
+ // ProcessGroups is a list of process groups in this Session. This is
+ // protected by TaskSet.mu.
+ processGroups processGroupList
+
+ // sessionEntry is the embed for TaskSet.sessions. This is protected by
+ // TaskSet.mu.
+ sessionEntry
+}
+
+// incRef grabs a reference.
+func (s *Session) incRef() {
+ s.refs.IncRef()
+}
+
+// decRef drops a reference.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (s *Session) decRef() {
+ s.refs.DecRefWithDestructor(func() {
+ // Remove translations from the leader.
+ for ns := s.leader.pidns; ns != nil; ns = ns.parent {
+ id := ns.sids[s]
+ delete(ns.sids, s)
+ delete(ns.sessions, id)
+ }
+
+ // Remove from the list of global Sessions.
+ s.leader.pidns.owner.sessions.Remove(s)
+ })
+}
+
+// ProcessGroup contains an originator threadgroup and a parent Session.
+type ProcessGroup struct {
+ refs refs.AtomicRefCount // not exported.
+
+ // originator is the originator of the group.
+ //
+ // See note re: leader in Session. The same applies here.
+ //
+ // The originator is immutable.
+ originator *ThreadGroup
+
+ // id is the cached identifier in the originator's namespace.
+ //
+ // The id is immutable.
+ id ProcessGroupID
+
+ // Session is the parent Session.
+ //
+ // The session is immutable.
+ session *Session
+
+ // ancestors is the number of thread groups in this process group whose
+ // parent is in a different process group in the same session.
+ //
+ // The name is derived from the fact that process groups where
+ // ancestors is zero are considered "orphans".
+ //
+ // ancestors is protected by TaskSet.mu.
+ ancestors uint32
+
+ // processGroupEntry is the embedded entry for Sessions.groups. This is
+ // protected by TaskSet.mu.
+ processGroupEntry
+}
+
+// incRefWithParent grabs a reference.
+//
+// This function is called when this ProcessGroup is being associated with some
+// new ThreadGroup, tg. parentPG is the ProcessGroup of tg's parent
+// ThreadGroup. If tg is init, then parentPG may be nil.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) incRefWithParent(parentPG *ProcessGroup) {
+ // We acquire an "ancestor" reference in the case of a nil parent.
+ // This is because the process being associated is init, and init can
+ // never be orphaned (we count it as always having an ancestor).
+ if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
+ pg.ancestors++
+ }
+
+ pg.refs.IncRef()
+}
+
+// decRefWithParent drops a reference.
+//
+// parentPG is per incRefWithParent.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
+ // See incRefWithParent regarding parent == nil.
+ if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
+ pg.ancestors--
+ }
+
+ alive := true
+ pg.refs.DecRefWithDestructor(func() {
+ alive = false // don't bother with handleOrphan.
+
+ // Remove translations from the originator.
+ for ns := pg.originator.pidns; ns != nil; ns = ns.parent {
+ id := ns.pgids[pg]
+ delete(ns.pgids, pg)
+ delete(ns.processGroups, id)
+ }
+
+ // Remove the list of process groups.
+ pg.session.processGroups.Remove(pg)
+ pg.session.decRef()
+ })
+ if alive {
+ pg.handleOrphan()
+ }
+}
+
+// parentPG returns the parent process group.
+//
+// Precondition: callers must hold TaskSet.mu.
+func (tg *ThreadGroup) parentPG() *ProcessGroup {
+ if tg.leader.parent != nil {
+ return tg.leader.parent.tg.processGroup
+ }
+ return nil
+}
+
+// handleOrphan checks whether the process group is an orphan and has any
+// stopped jobs. If yes, then appropriate signals are delivered to each thread
+// group within the process group.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (pg *ProcessGroup) handleOrphan() {
+ // Check if this process is an orphan.
+ if pg.ancestors != 0 {
+ return
+ }
+
+ // See if there are any stopped jobs.
+ hasStopped := false
+ pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+ if tg.processGroup != pg {
+ return
+ }
+ tg.signalHandlers.mu.Lock()
+ if tg.groupStopPhase == groupStopComplete {
+ hasStopped = true
+ }
+ tg.signalHandlers.mu.Unlock()
+ })
+ if !hasStopped {
+ return
+ }
+
+ // Deliver appropriate signals to all thread groups.
+ pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+ if tg.processGroup != pg {
+ return
+ }
+ tg.signalHandlers.mu.Lock()
+ tg.leader.sendSignalLocked(sigPriv(linux.SIGHUP), true /* group */)
+ tg.leader.sendSignalLocked(sigPriv(linux.SIGCONT), true /* group */)
+ tg.signalHandlers.mu.Unlock()
+ })
+
+ return
+}
+
+// CreateSession creates a new Session, with the ThreadGroup as the leader.
+//
+// EPERM may be returned if either the given ThreadGroup is already a Session
+// leader, or a ProcessGroup already exists for the ThreadGroup's ID.
+func (tg *ThreadGroup) CreateSession() error {
+ tg.pidns.owner.mu.Lock()
+ defer tg.pidns.owner.mu.Unlock()
+ return tg.createSession()
+}
+
+// createSession creates a new session for a threadgroup.
+//
+// Precondition: callers must hold TaskSet.mu for writing.
+func (tg *ThreadGroup) createSession() error {
+ // Get the ID for this thread in the current namespace.
+ id := tg.pidns.tids[tg.leader]
+
+ // Check if this ThreadGroup already leads a Session, or
+ // if the proposed group is already taken.
+ for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
+ if s.leader.pidns != tg.pidns {
+ continue
+ }
+ if s.leader == tg {
+ return syserror.EPERM
+ }
+ if s.id == SessionID(id) {
+ return syserror.EPERM
+ }
+ for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
+ if pg.id == ProcessGroupID(id) {
+ return syserror.EPERM
+ }
+ }
+ }
+
+ // Create a new Session, with a single reference.
+ s := &Session{
+ id: SessionID(id),
+ leader: tg,
+ }
+
+ // Create a new ProcessGroup, belonging to that Session.
+ // This also has a single reference (assigned below).
+ //
+ // Note that since this is a new session and a new process group, there
+ // will be zero ancestors for this process group. (It is an orphan at
+ // this point.)
+ pg := &ProcessGroup{
+ id: ProcessGroupID(id),
+ originator: tg,
+ session: s,
+ ancestors: 0,
+ }
+
+ // Tie them and return the result.
+ s.processGroups.PushBack(pg)
+ tg.pidns.owner.sessions.PushBack(s)
+
+ // Leave the current group, and assign the new one.
+ if tg.processGroup != nil {
+ oldParentPG := tg.parentPG()
+ tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+ childTG.processGroup.incRefWithParent(pg)
+ childTG.processGroup.decRefWithParent(oldParentPG)
+ })
+ tg.processGroup.decRefWithParent(oldParentPG)
+ tg.processGroup = pg
+ } else {
+ // The current process group may be nil only in the case of an
+ // unparented thread group (i.e. the init process). This would
+ // not normally occur, but we allow it for the convenience of
+ // CreateSession working from that point. There will be no
+ // child processes. We always say that the very first group
+ // created has ancestors (avoids checks elsewhere).
+ //
+ // Note that this mirrors the parent == nil logic in
+ // incRef/decRef/reparent, which counts nil as an ancestor.
+ tg.processGroup = pg
+ tg.processGroup.ancestors++
+ }
+
+ // Ensure a translation is added to all namespaces.
+ for ns := tg.pidns; ns != nil; ns = ns.parent {
+ local := ns.tids[tg.leader]
+ ns.sids[s] = SessionID(local)
+ ns.sessions[SessionID(local)] = s
+ ns.pgids[pg] = ProcessGroupID(local)
+ ns.processGroups[ProcessGroupID(local)] = pg
+ }
+
+ return nil
+}
+
+// CreateProcessGroup creates a new process group.
+//
+// An EPERM error will be returned if the ThreadGroup belongs to a different
+// Session, is a Session leader or the group already exists.
+func (tg *ThreadGroup) CreateProcessGroup() error {
+ tg.pidns.owner.mu.Lock()
+ defer tg.pidns.owner.mu.Unlock()
+
+ // Get the ID for this thread in the current namespace.
+ id := tg.pidns.tids[tg.leader]
+
+ // Per above, check for a Session leader or existing group.
+ for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
+ if s.leader.pidns != tg.pidns {
+ continue
+ }
+ if s.leader == tg {
+ return syserror.EPERM
+ }
+ for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
+ if pg.id == ProcessGroupID(id) {
+ return syserror.EPERM
+ }
+ }
+ }
+
+ // Create a new ProcessGroup, belonging to the current Session.
+ //
+ // We manually adjust the ancestors if the parent is in the same
+ // session.
+ tg.processGroup.session.incRef()
+ pg := &ProcessGroup{
+ id: ProcessGroupID(id),
+ originator: tg,
+ session: tg.processGroup.session,
+ }
+ if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
+ pg.ancestors++
+ }
+
+ // Assign the new process group; adjust children.
+ oldParentPG := tg.parentPG()
+ tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+ childTG.processGroup.incRefWithParent(pg)
+ childTG.processGroup.decRefWithParent(oldParentPG)
+ })
+ tg.processGroup.decRefWithParent(oldParentPG)
+ tg.processGroup = pg
+
+ // Ensure this translation is added to all namespaces.
+ for ns := tg.pidns; ns != nil; ns = ns.parent {
+ local := ns.tids[tg.leader]
+ ns.pgids[pg] = ProcessGroupID(local)
+ ns.processGroups[ProcessGroupID(local)] = pg
+ }
+
+ return nil
+}
+
+// JoinProcessGroup joins an existing process group.
+//
+// This function will return EACCES if an exec has been performed since fork
+// by the given ThreadGroup, and EPERM if the Sessions are not the same or the
+// group does not exist.
+//
+// If checkExec is set, then the join is not permitted after the process has
+// executed exec at least once.
+func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error {
+ pidns.owner.mu.Lock()
+ defer pidns.owner.mu.Unlock()
+
+ // Lookup the ProcessGroup.
+ pg := pidns.processGroups[pgid]
+ if pg == nil {
+ return syserror.EPERM
+ }
+
+ // Disallow the join if an execve has performed, per POSIX.
+ if checkExec && tg.execed {
+ return syserror.EACCES
+ }
+
+ // See if it's in the same session as ours.
+ if pg.session != tg.processGroup.session {
+ return syserror.EPERM
+ }
+
+ // Join the group; adjust children.
+ parentPG := tg.parentPG()
+ pg.incRefWithParent(parentPG)
+ tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
+ childTG.processGroup.incRefWithParent(pg)
+ childTG.processGroup.decRefWithParent(tg.processGroup)
+ })
+ tg.processGroup.decRefWithParent(parentPG)
+ tg.processGroup = pg
+
+ return nil
+}
+
+// Session returns the ThreadGroup's Session.
+//
+// A reference is not taken on the session.
+func (tg *ThreadGroup) Session() *Session {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ return tg.processGroup.session
+}
+
+// IDOfSession returns the Session assigned to s in PID namespace ns.
+//
+// If this group isn't visible in this namespace, zero will be returned. It is
+// the callers responsibility to check that before using this function.
+func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID {
+ pidns.owner.mu.RLock()
+ defer pidns.owner.mu.RUnlock()
+ return pidns.sids[s]
+}
+
+// SessionWithID returns the Session with the given ID in the PID namespace ns,
+// or nil if that given ID is not defined in this namespace.
+//
+// A reference is not taken on the session.
+func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session {
+ pidns.owner.mu.RLock()
+ defer pidns.owner.mu.RUnlock()
+ return pidns.sessions[id]
+}
+
+// ProcessGroup returns the ThreadGroup's ProcessGroup.
+//
+// A reference is not taken on the process group.
+func (tg *ThreadGroup) ProcessGroup() *ProcessGroup {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ return tg.processGroup
+}
+
+// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns.
+//
+// The same constraints apply as IDOfSession.
+func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
+ pidns.owner.mu.RLock()
+ defer pidns.owner.mu.RUnlock()
+ return pidns.pgids[pg]
+}
+
+// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID
+// namespace ns, or nil if that given ID is not defined in this namespace.
+//
+// A reference is not taken on the process group.
+func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
+ pidns.owner.mu.RLock()
+ defer pidns.owner.mu.RUnlock()
+ return pidns.processGroups[id]
+}
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
new file mode 100644
index 000000000..8edd05cdf
--- /dev/null
+++ b/pkg/sentry/kernel/signal.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+)
+
+// SignalPanic is used to panic the running threads. It is a signal which
+// cannot be used by the application: it must be caught and ignored by the
+// runtime (in order to catch possible races).
+const SignalPanic = linux.SIGUSR2
+
+// sendExternalSignal is called when an asynchronous signal is sent to the
+// sentry ("in sentry context"). On some platforms, it may also be called when
+// an asynchronous signal is sent to sandboxed application threads ("in
+// application context").
+//
+// context is used only for debugging to differentiate these cases.
+//
+// Returns false if signal could not be sent because the Kernel is not fully
+// initialized yet.
+func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) bool {
+ switch linux.Signal(info.Signo) {
+ case platform.SignalInterrupt:
+ // Assume that a call to platform.Context.Interrupt() misfired.
+ return true
+
+ case SignalPanic:
+ // SignalPanic is also specially handled in sentry setup to ensure that
+ // it causes a panic even after tasks exit, but SignalPanic may also
+ // be sent here if it is received while in app context.
+ panic("Signal-induced panic")
+
+ default:
+ log.Infof("Received external signal %d in %s context", info.Signo, context)
+ if k.globalInit == nil {
+ log.Warningf("Received external signal %d before init created", info.Signo)
+ return false
+ }
+ k.globalInit.SendSignal(info)
+ }
+
+ return true
+}
+
+// sigPriv returns a SignalInfo representing a signal sent by the sentry. (The
+// name reflects its equivalence to Linux's SEND_SIG_PRIV.)
+func sigPriv(sig linux.Signal) *arch.SignalInfo {
+ return &arch.SignalInfo{
+ Signo: int32(sig),
+ Code: arch.SignalInfoKernel,
+ }
+}
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
new file mode 100644
index 000000000..21ba4ee70
--- /dev/null
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+// SignalHandlers holds information about signal actions.
+type SignalHandlers struct {
+ // mu protects actions, as well as the signal state of all tasks and thread
+ // groups using this SignalHandlers object. (See comment on
+ // ThreadGroup.signalHandlers.)
+ mu sync.Mutex `state:"nosave"`
+
+ // actions is the action to be taken upon receiving each signal.
+ actions map[linux.Signal]arch.SignalAct
+}
+
+// NewSignalHandlers returns a new SignalHandlers specifying all default
+// actions.
+func NewSignalHandlers() *SignalHandlers {
+ return &SignalHandlers{
+ actions: make(map[linux.Signal]arch.SignalAct),
+ }
+}
+
+// Fork returns a copy of sh for a new thread group.
+func (sh *SignalHandlers) Fork() *SignalHandlers {
+ sh2 := NewSignalHandlers()
+ sh.mu.Lock()
+ defer sh.mu.Unlock()
+ for sig, act := range sh.actions {
+ sh2.actions[sig] = act
+ }
+ return sh2
+}
+
+// CopyForExec returns a copy of sh for a thread group that is undergoing an
+// execve. (See comments in Task.finishExec.)
+func (sh *SignalHandlers) CopyForExec() *SignalHandlers {
+ sh2 := NewSignalHandlers()
+ sh.mu.Lock()
+ defer sh.mu.Unlock()
+ for sig, act := range sh.actions {
+ if act.Handler == arch.SignalActIgnore {
+ sh2.actions[sig] = arch.SignalAct{
+ Handler: arch.SignalActIgnore,
+ }
+ }
+ }
+ return sh2
+}
+
+// dequeueActionLocked returns the SignalAct that should be used to handle sig.
+//
+// Preconditions: sh.mu must be locked.
+func (sh *SignalHandlers) dequeueAction(sig linux.Signal) arch.SignalAct {
+ act := sh.actions[sig]
+ if act.IsResetHandler() {
+ delete(sh.actions, sig)
+ }
+ return act
+}
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
new file mode 100644
index 000000000..e20fa3eb6
--- /dev/null
+++ b/pkg/sentry/kernel/syscalls.go
@@ -0,0 +1,305 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi"
+ "gvisor.googlesource.com/gvisor/pkg/bits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// maxSyscallNum is the highest supported syscall number.
+//
+// The types below create fast lookup slices for all syscalls. This maximum
+// serves as a sanity check that we don't allocate huge slices for a very large
+// syscall.
+const maxSyscallNum = 2000
+
+// SyscallFn is a syscall implementation.
+type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
+
+// MissingFn is a syscall to be called when an implementation is missing.
+type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error)
+
+// Possible flags for SyscallFlagsTable.enable.
+const (
+ // syscallPresent indicates that this is not a missing syscall.
+ //
+ // This flag is used internally in SyscallFlagsTable.
+ syscallPresent = 1 << iota
+
+ // StraceEnableLog enables syscall log tracing.
+ StraceEnableLog
+
+ // StraceEnableEvent enables syscall event tracing.
+ StraceEnableEvent
+
+ // ExternalBeforeEnable enables the external hook before syscall execution.
+ ExternalBeforeEnable
+
+ // ExternalAfterEnable enables the external hook after syscall execution.
+ ExternalAfterEnable
+)
+
+// StraceEnableBits combines both strace log and event flags.
+const StraceEnableBits = StraceEnableLog | StraceEnableEvent
+
+// SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall
+// basis.
+type SyscallFlagsTable struct {
+ // mu protects writes to the fields below.
+ //
+ // Atomic loads are always allowed. Atomic stores are allowed only
+ // while mu is held.
+ mu sync.Mutex
+
+ // enable contains the enable bits for each syscall.
+ //
+ // missing syscalls have the same value in enable as missingEnable to
+ // avoid an extra branch in Word.
+ enable []uint32
+
+ // missingEnable contains the enable bits for missing syscalls.
+ missingEnable uint32
+}
+
+// Init initializes the struct, with all syscalls in table set to enable.
+//
+// max is the largest syscall number in table.
+func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) {
+ e.enable = make([]uint32, max+1)
+ for num := range table {
+ e.enable[num] = syscallPresent
+ }
+}
+
+// Word returns the enable bitfield for sysno.
+func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 {
+ if sysno < uintptr(len(e.enable)) {
+ return atomic.LoadUint32(&e.enable[sysno])
+ }
+
+ return atomic.LoadUint32(&e.missingEnable)
+}
+
+// Enable sets enable bit bit for all syscalls based on s.
+//
+// Syscalls missing from s are disabled.
+//
+// Syscalls missing from the initial table passed to Init cannot be added as
+// individual syscalls. If present in s they will be ignored.
+//
+// Callers to Word may see either the old or new value while this function
+// is executing.
+func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ missingVal := atomic.LoadUint32(&e.missingEnable)
+ if missingEnable {
+ missingVal |= bit
+ } else {
+ missingVal &^= bit
+ }
+ atomic.StoreUint32(&e.missingEnable, missingVal)
+
+ for num := range e.enable {
+ val := atomic.LoadUint32(&e.enable[num])
+ if !bits.IsOn32(val, syscallPresent) {
+ // Missing.
+ atomic.StoreUint32(&e.enable[num], missingVal)
+ continue
+ }
+
+ if s[uintptr(num)] {
+ val |= bit
+ } else {
+ val &^= bit
+ }
+ atomic.StoreUint32(&e.enable[num], val)
+ }
+}
+
+// EnableAll sets enable bit bit for all syscalls, present and missing.
+func (e *SyscallFlagsTable) EnableAll(bit uint32) {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ missingVal := atomic.LoadUint32(&e.missingEnable)
+ missingVal |= bit
+ atomic.StoreUint32(&e.missingEnable, missingVal)
+
+ for num := range e.enable {
+ val := atomic.LoadUint32(&e.enable[num])
+ if !bits.IsOn32(val, syscallPresent) {
+ // Missing.
+ atomic.StoreUint32(&e.enable[num], missingVal)
+ continue
+ }
+
+ val |= bit
+ atomic.StoreUint32(&e.enable[num], val)
+ }
+}
+
+// Stracer traces syscall execution.
+type Stracer interface {
+ // SyscallEnter is called on syscall entry.
+ //
+ // The returned private data is passed to SyscallExit.
+ //
+ // TODO: remove kernel imports from the strace package so
+ // that the type can be used directly.
+ SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
+
+ // SyscallExit is called on syscall exit.
+ SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
+}
+
+// SyscallTable is a lookup table of system calls. Critically, a SyscallTable
+// is *immutable*. In order to make supporting suspend and resume sane, they
+// must be uniquely registered and may not change during operation.
+type SyscallTable struct {
+ // OS is the operating system that this syscall table implements.
+ OS abi.OS `state:"wait"`
+
+ // Arch is the architecture that this syscall table targets.
+ Arch arch.Arch `state:"wait"`
+
+ // The OS version that this syscall table implements.
+ Version Version `state:"manual"`
+
+ // AuditNumber is a numeric constant that represents the syscall table. If
+ // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
+ // linux/audit.h.
+ AuditNumber uint32 `state:"manual"`
+
+ // Table is the collection of functions.
+ Table map[uintptr]SyscallFn `state:"manual"`
+
+ // lookup is a fixed-size array that holds the syscalls (indexed by
+ // their numbers). It is used for fast look ups.
+ lookup []SyscallFn `state:"manual"`
+
+ // Emulate is a collection of instruction addresses to emulate. The
+ // keys are addresses, and the values are system call numbers.
+ Emulate map[usermem.Addr]uintptr `state:"manual"`
+
+ // The function to call in case of a missing system call.
+ Missing MissingFn `state:"manual"`
+
+ // Stracer traces this syscall table.
+ Stracer Stracer `state:"manual"`
+
+ // External is used to handle an external callback.
+ External func(*Kernel) `state:"manual"`
+
+ // ExternalFilterBefore is called before External is called before the syscall is executed.
+ // External is not called if it returns false.
+ ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+
+ // ExternalFilterAfter is called before External is called after the syscall is executed.
+ // External is not called if it returns false.
+ ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+
+ // FeatureEnable stores the strace and one-shot enable bits.
+ FeatureEnable SyscallFlagsTable `state:"manual"`
+}
+
+// allSyscallTables contains all known tables.
+var allSyscallTables []*SyscallTable
+
+// SyscallTables returns a read-only slice of registered SyscallTables.
+func SyscallTables() []*SyscallTable {
+ return allSyscallTables
+}
+
+// LookupSyscallTable returns the SyscallCall table for the OS/Arch combination.
+func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
+ for _, s := range allSyscallTables {
+ if s.OS == os && s.Arch == a {
+ return s, true
+ }
+ }
+ return nil, false
+}
+
+// RegisterSyscallTable registers a new syscall table for use by a Kernel.
+func RegisterSyscallTable(s *SyscallTable) {
+ if s.Table == nil {
+ // Ensure non-nil lookup table.
+ s.Table = make(map[uintptr]SyscallFn)
+ }
+ if s.Emulate == nil {
+ // Ensure non-nil emulate table.
+ s.Emulate = make(map[usermem.Addr]uintptr)
+ }
+
+ var max uintptr
+ for num := range s.Table {
+ if num > max {
+ max = num
+ }
+ }
+
+ if max > maxSyscallNum {
+ panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
+ }
+
+ s.lookup = make([]SyscallFn, max+1)
+
+ // Initialize the fast-lookup table.
+ for num, fn := range s.Table {
+ s.lookup[num] = fn
+ }
+
+ s.FeatureEnable.init(s.Table, max)
+
+ if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
+ panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
+ }
+
+ // Save a reference to this table.
+ //
+ // This is required for a Kernel to find the table and for save/restore
+ // operations below.
+ allSyscallTables = append(allSyscallTables, s)
+}
+
+// Lookup returns the syscall implementation, if one exists.
+func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
+ if sysno < uintptr(len(s.lookup)) {
+ return s.lookup[sysno]
+ }
+
+ return nil
+}
+
+// LookupEmulate looks up an emulation syscall number.
+func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
+ sysno, ok := s.Emulate[addr]
+ return sysno, ok
+}
+
+// mapLookup is similar to Lookup, except that it only uses the syscall table,
+// that is, it skips the fast look array. This is available for benchmarking.
+func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
+ return s.Table[sysno]
+}
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
new file mode 100644
index 000000000..826809a70
--- /dev/null
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "fmt"
+
+// afterLoad is invoked by stateify.
+func (s *SyscallTable) afterLoad() {
+ otherTable, ok := LookupSyscallTable(s.OS, s.Arch)
+ if !ok {
+ // Couldn't find a reference?
+ panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch))
+ }
+
+ // Copy the table.
+ *s = *otherTable
+}
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
new file mode 100644
index 000000000..31541749e
--- /dev/null
+++ b/pkg/sentry/kernel/syslog.go
@@ -0,0 +1,100 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "math/rand"
+ "sync"
+)
+
+// syslog represents a sentry-global kernel log.
+//
+// Currently, it contains only fun messages for a dmesg easter egg.
+type syslog struct {
+ // mu protects the below.
+ mu sync.Mutex `state:"nosave"`
+
+ // msg is the syslog message buffer. It is lazily initialized.
+ msg []byte
+}
+
+// Log returns a copy of the syslog.
+func (s *syslog) Log() []byte {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ if s.msg != nil {
+ // Already initialized, just return a copy.
+ o := make([]byte, len(s.msg))
+ copy(o, s.msg)
+ return o
+ }
+
+ // Not initialized, create message.
+ allMessages := []string{
+ "Synthesizing system calls...",
+ "Mounting deweydecimalfs...",
+ "Moving files to filing cabinet...",
+ "Digging up root...",
+ "Constructing home...",
+ "Segmenting fault lines...",
+ "Creating bureaucratic processes...",
+ "Searching for needles in stacks...",
+ "Preparing for the zombie uprising...",
+ "Feeding the init monster...",
+ "Creating cloned children...",
+ "Daemonizing children...",
+ "Waiting for children...",
+ "Gathering forks...",
+ "Committing treasure map to memory...",
+ "Reading process obituaries...",
+ "Searching for socket adapter...",
+ "Creating process schedule...",
+ "Generating random numbers by fair dice roll...",
+ "Rewriting operating system in Javascript...",
+ "Consulting tar man page...",
+ "Forking spaghetti code...",
+ "Checking naughty and nice process list...",
+ "Checking naughty and nice process list...", // Check it up to twice.
+ "Granting licence to kill(2)...", // British spelling for British movie.
+ "Letting the watchdogs out...",
+ }
+
+ selectMessage := func() string {
+ i := rand.Intn(len(allMessages))
+ m := allMessages[i]
+
+ // Delete the selected message.
+ allMessages[i] = allMessages[len(allMessages)-1]
+ allMessages = allMessages[:len(allMessages)-1]
+
+ return m
+ }
+
+ time := 0.0
+ for i := 0; i < 10; i++ {
+ time += rand.Float64() / 2
+ s.msg = append(s.msg, []byte(fmt.Sprintf("<6>[%11.6f] %s\n", time, selectMessage()))...)
+ }
+
+ time += rand.Float64() / 2
+ s.msg = append(s.msg, []byte(fmt.Sprintf("<6>[%11.6f] Ready!\n", time))...)
+
+ // Return a copy.
+ o := make([]byte, len(s.msg))
+ copy(o, s.msg)
+ return o
+}
diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go
new file mode 100644
index 000000000..71ca75555
--- /dev/null
+++ b/pkg/sentry/kernel/table_test.go
@@ -0,0 +1,108 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+const (
+ maxTestSyscall = 1000
+)
+
+func createSyscallTable() *SyscallTable {
+ m := make(map[uintptr]SyscallFn)
+ for i := uintptr(0); i <= maxTestSyscall; i++ {
+ j := i
+ m[i] = func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) {
+ return j, nil, nil
+ }
+ }
+
+ s := &SyscallTable{
+ OS: abi.Linux,
+ Arch: arch.AMD64,
+ Table: m,
+ }
+
+ RegisterSyscallTable(s)
+ return s
+}
+
+func TestTable(t *testing.T) {
+ table := createSyscallTable()
+ defer func() {
+ // Cleanup registered tables to keep tests separate.
+ allSyscallTables = []*SyscallTable{}
+ }()
+
+ // Go through all functions and check that they return the right value.
+ for i := uintptr(0); i < maxTestSyscall; i++ {
+ fn := table.Lookup(i)
+ if fn == nil {
+ t.Errorf("Syscall %v is set to nil", i)
+ continue
+ }
+
+ v, _, _ := fn(nil, arch.SyscallArguments{})
+ if v != i {
+ t.Errorf("Wrong return value for syscall %v: expected %v, got %v", i, i, v)
+ }
+ }
+
+ // Check that values outside the range return nil.
+ for i := uintptr(maxTestSyscall + 1); i < maxTestSyscall+100; i++ {
+ fn := table.Lookup(i)
+ if fn != nil {
+ t.Errorf("Syscall %v is not nil: %v", i, fn)
+ continue
+ }
+ }
+}
+
+func BenchmarkTableLookup(b *testing.B) {
+ table := createSyscallTable()
+
+ b.ResetTimer()
+
+ j := uintptr(0)
+ for i := 0; i < b.N; i++ {
+ table.Lookup(j)
+ j = (j + 1) % 310
+ }
+
+ b.StopTimer()
+ // Cleanup registered tables to keep tests separate.
+ allSyscallTables = []*SyscallTable{}
+}
+
+func BenchmarkTableMapLookup(b *testing.B) {
+ table := createSyscallTable()
+
+ b.ResetTimer()
+
+ j := uintptr(0)
+ for i := 0; i < b.N; i++ {
+ table.mapLookup(j)
+ j = (j + 1) % 310
+ }
+
+ b.StopTimer()
+ // Cleanup registered tables to keep tests separate.
+ allSyscallTables = []*SyscallTable{}
+}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
new file mode 100644
index 000000000..3d2e035e9
--- /dev/null
+++ b/pkg/sentry/kernel/task.go
@@ -0,0 +1,606 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "sync"
+ "sync/atomic"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/bpf"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ ssync "gvisor.googlesource.com/gvisor/pkg/sync"
+)
+
+// Task represents a thread of execution in the untrusted app. It
+// includes registers and any thread-specific state that you would
+// normally expect.
+//
+// Each task is associated with a goroutine, called the task goroutine, that
+// executes code (application code, system calls, etc.) on behalf of that task.
+// See Task.run (task_run.go).
+//
+// All fields that are "owned by the task goroutine" can only be mutated by the
+// task goroutine while it is running. The task goroutine does not require
+// synchronization to read these fields, although it still requires
+// synchronization as described for those fields to mutate them.
+//
+// All fields that are "exclusive to the task goroutine" can only be accessed
+// by the task goroutine while it is running. The task goroutine does not
+// require synchronization to read or write these fields.
+type Task struct {
+ taskNode
+
+ // runState is what the task goroutine is executing if it is not stopped.
+ // If runState is nil, the task goroutine should exit or has exited.
+ // runState is exclusive to the task goroutine.
+ runState taskRunState
+
+ // haveSyscallReturn is true if tc.Arch().Return() represents a value
+ // returned by a syscall (or set by ptrace after a syscall).
+ //
+ // haveSyscallReturn is exclusive to the task goroutine.
+ haveSyscallReturn bool
+
+ // interruptChan is notified whenever the task goroutine is interrupted
+ // (usually by a pending signal). interruptChan is effectively a condition
+ // variable that can be used in select statements.
+ //
+ // interruptChan is not saved; because saving interrupts all tasks,
+ // interruptChan is always notified after restore (see Task.run).
+ interruptChan chan struct{} `state:"nosave"`
+
+ // gosched contains the current scheduling state of the task goroutine.
+ //
+ // gosched is protected by goschedSeq. gosched is owned by the task
+ // goroutine.
+ goschedSeq ssync.SeqCount `state:"nosave"`
+ gosched TaskGoroutineSchedInfo
+
+ // yieldCount is the number of times the task goroutine has called
+ // Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or
+ // Task.Yield(), voluntarily ceasing execution.
+ //
+ // yieldCount is accessed using atomic memory operations. yieldCount is
+ // owned by the task goroutine.
+ yieldCount uint64
+
+ // pendingSignals is the set of pending signals that may be handled only by
+ // this task.
+ //
+ // pendingSignals is protected by (taskNode.)tg.signalHandlers.mu
+ // (hereafter "the signal mutex"); see comment on
+ // ThreadGroup.signalHandlers.
+ pendingSignals pendingSignals
+
+ // If haveSavedSignalMask is true, savedSignalMask is the signal mask that
+ // should be applied after the task has either delivered one signal to a
+ // user handler or is about to resume execution in the untrusted
+ // application.
+ //
+ // Both haveSavedSignalMask and savedSignalMask are exclusive to the task
+ // goroutine.
+ haveSavedSignalMask bool
+ savedSignalMask linux.SignalSet
+
+ // signalStack is the alternate signal stack used by signal handlers for
+ // which the SA_ONSTACK flag is set.
+ //
+ // signalStack is exclusive to the task goroutine.
+ signalStack arch.SignalStack
+
+ // If groupStopRequired is true, the task should enter a group stop in the
+ // interrupt path. groupStopRequired is not redundant with
+ // tg.groupStopPhase != groupStopNone, because ptrace allows tracers to
+ // resume individual tasks from a group stop without ending the group stop
+ // as a whole.
+ //
+ // groupStopRequired is analogous to JOBCTL_TRAP_STOP in Linux, except that
+ // Linux only uses that flag for ptraced tasks.
+ //
+ // groupStopRequired is protected by the signal mutex.
+ groupStopRequired bool
+
+ // If groupStopAcknowledged is true, the task has already acknowledged that
+ // it is entering the most recent group stop that has been initiated on its
+ // thread group. groupStopAcknowledged is only meaningful if
+ // tg.groupStopPhase == groupStopInitiated.
+ //
+ // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux.
+ //
+ // groupStopAcknowledged is protected by the signal mutex.
+ groupStopAcknowledged bool
+
+ // If stop is not nil, it is the internally-initiated condition that
+ // currently prevents the task goroutine from running.
+ //
+ // stop is protected by the signal mutex.
+ stop TaskStop
+
+ // stopCount is the number of active external stops (calls to
+ // Task.BeginExternalStop that have not been paired with a call to
+ // Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is
+ // non-zero if the task goroutine should stop.
+ //
+ // Mutating stopCount requires both locking the signal mutex and using
+ // atomic memory operations. Reading stopCount requires either locking the
+ // signal mutex or using atomic memory operations. This allows Task.doStop
+ // to require only a single atomic read in the common case where stopCount
+ // is 0.
+ //
+ // stopCount is not saved, because external stops cannot be retained across
+ // a save/restore cycle. (Suppose a sentryctl command issues an external
+ // stop; after a save/restore cycle, the restored sentry has no knowledge
+ // of the pre-save sentryctl command, and the stopped task would remain
+ // stopped forever.)
+ stopCount int32 `state:"nosave"`
+
+ // endStopCond is signaled when stopCount transitions to 0. The combination
+ // of stopCount and endStopCond effectively form a sync.WaitGroup, but
+ // WaitGroup provides no way to read its counter value.
+ //
+ // Invariant: endStopCond.L is the signal mutex. (This is not racy because
+ // sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine
+ // calls sync.Cond.Wait; and only the task goroutine can change the
+ // identity of the signal mutex, in Task.finishExec.)
+ endStopCond sync.Cond `state:"nosave"`
+
+ // exitStatus is the task's exit status.
+ //
+ // exitStatus is protected by the signal mutex.
+ exitStatus ExitStatus
+
+ // syscallRestartBlock represents a custom restart function to run in
+ // restart_syscall(2) to resume an interrupted syscall.
+ //
+ // syscallRestartBlock is exclusive to the task goroutine.
+ syscallRestartBlock SyscallRestartBlock
+
+ // mu protects some of the following fields.
+ mu sync.Mutex `state:"nosave"`
+
+ // tc and tr form the majority of the task's data.
+ //
+ // tc and tr are protected by mu. tc and tr are owned by the task
+ // goroutine. tr.signalMask is protected by the signal mutex and must be
+ // written using atomic memory operations (such that reading tr.signalMask
+ // is safe if the signal mutex is locked or if atomic memory operations are
+ // used), but is also owned by the task goroutine.
+ tc TaskContext
+ tr TaskResources
+
+ // p provides the mechanism by which the task runs code in userspace. The p
+ // interface object is immutable.
+ p platform.Context `state:"nosave"`
+
+ // k is the Kernel that this task belongs to. The k pointer is immutable.
+ k *Kernel
+
+ // If vforkParent is not nil, it is the task that created this task with
+ // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
+ // this TaskContext is released.
+ //
+ // vforkParent is protected by the TaskSet mutex.
+ vforkParent *Task
+
+ // exitState is the task's progress through the exit path.
+ //
+ // exitState is protected by the TaskSet mutex. exitState is owned by the
+ // task goroutine.
+ exitState TaskExitState
+
+ // exitTracerNotified is true if the exit path has either signaled the
+ // task's tracer to indicate the exit, or determined that no such signal is
+ // needed. exitTracerNotified can only be true if exitState is
+ // TaskExitZombie or TaskExitDead.
+ //
+ // exitTracerNotified is protected by the TaskSet mutex.
+ exitTracerNotified bool
+
+ // exitTracerAcked is true if exitTracerNotified is true and either the
+ // task's tracer has acknowledged the exit notification, or the exit path
+ // has determined that no such notification is needed.
+ //
+ // exitTracerAcked is protected by the TaskSet mutex.
+ exitTracerAcked bool
+
+ // exitParentNotified is true if the exit path has either signaled the
+ // task's parent to indicate the exit, or determined that no such signal is
+ // needed. exitParentNotified can only be true if exitState is
+ // TaskExitZombie or TaskExitDead.
+ //
+ // exitParentNotified is protected by the TaskSet mutex.
+ exitParentNotified bool
+
+ // exitParentAcked is true if exitParentNotified is true and either the
+ // task's parent has acknowledged the exit notification, or the exit path
+ // has determined that no such acknowledgment is needed.
+ //
+ // exitParentAcked is protected by the TaskSet mutex.
+ exitParentAcked bool
+
+ // goroutineStopped is a WaitGroup whose counter value is 1 when the task
+ // goroutine is running and 0 when the task goroutine is stopped or has
+ // exited.
+ goroutineStopped sync.WaitGroup `state:"nosave"`
+
+ // ptraceTracer is the task that is ptrace-attached to this one. If
+ // ptraceTracer is nil, this task is not being traced. Note that due to
+ // atomic.Value limitations (atomic.Value.Store(nil) panics), a nil
+ // ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)).
+ //
+ // ptraceTracer is protected by the TaskSet mutex, and accessed with atomic
+ // operations. This allows paths that wouldn't otherwise lock the TaskSet
+ // mutex, notably the syscall path, to check if ptraceTracer is nil without
+ // additional synchronization.
+ ptraceTracer atomic.Value `state:".(*Task)"`
+
+ // ptraceTracees is the set of tasks that this task is ptrace-attached to.
+ //
+ // ptraceTracees is protected by the TaskSet mutex.
+ ptraceTracees map[*Task]struct{}
+
+ // ptraceOpts contains ptrace options explicitly set by the tracer. If
+ // ptraceTracer is nil, ptraceOpts is expected to be the zero value.
+ //
+ // ptraceOpts is protected by the TaskSet mutex.
+ ptraceOpts ptraceOptions
+
+ // ptraceSyscallMode controls ptrace behavior around syscall entry and
+ // exit.
+ //
+ // ptraceSyscallMode is protected by the TaskSet mutex.
+ ptraceSyscallMode ptraceSyscallMode
+
+ // If ptraceSinglestep is true, the next time the task executes application
+ // code, single-stepping should be enabled. ptraceSinglestep is stored
+ // independently of the architecture-specific trap flag because tracer
+ // detaching (which can happen concurrently with the tracee's execution if
+ // the tracer exits) must disable single-stepping, and the task's
+ // architectural state is implicitly exclusive to the task goroutine (no
+ // synchronization occurs before passing registers to SwitchToApp).
+ //
+ // ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP.
+ //
+ // ptraceSinglestep is protected by the TaskSet mutex.
+ ptraceSinglestep bool
+
+ // If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the
+ // time that t entered the ptrace stop, reset to 0 when the tracer
+ // acknowledges the stop with a wait*() syscall. Otherwise, it is the
+ // signal number passed to the ptrace operation that ended the last ptrace
+ // stop on this task. In the latter case, the effect of ptraceCode depends
+ // on the nature of the ptrace stop; signal-delivery-stop uses it to
+ // conditionally override ptraceSiginfo, syscall-entry/exit-stops send the
+ // signal to the task after leaving the stop, and PTRACE_EVENT stops and
+ // traced group stops ignore it entirely.
+ //
+ // Linux contextually stores the equivalent of ptraceCode in
+ // task_struct::exit_code.
+ //
+ // ptraceCode is protected by the TaskSet mutex.
+ ptraceCode int32
+
+ // ptraceSiginfo is the value returned to the tracer by
+ // ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO).
+ // (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.)
+ // ptraceSiginfo is nil if the task is in a ptraced group-stop (this is
+ // required for PTRACE_GETSIGINFO to return EINVAL during such stops, which
+ // is in turn required to distinguish group stops from other ptrace stops,
+ // per subsection "Group-stop" in ptrace(2)).
+ //
+ // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo.
+ //
+ // ptraceSiginfo is protected by the TaskSet mutex.
+ ptraceSiginfo *arch.SignalInfo
+
+ // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to
+ // the tracer by ptrace(PTRACE_GETEVENTMSG).
+ //
+ // ptraceEventMsg is protected by the TaskSet mutex.
+ ptraceEventMsg uint64
+
+ // The struct that holds the IO-related usage. The ioUsage pointer is
+ // immutable.
+ ioUsage *usage.IO
+
+ // logPrefix is a string containing the task's thread ID in the root PID
+ // namespace, and is prepended to log messages emitted by Task.Infof etc.
+ logPrefix atomic.Value `state:".(string)"`
+
+ // creds is the task's credentials.
+ //
+ // creds is protected by mu.
+ creds *auth.Credentials
+
+ // utsns is the task's UTS namespace.
+ //
+ // utsns is protected by mu.
+ utsns *UTSNamespace
+
+ // ipcns is the task's IPC namespace.
+ //
+ // ipcns is protected by mu.
+ ipcns *IPCNamespace
+
+ // parentDeathSignal is sent to this task's thread group when its parent exits.
+ //
+ // parentDeathSignal is protected by mu.
+ parentDeathSignal linux.Signal
+
+ // syscallFilters is all seccomp-bpf syscall filters applicable to the
+ // task, in the order in which they were installed.
+ //
+ // syscallFilters is protected by mu. syscallFilters is owned by the task
+ // goroutine.
+ syscallFilters []bpf.Program
+
+ // If cleartid is non-zero, treat it as a pointer to a ThreadID in the
+ // task's virtual address space; when the task exits, set the pointed-to
+ // ThreadID to 0, and wake any futex waiters.
+ //
+ // cleartid is exclusive to the task goroutine.
+ cleartid usermem.Addr
+
+ // This is mostly a fake cpumask just for sched_set/getaffinity as we
+ // don't really control the affinity.
+ //
+ // Invariant: allowedCPUMask.Size() ==
+ // sched.CPUMaskSize(Kernel.applicationCores).
+ //
+ // allowedCPUMask is protected by mu.
+ allowedCPUMask sched.CPUSet
+
+ // cpu is the fake cpu number returned by getcpu(2). cpu is ignored
+ // entirely if Kernel.useHostCores is true.
+ //
+ // cpu is accessed using atomic memory operations.
+ cpu int32
+
+ // This is used to keep track of changes made to a process' priority/niceness.
+ // It is mostly used to provide some reasonable return value from
+ // getpriority(2) after a call to setpriority(2) has been made.
+ // We currently do not actually modify a process' scheduling priority.
+ // NOTE: This represents the userspace view of priority (nice).
+ // This means that the value should be in the range [-20, 19].
+ //
+ // niceness is protected by mu.
+ niceness int
+
+ // This is used to track the numa policy for the current thread. This can be
+ // modified through a set_mempolicy(2) syscall. Since we always report a
+ // single numa node, all policies are no-ops. We only track this information
+ // so that we can return reasonable values if the application calls
+ // get_mempolicy(2) after setting a non-default policy. Note that in the
+ // real syscall, nodemask can be longer than 4 bytes, but we always report a
+ // single node so never need to save more than a single bit.
+ //
+ // numaPolicy and numaNodeMask are protected by mu.
+ numaPolicy int32
+ numaNodeMask uint32
+
+ // If netns is true, the task is in a non-root network namespace. Network
+ // namespaces aren't currently implemented in full; being in a network
+ // namespace simply prevents the task from observing any network devices
+ // (including loopback) or using abstract socket addresses (see unix(7)).
+ //
+ // netns is protected by mu. netns is owned by the task goroutine.
+ netns bool
+
+ // If rseqPreempted is true, before the next call to p.Switch(), interrupt
+ // RSEQ critical regions as defined by tg.rseq and write the task
+ // goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number
+ // written to rseqCPUAddr.
+ //
+ // If rseqCPUAddr is 0, rseqCPU is -1.
+ //
+ // rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task
+ // goroutine.
+ rseqPreempted bool `state:"nosave"`
+ rseqCPUAddr usermem.Addr
+ rseqCPU int32
+
+ // copyScratchBuffer is a buffer available to CopyIn/CopyOut
+ // implementations that require an intermediate buffer to copy data
+ // into/out of. It prevents these buffers from being allocated/zeroed in
+ // each syscall and eventually garbage collected.
+ //
+ // copyScratchBuffer is exclusive to the task goroutine.
+ copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"`
+
+ // blockingTimer is used for blocking timeouts. blockingTimerChan is the
+ // channel that is sent to when blockingTimer fires.
+ //
+ // blockingTimer is exclusive to the task goroutine.
+ blockingTimer *ktime.Timer `state:"nosave"`
+ blockingTimerChan <-chan struct{} `state:"nosave"`
+
+ // futexWaiter is used for futex(FUTEX_WAIT) syscalls.
+ //
+ // futexWaiter is exclusive to the task goroutine.
+ futexWaiter *futex.Waiter `state:"nosave"`
+
+ // startTime is the real time at which the task started. It is set when
+ // a Task is created or invokes execve(2).
+ //
+ // startTime is protected by mu.
+ startTime ktime.Time
+}
+
+func (t *Task) savePtraceTracer() *Task {
+ return t.ptraceTracer.Load().(*Task)
+}
+
+func (t *Task) loadPtraceTracer(tracer *Task) {
+ t.ptraceTracer.Store(tracer)
+}
+
+func (t *Task) saveLogPrefix() string {
+ return t.logPrefix.Load().(string)
+}
+
+func (t *Task) loadLogPrefix(prefix string) {
+ t.logPrefix.Store(prefix)
+}
+
+// afterLoad is invoked by stateify.
+func (t *Task) afterLoad() {
+ t.interruptChan = make(chan struct{}, 1)
+ t.gosched.State = TaskGoroutineNonexistent
+ if t.stop != nil {
+ t.stopCount = 1
+ }
+ t.endStopCond.L = &t.tg.signalHandlers.mu
+ t.p = t.k.Platform.NewContext()
+ t.rseqPreempted = true
+ t.futexWaiter = futex.NewWaiter()
+}
+
+// copyScratchBufferLen is the length of the copyScratchBuffer field of the Task
+// struct.
+const copyScratchBufferLen = 52
+
+// TaskMaybe is the interface for extracting Tasks out of things which may be
+// or contain Task objects.
+type TaskMaybe interface {
+ // ExtractTask returns the Task.
+ ExtractTask() *Task
+}
+
+// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
+// functions. It must only be used within those functions and can only be used
+// by the task goroutine; it exists to improve performance and thus
+// intentionally lacks any synchronization.
+//
+// Callers should pass a constant value as an argument, which will allow the
+// compiler to inline and optimize out the if statement below.
+func (t *Task) CopyScratchBuffer(size int) []byte {
+ if size > copyScratchBufferLen {
+ return make([]byte, size)
+ }
+ return t.copyScratchBuffer[:size]
+}
+
+// FutexWaiter returns the Task's futex.Waiter.
+func (t *Task) FutexWaiter() *futex.Waiter {
+ return t.futexWaiter
+}
+
+// ExtractTask implements TaskMaybe.ExtractTask.
+func (t *Task) ExtractTask() *Task {
+ return t
+}
+
+// TaskContext returns t's TaskContext.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) TaskContext() *TaskContext {
+ return &t.tc
+}
+
+// TaskResources returns t's TaskResources.
+//
+// Precondition: The caller must be running on the task goroutine, or t.mu must
+// be locked.
+func (t *Task) TaskResources() *TaskResources {
+ return &t.tr
+}
+
+// WithMuLocked executes f with t.mu locked.
+func (t *Task) WithMuLocked(f func(*Task)) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ f(t)
+}
+
+// Kernel returns the Kernel containing t.
+func (t *Task) Kernel() *Kernel {
+ return t.k
+}
+
+// Value implements context.Context.Value.
+func (t *Task) Value(key interface{}) interface{} {
+ switch key {
+ case CtxCanTrace:
+ return t.CanTrace
+ case CtxKernel:
+ return t.k
+ case CtxPIDNamespace:
+ return t.tg.pidns
+ case CtxUTSNamespace:
+ return t.utsns
+ case CtxIPCNamespace:
+ return t.ipcns
+ case CtxTask:
+ return t
+ case auth.CtxCredentials:
+ return t.creds
+ case fs.CtxRoot:
+ return t.FSContext().RootDirectory()
+ case ktime.CtxRealtimeClock:
+ return t.k.RealtimeClock()
+ case limits.CtxLimits:
+ return t.tg.limits
+ case platform.CtxPlatform:
+ return t.k
+ case uniqueid.CtxGlobalUniqueID:
+ return t.k.UniqueID()
+ case uniqueid.CtxInotifyCookie:
+ return t.k.GenerateInotifyCookie()
+ default:
+ return nil
+ }
+}
+
+// SetClearTID sets t's cleartid.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) SetClearTID(addr usermem.Addr) {
+ t.cleartid = addr
+}
+
+// SetSyscallRestartBlock sets the restart block for use in
+// restart_syscall(2). After registering a restart block, a syscall should
+// return ERESTART_RESTARTBLOCK to request a restart using the block.
+//
+// Precondition: The caller must be running on the task goroutine.
+func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) {
+ t.syscallRestartBlock = r
+}
+
+// SyscallRestartBlock returns the currently registered restart block for use in
+// restart_syscall(2). This function is *not* idempotent and may be called once
+// per syscall. This function must not be called if a restart block has not been
+// registered for the current syscall.
+//
+// Precondition: The caller must be running on the task goroutine.
+func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
+ r := t.syscallRestartBlock
+ // Explicitly set the restart block to nil so that a future syscall can't
+ // accidentally reuse it.
+ t.syscallRestartBlock = nil
+ return r
+}
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
new file mode 100644
index 000000000..ce12cdb64
--- /dev/null
+++ b/pkg/sentry/kernel/task_acct.go
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// Accounting, limits, timers.
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// IOUsage returns the io usage of the thread.
+func (t *Task) IOUsage() *usage.IO {
+ return t.ioUsage
+}
+
+// IOUsage returns the total io usage of all dead and live threads in the group.
+func (tg *ThreadGroup) IOUsage() *usage.IO {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+
+ io := *tg.ioUsage
+ // Account for active tasks.
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ io.Accumulate(t.IOUsage())
+ }
+ return &io
+}
+
+// Name returns t's name.
+func (t *Task) Name() string {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.tc.Name
+}
+
+// SetName changes t's name.
+func (t *Task) SetName(name string) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.tc.Name = name
+ t.Debugf("Set thread name to %q", name)
+}
+
+// SetCPUTimer is used by setrlimit(RLIMIT_CPU) to enforce the hard and soft
+// limits on CPU time used by this process.
+func (tg *ThreadGroup) SetCPUTimer(l *limits.Limit) {
+ tg.Timer().applyCPULimits(*l)
+}
+
+// Limits implements context.Context.Limits.
+func (t *Task) Limits() *limits.LimitSet {
+ return t.ThreadGroup().Limits()
+}
+
+// StartTime returns t's start time.
+func (t *Task) StartTime() ktime.Time {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.startTime
+}
+
+// MaxRSS returns the maximum resident set size of the task in bytes. which
+// should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or
+// RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these
+// flags.
+func (t *Task) MaxRSS(which int32) uint64 {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+
+ switch which {
+ case linux.RUSAGE_SELF, linux.RUSAGE_THREAD:
+ // If there's an active mm we can use its value.
+ if mm := t.MemoryManager(); mm != nil {
+ if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > t.tg.maxRSS {
+ return mmMaxRSS
+ }
+ }
+ return t.tg.maxRSS
+ case linux.RUSAGE_CHILDREN:
+ return t.tg.childMaxRSS
+ case linux.RUSAGE_BOTH:
+ maxRSS := t.tg.maxRSS
+ if maxRSS < t.tg.childMaxRSS {
+ maxRSS = t.tg.childMaxRSS
+ }
+ if mm := t.MemoryManager(); mm != nil {
+ if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > maxRSS {
+ return mmMaxRSS
+ }
+ }
+ return maxRSS
+ default:
+ // We'll only get here if which is invalid.
+ return 0
+ }
+}
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
new file mode 100644
index 000000000..9fd24f134
--- /dev/null
+++ b/pkg/sentry/kernel/task_block.go
@@ -0,0 +1,207 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "time"
+
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// BlockWithTimeout blocks t until an event is received from C, the application
+// monotonic clock indicates that timeout has elapsed (only if haveTimeout is true),
+// or t is interrupted. It returns:
+//
+// - The remaining timeout, which is guaranteed to be 0 if the timeout expired,
+// and is unspecified if haveTimeout is false.
+//
+// - An error which is nil if an event is received from C, ETIMEDOUT if the timeout
+// expired, and syserror.ErrInterrupted if t is interrupted.
+func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) {
+ if !haveTimeout {
+ return timeout, t.block(C, nil)
+ }
+
+ start := t.Kernel().MonotonicClock().Now()
+ deadline := start.Add(timeout)
+ err := t.BlockWithDeadline(C, true, deadline)
+
+ // Timeout, explicitly return a remaining duration of 0.
+ if err == syserror.ETIMEDOUT {
+ return 0, err
+ }
+
+ // Compute the remaining timeout. Note that even if block() above didn't
+ // return due to a timeout, we may have used up any of the remaining time
+ // since then. We cap the remaining timeout to 0 to make it easier to
+ // directly use the returned duration.
+ end := t.Kernel().MonotonicClock().Now()
+ remainingTimeout := timeout - end.Sub(start)
+ if remainingTimeout < 0 {
+ remainingTimeout = 0
+ }
+
+ return remainingTimeout, err
+}
+
+// BlockWithDeadline blocks t until an event is received from C, the
+// application monotonic clock indicates a time of deadline (only if
+// haveDeadline is true), or t is interrupted. It returns nil if an event is
+// received from C, ETIMEDOUT if the deadline expired, and
+// syserror.ErrInterrupted if t is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error {
+ if !haveDeadline {
+ return t.block(C, nil)
+ }
+
+ // Start the timeout timer.
+ t.blockingTimer.Swap(ktime.Setting{
+ Enabled: true,
+ Next: deadline,
+ })
+
+ err := t.block(C, t.blockingTimerChan)
+
+ // Stop the timeout timer and drain the channel.
+ t.blockingTimer.Swap(ktime.Setting{})
+ select {
+ case <-t.blockingTimerChan:
+ default:
+ }
+
+ return err
+}
+
+// BlockWithTimer blocks t until an event is received from C or tchan, or t is
+// interrupted. It returns nil if an event is received from C, ETIMEDOUT if an
+// event is received from tchan, and syserror.ErrInterrupted if t is
+// interrupted.
+//
+// Most clients should use BlockWithDeadline or BlockWithTimeout instead.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) BlockWithTimer(C chan struct{}, tchan <-chan struct{}) error {
+ return t.block(C, tchan)
+}
+
+// Block blocks t until an event is received from C or t is interrupted. It
+// returns nil if an event is received from C and syserror.ErrInterrupted if t
+// is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Block(C chan struct{}) error {
+ return t.block(C, nil)
+}
+
+// block blocks a task on one of many events.
+// N.B. defer is too expensive to be used here.
+func (t *Task) block(C chan struct{}, timerChan <-chan struct{}) error {
+ // Fast path if the request is already done.
+ select {
+ case <-C:
+ return nil
+ default:
+ }
+
+ // Deactive our address space, we don't need it.
+ interrupt := t.SleepStart()
+
+ select {
+ case <-C:
+ t.SleepFinish(true)
+ return nil
+
+ case <-interrupt:
+ t.SleepFinish(false)
+ // Return the indicated error on interrupt.
+ return syserror.ErrInterrupted
+
+ case <-timerChan:
+ // We've timed out.
+ t.SleepFinish(true)
+ return syserror.ETIMEDOUT
+ }
+}
+
+// SleepStart implements amutex.Sleeper.SleepStart.
+func (t *Task) SleepStart() <-chan struct{} {
+ t.Deactivate()
+ t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible)
+ return t.interruptChan
+}
+
+// SleepFinish implements amutex.Sleeper.SleepFinish.
+func (t *Task) SleepFinish(success bool) {
+ if !success {
+ // The interrupted notification is consumed only at the top-level
+ // (Run). Therefore we attempt to reset the pending notification.
+ // This will also elide our next entry back into the task, so we
+ // will process signals, state changes, etc.
+ t.interruptSelf()
+ }
+ t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible)
+ t.Activate()
+}
+
+// UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
+func (t *Task) UninterruptibleSleepStart(deactivate bool) {
+ if deactivate {
+ t.Deactivate()
+ }
+ t.accountTaskGoroutineEnter(TaskGoroutineBlockedUninterruptible)
+}
+
+// UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish.
+func (t *Task) UninterruptibleSleepFinish(activate bool) {
+ t.accountTaskGoroutineLeave(TaskGoroutineBlockedUninterruptible)
+ if activate {
+ t.Activate()
+ }
+}
+
+// interrupted returns true if interrupt or interruptSelf has been called at
+// least once since the last call to interrupted.
+func (t *Task) interrupted() bool {
+ select {
+ case <-t.interruptChan:
+ return true
+ default:
+ return false
+ }
+}
+
+// interrupt unblocks the task and interrupts it if it's currently running in
+// userspace.
+func (t *Task) interrupt() {
+ t.interruptSelf()
+ t.p.Interrupt()
+}
+
+// interruptSelf is like Interrupt, but can only be called by the task
+// goroutine.
+func (t *Task) interruptSelf() {
+ select {
+ case t.interruptChan <- struct{}{}:
+ t.Debugf("Interrupt queued")
+ default:
+ t.Debugf("Dropping duplicate interrupt")
+ }
+ // platform.Context.Interrupt() is unnecessary since a task goroutine
+ // calling interruptSelf() cannot also be blocked in
+ // platform.Context.Switch().
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
new file mode 100644
index 000000000..3a74abdfb
--- /dev/null
+++ b/pkg/sentry/kernel/task_clone.go
@@ -0,0 +1,475 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/bpf"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SharingOptions controls what resources are shared by a new task created by
+// Task.Clone, or an existing task affected by Task.Unshare.
+type SharingOptions struct {
+ // If NewAddressSpace is true, the task should have an independent virtual
+ // address space.
+ NewAddressSpace bool
+
+ // If NewSignalHandlers is true, the task should use an independent set of
+ // signal handlers.
+ NewSignalHandlers bool
+
+ // If NewThreadGroup is true, the task should be the leader of its own
+ // thread group. TerminationSignal is the signal that the thread group
+ // will send to its parent when it exits. If NewThreadGroup is false,
+ // TerminationSignal is ignored.
+ NewThreadGroup bool
+ TerminationSignal linux.Signal
+
+ // If NewPIDNamespace is true:
+ //
+ // - In the context of Task.Clone, the new task should be the init task
+ // (TID 1) in a new PID namespace.
+ //
+ // - In the context of Task.Unshare, the task should create a new PID
+ // namespace, and all subsequent clones of the task should be members of
+ // the new PID namespace.
+ NewPIDNamespace bool
+
+ // If NewUserNamespace is true, the task should have an independent user
+ // namespace.
+ NewUserNamespace bool
+
+ // If NewNetworkNamespace is true, the task should have an independent
+ // network namespace. (Note that network namespaces are not really
+ // implemented; see comment on Task.netns for details.)
+ NewNetworkNamespace bool
+
+ // If NewFiles is true, the task should use an independent file descriptor
+ // table.
+ NewFiles bool
+
+ // If NewFSContext is true, the task should have an independent FSContext.
+ NewFSContext bool
+
+ // If NewUTSNamespace is true, the task should have an independent UTS
+ // namespace.
+ NewUTSNamespace bool
+
+ // If NewIPCNamespace is true, the task should have an independent IPC
+ // namespace.
+ NewIPCNamespace bool
+}
+
+// CloneOptions controls the behavior of Task.Clone.
+type CloneOptions struct {
+ // SharingOptions defines the set of resources that the new task will share
+ // with its parent.
+ SharingOptions
+
+ // Stack is the initial stack pointer of the new task. If Stack is 0, the
+ // new task will start with the same stack pointer as its parent.
+ Stack usermem.Addr
+
+ // If SetTLS is true, set the new task's TLS (thread-local storage)
+ // descriptor to TLS. If SetTLS is false, TLS is ignored.
+ SetTLS bool
+ TLS usermem.Addr
+
+ // If ChildClearTID is true, when the child exits, 0 is written to the
+ // address ChildTID in the child's memory, and if the write is successful a
+ // futex wake on the same address is performed.
+ //
+ // If ChildSetTID is true, the child's thread ID (in the child's PID
+ // namespace) is written to address ChildTID in the child's memory. (As in
+ // Linux, failed writes are silently ignored.)
+ ChildClearTID bool
+ ChildSetTID bool
+ ChildTID usermem.Addr
+
+ // If ParentSetTID is true, the child's thread ID (in the parent's PID
+ // namespace) is written to address ParentTID in the parent's memory. (As
+ // in Linux, failed writes are silently ignored.)
+ //
+ // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID
+ // causes the child's thread ID to be written to ptid in both the parent
+ // and child's memory, but this is a documentation error fixed by
+ // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID").
+ ParentSetTID bool
+ ParentTID usermem.Addr
+
+ // If Vfork is true, place the parent in vforkStop until the cloned task
+ // releases its TaskContext.
+ Vfork bool
+
+ // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for
+ // this clone(), and do not ptrace-attach the caller's tracer to the new
+ // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate).
+ Untraced bool
+
+ // If InheritTracer is true, ptrace-attach the caller's tracer to the new
+ // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported
+ // for it. If both Untraced and InheritTracer are true, no event will be
+ // reported, but tracer inheritance will still occur.
+ InheritTracer bool
+}
+
+// Clone implements the clone(2) syscall and returns the thread ID of the new
+// task in t's PID namespace. Clone may return both a non-zero thread ID and a
+// non-nil error.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
+ // Since signal actions may refer to application signal handlers by virtual
+ // address, any set of signal handlers must refer to the same address
+ // space.
+ if !opts.NewSignalHandlers && opts.NewAddressSpace {
+ return 0, nil, syserror.EINVAL
+ }
+ // In order for the behavior of thread-group-directed signals to be sane,
+ // all tasks in a thread group must share signal handlers.
+ if !opts.NewThreadGroup && opts.NewSignalHandlers {
+ return 0, nil, syserror.EINVAL
+ }
+ // All tasks in a thread group must be in the same PID namespace.
+ if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) {
+ return 0, nil, syserror.EINVAL
+ }
+ // The two different ways of specifying a new PID namespace are
+ // incompatible.
+ if opts.NewPIDNamespace && t.childPIDNamespace != nil {
+ return 0, nil, syserror.EINVAL
+ }
+ // Thread groups and FS contexts cannot span user namespaces.
+ if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
+ // single clone(2) or unshare(2) call, the user namespace is guaranteed to
+ // be created first, giving the child (clone(2)) or caller (unshare(2))
+ // privileges over the remaining namespaces created by the call." -
+ // user_namespaces(7)
+ creds := t.Credentials()
+ var userns *auth.UserNamespace
+ if opts.NewUserNamespace {
+ var err error
+ // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
+ // the caller is in a chroot environment (i.e., the caller's root
+ // directory does not match the root directory of the mount namespace
+ // in which it resides)." - clone(2). Neither chroot(2) nor
+ // user_namespaces(7) document this.
+ if t.IsChrooted() {
+ return 0, nil, syserror.EPERM
+ }
+ userns, err = creds.NewChildUserNamespace()
+ if err != nil {
+ return 0, nil, err
+ }
+ }
+ if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapability(linux.CAP_SYS_ADMIN) {
+ return 0, nil, syserror.EPERM
+ }
+
+ utsns := t.UTSNamespace()
+ if opts.NewUTSNamespace {
+ // Note that this must happen after NewUserNamespace so we get
+ // the new userns if there is one.
+ utsns = t.UTSNamespace().Clone(userns)
+ }
+
+ ipcns := t.IPCNamespace()
+ if opts.NewIPCNamespace {
+ // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
+ // namespace"
+ ipcns = NewIPCNamespace()
+ }
+
+ tc, err := t.tc.Fork(t, !opts.NewAddressSpace)
+ if err != nil {
+ return 0, nil, err
+ }
+ // clone() returns 0 in the child.
+ tc.Arch.SetReturn(0)
+ if opts.Stack != 0 {
+ tc.Arch.SetStack(uintptr(opts.Stack))
+ }
+ if opts.SetTLS {
+ tc.Arch.StateData().Regs.Fs_base = uint64(opts.TLS)
+ }
+
+ pidns := t.tg.pidns
+ if t.childPIDNamespace != nil {
+ pidns = t.childPIDNamespace
+ } else if opts.NewPIDNamespace {
+ pidns = pidns.NewChild(userns)
+ }
+ tg := t.tg
+ parent := t.parent
+ if opts.NewThreadGroup {
+ sh := t.tg.signalHandlers
+ if opts.NewSignalHandlers {
+ sh = sh.Fork()
+ }
+ tg = NewThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+ parent = t
+ }
+ cfg := &TaskConfig{
+ Kernel: t.k,
+ Parent: parent,
+ ThreadGroup: tg,
+ TaskContext: tc,
+ TaskResources: t.tr.Fork(!opts.NewFiles, !opts.NewFSContext),
+ Niceness: t.Niceness(),
+ Credentials: creds.Fork(),
+ NetworkNamespaced: t.netns,
+ AllowedCPUMask: t.CPUMask(),
+ UTSNamespace: utsns,
+ IPCNamespace: ipcns,
+ }
+ if opts.NewNetworkNamespace {
+ cfg.NetworkNamespaced = true
+ }
+ nt, err := t.tg.pidns.owner.NewTask(cfg)
+ if err != nil {
+ if opts.NewThreadGroup {
+ tg.release()
+ }
+ return 0, nil, err
+ }
+
+ // "A child process created via fork(2) inherits a copy of its parent's
+ // alternate signal stack settings" - sigaltstack(2).
+ //
+ // However kernel/fork.c:copy_process() adds a limitation to this:
+ // "sigaltstack should be cleared when sharing the same VM".
+ if opts.NewAddressSpace || opts.Vfork {
+ nt.SetSignalStack(t.SignalStack())
+ }
+
+ if userns != nil {
+ if err := nt.SetUserNamespace(userns); err != nil {
+ // This shouldn't be possible: userns was created from nt.creds, so
+ // nt should have CAP_SYS_ADMIN in userns.
+ panic("Task.Clone: SetUserNamespace failed: " + err.Error())
+ }
+ }
+
+ // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
+ // nt that it must receive before its task goroutine starts running.
+ tid := nt.k.tasks.Root.IDOfTask(nt)
+ defer nt.Start(tid)
+
+ // "If fork/clone and execve are allowed by @prog, any child processes will
+ // be constrained to the same filters and system call ABI as the parent." -
+ // Documentation/prctl/seccomp_filter.txt
+ nt.syscallFilters = append([]bpf.Program(nil), t.syscallFilters...)
+ if opts.Vfork {
+ nt.vforkParent = t
+ }
+
+ if opts.ChildClearTID {
+ nt.SetClearTID(opts.ChildTID)
+ }
+ if opts.ChildSetTID {
+ // Can't use Task.CopyOut, which assumes AddressSpaceActive.
+ usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{})
+ }
+ ntid := t.tg.pidns.IDOfTask(nt)
+ if opts.ParentSetTID {
+ t.CopyOut(opts.ParentTID, ntid)
+ }
+
+ kind := ptraceCloneKindClone
+ if opts.Vfork {
+ kind = ptraceCloneKindVfork
+ } else if opts.TerminationSignal == linux.SIGCHLD {
+ kind = ptraceCloneKindFork
+ }
+ if t.ptraceClone(kind, nt, opts) {
+ if opts.Vfork {
+ return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
+ }
+ return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
+ }
+ if opts.Vfork {
+ t.maybeBeginVforkStop(nt)
+ return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
+ }
+ return ntid, nil, nil
+}
+
+// maybeBeginVforkStop checks if a previously-started vfork child is still
+// running and has not yet released its MM, such that its parent t should enter
+// a vforkStop.
+//
+// Preconditions: The caller must be running on t's task goroutine.
+func (t *Task) maybeBeginVforkStop(child *Task) {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ if t.killedLocked() {
+ child.vforkParent = nil
+ return
+ }
+ if child.vforkParent == t {
+ t.beginInternalStopLocked((*vforkStop)(nil))
+ }
+}
+
+func (t *Task) unstopVforkParent() {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ if p := t.vforkParent; p != nil {
+ p.tg.signalHandlers.mu.Lock()
+ defer p.tg.signalHandlers.mu.Unlock()
+ if _, ok := p.stop.(*vforkStop); ok {
+ p.endInternalStopLocked()
+ }
+ // Parent no longer needs to be unstopped.
+ t.vforkParent = nil
+ }
+}
+
+type runSyscallAfterPtraceEventClone struct {
+ vforkChild *Task
+
+ // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
+ // PID namespace. vforkChildTID must be stored since the child may exit and
+ // release its TID before the PTRACE_EVENT stop ends.
+ vforkChildTID ThreadID
+}
+
+func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
+ if r.vforkChild != nil {
+ t.maybeBeginVforkStop(r.vforkChild)
+ return &runSyscallAfterVforkStop{r.vforkChildTID}
+ }
+ return (*runSyscallExit)(nil)
+}
+
+type runSyscallAfterVforkStop struct {
+ // childTID has the same meaning as
+ // runSyscallAfterPtraceEventClone.vforkChildTID.
+ childTID ThreadID
+}
+
+func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
+ t.ptraceVforkDone(r.childTID)
+ return (*runSyscallExit)(nil)
+}
+
+// Unshare changes the set of resources t shares with other tasks, as specified
+// by opts.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Unshare(opts *SharingOptions) error {
+ // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and
+ // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if
+ // t is the only task using its MM, which due to clone(2)'s rules imply
+ // that it is also the only task using its signal handlers / in its thread
+ // group, and cause EINVAL to be returned otherwise.
+ //
+ // Since we don't count the number of tasks using each address space or set
+ // of signal handlers, we reject NewSignalHandlers and NewAddressSpace
+ // altogether, and interpret NewThreadGroup as requiring that t be the only
+ // member of its thread group. This seems to be logically coherent, in the
+ // sense that clone(2) allows a task to share signal handlers and address
+ // spaces with tasks in other thread groups.
+ if opts.NewAddressSpace || opts.NewSignalHandlers {
+ return syserror.EINVAL
+ }
+ if opts.NewThreadGroup {
+ t.tg.signalHandlers.mu.Lock()
+ if t.tg.tasksCount != 1 {
+ t.tg.signalHandlers.mu.Unlock()
+ return syserror.EINVAL
+ }
+ t.tg.signalHandlers.mu.Unlock()
+ // This isn't racy because we're the only living task, and therefore
+ // the only task capable of creating new ones, in our thread group.
+ }
+ if opts.NewUserNamespace {
+ if t.IsChrooted() {
+ return syserror.EPERM
+ }
+ // This temporary is needed because Go.
+ creds := t.Credentials()
+ newUserNS, err := creds.NewChildUserNamespace()
+ if err != nil {
+ return err
+ }
+ err = t.SetUserNamespace(newUserNS)
+ if err != nil {
+ return err
+ }
+ }
+ haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
+ if opts.NewPIDNamespace {
+ if !haveCapSysAdmin {
+ return syserror.EPERM
+ }
+ t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
+ }
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if opts.NewNetworkNamespace {
+ if !haveCapSysAdmin {
+ return syserror.EPERM
+ }
+ t.netns = true
+ }
+ if opts.NewUTSNamespace {
+ if !haveCapSysAdmin {
+ return syserror.EPERM
+ }
+ // Note that this must happen after NewUserNamespace, so the
+ // new user namespace is used if there is one.
+ t.utsns = t.utsns.Clone(t.creds.UserNamespace)
+ }
+ if opts.NewIPCNamespace {
+ if !haveCapSysAdmin {
+ return syserror.EPERM
+ }
+ // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
+ // namespace"
+ t.ipcns = NewIPCNamespace()
+ }
+ if opts.NewFiles {
+ oldFDMap := t.tr.FDMap
+ t.tr.FDMap = oldFDMap.Fork()
+ oldFDMap.DecRef()
+ }
+ if opts.NewFSContext {
+ oldFS := t.tr.FSContext
+ t.tr.FSContext = oldFS.Fork()
+ oldFS.DecRef()
+ }
+ return nil
+}
+
+// vforkStop is a TaskStop imposed on a task that creates a child with
+// CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
+// current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
+// that the child and parent share mappings until the child execve()s into a
+// new process image or exits.)
+type vforkStop struct{}
+
+// StopIgnoresKill implements TaskStop.Killable.
+func (*vforkStop) Killable() bool { return true }
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
new file mode 100644
index 000000000..5c563ba08
--- /dev/null
+++ b/pkg/sentry/kernel/task_context.go
@@ -0,0 +1,179 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "errors"
+ "fmt"
+
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/loader"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// ErrNoSyscalls is returned if there is no syscall table.
+var ErrNoSyscalls = errors.New("no syscall table found")
+
+// Auxmap contains miscellaneous data for the task.
+type Auxmap map[string]interface{}
+
+// TaskContext is the subset of a task's data that is provided by the loader.
+type TaskContext struct {
+ // Name is the thread name set by the prctl(PR_SET_NAME) system call.
+ Name string
+
+ // Arch is the architecture-specific context (registers, etc.)
+ Arch arch.Context
+
+ // MemoryManager is the task's address space.
+ MemoryManager *mm.MemoryManager
+
+ // fu implements futexes in the address space.
+ fu *futex.Manager
+
+ // st is the task's syscall table.
+ st *SyscallTable
+}
+
+// release releases all resources held by the TaskContext. release is called by
+// the task when it execs into a new TaskContext or exits.
+func (tc *TaskContext) release() {
+ // Nil out pointers so that if the task is saved after release, it doesn't
+ // follow the pointers to possibly now-invalid objects.
+ if tc.MemoryManager != nil {
+ // TODO
+ tc.MemoryManager.DecUsers(context.Background())
+ tc.MemoryManager = nil
+ }
+ tc.fu = nil
+}
+
+// Fork returns a duplicate of tc. The copied TaskContext always has an
+// independent arch.Context. If shareAddressSpace is true, the copied
+// TaskContext shares an address space with the original; otherwise, the copied
+// TaskContext has an independent address space that is initially a duplicate
+// of the original's.
+func (tc *TaskContext) Fork(ctx context.Context, shareAddressSpace bool) (*TaskContext, error) {
+ newTC := &TaskContext{
+ Arch: tc.Arch.Fork(),
+ st: tc.st,
+ }
+ if shareAddressSpace {
+ newTC.MemoryManager = tc.MemoryManager
+ if newTC.MemoryManager != nil {
+ if !newTC.MemoryManager.IncUsers() {
+ // Shouldn't be possible since tc.MemoryManager should be a
+ // counted user.
+ panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager"))
+ }
+ }
+ newTC.fu = tc.fu
+ } else {
+ newMM, err := tc.MemoryManager.Fork(ctx)
+ if err != nil {
+ return nil, err
+ }
+ newTC.MemoryManager = newMM
+ // TODO: revisit when shmem is supported.
+ newTC.fu = futex.NewManager()
+ }
+ return newTC, nil
+}
+
+// Arch returns t's arch.Context.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Arch() arch.Context {
+ return t.tc.Arch
+}
+
+// MemoryManager returns t's MemoryManager. MemoryManager does not take an
+// additional reference on the returned MM.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) MemoryManager() *mm.MemoryManager {
+ return t.tc.MemoryManager
+}
+
+// Futex returns t's futex manager.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Futex() *futex.Manager {
+ return t.tc.fu
+}
+
+// SyscallTable returns t's syscall table.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) SyscallTable() *SyscallTable {
+ return t.tc.st
+}
+
+// Stack returns the userspace stack.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Stack() *arch.Stack {
+ return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())}
+}
+
+// LoadTaskImage loads filename into a new TaskContext.
+//
+// It takes several arguments:
+// * mounts: MountNamespace to lookup filename in
+// * root: Root to lookup filename under
+// * wd: Working directory to lookup filename under
+// * maxTraversals: maximum number of symlinks to follow
+// * filename: path to binary to load
+// * argv: Binary argv
+// * envv: Binary envv
+// * fs: Binary FeatureSet
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, error) {
+ // Prepare a new user address space to load into.
+ m := mm.NewMemoryManager(k)
+ defer m.DecUsers(ctx)
+
+ os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso)
+ if err != nil {
+ return nil, err
+ }
+
+ // Lookup our new syscall table.
+ st, ok := LookupSyscallTable(os, ac.Arch())
+ if !ok {
+ // No syscall table found. Yikes.
+ return nil, ErrNoSyscalls
+ }
+
+ if !m.IncUsers() {
+ panic("Failed to increment users count on new MM")
+ }
+ return &TaskContext{
+ Name: name,
+ Arch: ac,
+ MemoryManager: m,
+ fu: futex.NewManager(),
+ st: st,
+ }, nil
+}
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
new file mode 100644
index 000000000..2285847a2
--- /dev/null
+++ b/pkg/sentry/kernel/task_exec.go
@@ -0,0 +1,240 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the machinery behind the execve() syscall. In brief, a
+// thread executes an execve() by killing all other threads in its thread
+// group, assuming the leader's identity, and then switching process images.
+//
+// This design is effectively mandated by Linux. From ptrace(2):
+//
+// """
+// execve(2) under ptrace
+// When one thread in a multithreaded process calls execve(2), the
+// kernel destroys all other threads in the process, and resets the
+// thread ID of the execing thread to the thread group ID (process ID).
+// (Or, to put things another way, when a multithreaded process does an
+// execve(2), at completion of the call, it appears as though the
+// execve(2) occurred in the thread group leader, regardless of which
+// thread did the execve(2).) This resetting of the thread ID looks
+// very confusing to tracers:
+//
+// * All other threads stop in PTRACE_EVENT_EXIT stop, if the
+// PTRACE_O_TRACEEXIT option was turned on. Then all other threads
+// except the thread group leader report death as if they exited via
+// _exit(2) with exit code 0.
+//
+// * The execing tracee changes its thread ID while it is in the
+// execve(2). (Remember, under ptrace, the "pid" returned from
+// waitpid(2), or fed into ptrace calls, is the tracee's thread ID.)
+// That is, the tracee's thread ID is reset to be the same as its
+// process ID, which is the same as the thread group leader's thread
+// ID.
+//
+// * Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC
+// option was turned on.
+//
+// * If the thread group leader has reported its PTRACE_EVENT_EXIT stop
+// by this time, it appears to the tracer that the dead thread leader
+// "reappears from nowhere". (Note: the thread group leader does not
+// report death via WIFEXITED(status) until there is at least one
+// other live thread. This eliminates the possibility that the
+// tracer will see it dying and then reappearing.) If the thread
+// group leader was still alive, for the tracer this may look as if
+// thread group leader returns from a different system call than it
+// entered, or even "returned from a system call even though it was
+// not in any system call". If the thread group leader was not
+// traced (or was traced by a different tracer), then during
+// execve(2) it will appear as if it has become a tracee of the
+// tracer of the execing tracee.
+//
+// All of the above effects are the artifacts of the thread ID change in
+// the tracee.
+// """
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// execStop is a TaskStop that a task sets on itself when it wants to execve
+// and is waiting for the other tasks in its thread group to exit first.
+type execStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*execStop) Killable() bool { return true }
+
+// Execve implements the execve(2) syscall by killing all other tasks in its
+// thread group and switching to newTC. Execve always takes ownership of newTC.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+
+ if t.tg.exiting || t.tg.execing != nil {
+ // We lost to a racing group-exit, kill, or exec from another thread
+ // and should just exit.
+ newTC.release()
+ return nil, syserror.EINTR
+ }
+
+ // Cancel any racing group stops.
+ t.tg.endGroupStopLocked(false)
+
+ // If the task has any siblings, they have to exit before the exec can
+ // continue.
+ t.tg.execing = t
+ if t.tg.tasks.Front() != t.tg.tasks.Back() {
+ // "[All] other threads except the thread group leader report death as
+ // if they exited via _exit(2) with exit code 0." - ptrace(2)
+ for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+ if t != sibling {
+ sibling.killLocked()
+ }
+ }
+ // The last sibling to exit will wake t.
+ t.beginInternalStopLocked((*execStop)(nil))
+ }
+
+ return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil
+}
+
+// The runSyscallAfterExecStop state continues execve(2) after all siblings of
+// a thread in the execve syscall have exited.
+type runSyscallAfterExecStop struct {
+ tc *TaskContext
+}
+
+func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
+ t.tg.pidns.owner.mu.Lock()
+ t.tg.execing = nil
+ if t.killed() {
+ t.tg.pidns.owner.mu.Unlock()
+ r.tc.release()
+ return (*runInterrupt)(nil)
+ }
+ // We are the thread group leader now. Save our old thread ID for
+ // PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this
+ // point it will get a PID of 0, but this is consistent with Linux.
+ oldTID := ThreadID(0)
+ if tracer := t.Tracer(); tracer != nil {
+ oldTID = tracer.tg.pidns.tids[t]
+ }
+ t.promoteLocked()
+ // "During an execve(2), the dispositions of handled signals are reset to
+ // the default; the dispositions of ignored signals are left unchanged. ...
+ // [The] signal mask is preserved across execve(2). ... [The] pending
+ // signal set is preserved across an execve(2)." - signal(7)
+ //
+ // Details:
+ //
+ // - If the thread group is sharing its signal handlers with another thread
+ // group via CLONE_SIGHAND, execve forces the signal handlers to be copied
+ // (see Linux's fs/exec.c:de_thread). We're not reference-counting signal
+ // handlers, so we always make a copy.
+ //
+ // - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags,
+ // restorer (if present), and mask are always reset. (See Linux's
+ // fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.)
+ t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec()
+ t.endStopCond.L = &t.tg.signalHandlers.mu
+ // "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2)
+ t.signalStack = arch.SignalStack{Flags: arch.SignalStackFlagDisable}
+ // "The termination signal is reset to SIGCHLD (see clone(2))."
+ t.tg.terminationSignal = linux.SIGCHLD
+ // execed indicates that the process can no longer join a process group
+ // in some scenarios (namely, the parent call setpgid(2) on the child).
+ // See the JoinProcessGroup function in sessions.go for more context.
+ t.tg.execed = true
+ // Maximum RSS is preserved across execve(2).
+ t.updateRSSLocked()
+ // Restartable sequence state is discarded.
+ t.rseqPreempted = false
+ t.rseqCPUAddr = 0
+ t.rseqCPU = -1
+ t.tg.rscr.Store(&RSEQCriticalRegion{})
+ t.tg.pidns.owner.mu.Unlock()
+
+ // Remove FDs with the CloseOnExec flag set.
+ t.FDMap().RemoveIf(func(file *fs.File, flags FDFlags) bool {
+ return flags.CloseOnExec
+ })
+
+ // Switch to the new process.
+ t.MemoryManager().Deactivate()
+ t.mu.Lock()
+ // Update credentials to reflect the execve. This should precede switching
+ // MMs to ensure that dumpability has been reset first, if needed.
+ t.updateCredsForExecLocked()
+ t.tc.release()
+ t.tc = *r.tc
+ t.mu.Unlock()
+ t.unstopVforkParent()
+ // NOTE: All locks must be dropped prior to calling Activate.
+ t.MemoryManager().Activate()
+
+ t.ptraceExec(oldTID)
+ return (*runSyscallExit)(nil)
+}
+
+// promoteLocked makes t the leader of its thread group. If t is already the
+// thread group leader, promoteLocked is a no-op.
+//
+// Preconditions: All other tasks in t's thread group, including the existing
+// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
+// be locked for writing.
+func (t *Task) promoteLocked() {
+ oldLeader := t.tg.leader
+ if t == oldLeader {
+ return
+ }
+ // Swap the leader's TIDs with the execing task's. The latter will be
+ // released when the old leader is reaped below.
+ for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+ oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader]
+ ns.tids[oldLeader] = oldTID
+ ns.tids[t] = leaderTID
+ ns.tasks[oldTID] = oldLeader
+ ns.tasks[leaderTID] = t
+ }
+
+ // Inherit the old leader's start time.
+ oldStartTime := oldLeader.StartTime()
+ t.mu.Lock()
+ t.startTime = oldStartTime
+ t.mu.Unlock()
+
+ t.tg.leader = t
+ t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
+ t.updateLogPrefixLocked()
+ // Reap the original leader. If it has a tracer, detach it instead of
+ // waiting for it to acknowledge the original leader's death.
+ oldLeader.exitParentNotified = true
+ oldLeader.exitParentAcked = true
+ if tracer := oldLeader.Tracer(); tracer != nil {
+ delete(tracer.ptraceTracees, oldLeader)
+ oldLeader.forgetTracerLocked()
+ // Notify the tracer that it will no longer be receiving these events
+ // from the tracee.
+ tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue)
+ }
+ oldLeader.exitNotifyLocked(false)
+}
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
new file mode 100644
index 000000000..3d49ae350
--- /dev/null
+++ b/pkg/sentry/kernel/task_exit.go
@@ -0,0 +1,1139 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the task exit cycle:
+//
+// - Tasks are asynchronously requested to exit with Task.Kill.
+//
+// - When able, the task goroutine enters the exit path starting from state
+// runExit.
+//
+// - Other tasks observe completed exits with Task.Wait (which implements the
+// wait*() family of syscalls).
+
+import (
+ "errors"
+ "fmt"
+ "strconv"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// An ExitStatus is a value communicated from an exiting task or thread group
+// to the party that reaps it.
+type ExitStatus struct {
+ // Code is the numeric value passed to the call to exit or exit_group that
+ // caused the exit. If the exit was not caused by such a call, Code is 0.
+ Code int
+
+ // Signo is the signal that caused the exit. If the exit was not caused by
+ // a signal, Signo is 0.
+ Signo int
+}
+
+// Signaled returns true if the ExitStatus indicates that the exiting task or
+// thread group was killed by a signal.
+func (es ExitStatus) Signaled() bool {
+ return es.Signo != 0
+}
+
+// Status returns the numeric representation of the ExitStatus returned by e.g.
+// the wait4() system call.
+func (es ExitStatus) Status() uint32 {
+ return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff)
+}
+
+// ShellExitCode returns the numeric exit code that Bash would return for an
+// exit status of es.
+func (es ExitStatus) ShellExitCode() int {
+ if es.Signaled() {
+ return 128 + es.Signo
+ }
+ return es.Code
+}
+
+// TaskExitState represents a step in the task exit path.
+//
+// "Exiting" and "exited" are often ambiguous; prefer to name specific states.
+type TaskExitState int
+
+const (
+ // TaskExitNone indicates that the task has not begun exiting.
+ TaskExitNone TaskExitState = iota
+
+ // TaskExitInitiated indicates that the task goroutine has entered the exit
+ // path, and the task is no longer eligible to participate in group stops
+ // or group signal handling. TaskExitInitiated is analogous to Linux's
+ // PF_EXITING.
+ TaskExitInitiated
+
+ // TaskExitZombie indicates that the task has released its resources, and
+ // the task no longer prevents a sibling thread from completing execve.
+ TaskExitZombie
+
+ // TaskExitDead indicates that the task's thread IDs have been released,
+ // and the task no longer prevents its thread group leader from being
+ // reaped. ("Reaping" refers to the transitioning of a task from
+ // TaskExitZombie to TaskExitDead.)
+ TaskExitDead
+)
+
+// String implements fmt.Stringer.
+func (t TaskExitState) String() string {
+ switch t {
+ case TaskExitNone:
+ return "TaskExitNone"
+ case TaskExitInitiated:
+ return "TaskExitInitiated"
+ case TaskExitZombie:
+ return "TaskExitZombie"
+ case TaskExitDead:
+ return "TaskExitDead"
+ default:
+ return strconv.Itoa(int(t))
+ }
+}
+
+// killLocked marks t as killed by enqueueing a SIGKILL, without causing the
+// thread-group-affecting side effects SIGKILL usually has.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) killLocked() {
+ // Clear killable stops.
+ if t.stop != nil && t.stop.Killable() {
+ t.endInternalStopLocked()
+ }
+ t.groupStopRequired = false
+ t.pendingSignals.enqueue(&arch.SignalInfo{
+ Signo: int32(linux.SIGKILL),
+ // Linux just sets SIGKILL in the pending signal bitmask without
+ // enqueueing an actual siginfo, such that
+ // kernel/signal.c:collect_signal() initalizes si_code to SI_USER.
+ Code: arch.SignalInfoUser,
+ })
+ t.interrupt()
+}
+
+// killed returns true if t has a SIGKILL pending. killed is analogous to
+// Linux's fatal_signal_pending().
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) killed() bool {
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ return t.killedLocked()
+}
+
+func (t *Task) killedLocked() bool {
+ return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0
+}
+
+// PrepareExit indicates an exit with status es.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareExit(es ExitStatus) {
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ t.exitStatus = es
+}
+
+// PrepareGroupExit indicates a group exit with status es to t's thread group.
+//
+// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it
+// does not tail-call do_exit(), except that it *does* set Task.exitStatus.
+// (Linux does not do so until within do_exit(), since it reuses exit_code for
+// ptrace.)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareGroupExit(es ExitStatus) {
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ if t.tg.exiting || t.tg.execing != nil {
+ // Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e.
+ // this "group exit" is being executed by the killed sibling of an
+ // execing task, then Task.Execve never set t.tg.exitStatus, so it's
+ // still the zero value. This is consistent with Linux, both in intent
+ // ("all other threads ... report death as if they exited via _exit(2)
+ // with exit code 0" - ptrace(2), "execve under ptrace") and in
+ // implementation (compare fs/exec.c:de_thread() =>
+ // kernel/signal.c:zap_other_threads() and
+ // kernel/exit.c:do_group_exit() =>
+ // include/linux/sched.h:signal_group_exit()).
+ t.exitStatus = t.tg.exitStatus
+ return
+ }
+ t.tg.exiting = true
+ t.tg.exitStatus = es
+ t.exitStatus = es
+ for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+ if sibling != t {
+ sibling.killLocked()
+ }
+ }
+}
+
+// Kill requests that all tasks in ts exit as if group exiting with status es.
+// Kill does not wait for tasks to exit.
+//
+// Kill has no analogue in Linux; it's provided for save/restore only.
+func (ts *TaskSet) Kill(es ExitStatus) {
+ ts.mu.Lock()
+ defer ts.mu.Unlock()
+ ts.Root.exiting = true
+ for t := range ts.Root.tids {
+ t.tg.signalHandlers.mu.Lock()
+ if !t.tg.exiting {
+ t.tg.exiting = true
+ t.tg.exitStatus = es
+ }
+ t.killLocked()
+ t.tg.signalHandlers.mu.Unlock()
+ }
+}
+
+// advanceExitStateLocked checks that t's current exit state is oldExit, then
+// sets it to newExit. If t's current exit state is not oldExit,
+// advanceExitStateLocked panics.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
+ if t.exitState != oldExit {
+ panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState))
+ }
+ t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit)
+ t.exitState = newExit
+}
+
+// runExit is the entry point into the task exit path.
+type runExit struct{}
+
+func (*runExit) execute(t *Task) taskRunState {
+ t.ptraceExit()
+ return (*runExitMain)(nil)
+}
+
+type runExitMain struct{}
+
+func (*runExitMain) execute(t *Task) taskRunState {
+ lastExiter := t.exitThreadGroup()
+
+ // If the task has a cleartid, and the thread group wasn't killed by a
+ // signal, handle that before releasing the MM.
+ if t.cleartid != 0 {
+ t.tg.signalHandlers.mu.Lock()
+ signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
+ t.tg.signalHandlers.mu.Unlock()
+ if !signaled {
+ if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+ t.Futex().Wake(uintptr(t.cleartid), ^uint32(0), 1)
+ }
+ // If the CopyOut fails, there's nothing we can do.
+ }
+ }
+
+ // Deactivate the address space before releasing the MM.
+ t.Deactivate()
+
+ // Update the max resident set size before releasing t.tc.mm.
+ t.tg.pidns.owner.mu.Lock()
+ t.updateRSSLocked()
+ t.tg.pidns.owner.mu.Unlock()
+
+ // Release all of the task's resources.
+ t.mu.Lock()
+ t.tc.release()
+ t.tr.release()
+ t.mu.Unlock()
+ t.unstopVforkParent()
+
+ // If this is the last task to exit from the thread group, release the
+ // thread group's resources.
+ if lastExiter {
+ t.tg.release()
+ }
+
+ // Detach tracees.
+ t.exitPtrace()
+
+ // Reparent the task's children.
+ t.exitChildren()
+
+ // Don't tail-call runExitNotify, as exitChildren may have initiated a stop
+ // to wait for a PID namespace to die.
+ return (*runExitNotify)(nil)
+}
+
+// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread
+// group that it is no longer eligible to participate in group activities. It
+// returns true if t is the last task in its thread group to call
+// exitThreadGroup.
+func (t *Task) exitThreadGroup() bool {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ t.tg.signalHandlers.mu.Lock()
+ // Can't defer unlock: see below.
+
+ t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated)
+ t.tg.activeTasks--
+ last := t.tg.activeTasks == 0
+
+ // Ensure that someone will handle the signals we can't.
+ t.setSignalMaskLocked(^linux.SignalSet(0))
+
+ // Check if this task's exit interacts with an initiated group stop.
+ if t.tg.groupStopPhase != groupStopInitiated {
+ t.tg.signalHandlers.mu.Unlock()
+ return last
+ }
+ if t.groupStopAcknowledged {
+ // Un-acknowledge the group stop.
+ t.tg.groupStopCount--
+ t.groupStopAcknowledged = false
+ // If the group stop wasn't complete before, then there is still at
+ // least one other task that hasn't acknowledged the group stop, so
+ // it is still not complete now.
+ t.tg.signalHandlers.mu.Unlock()
+ return last
+ }
+ if t.tg.groupStopCount != t.tg.activeTasks {
+ t.tg.signalHandlers.mu.Unlock()
+ return last
+ }
+ t.Debugf("Completing group stop")
+ t.tg.groupStopPhase = groupStopComplete
+ t.tg.groupStopWaitable = true
+ sig := t.tg.groupStopSignal
+ t.tg.groupContNotify = false
+ t.tg.groupContWaitable = false
+ // signalStop must be called with t's signal mutex unlocked.
+ t.tg.signalHandlers.mu.Unlock()
+ if t.tg.leader.parent != nil {
+ t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig))
+ t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+ }
+ return last
+}
+
+func (t *Task) exitChildren() {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ newParent := t.findReparentTargetLocked()
+ if newParent == nil {
+ // "If the init process of a PID namespace terminates, the kernel
+ // terminates all of the processes in the namespace via a SIGKILL
+ // signal." - pid_namespaces(7)
+ t.Debugf("Init process terminating, killing namespace")
+ t.tg.pidns.exiting = true
+ for other := range t.tg.pidns.tids {
+ if other.tg != t.tg {
+ other.tg.signalHandlers.mu.Lock()
+ other.sendSignalLocked(&arch.SignalInfo{
+ Signo: int32(linux.SIGKILL),
+ }, false /* group */)
+ other.tg.signalHandlers.mu.Unlock()
+ }
+ }
+ // TODO: The init process waits for all processes in the
+ // namespace to exit before completing its own exit
+ // (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
+ // other tasks in the namespace are dead, except possibly for this
+ // thread group's leader (which can't be reaped until this task exits).
+ }
+ // This is correct even if newParent is nil (it ensures that children don't
+ // wait for a parent to reap them.)
+ for c := range t.children {
+ if sig := c.ParentDeathSignal(); sig != 0 {
+ siginfo := &arch.SignalInfo{
+ Signo: int32(sig),
+ Code: arch.SignalInfoUser,
+ }
+ siginfo.SetPid(int32(c.tg.pidns.tids[t]))
+ siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
+ c.tg.signalHandlers.mu.Lock()
+ c.sendSignalLocked(siginfo, true /* group */)
+ c.tg.signalHandlers.mu.Unlock()
+ }
+ c.reparentLocked(newParent)
+ if newParent != nil {
+ newParent.children[c] = struct{}{}
+ }
+ }
+}
+
+// findReparentTargetLocked returns the task to which t's children should be
+// reparented. If no such task exists, findNewParentLocked returns nil.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) findReparentTargetLocked() *Task {
+ // Reparent to any sibling in the same thread group that hasn't begun
+ // exiting.
+ if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil {
+ return t2
+ }
+ // "A child process that is orphaned within the namespace will be
+ // reparented to [the init process for the namespace] ..." -
+ // pid_namespaces(7)
+ if init := t.tg.pidns.tasks[InitTID]; init != nil {
+ return init.tg.anyNonExitingTaskLocked()
+ }
+ return nil
+}
+
+func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task {
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ if t.exitState == TaskExitNone {
+ return t
+ }
+ }
+ return nil
+}
+
+// reparentLocked changes t's parent. The new parent may be nil.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) reparentLocked(parent *Task) {
+ oldParent := t.parent
+ t.parent = parent
+ // If a thread group leader's parent changes, reset the thread group's
+ // termination signal to SIGCHLD and re-check exit notification. (Compare
+ // kernel/exit.c:reparent_leader().)
+ if t != t.tg.leader {
+ return
+ }
+ if oldParent == nil && parent == nil {
+ return
+ }
+ if oldParent != nil && parent != nil && oldParent.tg == parent.tg {
+ return
+ }
+ t.tg.terminationSignal = linux.SIGCHLD
+ if t.exitParentNotified && !t.exitParentAcked {
+ t.exitParentNotified = false
+ t.exitNotifyLocked(false)
+ }
+}
+
+// When a task exits, other tasks in the system, notably the task's parent and
+// ptracer, may want to be notified. The exit notification system ensures that
+// interested tasks receive signals and/or are woken from blocking calls to
+// wait*() syscalls; these notifications must be resolved before exiting tasks
+// can be reaped and disappear from the system.
+//
+// Each task may have a parent task and/or a tracer task. If both a parent and
+// a tracer exist, they may be the same task, different tasks in the same
+// thread group, or tasks in different thread groups. (In the last case, Linux
+// refers to the task as being ptrace-reparented due to an implementation
+// detail; we avoid this terminology to avoid confusion.)
+//
+// A thread group is *empty* if all non-leader tasks in the thread group are
+// dead, and the leader is either a zombie or dead. The exit of a thread group
+// leader is never waitable - by either the parent or tracer - until the thread
+// group is empty.
+//
+// There are a few ways for an exit notification to be resolved:
+//
+// - The exit notification may be acknowledged by a call to Task.Wait with
+// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall).
+//
+// - If the notified party is the parent, and the parent thread group is not
+// also the tracer thread group, and the notification signal is SIGCHLD, the
+// parent may explicitly ignore the notification (see quote in exitNotify).
+// Note that it's possible for the notified party to ignore the signal in other
+// cases, but the notification is only resolved under the above conditions.
+// (Actually, there is one exception; see the last paragraph of the "leader,
+// has tracer, tracer thread group is parent thread group" case below.)
+//
+// - If the notified party is the parent, and the parent does not exist, the
+// notification is resolved as if ignored. (This is only possible in the
+// sentry. In Linux, the only task / thread group without a parent is global
+// init, and killing global init causes a kernel panic.)
+//
+// - If the notified party is a tracer, the tracer may detach the traced task.
+// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.)
+//
+// In addition, if the notified party is the parent, the parent may exit and
+// cause the notifying task to be reparented to another thread group. This does
+// not resolve the notification; instead, the notification must be resent to
+// the new parent.
+//
+// The series of notifications generated for a given task's exit depend on
+// whether it is a thread group leader; whether the task is ptraced; and, if
+// so, whether the tracer thread group is the same as the parent thread group.
+//
+// - Non-leader, no tracer: No notification is generated; the task is reaped
+// immediately.
+//
+// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer
+// notification is resolved (by waiting or detaching), the task is reaped. (For
+// non-leaders, whether the tracer and parent thread groups are the same is
+// irrelevant.)
+//
+// - Leader, no tracer: The task remains a zombie, with no notification sent,
+// until all other tasks in the thread group are dead. (In Linux terms, this
+// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks
+// are removed from their thread_group list in kernel/exit.c:release_task() =>
+// __exit_signal() => __unhash_process().) Then the thread group's termination
+// signal is sent to the parent. When the parent notification is resolved (by
+// waiting or ignoring), the task is reaped.
+//
+// - Leader, has tracer, tracer thread group is not parent thread group:
+// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by
+// waiting or detaching), and all other tasks in the thread group are dead, the
+// thread group's termination signal is sent to the parent. (Note that the
+// tracer cannot resolve the exit notification by waiting until the thread
+// group is empty.) When the parent notification is resolved, the task is
+// reaped.
+//
+// - Leader, has tracer, tracer thread group is parent thread group:
+//
+// If all other tasks in the thread group are dead, the thread group's
+// termination signal is sent to the parent. At this point, the notification
+// can only be resolved by waiting. If the parent detaches from the task as a
+// tracer, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// If at least one task in the thread group is not dead, SIGCHLD is sent to the
+// parent. At this point, the notification cannot be resolved at all; once the
+// thread group becomes empty, it can be resolved only by waiting. If the
+// parent detaches from the task as a tracer before all remaining tasks die,
+// then exit notification proceeds as in the case where the leader never had a
+// tracer. If the parent detaches from the task as a tracer after all remaining
+// tasks die, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// In both of the above cases, when the parent detaches from the task as a
+// tracer while the thread group is empty, whether or not the parent resolves
+// the notification by ignoring it is based on the parent's SIGCHLD signal
+// action, whether or not the thread group's termination signal is SIGCHLD
+// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()).
+//
+// There is one final wrinkle: A leader can become a non-leader due to a
+// sibling execve. In this case, the execing thread detaches the leader's
+// tracer (if one exists) and reaps the leader immediately. In Linux, this is
+// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
+
+type runExitNotify struct{}
+
+func (*runExitNotify) execute(t *Task) taskRunState {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie)
+ t.tg.liveTasks--
+ // Check if this completes a sibling's execve.
+ if t.tg.execing != nil && t.tg.liveTasks == 1 {
+ // execing blocks the addition of new tasks to the thread group, so
+ // the sole living task must be the execing one.
+ e := t.tg.execing
+ e.tg.signalHandlers.mu.Lock()
+ if _, ok := e.stop.(*execStop); ok {
+ e.endInternalStopLocked()
+ }
+ e.tg.signalHandlers.mu.Unlock()
+ }
+ t.exitNotifyLocked(false)
+ // The task goroutine will now exit.
+ return nil
+}
+
+// exitNotifyLocked is called after changes to t's state that affect exit
+// notification.
+//
+// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace;
+// thanks to Linux's haphazard implementation of this functionality, such cases
+// determine whether parent notifications are ignored based on the parent's
+// handling of SIGCHLD, regardless of what the exited task's thread group's
+// termination signal is.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
+ if t.exitState != TaskExitZombie {
+ return
+ }
+ if !t.exitTracerNotified {
+ t.exitTracerNotified = true
+ tracer := t.Tracer()
+ if tracer == nil {
+ t.exitTracerAcked = true
+ } else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg {
+ // Don't set exitParentNotified if t is non-leader, even if the
+ // tracer is in the parent thread group, so that if the parent
+ // detaches the following call to exitNotifyLocked passes through
+ // the !exitParentNotified case below and causes t to be reaped
+ // immediately.
+ //
+ // Tracer notification doesn't care about about
+ // SIG_IGN/SA_NOCLDWAIT.
+ tracer.tg.signalHandlers.mu.Lock()
+ tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */)
+ tracer.tg.signalHandlers.mu.Unlock()
+ // Wake EventTraceeStop waiters as well since this task will never
+ // ptrace-stop again.
+ tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop)
+ } else {
+ // t is a leader and the tracer is in the parent thread group.
+ t.exitParentNotified = true
+ sig := linux.SIGCHLD
+ if t.tg.tasksCount == 1 {
+ sig = t.tg.terminationSignal
+ }
+ // This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either
+ // (in Linux, the check in do_notify_parent() is gated by
+ // !tsk->ptrace.)
+ t.parent.tg.signalHandlers.mu.Lock()
+ t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */)
+ t.parent.tg.signalHandlers.mu.Unlock()
+ // See below for rationale for this event mask.
+ t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+ }
+ }
+ if t.exitTracerAcked && !t.exitParentNotified {
+ if t != t.tg.leader {
+ t.exitParentNotified = true
+ t.exitParentAcked = true
+ } else if t.tg.tasksCount == 1 {
+ t.exitParentNotified = true
+ if t.parent == nil {
+ t.exitParentAcked = true
+ } else {
+ // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
+ // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
+ // sigaction(2)), then children that terminate do not become
+ // zombies and a call to wait() or waitpid() will block until all
+ // children have terminated, and then fail with errno set to
+ // ECHILD. (The original POSIX standard left the behavior of
+ // setting SIGCHLD to SIG_IGN unspecified. Note that even though
+ // the default disposition of SIGCHLD is "ignore", explicitly
+ // setting the disposition to SIG_IGN results in different
+ // treatment of zombie process children.) Linux 2.6 conforms to
+ // this specification." - wait(2)
+ //
+ // Some undocumented Linux-specific details:
+ //
+ // - All of the above is ignored if the termination signal isn't
+ // SIGCHLD.
+ //
+ // - SA_NOCLDWAIT causes the leader to be immediately reaped, but
+ // does not suppress the SIGCHLD.
+ signalParent := t.tg.terminationSignal.IsValid()
+ t.parent.tg.signalHandlers.mu.Lock()
+ if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach {
+ if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok {
+ if act.Handler == arch.SignalActIgnore {
+ t.exitParentAcked = true
+ signalParent = false
+ } else if act.Flags&arch.SignalFlagNoCldWait != 0 {
+ t.exitParentAcked = true
+ }
+ }
+ }
+ if signalParent {
+ t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */)
+ }
+ t.parent.tg.signalHandlers.mu.Unlock()
+ // If a task in the parent was waiting for a child group stop
+ // or continue, it needs to be notified of the exit, because
+ // there may be no remaining eligible tasks (so that wait
+ // should return ECHILD).
+ t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+ }
+ }
+ }
+ if t.exitTracerAcked && t.exitParentAcked {
+ t.advanceExitStateLocked(TaskExitZombie, TaskExitDead)
+ for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+ tid := ns.tids[t]
+ delete(ns.tasks, tid)
+ delete(ns.tids, t)
+ }
+ t.tg.exitedCPUStats.Accumulate(t.CPUStats())
+ t.tg.ioUsage.Accumulate(t.ioUsage)
+ t.tg.signalHandlers.mu.Lock()
+ t.tg.tasks.Remove(t)
+ if t.tg.lastTimerSignalTask == t {
+ t.tg.lastTimerSignalTask = nil
+ }
+ t.tg.tasksCount--
+ tc := t.tg.tasksCount
+ t.tg.signalHandlers.mu.Unlock()
+ if tc == 1 && t != t.tg.leader {
+ // Our fromPtraceDetach doesn't matter here (in Linux terms, this
+ // is via a call to release_task()).
+ t.tg.leader.exitNotifyLocked(false)
+ } else if tc == 0 {
+ t.tg.processGroup.decRefWithParent(t.tg.parentPG())
+ }
+ if t.parent != nil {
+ delete(t.parent.children, t)
+ t.parent = nil
+ }
+ }
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo {
+ info := &arch.SignalInfo{
+ Signo: int32(sig),
+ }
+ info.SetPid(int32(receiver.tg.pidns.tids[t]))
+ info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+ if t.exitStatus.Signaled() {
+ info.Code = arch.CLD_KILLED
+ info.SetStatus(int32(t.exitStatus.Signo))
+ } else {
+ info.Code = arch.CLD_EXITED
+ info.SetStatus(int32(t.exitStatus.Code))
+ }
+ // TODO: Set utime, stime.
+ return info
+}
+
+// ExitStatus returns t's exit status, which is only guaranteed to be
+// meaningful if t.ExitState() != TaskExitNone.
+func (t *Task) ExitStatus() ExitStatus {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ return t.exitStatus
+}
+
+// ExitStatus returns the exit status that would be returned by a consuming
+// wait*() on tg.
+func (tg *ThreadGroup) ExitStatus() ExitStatus {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ tg.signalHandlers.mu.Lock()
+ defer tg.signalHandlers.mu.Unlock()
+ if tg.exiting {
+ return tg.exitStatus
+ }
+ return tg.leader.exitStatus
+}
+
+// TerminationSignal returns the thread group's termination signal.
+func (tg *ThreadGroup) TerminationSignal() linux.Signal {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ return tg.terminationSignal
+}
+
+// Task events that can be waited for.
+const (
+ // EventExit represents an exit notification generated for a child thread
+ // group leader or a tracee under the conditions specified in the comment
+ // above runExitNotify.
+ EventExit waiter.EventMask = 1 << iota
+
+ // EventChildGroupStop occurs when a child thread group completes a group
+ // stop (i.e. all tasks in the child thread group have entered a stopped
+ // state as a result of a group stop).
+ EventChildGroupStop
+
+ // EventTraceeStop occurs when a task that is ptraced by a task in the
+ // notified thread group enters a ptrace stop (see ptrace(2)).
+ EventTraceeStop
+
+ // EventGroupContinue occurs when a child thread group, or a thread group
+ // whose leader is ptraced by a task in the notified thread group, that had
+ // initiated or completed a group stop leaves the group stop, due to the
+ // child thread group or any task in the child thread group being sent
+ // SIGCONT.
+ EventGroupContinue
+)
+
+// WaitOptions controls the behavior of Task.Wait.
+type WaitOptions struct {
+ // If SpecificTID is non-zero, only events from the task with thread ID
+ // SpecificTID are eligible to be waited for. SpecificTID is resolved in
+ // the PID namespace of the waiter (the method receiver of Task.Wait). If
+ // no such task exists, or that task would not otherwise be eligible to be
+ // waited for by the waiting task, then there are no waitable tasks and
+ // Wait will return ECHILD.
+ SpecificTID ThreadID
+
+ // If SpecificPGID is non-zero, only events from ThreadGroups with a
+ // matching ProcessGroupID are eligible to be waited for. (Same
+ // constraints as SpecificTID apply.)
+ SpecificPGID ProcessGroupID
+
+ // Terminology note: Per waitpid(2), "a clone child is one which delivers
+ // no signal, or a signal other than SIGCHLD to its parent upon
+ // termination." In Linux, termination signal is technically a per-task
+ // property rather than a per-thread-group property. However, clone()
+ // forces no termination signal for tasks created with CLONE_THREAD, and
+ // execve() resets the termination signal to SIGCHLD, so all
+ // non-group-leader threads have no termination signal and are therefore
+ // "clone tasks".
+
+ // If NonCloneTasks is true, events from non-clone tasks are eligible to be
+ // waited for.
+ NonCloneTasks bool
+
+ // If CloneTasks is true, events from clone tasks are eligible to be waited
+ // for.
+ CloneTasks bool
+
+ // Events is a bitwise combination of the events defined above that specify
+ // what events are of interest to the call to Wait.
+ Events waiter.EventMask
+
+ // If ConsumeEvent is true, the Wait should consume the event such that it
+ // cannot be returned by a future Wait. Note that if a task exit is
+ // consumed in this way, in most cases the task will be reaped.
+ ConsumeEvent bool
+
+ // If BlockInterruptErr is not nil, Wait will block until either an event
+ // is available or there are no tasks that could produce a waitable event;
+ // if that blocking is interrupted, Wait returns BlockInterruptErr. If
+ // BlockInterruptErr is nil, Wait will not block.
+ BlockInterruptErr error
+}
+
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace) bool {
+ if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] {
+ return false
+ }
+ if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] {
+ return false
+ }
+ if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD {
+ return o.NonCloneTasks
+ }
+ return o.CloneTasks
+}
+
+// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g.
+// waitpid(WNOHANG)) that find no waitable events, but determine that waitable
+// events may exist in the future. (In contrast, if a non-blocking or blocking
+// Wait determines that there are no tasks that can produce a waitable event,
+// Task.Wait returns ECHILD.)
+var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events")
+
+// WaitResult contains information about a waited-for event.
+type WaitResult struct {
+ // Task is the task that reported the event.
+ Task *Task
+
+ // TID is the thread ID of Task in the PID namespace of the task that
+ // called Wait (that is, the method receiver of the call to Task.Wait). TID
+ // is provided because consuming exit waits cause the thread ID to be
+ // deallocated.
+ TID ThreadID
+
+ // UID is the real UID of Task in the user namespace of the task that
+ // called Wait.
+ UID auth.UID
+
+ // Event is exactly one of the events defined above.
+ Event waiter.EventMask
+
+ // Status is the numeric status associated with the event.
+ Status uint32
+}
+
+// Wait waits for an event from a thread group that is a child of t's thread
+// group, or a task in such a thread group, or a task that is ptraced by t,
+// subject to the options specified in opts.
+func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) {
+ if opts.BlockInterruptErr == nil {
+ return t.waitOnce(opts)
+ }
+ w, ch := waiter.NewChannelEntry(nil)
+ t.tg.eventQueue.EventRegister(&w, opts.Events)
+ defer t.tg.eventQueue.EventUnregister(&w)
+ for {
+ wr, err := t.waitOnce(opts)
+ if err != ErrNoWaitableEvent {
+ // This includes err == nil.
+ return wr, err
+ }
+ if err := t.Block(ch); err != nil {
+ return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr)
+ }
+ }
+}
+
+func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
+ anyWaitableTasks := false
+
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+
+ // Without the (unimplemented) __WNOTHREAD flag, a task can wait on the
+ // children and tracees of any task in the same thread group.
+ for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
+ for child := range parent.children {
+ if !opts.matchesTask(child, parent.tg.pidns) {
+ continue
+ }
+ // Non-leaders don't notify parents on exit and aren't eligible to
+ // be waited on.
+ if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
+ anyWaitableTasks = true
+ if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
+ return wr, nil
+ }
+ }
+ // Check for group stops and continues. Tasks that have passed
+ // TaskExitInitiated can no longer participate in group stops.
+ if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
+ continue
+ }
+ if child.exitState >= TaskExitInitiated {
+ continue
+ }
+ // If the waiter is in the same thread group as the task's
+ // tracer, do not report its group stops; they will be reported
+ // as ptrace stops instead. This also skips checking for group
+ // continues, but they'll be checked for when scanning tracees
+ // below. (Per kernel/exit.c:wait_consider_task(): "If a
+ // ptracer wants to distinguish the two events for its own
+ // children, it should create a separate process which takes
+ // the role of real parent.")
+ if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
+ continue
+ }
+ anyWaitableTasks = true
+ if opts.Events&EventChildGroupStop != 0 {
+ if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
+ return wr, nil
+ }
+ }
+ if opts.Events&EventGroupContinue != 0 {
+ if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
+ return wr, nil
+ }
+ }
+ }
+ for tracee := range parent.ptraceTracees {
+ if !opts.matchesTask(tracee, parent.tg.pidns) {
+ continue
+ }
+ // Non-leaders do notify tracers on exit.
+ if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
+ anyWaitableTasks = true
+ if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
+ return wr, nil
+ }
+ }
+ if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
+ continue
+ }
+ if tracee.exitState >= TaskExitInitiated {
+ continue
+ }
+ anyWaitableTasks = true
+ if opts.Events&EventTraceeStop != 0 {
+ if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
+ return wr, nil
+ }
+ }
+ if opts.Events&EventGroupContinue != 0 {
+ if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
+ return wr, nil
+ }
+ }
+ }
+ }
+
+ if anyWaitableTasks {
+ return nil, ErrNoWaitableEvent
+ }
+ return nil, syserror.ECHILD
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult {
+ if asPtracer && !target.exitTracerNotified {
+ return nil
+ }
+ if !asPtracer && !target.exitParentNotified {
+ return nil
+ }
+ // Zombied thread group leaders are never waitable until their thread group
+ // is otherwise empty. Usually this is caught by the
+ // target.exitParentNotified check above, but if t is both (in the thread
+ // group of) target's tracer and parent, asPtracer may be true.
+ if target == target.tg.leader && target.tg.tasksCount != 1 {
+ return nil
+ }
+ pid := t.tg.pidns.tids[target]
+ uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+ status := target.exitStatus.Status()
+ if !opts.ConsumeEvent {
+ return &WaitResult{
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventExit,
+ Status: status,
+ }
+ }
+ // Surprisingly, the exit status reported by a non-consuming wait can
+ // differ from that reported by a consuming wait; the latter will return
+ // the group exit code if one is available.
+ if target.tg.exiting {
+ status = target.tg.exitStatus.Status()
+ }
+ // t may be (in the thread group of) target's parent, tracer, or both. We
+ // don't need to check for !exitTracerAcked because tracees are detached
+ // here, and we don't need to check for !exitParentAcked because zombies
+ // will be reaped here.
+ if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified {
+ target.exitTracerAcked = true
+ target.ptraceTracer.Store((*Task)(nil))
+ delete(t.ptraceTracees, target)
+ }
+ if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified {
+ target.exitParentAcked = true
+ if target == target.tg.leader {
+ // target.tg.exitedCPUStats doesn't include target.CPUStats() yet,
+ // and won't until after target.exitNotifyLocked() (maybe). Include
+ // target.CPUStats() explicitly. This is consistent with Linux,
+ // which accounts an exited task's cputime to its thread group in
+ // kernel/exit.c:release_task() => __exit_signal(), and uses
+ // thread_group_cputime_adjusted() in wait_task_zombie().
+ t.tg.childCPUStats.Accumulate(target.CPUStats())
+ t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats)
+ t.tg.childCPUStats.Accumulate(target.tg.childCPUStats)
+ // Update t's child max resident set size. The size will be the maximum
+ // of this thread's size and all its childrens' sizes.
+ if t.tg.childMaxRSS < target.tg.maxRSS {
+ t.tg.childMaxRSS = target.tg.maxRSS
+ }
+ if t.tg.childMaxRSS < target.tg.childMaxRSS {
+ t.tg.childMaxRSS = target.tg.childMaxRSS
+ }
+ }
+ }
+ target.exitNotifyLocked(false)
+ return &WaitResult{
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventExit,
+ Status: status,
+ }
+}
+
+// updateRSSLocked updates t.tg.maxRSS.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) updateRSSLocked() {
+ if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS {
+ t.tg.maxRSS = mmMaxRSS
+ }
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+ target.tg.signalHandlers.mu.Lock()
+ defer target.tg.signalHandlers.mu.Unlock()
+ if !target.tg.groupStopWaitable {
+ return nil
+ }
+ pid := t.tg.pidns.tids[target]
+ uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+ sig := target.tg.groupStopSignal
+ if opts.ConsumeEvent {
+ target.tg.groupStopWaitable = false
+ }
+ return &WaitResult{
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventChildGroupStop,
+ // There is no name for these status constants.
+ Status: (uint32(sig)&0xff)<<8 | 0x7f,
+ }
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult {
+ target.tg.signalHandlers.mu.Lock()
+ defer target.tg.signalHandlers.mu.Unlock()
+ if !target.tg.groupContWaitable {
+ return nil
+ }
+ pid := t.tg.pidns.tids[target]
+ uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+ if opts.ConsumeEvent {
+ target.tg.groupContWaitable = false
+ }
+ return &WaitResult{
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventGroupContinue,
+ Status: 0xffff,
+ }
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+ target.tg.signalHandlers.mu.Lock()
+ defer target.tg.signalHandlers.mu.Unlock()
+ if target.stop == nil {
+ return nil
+ }
+ if _, ok := target.stop.(*ptraceStop); !ok {
+ return nil
+ }
+ if target.ptraceCode == 0 {
+ return nil
+ }
+ pid := t.tg.pidns.tids[target]
+ uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+ code := target.ptraceCode
+ if opts.ConsumeEvent {
+ target.ptraceCode = 0
+ }
+ return &WaitResult{
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventTraceeStop,
+ Status: uint32(code)<<8 | 0x7f,
+ }
+}
+
+// ExitState returns t's current progress through the exit path.
+func (t *Task) ExitState() TaskExitState {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ return t.exitState
+}
+
+// ParentDeathSignal returns t's parent death signal.
+func (t *Task) ParentDeathSignal() linux.Signal {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.parentDeathSignal
+}
+
+// SetParentDeathSignal sets t's parent death signal.
+func (t *Task) SetParentDeathSignal(sig linux.Signal) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.parentDeathSignal = sig
+}
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
new file mode 100644
index 000000000..a51fa9d7e
--- /dev/null
+++ b/pkg/sentry/kernel/task_identity.go
@@ -0,0 +1,557 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Credentials returns t's credentials by value.
+func (t *Task) Credentials() auth.Credentials {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return *t.creds // Copy out with lock held.
+}
+
+// UserNamespace returns the user namespace associated with the task.
+func (t *Task) UserNamespace() *auth.UserNamespace {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.creds.UserNamespace
+}
+
+// HasCapabilityIn checks if the task has capability cp in user namespace ns.
+func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.creds.HasCapabilityIn(cp, ns)
+}
+
+// HasCapability checks if the task has capability cp in its user namespace.
+func (t *Task) HasCapability(cp linux.Capability) bool {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.creds.HasCapability(cp)
+}
+
+// SetUID implements the semantics of setuid(2).
+func (t *Task) SetUID(uid auth.UID) error {
+ // setuid considers -1 to be invalid.
+ if !uid.Ok() {
+ return syserror.EINVAL
+ }
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ kuid := t.creds.UserNamespace.MapToKUID(uid)
+ if !kuid.Ok() {
+ return syserror.EINVAL
+ }
+ // "setuid() sets the effective user ID of the calling process. If the
+ // effective UID of the caller is root (more precisely: if the caller has
+ // the CAP_SETUID capability), the real UID and saved set-user-ID are also
+ // set." - setuid(2)
+ if t.creds.HasCapability(linux.CAP_SETUID) {
+ t.setKUIDsUncheckedLocked(kuid, kuid, kuid)
+ return nil
+ }
+ // "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID
+ // capability) and uid does not match the real UID or saved set-user-ID of
+ // the calling process."
+ if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID {
+ return syserror.EPERM
+ }
+ t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID)
+ return nil
+}
+
+// SetREUID implements the semantics of setreuid(2).
+func (t *Task) SetREUID(r, e auth.UID) error {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ // "Supplying a value of -1 for either the real or effective user ID forces
+ // the system to leave that ID unchanged." - setreuid(2)
+ newR := t.creds.RealKUID
+ if r.Ok() {
+ newR = t.creds.UserNamespace.MapToKUID(r)
+ if !newR.Ok() {
+ return syserror.EINVAL
+ }
+ }
+ newE := t.creds.EffectiveKUID
+ if e.Ok() {
+ newE = t.creds.UserNamespace.MapToKUID(e)
+ if !newE.Ok() {
+ return syserror.EINVAL
+ }
+ }
+ if !t.creds.HasCapability(linux.CAP_SETUID) {
+ // "Unprivileged processes may only set the effective user ID to the
+ // real user ID, the effective user ID, or the saved set-user-ID."
+ if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID {
+ return syserror.EPERM
+ }
+ // "Unprivileged users may only set the real user ID to the real user
+ // ID or the effective user ID."
+ if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID {
+ return syserror.EPERM
+ }
+ }
+ // "If the real user ID is set (i.e., ruid is not -1) or the effective user
+ // ID is set to a value not equal to the previous real user ID, the saved
+ // set-user-ID will be set to the new effective user ID."
+ newS := t.creds.SavedKUID
+ if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) {
+ newS = newE
+ }
+ t.setKUIDsUncheckedLocked(newR, newE, newS)
+ return nil
+}
+
+// SetRESUID implements the semantics of the setresuid(2) syscall.
+func (t *Task) SetRESUID(r, e, s auth.UID) error {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ // "Unprivileged user processes may change the real UID, effective UID, and
+ // saved set-user-ID, each to one of: the current real UID, the current
+ // effective UID or the current saved set-user-ID. Privileged processes (on
+ // Linux, those having the CAP_SETUID capability) may set the real UID,
+ // effective UID, and saved set-user-ID to arbitrary values. If one of the
+ // arguments equals -1, the corresponding value is not changed." -
+ // setresuid(2)
+ var err error
+ newR := t.creds.RealKUID
+ if r.Ok() {
+ newR, err = t.creds.UseUID(r)
+ if err != nil {
+ return err
+ }
+ }
+ newE := t.creds.EffectiveKUID
+ if e.Ok() {
+ newE, err = t.creds.UseUID(e)
+ if err != nil {
+ return err
+ }
+ }
+ newS := t.creds.SavedKUID
+ if s.Ok() {
+ newS, err = t.creds.UseUID(s)
+ if err != nil {
+ return err
+ }
+ }
+ t.setKUIDsUncheckedLocked(newR, newE, newS)
+ return nil
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
+ root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
+ oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID
+ t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS
+
+ // "1. If one or more of the real, effective or saved set user IDs was
+ // previously 0, and as a result of the UID changes all of these IDs have a
+ // nonzero value, then all capabilities are cleared from the permitted and
+ // effective capability sets." - capabilities(7)
+ if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) {
+ // prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's
+ // "keep capabilities" flag, which determines whether the thread's permitted
+ // capability set is cleared when a change is made to the
+ // thread's user IDs such that the thread's real UID, effective
+ // UID, and saved set-user-ID all become nonzero when at least
+ // one of them previously had the value 0. By default, the
+ // permitted capability set is cleared when such a change is
+ // made; setting the "keep capabilities" flag prevents it from
+ // being cleared." (A thread's effective capability set is always
+ // cleared when such a credential change is made,
+ // regardless of the setting of the "keep capabilities" flag.)
+ if !t.creds.KeepCaps {
+ t.creds.PermittedCaps = 0
+ t.creds.EffectiveCaps = 0
+ }
+ }
+ // """
+ // 2. If the effective user ID is changed from 0 to nonzero, then all
+ // capabilities are cleared from the effective set.
+ //
+ // 3. If the effective user ID is changed from nonzero to 0, then the
+ // permitted set is copied to the effective set.
+ // """
+ if oldE == root && newE != root {
+ t.creds.EffectiveCaps = 0
+ } else if oldE != root && newE == root {
+ t.creds.EffectiveCaps = t.creds.PermittedCaps
+ }
+ // "4. If the filesystem user ID is changed from 0 to nonzero (see
+ // setfsuid(2)), then the following capabilities are cleared from the
+ // effective set: ..."
+ // (filesystem UIDs aren't implemented, nor are any of the capabilities in
+ // question)
+
+ // Not documented, but compare Linux's kernel/cred.c:commit_creds().
+ if oldE != newE {
+ t.parentDeathSignal = 0
+ }
+}
+
+// SetGID implements the semantics of setgid(2).
+func (t *Task) SetGID(gid auth.GID) error {
+ if !gid.Ok() {
+ return syserror.EINVAL
+ }
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ kgid := t.creds.UserNamespace.MapToKGID(gid)
+ if !kgid.Ok() {
+ return syserror.EINVAL
+ }
+ if t.creds.HasCapability(linux.CAP_SETGID) {
+ t.setKGIDsUncheckedLocked(kgid, kgid, kgid)
+ return nil
+ }
+ if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID {
+ return syserror.EPERM
+ }
+ t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID)
+ return nil
+}
+
+// SetREGID implements the semantics of setregid(2).
+func (t *Task) SetREGID(r, e auth.GID) error {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ newR := t.creds.RealKGID
+ if r.Ok() {
+ newR = t.creds.UserNamespace.MapToKGID(r)
+ if !newR.Ok() {
+ return syserror.EINVAL
+ }
+ }
+ newE := t.creds.EffectiveKGID
+ if e.Ok() {
+ newE = t.creds.UserNamespace.MapToKGID(e)
+ if !newE.Ok() {
+ return syserror.EINVAL
+ }
+ }
+ if !t.creds.HasCapability(linux.CAP_SETGID) {
+ if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID {
+ return syserror.EPERM
+ }
+ if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID {
+ return syserror.EPERM
+ }
+ }
+ newS := t.creds.SavedKGID
+ if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) {
+ newS = newE
+ }
+ t.setKGIDsUncheckedLocked(newR, newE, newS)
+ return nil
+}
+
+// SetRESGID implements the semantics of the setresgid(2) syscall.
+func (t *Task) SetRESGID(r, e, s auth.GID) error {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ var err error
+ newR := t.creds.RealKGID
+ if r.Ok() {
+ newR, err = t.creds.UseGID(r)
+ if err != nil {
+ return err
+ }
+ }
+ newE := t.creds.EffectiveKGID
+ if e.Ok() {
+ newE, err = t.creds.UseGID(e)
+ if err != nil {
+ return err
+ }
+ }
+ newS := t.creds.SavedKGID
+ if s.Ok() {
+ newS, err = t.creds.UseGID(s)
+ if err != nil {
+ return err
+ }
+ }
+ t.setKGIDsUncheckedLocked(newR, newE, newS)
+ return nil
+}
+
+func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
+ oldE := t.creds.EffectiveKGID
+ t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS
+
+ // Not documented, but compare Linux's kernel/cred.c:commit_creds().
+ if oldE != newE {
+ t.parentDeathSignal = 0
+ }
+}
+
+// SetExtraGIDs attempts to change t's supplemental groups. All IDs are
+// interpreted as being in t's user namespace.
+func (t *Task) SetExtraGIDs(gids []auth.GID) error {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if !t.creds.HasCapability(linux.CAP_SETGID) {
+ return syserror.EPERM
+ }
+ kgids := make([]auth.KGID, len(gids))
+ for i, gid := range gids {
+ kgid := t.creds.UserNamespace.MapToKGID(gid)
+ if !kgid.Ok() {
+ return syserror.EINVAL
+ }
+ kgids[i] = kgid
+ }
+ t.creds.ExtraKGIDs = kgids
+ return nil
+}
+
+// SetCapabilitySets attempts to change t's permitted, inheritable, and
+// effective capability sets.
+func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ // "Permitted: This is a limiting superset for the effective capabilities
+ // that the thread may assume." - capabilities(7)
+ if effective & ^permitted != 0 {
+ return syserror.EPERM
+ }
+ // "It is also a limiting superset for the capabilities that may be added
+ // to the inheritable set by a thread that does not have the CAP_SETPCAP
+ // capability in its effective set."
+ if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) {
+ return syserror.EPERM
+ }
+ // "If a thread drops a capability from its permitted set, it can never
+ // reacquire that capability (unless it execve(2)s ..."
+ if permitted & ^t.creds.PermittedCaps != 0 {
+ return syserror.EPERM
+ }
+ // "... if a capability is not in the bounding set, then a thread can't add
+ // this capability to its inheritable set, even if it was in its permitted
+ // capabilities ..."
+ if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 {
+ return syserror.EPERM
+ }
+ t.creds.PermittedCaps = permitted
+ t.creds.InheritableCaps = inheritable
+ t.creds.EffectiveCaps = effective
+ return nil
+}
+
+// DropBoundingCapability attempts to drop capability cp from t's capability
+// bounding set.
+func (t *Task) DropBoundingCapability(cp linux.Capability) error {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if !t.creds.HasCapability(linux.CAP_SETPCAP) {
+ return syserror.EPERM
+ }
+ t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
+ return nil
+}
+
+// SetUserNamespace attempts to move c into ns.
+func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+
+ // "A process reassociating itself with a user namespace must have the
+ // CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
+ //
+ // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
+ // in ns (by rule 3 in auth.Credentials.HasCapability).
+ if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
+ return syserror.EPERM
+ }
+
+ t.creds.UserNamespace = ns
+ // "The child process created by clone(2) with the CLONE_NEWUSER flag
+ // starts out with a complete set of capabilities in the new user
+ // namespace. Likewise, a process that creates a new user namespace using
+ // unshare(2) or joins an existing user namespace using setns(2) gains a
+ // full set of capabilities in that namespace."
+ t.creds.PermittedCaps = auth.AllCapabilities
+ t.creds.InheritableCaps = 0
+ t.creds.EffectiveCaps = auth.AllCapabilities
+ t.creds.BoundingCaps = auth.AllCapabilities
+ // "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
+ // flag sets the "securebits" flags (see capabilities(7)) to their default
+ // values (all flags disabled) in the child (for clone(2)) or caller (for
+ // unshare(2), or setns(2)." - user_namespaces(7)
+ t.creds.KeepCaps = false
+
+ return nil
+}
+
+// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS.
+func (t *Task) SetKeepCaps(k bool) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.creds.KeepCaps = k
+}
+
+// updateCredsForExec updates t.creds to reflect an execve().
+//
+// NOTE: We currently do not implement privileged executables
+// (set-user/group-ID bits and file capabilities). This allows us to make a lot
+// of simplifying assumptions:
+//
+// - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which
+// disables the features we don't support anyway, is always set. This
+// drastically simplifies this function.
+//
+// - We don't implement AT_SECURE, because no_new_privs always being set means
+// that the conditions that require AT_SECURE never arise. (Compare Linux's
+// security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().)
+//
+// - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since
+// seccomp-bpf is also allowed if the task has no_new_privs set.
+//
+// - Task.ptraceAttach does not serialize with execve as it does in Linux,
+// since no_new_privs being set has the same effect as the presence of an
+// unprivileged tracer.
+//
+// Preconditions: t.mu must be locked.
+func (t *Task) updateCredsForExecLocked() {
+ // """
+ // During an execve(2), the kernel calculates the new capabilities of
+ // the process using the following algorithm:
+ //
+ // P'(permitted) = (P(inheritable) & F(inheritable)) |
+ // (F(permitted) & cap_bset)
+ //
+ // P'(effective) = F(effective) ? P'(permitted) : 0
+ //
+ // P'(inheritable) = P(inheritable) [i.e., unchanged]
+ //
+ // where:
+ //
+ // P denotes the value of a thread capability set before the
+ // execve(2)
+ //
+ // P' denotes the value of a thread capability set after the
+ // execve(2)
+ //
+ // F denotes a file capability set
+ //
+ // cap_bset is the value of the capability bounding set
+ //
+ // ...
+ //
+ // In order to provide an all-powerful root using capability sets, during
+ // an execve(2):
+ //
+ // 1. If a set-user-ID-root program is being executed, or the real user ID
+ // of the process is 0 (root) then the file inheritable and permitted sets
+ // are defined to be all ones (i.e. all capabilities enabled).
+ //
+ // 2. If a set-user-ID-root program is being executed, then the file
+ // effective bit is defined to be one (enabled).
+ //
+ // The upshot of the above rules, combined with the capabilities
+ // transformations described above, is that when a process execve(2)s a
+ // set-user-ID-root program, or when a process with an effective UID of 0
+ // execve(2)s a program, it gains all capabilities in its permitted and
+ // effective capability sets, except those masked out by the capability
+ // bounding set.
+ // """ - capabilities(7)
+ // (ambient capability sets omitted)
+ //
+ // As the last paragraph implies, the case of "a set-user-ID root program
+ // is being executed" also includes the case where (namespace) root is
+ // executing a non-set-user-ID program; the actual check is just based on
+ // the effective user ID.
+ var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0
+ fileEffective := false
+ root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
+ if t.creds.EffectiveKUID == root || t.creds.RealKUID == root {
+ newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps
+ if t.creds.EffectiveKUID == root {
+ fileEffective = true
+ }
+ }
+
+ // Now we enter poorly-documented, somewhat confusing territory. (The
+ // accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds
+ // is not very helpful.) My reading of it is:
+ //
+ // If at least one of the following is true:
+ //
+ // A1. The execing task is ptraced, and the tracer did not have
+ // CAP_SYS_PTRACE in the execing task's user namespace at the time of
+ // PTRACE_ATTACH.
+ //
+ // A2. The execing task shares its FS context with at least one task in
+ // another thread group.
+ //
+ // A3. The execing task has no_new_privs set.
+ //
+ // AND at least one of the following is true:
+ //
+ // B1. The new effective user ID (which may come from set-user-ID, or be the
+ // execing task's existing effective user ID) is not equal to the task's
+ // real UID.
+ //
+ // B2. The new effective group ID (which may come from set-group-ID, or be
+ // the execing task's existing effective group ID) is not equal to the
+ // task's real GID.
+ //
+ // B3. The new permitted capability set contains capabilities not in the
+ // task's permitted capability set.
+ //
+ // Then:
+ //
+ // C1. Limit the new permitted capability set to the task's permitted
+ // capability set.
+ //
+ // C2. If either the task does not have CAP_SETUID in its user namespace, or
+ // the task has no_new_privs set, force the new effective UID and GID to
+ // the task's real UID and GID.
+ //
+ // But since no_new_privs is always set (A3 is always true), this becomes
+ // much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1
+ // is a no-op. So we can just do C1 and C2 unconditionally.
+ if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID {
+ t.creds.EffectiveKUID = t.creds.RealKUID
+ t.creds.EffectiveKGID = t.creds.RealKGID
+ t.parentDeathSignal = 0
+ }
+ // (Saved set-user-ID is always set to the new effective user ID, and saved
+ // set-group-ID is always set to the new effective group ID, regardless of
+ // the above.)
+ t.creds.SavedKUID = t.creds.RealKUID
+ t.creds.SavedKGID = t.creds.RealKGID
+ t.creds.PermittedCaps &= newPermitted
+ if fileEffective {
+ t.creds.EffectiveCaps = t.creds.PermittedCaps
+ } else {
+ t.creds.EffectiveCaps = 0
+ }
+
+ // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent
+ // calls to execve(2).
+ t.creds.KeepCaps = false
+
+ // "The bounding set is inherited at fork(2) from the thread's parent, and
+ // is preserved across an execve(2)". So we're done.
+}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
new file mode 100644
index 000000000..18efacb19
--- /dev/null
+++ b/pkg/sentry/kernel/task_log.go
@@ -0,0 +1,137 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "sort"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+ // maxStackDebugBytes is the maximum number of user stack bytes that may be
+ // printed by debugDumpStack.
+ maxStackDebugBytes = 1024
+)
+
+// Infof logs an formatted info message by calling log.Infof.
+func (t *Task) Infof(fmt string, v ...interface{}) {
+ if log.IsLogging(log.Info) {
+ log.Infof(t.logPrefix.Load().(string)+fmt, v...)
+ }
+}
+
+// Warningf logs a warning string by calling log.Warningf.
+func (t *Task) Warningf(fmt string, v ...interface{}) {
+ if log.IsLogging(log.Warning) {
+ log.Warningf(t.logPrefix.Load().(string)+fmt, v...)
+ }
+}
+
+// Debugf creates a debug string that includes the task ID.
+func (t *Task) Debugf(fmt string, v ...interface{}) {
+ if log.IsLogging(log.Debug) {
+ log.Debugf(t.logPrefix.Load().(string)+fmt, v...)
+ }
+}
+
+// IsLogging returns true iff this level is being logged.
+func (t *Task) IsLogging(level log.Level) bool {
+ return log.IsLogging(level)
+}
+
+// DebugDumpState logs task state at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) DebugDumpState() {
+ t.debugDumpRegisters()
+ t.debugDumpStack()
+ if mm := t.MemoryManager(); mm != nil {
+ t.Debugf("Mappings:\n%s", mm)
+ }
+ t.Debugf("FDMap:\n%s", t.FDMap())
+}
+
+// debugDumpRegisters logs register state at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpRegisters() {
+ if !t.IsLogging(log.Debug) {
+ return
+ }
+ regmap, err := t.Arch().RegisterMap()
+ if err != nil {
+ t.Debugf("Registers: %v", err)
+ } else {
+ t.Debugf("Registers:")
+ var regs []string
+ for reg := range regmap {
+ regs = append(regs, reg)
+ }
+ sort.Strings(regs)
+ for _, reg := range regs {
+ t.Debugf("%-8s = %016x", reg, regmap[reg])
+ }
+ }
+}
+
+// debugDumpStack logs user stack contents at log level debug.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) debugDumpStack() {
+ if !t.IsLogging(log.Debug) {
+ return
+ }
+ m := t.MemoryManager()
+ if m == nil {
+ t.Debugf("Memory manager for task is gone, skipping application stack dump.")
+ return
+ }
+ t.Debugf("Stack:")
+ start := usermem.Addr(t.Arch().Stack())
+ // Round addr down to a 16-byte boundary.
+ start &= ^usermem.Addr(15)
+ // Print 16 bytes per line, one byte at a time.
+ for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 {
+ addr, ok := start.AddLength(offset)
+ if !ok {
+ break
+ }
+ var data [16]byte
+ n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{
+ IgnorePermissions: true,
+ })
+ // Print as much of the line as we can, even if an error was
+ // encountered.
+ if n > 0 {
+ t.Debugf("%x: % x", addr, data[:n])
+ }
+ if err != nil {
+ t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err)
+ break
+ }
+ }
+}
+
+// updateLogPrefix updates the task's cached log prefix to reflect its
+// current thread ID.
+//
+// Preconditions: The task's owning TaskSet.mu must be locked.
+func (t *Task) updateLogPrefixLocked() {
+ // Use the task's TID in the root PID namespace for logging.
+ t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t]))
+}
diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go
new file mode 100644
index 000000000..4df2e53d3
--- /dev/null
+++ b/pkg/sentry/kernel/task_net.go
@@ -0,0 +1,35 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/inet"
+)
+
+// IsNetworkNamespaced returns true if t is in a non-root network namespace.
+func (t *Task) IsNetworkNamespaced() bool {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.netns
+}
+
+// NetworkContext returns the network stack used by the task. NetworkContext
+// may return nil if no network stack is available.
+func (t *Task) NetworkContext() inet.Stack {
+ if t.IsNetworkNamespaced() {
+ return nil
+ }
+ return t.k.networkStack
+}
diff --git a/pkg/sentry/kernel/task_resources.go b/pkg/sentry/kernel/task_resources.go
new file mode 100644
index 000000000..e529f0c2d
--- /dev/null
+++ b/pkg/sentry/kernel/task_resources.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+)
+
+// TaskResources is the subset of a task's data provided by its creator that is
+// not provided by the loader.
+type TaskResources struct {
+ // SignalMask is the set of signals whose delivery is currently blocked.
+ //
+ // FIXME: Determine if we also need RealSignalMask
+ SignalMask linux.SignalSet
+
+ // FSContext is the filesystem context.
+ *FSContext
+
+ // FDMap provides access to files to the task.
+ *FDMap
+
+ // Tracks abstract sockets that are in use.
+ AbstractSockets *AbstractSocketNamespace
+}
+
+// newTaskResources returns a new TaskResources, taking an additional reference
+// on fdm.
+func newTaskResources(fdm *FDMap, fc *FSContext) *TaskResources {
+ fdm.IncRef()
+ return &TaskResources{
+ FDMap: fdm,
+ FSContext: fc,
+ AbstractSockets: NewAbstractSocketNamespace(),
+ }
+}
+
+// release releases all resources held by the TaskResources. release is called
+// by the task when it exits.
+func (tr *TaskResources) release() {
+ tr.FDMap.DecRef()
+ tr.FDMap = nil
+ tr.FSContext.DecRef()
+ tr.FSContext = nil
+ tr.AbstractSockets = nil
+}
+
+// Fork returns a duplicate of tr.
+//
+// FIXME: Preconditions: When tr is owned by a Task, that task's
+// signal mutex must be locked, or Fork must be called by the task's goroutine.
+func (tr *TaskResources) Fork(shareFiles bool, shareFSContext bool) *TaskResources {
+ var fdmap *FDMap
+ if shareFiles {
+ fdmap = tr.FDMap
+ fdmap.IncRef()
+ } else {
+ fdmap = tr.FDMap.Fork()
+ }
+
+ var fsc *FSContext
+ if shareFSContext {
+ fsc = tr.FSContext
+ fsc.IncRef()
+ } else {
+ fsc = tr.FSContext.Fork()
+ }
+
+ return &TaskResources{
+ SignalMask: tr.SignalMask,
+ FDMap: fdmap,
+ FSContext: fsc,
+ AbstractSockets: tr.AbstractSockets,
+ }
+}
+
+// FDMap returns t's FDMap.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) FDMap() *FDMap {
+ return t.tr.FDMap
+}
+
+// FSContext returns t's FSContext.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) FSContext() *FSContext {
+ return t.tr.FSContext
+}
+
+// MountNamespace returns t's MountNamespace. MountNamespace does not take an additional
+// reference on the returned MountNamespace.
+func (t *Task) MountNamespace() *fs.MountNamespace {
+ return t.k.mounts
+}
+
+// AbstractSockets returns t's AbstractSocketNamespace.
+func (t *Task) AbstractSockets() *AbstractSocketNamespace {
+ return t.tr.AbstractSockets
+}
+
+// IsChrooted returns true if the root directory of t's FSContext is not the
+// root directory of t's MountNamespace.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) IsChrooted() bool {
+ realRoot := t.k.mounts.Root()
+ defer realRoot.DecRef()
+ return t.tr.FSContext.root != realRoot
+}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
new file mode 100644
index 000000000..94ce5582b
--- /dev/null
+++ b/pkg/sentry/kernel/task_run.go
@@ -0,0 +1,346 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "bytes"
+ "runtime"
+ "sync/atomic"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// A taskRunState is a reified state in the task state machine. See README.md
+// for details. The canonical list of all run states, as well as transitions
+// between them, is given in run_states.dot.
+//
+// The set of possible states is enumerable and completely defined by the
+// kernel package, so taskRunState would ideally be represented by a
+// discriminated union. However, Go does not support sum types.
+//
+// Hence, as with TaskStop, data-free taskRunStates should be represented as
+// typecast nils to avoid unnecessary allocation.
+type taskRunState interface {
+ // execute executes the code associated with this state over the given task
+ // and returns the following state. If execute returns nil, the task
+ // goroutine should exit.
+ //
+ // It is valid to tail-call a following state's execute to avoid the
+ // overhead of converting the following state to an interface object and
+ // checking for stops, provided that the tail-call cannot recurse.
+ execute(*Task) taskRunState
+}
+
+// run runs the task goroutine.
+//
+// threadID a dummy value set to the task's TID in the root PID namespace to
+// make it visible in stack dumps. A goroutine for a given task can be identified
+// searching for Task.run()'s argument value.
+func (t *Task) run(threadID uintptr) {
+ // Construct t.blockingTimer here. We do this here because we can't
+ // reconstruct t.blockingTimer during restore in Task.afterLoad(), because
+ // kernel.timekeeper.SetClocks() hasn't been called yet.
+ blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
+ t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
+ defer t.blockingTimer.Destroy()
+ t.blockingTimerChan = blockingTimerChan
+
+ // Activate our address space.
+ t.Activate()
+ // The corresponding t.Deactivate occurs in the exit path
+ // (runExitMain.execute) so that when
+ // Platform.CooperativelySharesAddressSpace() == true, we give up the
+ // AddressSpace before the task goroutine finishes executing.
+
+ // Ensure that thread group timers for execution time reflect that this
+ // task now exists.
+ t.tg.tm.kick()
+
+ // If this is a newly-started task, it should check for participation in
+ // group stops. If this is a task resuming after restore, it was
+ // interrupted by saving. In either case, the task is initially
+ // interrupted.
+ t.interruptSelf()
+
+ for {
+ // Explanation for this ordering:
+ //
+ // - A freshly-started task that is stopped should not do anything
+ // before it enters the stop.
+ //
+ // - If taskRunState.execute returns nil, the task goroutine should
+ // exit without checking for a stop.
+ //
+ // - Task.Start won't start Task.run if t.runState is nil, so this
+ // ordering is safe.
+ t.doStop()
+ t.runState = t.runState.execute(t)
+ if t.runState == nil {
+ t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
+ t.goroutineStopped.Done()
+ t.tg.liveGoroutines.Done()
+ t.tg.pidns.owner.liveGoroutines.Done()
+ t.tg.pidns.owner.runningGoroutines.Done()
+
+ // Keep argument alive because stack trace for dead variables may not be correct.
+ runtime.KeepAlive(threadID)
+ return
+ }
+ }
+}
+
+// doStop is called by Task.run to block until the task is not stopped.
+func (t *Task) doStop() {
+ if atomic.LoadInt32(&t.stopCount) == 0 {
+ return
+ }
+ t.Deactivate()
+ // NOTE: t.Activate() must be called without any locks held, so
+ // this defer must precede the defer for unlocking the signal mutex.
+ defer t.Activate()
+ t.accountTaskGoroutineEnter(TaskGoroutineStopped)
+ defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ t.tg.pidns.owner.runningGoroutines.Add(-1)
+ defer t.tg.pidns.owner.runningGoroutines.Add(1)
+ t.goroutineStopped.Add(-1)
+ defer t.goroutineStopped.Add(1)
+ for t.stopCount > 0 {
+ t.endStopCond.Wait()
+ }
+}
+
+// The runApp state checks for interrupts before executing untrusted
+// application code.
+type runApp struct{}
+
+func (*runApp) execute(t *Task) taskRunState {
+ if t.interrupted() {
+ // Checkpointing instructs tasks to stop by sending an interrupt, so we
+ // must check for stops before entering runInterrupt (instead of
+ // tail-calling it).
+ return (*runInterrupt)(nil)
+ }
+
+ // We're about to switch to the application again. If there's still a
+ // unhandled SyscallRestartErrno that wasn't translated to an EINTR,
+ // restart the syscall that was interrupted. If there's a saved signal
+ // mask, restore it. (Note that restoring the saved signal mask may unblock
+ // a pending signal, causing another interruption, but that signal should
+ // not interact with the interrupted syscall.)
+ if t.haveSyscallReturn {
+ if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+ if sre == ERESTART_RESTARTBLOCK {
+ t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
+ t.Arch().RestartSyscallWithRestartBlock()
+ } else {
+ t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
+ t.Arch().RestartSyscall()
+ }
+ }
+ t.haveSyscallReturn = false
+ }
+ if t.haveSavedSignalMask {
+ t.SetSignalMask(t.savedSignalMask)
+ t.haveSavedSignalMask = false
+ if t.interrupted() {
+ return (*runInterrupt)(nil)
+ }
+ }
+
+ // Apply restartable sequences.
+ if t.rseqPreempted {
+ t.rseqPreempted = false
+ if t.rseqCPUAddr != 0 {
+ if err := t.rseqCopyOutCPU(); err != nil {
+ t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
+ t.forceSignal(linux.SIGSEGV, false)
+ t.SendSignal(sigPriv(linux.SIGSEGV))
+ // Re-enter the task run loop for signal delivery.
+ return (*runApp)(nil)
+ }
+ }
+ t.rseqInterrupt()
+ }
+
+ // Check if we need to enable single-stepping. Tracers expect that the
+ // kernel preserves the value of the single-step flag set by PTRACE_SETREGS
+ // whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
+ // includes our ptrace platform, by the way), so we should only clear the
+ // single-step flag if we're responsible for setting it. (clearSinglestep
+ // is therefore analogous to Linux's TIF_FORCED_TF.)
+ //
+ // Strictly speaking, we should also not clear the single-step flag if we
+ // single-step through an instruction that sets the single-step flag
+ // (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
+ // own TF. (Famous last words, I know.)
+ clearSinglestep := false
+ if t.hasTracer() {
+ t.tg.pidns.owner.mu.RLock()
+ if t.ptraceSinglestep {
+ clearSinglestep = !t.Arch().SingleStep()
+ t.Arch().SetSingleStep()
+ }
+ t.tg.pidns.owner.mu.RUnlock()
+ }
+
+ t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
+ info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
+ t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
+
+ if clearSinglestep {
+ t.Arch().ClearSingleStep()
+ }
+
+ switch err {
+ case nil:
+ // Handle application system call.
+ return t.doSyscall()
+
+ case platform.ErrContextInterrupt:
+ // Interrupted by platform.Context.Interrupt(). Re-enter the run
+ // loop to figure out why.
+ return (*runApp)(nil)
+
+ case platform.ErrContextSignal:
+ // Looks like a signal has been delivered to us. If it's a synchronous
+ // signal (SEGV, SIGBUS, etc.), it should be sent to the application
+ // thread that received it.
+ sig := linux.Signal(info.Signo)
+
+ // Was it a fault that we should handle internally? If so, this wasn't
+ // an application-generated signal and we should continue execution
+ // normally.
+ if at.Any() {
+ addr := usermem.Addr(info.Addr())
+ err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+ if err == nil {
+ // The fault was handled appropriately.
+ // We can resume running the application.
+ return (*runApp)(nil)
+ }
+
+ // Is this a vsyscall that we need emulate?
+ if at.Execute {
+ if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
+ return t.doVsyscall(addr, sysno)
+ }
+ }
+
+ // The JVM will trigger these errors constantly, so don't
+ // spam logs with this error.
+ if err == syserror.EFAULT || err == syserror.EPERM {
+ t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
+ } else {
+ t.Warningf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
+ }
+ t.DebugDumpState()
+
+ // Continue to signal handling.
+ //
+ // Convert a BusError error to a SIGBUS from a SIGSEGV. All
+ // other info bits stay the same (address, etc.).
+ if _, ok := err.(*memmap.BusError); ok {
+ sig = linux.SIGBUS
+ info.Signo = int32(linux.SIGBUS)
+ }
+ }
+
+ switch sig {
+ case linux.SIGILL:
+ // N.B. The debug stuff here is arguably
+ // expensive. Don't fret. This gets called
+ // about 5 times for a typical application, if
+ // that.
+ t.Debugf("SIGILL @ %x", t.Arch().IP())
+
+ // Is this a CPUID instruction?
+ expected := arch.CPUIDInstruction[:]
+ found := make([]byte, len(expected))
+ _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
+ if err == nil && bytes.Equal(expected, found) {
+ // Skip the cpuid instruction.
+ t.Arch().CPUIDEmulate(t)
+ t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+ break
+ }
+
+ // Treat it like any other synchronous signal.
+ fallthrough
+
+ case linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
+ // Synchronous signal. Send it to ourselves. Assume the signal is
+ // legitimate and force it (work around the signal being ignored or
+ // blocked) like Linux does. Conveniently, this is even the correct
+ // behavior for SIGTRAP from single-stepping.
+ t.forceSignal(linux.Signal(sig), false /* unconditional */)
+ t.SendSignal(info)
+
+ case platform.SignalInterrupt:
+ // Assume that a call to platform.Context.Interrupt() misfired.
+
+ case linux.SIGPROF:
+ // It's a profiling interrupt: there's not much
+ // we can do. We've already paid a decent cost
+ // by intercepting the signal, at this point we
+ // simply ignore it.
+
+ default:
+ // Asynchronous signal. Let the system deal with it.
+ t.k.sendExternalSignal(info, "application")
+ }
+
+ return (*runApp)(nil)
+
+ case platform.ErrContextCPUPreempted:
+ // Ensure that RSEQ critical sections are interrupted and per-thread
+ // CPU values are updated before the next platform.Context.Switch().
+ t.rseqPreempted = true
+ return (*runApp)(nil)
+
+ default:
+ // What happened? Can't continue.
+ t.Warningf("Unexpected SwitchToApp error: %v", err)
+ t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)})
+ return (*runExit)(nil)
+ }
+}
+
+// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
+func (t *Task) waitGoroutineStoppedOrExited() {
+ t.goroutineStopped.Wait()
+}
+
+// WaitExited blocks until all task goroutines in tg have exited.
+//
+// WaitExited does not correspond to anything in Linux; it's provided so that
+// external callers of Kernel.CreateProcess can wait for the created thread
+// group to terminate.
+func (tg *ThreadGroup) WaitExited() {
+ tg.liveGoroutines.Wait()
+}
+
+// Yield yields the processor for the calling task.
+func (t *Task) Yield() {
+ atomic.AddUint64(&t.yieldCount, 1)
+ runtime.Gosched()
+}
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
new file mode 100644
index 000000000..b50139077
--- /dev/null
+++ b/pkg/sentry/kernel/task_sched.go
@@ -0,0 +1,329 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// CPU scheduling, real and fake.
+
+import (
+ "fmt"
+ "sync/atomic"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TaskGoroutineState is a coarse representation of the current execution
+// status of a kernel.Task goroutine.
+type TaskGoroutineState int
+
+const (
+ // TaskGoroutineNonexistent indicates that the task goroutine has either
+ // not yet been created by Task.Start() or has returned from Task.run().
+ // This must be the zero value for TaskGoroutineState.
+ TaskGoroutineNonexistent TaskGoroutineState = iota
+
+ // TaskGoroutineRunningSys indicates that the task goroutine is executing
+ // sentry code.
+ TaskGoroutineRunningSys
+
+ // TaskGoroutineRunningApp indicates that the task goroutine is executing
+ // application code.
+ TaskGoroutineRunningApp
+
+ // TaskGoroutineBlockedInterruptible indicates that the task goroutine is
+ // blocked in Task.block(), and hence may be woken by Task.interrupt()
+ // (e.g. due to signal delivery).
+ TaskGoroutineBlockedInterruptible
+
+ // TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
+ // stopped outside of Task.block() and Task.doStop(), and hence cannot be
+ // woken by Task.interrupt().
+ TaskGoroutineBlockedUninterruptible
+
+ // TaskGoroutineStopped indicates that the task goroutine is blocked in
+ // Task.doStop(). TaskGoroutineStopped is similar to
+ // TaskGoroutineBlockedUninterruptible, but is a separate state to make it
+ // possible to determine when Task.stop is meaningful.
+ TaskGoroutineStopped
+)
+
+// TaskGoroutineSchedInfo contains task goroutine scheduling state which must
+// be read and updated atomically.
+type TaskGoroutineSchedInfo struct {
+ // Timestamp was the value of Kernel.cpuClock when this
+ // TaskGoroutineSchedInfo was last updated.
+ Timestamp uint64
+
+ // State is the current state of the task goroutine.
+ State TaskGoroutineState
+
+ // UserTicks is the amount of time the task goroutine has spent executing
+ // its associated Task's application code, in units of linux.ClockTick.
+ UserTicks uint64
+
+ // SysTicks is the amount of time the task goroutine has spent executing in
+ // the sentry, in units of linux.ClockTick.
+ SysTicks uint64
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
+ now := t.k.CPUClockNow()
+ if t.gosched.State != TaskGoroutineRunningSys {
+ panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state))
+ }
+ t.goschedSeq.BeginWrite()
+ // This function is very hot; avoid defer.
+ t.gosched.SysTicks += now - t.gosched.Timestamp
+ t.gosched.Timestamp = now
+ t.gosched.State = state
+ t.goschedSeq.EndWrite()
+}
+
+// Preconditions: The caller must be running on the task goroutine, and leaving
+// a state indicated by a previous call to
+// t.accountTaskGoroutineEnter(state).
+func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
+ now := t.k.CPUClockNow()
+ if t.gosched.State != state {
+ panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
+ }
+ t.goschedSeq.BeginWrite()
+ // This function is very hot; avoid defer.
+ if state == TaskGoroutineRunningApp {
+ t.gosched.UserTicks += now - t.gosched.Timestamp
+ }
+ t.gosched.Timestamp = now
+ t.gosched.State = TaskGoroutineRunningSys
+ t.goschedSeq.EndWrite()
+}
+
+// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
+// Most clients should use t.CPUStats() instead.
+func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
+ return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched)
+}
+
+// CPUStats returns the CPU usage statistics of t.
+func (t *Task) CPUStats() usage.CPUStats {
+ return t.cpuStatsAt(t.k.CPUClockNow())
+}
+
+// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
+// monotonic, this is satisfied if now is the result of a previous call to
+// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
+// change to t.gosched can cause cpuStatsAt to adjust stats by too much, making
+// the returned stats non-monotonic.
+func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
+ tsched := t.TaskGoroutineSchedInfo()
+ if tsched.Timestamp < now {
+ // Update stats to reflect execution since the last update to
+ // t.gosched.
+ switch tsched.State {
+ case TaskGoroutineRunningSys:
+ tsched.SysTicks += now - tsched.Timestamp
+ case TaskGoroutineRunningApp:
+ tsched.UserTicks += now - tsched.Timestamp
+ }
+ }
+ return usage.CPUStats{
+ UserTime: time.Duration(tsched.UserTicks * uint64(linux.ClockTick)),
+ SysTime: time.Duration(tsched.SysTicks * uint64(linux.ClockTick)),
+ VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
+ }
+}
+
+// CPUStats returns the combined CPU usage statistics of all past and present
+// threads in tg.
+func (tg *ThreadGroup) CPUStats() usage.CPUStats {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ // Hack to get a pointer to the Kernel.
+ if tg.leader == nil {
+ // Per comment on tg.leader, this is only possible if nothing in the
+ // ThreadGroup has ever executed anyway.
+ return usage.CPUStats{}
+ }
+ now := tg.leader.k.CPUClockNow()
+ stats := tg.exitedCPUStats
+ // Account for active tasks.
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ stats.Accumulate(t.cpuStatsAt(now))
+ }
+ return stats
+}
+
+// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
+// resource usage statistics for all children of [tg] that have terminated and
+// been waited for. These statistics will include the resources used by
+// grandchildren, and further removed descendants, if all of the intervening
+// descendants waited on their terminated children."
+func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ return tg.childCPUStats
+}
+
+// StateStatus returns a string representation of the task's current state,
+// appropriate for /proc/[pid]/status.
+func (t *Task) StateStatus() string {
+ switch s := t.TaskGoroutineSchedInfo().State; s {
+ case TaskGoroutineNonexistent:
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ switch t.exitState {
+ case TaskExitZombie:
+ return "Z (zombie)"
+ case TaskExitDead:
+ return "X (dead)"
+ default:
+ // The task goroutine can't exit before passing through
+ // runExitNotify, so this indicates that the task has been created,
+ // but the task goroutine hasn't yet started. The Linux equivalent
+ // is struct task_struct::state == TASK_NEW
+ // (kernel/fork.c:copy_process() =>
+ // kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
+ // masked out by TASK_REPORT for /proc/[pid]/status, leaving only
+ // TASK_RUNNING.
+ return "R (running)"
+ }
+ case TaskGoroutineRunningSys, TaskGoroutineRunningApp:
+ return "R (running)"
+ case TaskGoroutineBlockedInterruptible:
+ return "S (sleeping)"
+ case TaskGoroutineStopped:
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ switch t.stop.(type) {
+ case *groupStop:
+ return "T (stopped)"
+ case *ptraceStop:
+ return "t (tracing stop)"
+ }
+ fallthrough
+ case TaskGoroutineBlockedUninterruptible:
+ // This is the name Linux uses for TASK_UNINTERRUPTIBLE and
+ // TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
+ // fs/proc/array.c:task_state_array.
+ return "D (disk sleep)"
+ default:
+ panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
+ }
+}
+
+// CPUMask returns a copy of t's allowed CPU mask.
+func (t *Task) CPUMask() sched.CPUSet {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.allowedCPUMask.Copy()
+}
+
+// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
+// mask.
+//
+// Preconditions: mask.Size() ==
+// sched.CPUSetSize(t.Kernel().ApplicationCores()).
+func (t *Task) SetCPUMask(mask sched.CPUSet) error {
+ if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
+ panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
+ }
+
+ // Remove CPUs in mask above Kernel.applicationCores.
+ mask.ClearAbove(t.k.applicationCores)
+
+ // Ensure that at least 1 CPU is still allowed.
+ if mask.NumCPUs() == 0 {
+ return syserror.EINVAL
+ }
+
+ if t.k.useHostCores {
+ // No-op; pretend the mask was immediately changed back.
+ return nil
+ }
+
+ t.tg.pidns.owner.mu.RLock()
+ rootTID := t.tg.pidns.owner.Root.tids[t]
+ t.tg.pidns.owner.mu.RUnlock()
+
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.allowedCPUMask = mask
+ atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID))
+ return nil
+}
+
+// CPU returns the cpu id for a given task.
+func (t *Task) CPU() int32 {
+ if t.k.useHostCores {
+ return int32(hostcpu.GetCPU())
+ }
+
+ return atomic.LoadInt32(&t.cpu)
+}
+
+// assignCPU returns the virtualized CPU number for the task with global TID
+// tid and allowedCPUMask allowed.
+func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
+ // To pretend that threads are evenly distributed to allowed CPUs, choose n
+ // to be less than the number of CPUs in allowed ...
+ n := int(tid) % int(allowed.NumCPUs())
+ // ... then pick the nth CPU in allowed.
+ allowed.ForEachCPU(func(c uint) {
+ if n--; n == 0 {
+ cpu = int32(c)
+ }
+ })
+ return cpu
+}
+
+// Niceness returns t's niceness.
+func (t *Task) Niceness() int {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.niceness
+}
+
+// Priority returns t's priority.
+func (t *Task) Priority() int {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.niceness + 20
+}
+
+// SetNiceness sets t's niceness to n.
+func (t *Task) SetNiceness(n int) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.niceness = n
+}
+
+// NumaPolicy returns t's current numa policy.
+func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.numaPolicy, t.numaNodeMask
+}
+
+// SetNumaPolicy sets t's numa policy.
+func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.numaPolicy = policy
+ t.numaNodeMask = nodeMask
+}
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
new file mode 100644
index 000000000..2340256b0
--- /dev/null
+++ b/pkg/sentry/kernel/task_signals.go
@@ -0,0 +1,1056 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file defines the behavior of task signal handling.
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SignalAction is an internal signal action.
+type SignalAction int
+
+// Available signal actions.
+// Note that although we refer the complete set internally,
+// the application is only capable of using the Default and
+// Ignore actions from the system call interface.
+const (
+ SignalActionTerm SignalAction = iota
+ SignalActionCore
+ SignalActionStop
+ SignalActionIgnore
+ SignalActionHandler
+)
+
+// Default signal handler actions. Note that for most signals,
+// (except SIGKILL and SIGSTOP) these can be overridden by the app.
+var defaultActions = map[linux.Signal]SignalAction{
+ // POSIX.1-1990 standard.
+ linux.SIGHUP: SignalActionTerm,
+ linux.SIGINT: SignalActionTerm,
+ linux.SIGQUIT: SignalActionCore,
+ linux.SIGILL: SignalActionCore,
+ linux.SIGABRT: SignalActionCore,
+ linux.SIGFPE: SignalActionCore,
+ linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects
+ linux.SIGSEGV: SignalActionCore,
+ linux.SIGPIPE: SignalActionTerm,
+ linux.SIGALRM: SignalActionTerm,
+ linux.SIGTERM: SignalActionTerm,
+ linux.SIGUSR1: SignalActionTerm,
+ linux.SIGUSR2: SignalActionTerm,
+ linux.SIGCHLD: SignalActionIgnore,
+ linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects
+ linux.SIGSTOP: SignalActionStop,
+ linux.SIGTSTP: SignalActionStop,
+ linux.SIGTTIN: SignalActionStop,
+ linux.SIGTTOU: SignalActionStop,
+ // POSIX.1-2001 standard.
+ linux.SIGBUS: SignalActionCore,
+ linux.SIGPROF: SignalActionTerm,
+ linux.SIGSYS: SignalActionCore,
+ linux.SIGTRAP: SignalActionCore,
+ linux.SIGURG: SignalActionIgnore,
+ linux.SIGVTALRM: SignalActionTerm,
+ linux.SIGXCPU: SignalActionCore,
+ linux.SIGXFSZ: SignalActionCore,
+ // The rest on linux.
+ linux.SIGSTKFLT: SignalActionTerm,
+ linux.SIGIO: SignalActionTerm,
+ linux.SIGPWR: SignalActionTerm,
+ linux.SIGWINCH: SignalActionIgnore,
+}
+
+// computeAction figures out what to do given a signal number
+// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop,
+// and SIGKILL always results in a SignalActionTerm.
+// Signal 0 is always ignored as many programs use it for various internal functions
+// and don't expect it to do anything.
+//
+// In the event the signal is not one of these, act.Handler determines what
+// happens next.
+// If act.Handler is:
+// 0, the default action is taken;
+// 1, the signal is ignored;
+// anything else, the function returns SignalActionHandler.
+func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction {
+ switch sig {
+ case linux.SIGSTOP:
+ return SignalActionStop
+ case linux.SIGKILL:
+ return SignalActionTerm
+ case linux.Signal(0):
+ return SignalActionIgnore
+ }
+
+ switch act.Handler {
+ case arch.SignalActDefault:
+ return defaultActions[sig]
+ case arch.SignalActIgnore:
+ return SignalActionIgnore
+ default:
+ return SignalActionHandler
+ }
+}
+
+// UnblockableSignals contains the set of signals which cannot be blocked.
+var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP)
+
+// StopSignals is the set of signals whose default action is SignalActionStop.
+var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU)
+
+// dequeueSignalLocked returns a pending unmasked signal. If there are no
+// pending unmasked signals, dequeueSignalLocked returns nil.
+//
+// Preconditions: t.tg.signalHandlers.mu must be locked.
+func (t *Task) dequeueSignalLocked() *arch.SignalInfo {
+ if info := t.pendingSignals.dequeue(t.tr.SignalMask); info != nil {
+ return info
+ }
+ if info := t.tg.pendingSignals.dequeue(t.tr.SignalMask); info != nil {
+ return info
+ }
+ return nil
+}
+
+// TakeSignal returns a pending signal not blocked by mask. Signal handlers are
+// not affected. If there are no pending signals not blocked by mask,
+// TakeSignal returns a nil SignalInfo.
+func (t *Task) TakeSignal(mask linux.SignalSet) *arch.SignalInfo {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ if info := t.pendingSignals.dequeue(mask); info != nil {
+ return info
+ }
+ if info := t.tg.pendingSignals.dequeue(mask); info != nil {
+ return info
+ }
+ return nil
+}
+
+// discardSpecificLocked removes all instances of the given signal from all
+// signal queues in tg.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) {
+ tg.pendingSignals.discardSpecific(sig)
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ t.pendingSignals.discardSpecific(sig)
+ }
+}
+
+// PendingSignals returns the set of pending signals.
+func (t *Task) PendingSignals() linux.SignalSet {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet
+}
+
+// deliverSignal delivers the given signal and returns the following run state.
+func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState {
+ sigact := computeAction(linux.Signal(info.Signo), act)
+
+ if t.haveSyscallReturn {
+ if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
+ // Signals that are ignored, cause a thread group stop, or
+ // terminate the thread group do not interact with interrupted
+ // syscalls; in Linux terms, they are never returned to the signal
+ // handling path from get_signal => get_signal_to_deliver. The
+ // behavior of an interrupted syscall is determined by the first
+ // signal that is actually handled (by userspace).
+ if sigact == SignalActionHandler {
+ switch {
+ case sre == ERESTARTNOHAND:
+ fallthrough
+ case sre == ERESTART_RESTARTBLOCK:
+ fallthrough
+ case (sre == ERESTARTSYS && !act.IsRestart()):
+ t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
+ t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1)))
+ default:
+ t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
+ t.Arch().RestartSyscall()
+ }
+ }
+ }
+ }
+
+ switch sigact {
+ case SignalActionTerm, SignalActionCore:
+ // "Default action is to terminate the process." - signal(7)
+ t.Debugf("Signal %d: terminating thread group", info.Signo)
+ t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)})
+ return (*runExit)(nil)
+
+ case SignalActionStop:
+ // "Default action is to stop the process."
+ t.initiateGroupStop(info)
+
+ case SignalActionIgnore:
+ // "Default action is to ignore the signal."
+ t.Debugf("Signal %d: ignored", info.Signo)
+
+ case SignalActionHandler:
+ // Try to deliver the signal to the user-configured handler.
+ t.Debugf("Signal %d: delivering to handler", info.Signo)
+ if err := t.deliverSignalToHandler(info, act); err != nil {
+ t.Warningf("Failed to deliver signal %+v to user handler: %v", info, err)
+ // Send a forced SIGSEGV. If the signal that couldn't be delivered
+ // was a SIGSEGV, force the handler to SIG_DFL.
+ t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */)
+ t.SendSignal(sigPriv(linux.SIGSEGV))
+ }
+
+ default:
+ panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act)))
+ }
+ return (*runInterrupt)(nil)
+}
+
+// deliverSignalToHandler changes the task's userspace state to enter the given
+// user-configured handler for the given signal.
+func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error {
+ // Signal delivery to an application handler interrupts restartable
+ // sequences.
+ t.rseqInterrupt()
+
+ // Are executing on the main stack,
+ // or the provided alternate stack?
+ sp := usermem.Addr(t.Arch().Stack())
+
+ // N.B. This is a *copy* of the alternate stack that the user's signal
+ // handler expects to see in its ucontext (even if it's not in use).
+ alt := t.signalStack
+ if act.IsOnStack() && alt.IsEnabled() {
+ alt.SetOnStack()
+ if !t.OnSignalStack(alt) {
+ sp = usermem.Addr(alt.Top())
+ }
+ }
+
+ // Set up the signal handler. If we have a saved signal mask, the signal
+ // handler should run with the current mask, but sigreturn should restore
+ // the saved one.
+ st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
+ mask := t.tr.SignalMask
+ if t.haveSavedSignalMask {
+ mask = t.savedSignalMask
+ }
+ if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
+ return err
+ }
+ t.haveSavedSignalMask = false
+
+ // Add our signal mask.
+ newMask := t.tr.SignalMask | act.Mask
+ if !act.IsNoDefer() {
+ newMask |= linux.SignalSetOf(linux.Signal(info.Signo))
+ }
+ t.SetSignalMask(newMask)
+
+ return nil
+}
+
+var ctrlResume = &SyscallControl{ignoreReturn: true}
+
+// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if
+// rt is true).
+func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
+ st := t.Stack()
+ sigset, err := t.Arch().SignalRestore(st, rt)
+ if err != nil {
+ return nil, err
+ }
+
+ // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
+ t.SetSignalMask(sigset &^ UnblockableSignals)
+
+ // TODO: sys_rt_sigreturn also calls restore_altstack from
+ // uc.stack, allowing the signal handler to implicitly mutate the signal
+ // stack.
+
+ return ctrlResume, nil
+}
+
+// SendSignal sends the given signal to t.
+//
+// The following errors may be returned:
+//
+// syserror.ESRCH - The task has exited.
+// syserror.EINVAL - The signal is not valid.
+// syserror.EAGAIN - THe signal is realtime, and cannot be queued.
+//
+func (t *Task) SendSignal(info *arch.SignalInfo) error {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ return t.sendSignalLocked(info, false /* group */)
+}
+
+// SendGroupSignal sends the given signal to t's thread group.
+func (t *Task) SendGroupSignal(info *arch.SignalInfo) error {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ return t.sendSignalLocked(info, true /* group */)
+}
+
+// SendSignal sends the given signal to tg, using tg's leader to determine if
+// the signal is blocked.
+func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ tg.signalHandlers.mu.Lock()
+ defer tg.signalHandlers.mu.Unlock()
+ return tg.leader.sendSignalLocked(info, true /* group */)
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) onCPULocked(includeSys bool) bool {
+ // Task is exiting.
+ if t.exitState != TaskExitNone {
+ return false
+ }
+
+ switch t.TaskGoroutineSchedInfo().State {
+ case TaskGoroutineRunningSys:
+ return includeSys
+ case TaskGoroutineRunningApp:
+ return true
+ default:
+ return false
+ }
+}
+
+// SendTimerSignal mimics the process timer signal delivery behavior in linux:
+// signals are delivered to the thread that triggers the timer expiration (see
+// kernel/time/posix-cpu-timers.c:check_process_timers(). This
+// means
+// 1) the thread is running on cpu at the time.
+// 2) a thread runs more frequently will get more of those signals.
+//
+// We approximate this behavior by selecting a running task in a round-robin
+// fashion. Statistically, a thread running more often should have a higher
+// probability to be selected.
+func (tg *ThreadGroup) SendTimerSignal(info *arch.SignalInfo, includeSys bool) error {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ tg.signalHandlers.mu.Lock()
+ defer tg.signalHandlers.mu.Unlock()
+
+ // Find the next running threads.
+ var t *Task
+ if tg.lastTimerSignalTask == nil {
+ t = tg.tasks.Front()
+ } else {
+ t = tg.lastTimerSignalTask.Next()
+ }
+
+ // Iterate from lastTimerSignalTask.Next() to the last task in the task list.
+ for t != nil {
+ if t.onCPULocked(includeSys) {
+ tg.lastTimerSignalTask = t
+ return t.sendSignalLocked(info, true /* group */)
+ }
+ t = t.Next()
+ }
+
+ // t is nil when we reach here. If lastTimerSignalTask is not nil, iterate
+ // from Front to lastTimerSignalTask.
+ if tg.lastTimerSignalTask != nil {
+ for t := tg.tasks.Front(); t != tg.lastTimerSignalTask.Next(); t = t.Next() {
+ if t.onCPULocked(includeSys) {
+ tg.lastTimerSignalTask = t
+ return t.sendSignalLocked(info, true /* group */)
+ }
+ }
+ }
+
+ // No running threads? Just try the leader.
+ tg.lastTimerSignalTask = tg.leader
+ return tg.leader.sendSignalLocked(info, true /* group */)
+}
+
+func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
+ if t.exitState == TaskExitDead {
+ return syserror.ESRCH
+ }
+ sig := linux.Signal(info.Signo)
+ if sig == 0 {
+ return nil
+ }
+ if !sig.IsValid() {
+ return syserror.EINVAL
+ }
+
+ // Signal side effects apply even if the signal is ultimately discarded.
+ t.tg.applySignalSideEffectsLocked(sig)
+
+ // TODO: "Only signals for which the "init" process has established a
+ // signal handler can be sent to the "init" process by other members of the
+ // PID namespace. This restriction applies even to privileged processes,
+ // and prevents other members of the PID namespace from accidentally
+ // killing the "init" process." - pid_namespaces(7). We don't currently do
+ // this for child namespaces, though we should; we also don't do this for
+ // the root namespace (the same restriction applies to global init on
+ // Linux), where whether or not we should is much murkier. In practice,
+ // most sandboxed applications are not prepared to function as an init
+ // process.
+
+ // Unmasked, ignored signals are discarded without being queued, unless
+ // they will be visible to a tracer. Even for group signals, it's the
+ // originally-targeted task's signal mask and tracer that matter; compare
+ // Linux's kernel/signal.c:__send_signal() => prepare_signal() =>
+ // sig_ignored().
+ ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
+ if linux.SignalSetOf(sig)&t.tr.SignalMask == 0 && ignored && !t.hasTracer() {
+ t.Debugf("Discarding ignored signal %d", sig)
+ return nil
+ }
+
+ q := &t.pendingSignals
+ if group {
+ q = &t.tg.pendingSignals
+ }
+ if !q.enqueue(info) {
+ if sig.IsRealtime() {
+ return syserror.EAGAIN
+ }
+ t.Debugf("Discarding duplicate signal %d", sig)
+ return nil
+ }
+
+ // Find a receiver to notify. Note that the task we choose to notify, if
+ // any, may not be the task that actually dequeues and handles the signal;
+ // e.g. a racing signal mask change may cause the notified task to become
+ // ineligible, or a racing sibling task may dequeue the signal first.
+ if t.canReceiveSignalLocked(sig) {
+ t.Debugf("Notified of signal %d", sig)
+ t.interrupt()
+ return nil
+ }
+ if group {
+ if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
+ nt.Debugf("Notified of group signal %d", sig)
+ nt.interrupt()
+ return nil
+ }
+ }
+ t.Debugf("No task notified of signal %d", sig)
+ return nil
+}
+
+func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
+ switch {
+ case linux.SignalSetOf(sig)&StopSignals != 0:
+ // Stop signals cause all prior SIGCONT to be discarded. (This is
+ // despite the fact this has little effect since SIGCONT's most
+ // important effect is applied when the signal is sent in the branch
+ // below, not when the signal is delivered.)
+ tg.discardSpecificLocked(linux.SIGCONT)
+ case sig == linux.SIGCONT:
+ // "The SIGCONT signal has a side effect of waking up (all threads of)
+ // a group-stopped process. This side effect happens before
+ // signal-delivery-stop. The tracer can't suppress this side effect (it
+ // can only suppress signal injection, which only causes the SIGCONT
+ // handler to not be executed in the tracee, if such a handler is
+ // installed." - ptrace(2)
+ tg.endGroupStopLocked(true)
+ case sig == linux.SIGKILL:
+ // "SIGKILL does not generate signal-delivery-stop and therefore the
+ // tracer can't suppress it. SIGKILL kills even within system calls
+ // (syscall-exit-stop is not generated prior to death by SIGKILL)." -
+ // ptrace(2)
+ //
+ // Note that this differs from ThreadGroup.requestExit in that it
+ // ignores tg.execing.
+ if !tg.exiting {
+ tg.exiting = true
+ tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)}
+ }
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ t.killLocked()
+ }
+ }
+}
+
+// canReceiveSignalLocked returns true if t should be interrupted to receive
+// the given signal. canReceiveSignalLocked is analogous to Linux's
+// kernel/signal.c:wants_signal(), but see below for divergences.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
+ // - Do not choose tasks that are blocking the signal.
+ if linux.SignalSetOf(sig)&t.tr.SignalMask != 0 {
+ return false
+ }
+ // - No need to check Task.exitState, as the exit path sets every bit in the
+ // signal mask when it transitions from TaskExitNone to TaskExitInitiated.
+ // - No special case for SIGKILL: SIGKILL already interrupted all tasks in the
+ // task group via applySignalSideEffects => killLocked.
+ // - Do not choose stopped tasks, which cannot handle signals.
+ if t.stop != nil {
+ return false
+ }
+ // - TODO: No special case for when t is also the sending task,
+ // because the identity of the sender is unknown.
+ // - Do not choose tasks that have already been interrupted, as they may be
+ // busy handling another signal.
+ if len(t.interruptChan) != 0 {
+ return false
+ }
+ return true
+}
+
+// findSignalReceiverLocked returns a task in tg that should be interrupted to
+// receive the given signal. If no such task exists, findSignalReceiverLocked
+// returns nil.
+//
+// Linux actually records curr_target to balance the group signal targets.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task {
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ if t.canReceiveSignalLocked(sig) {
+ return t
+ }
+ }
+ return nil
+}
+
+// forceSignal ensures that the task is not ignoring or blocking the given
+// signal. If unconditional is true, forceSignal takes action even if the
+// signal isn't being ignored or blocked.
+func (t *Task) forceSignal(sig linux.Signal, unconditional bool) {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ t.forceSignalLocked(sig, unconditional)
+}
+
+func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) {
+ blocked := linux.SignalSetOf(sig)&t.tr.SignalMask != 0
+ act := t.tg.signalHandlers.actions[sig]
+ ignored := act.Handler == arch.SignalActIgnore
+ if blocked || ignored || unconditional {
+ act.Handler = arch.SignalActDefault
+ t.tg.signalHandlers.actions[sig] = act
+ if blocked {
+ t.setSignalMaskLocked(t.tr.SignalMask &^ linux.SignalSetOf(sig))
+ }
+ }
+}
+
+// SignalMask returns a copy of t's signal mask.
+func (t *Task) SignalMask() linux.SignalSet {
+ return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.tr.SignalMask)))
+}
+
+// SetSignalMask sets t's signal mask.
+//
+// Preconditions: SetSignalMask can only be called by the task goroutine.
+// t.exitState < TaskExitZombie.
+func (t *Task) SetSignalMask(mask linux.SignalSet) {
+ // By precondition, t prevents t.tg from completing an execve and mutating
+ // t.tg.signalHandlers, so we can skip the TaskSet mutex.
+ t.tg.signalHandlers.mu.Lock()
+ t.setSignalMaskLocked(mask)
+ t.tg.signalHandlers.mu.Unlock()
+}
+
+// Preconditions: The signal mutex must be locked.
+func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
+ oldMask := t.tr.SignalMask
+ atomic.StoreUint64((*uint64)(&t.tr.SignalMask), uint64(mask))
+
+ // If the new mask blocks any signals that were not blocked by the old
+ // mask, and at least one such signal is pending in tg.pendingSignals, and
+ // t has been woken, it could be the case that t was woken to handle that
+ // signal, but will no longer do so as a result of its new signal mask, so
+ // we have to pick a replacement.
+ blocked := mask &^ oldMask
+ blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet
+ if blockedGroupPending != 0 && t.interrupted() {
+ linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) {
+ if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
+ nt.interrupt()
+ return
+ }
+ })
+ // We have to re-issue the interrupt consumed by t.interrupted() since
+ // it might have been for a different reason.
+ t.interruptSelf()
+ }
+
+ // Conversely, if the new mask unblocks any signals that were blocked by
+ // the old mask, and at least one such signal is pending, we may now need
+ // to handle that signal.
+ unblocked := oldMask &^ mask
+ unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet)
+ if unblockedPending != 0 {
+ t.interruptSelf()
+ }
+}
+
+// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
+// comment).
+//
+// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
+func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
+ t.savedSignalMask = mask
+ t.haveSavedSignalMask = true
+}
+
+// SignalStack returns the task-private signal stack.
+func (t *Task) SignalStack() arch.SignalStack {
+ return t.signalStack
+}
+
+// OnSignalStack returns true if, when the task resumes running, it will run on
+// the task-private signal stack.
+func (t *Task) OnSignalStack(s arch.SignalStack) bool {
+ sp := usermem.Addr(t.Arch().Stack())
+ return usermem.Addr(s.Addr) <= sp && sp < usermem.Addr(s.Addr+s.Size)
+}
+
+// SetSignalStack sets the task-private signal stack and clears the
+// SignalStackFlagDisable, since we have a signal stack.
+func (t *Task) SetSignalStack(alt arch.SignalStack) error {
+ // Mask out irrelevant parts: only disable matters.
+ alt.Flags &= arch.SignalStackFlagDisable
+ t.signalStack = alt
+ return nil
+}
+
+// SetSignalAct atomically sets the thread group's signal action for signal sig
+// to *actptr (if actptr is not nil) and returns the old signal action.
+func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) {
+ if !sig.IsValid() {
+ return arch.SignalAct{}, syserror.EINVAL
+ }
+
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ sh := tg.signalHandlers
+ sh.mu.Lock()
+ defer sh.mu.Unlock()
+ oldact := sh.actions[sig]
+ if actptr != nil {
+ if sig == linux.SIGKILL || sig == linux.SIGSTOP {
+ return oldact, syserror.EINVAL
+ }
+
+ act := *actptr
+ act.Mask &^= UnblockableSignals
+ sh.actions[sig] = act
+ // From POSIX, by way of Linux:
+ //
+ // "Setting a signal action to SIG_IGN for a signal that is pending
+ // shall cause the pending signal to be discarded, whether or not it is
+ // blocked."
+ //
+ // "Setting a signal action to SIG_DFL for a signal that is pending and
+ // whose default action is to ignore the signal (for example, SIGCHLD),
+ // shall cause the pending signal to be discarded, whether or not it is
+ // blocked."
+ if computeAction(sig, act) == SignalActionIgnore {
+ tg.discardSpecificLocked(sig)
+ }
+ }
+ return oldact, nil
+}
+
+// CopyOutSignalAct converts the given SignalAct into an architecture-specific
+// type and then copies it out to task memory.
+func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
+ n := t.Arch().NewSignalAct()
+ n.SerializeFrom(s)
+ _, err := t.CopyOut(addr, n)
+ return err
+}
+
+// CopyInSignalAct copies an architecture-specific sigaction type from task
+// memory and then converts it into a SignalAct.
+func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
+ n := t.Arch().NewSignalAct()
+ var s arch.SignalAct
+ if _, err := t.CopyIn(addr, n); err != nil {
+ return s, err
+ }
+ n.DeserializeTo(&s)
+ return s, nil
+}
+
+// CopyOutSignalStack converts the given SignalStack into an
+// architecture-specific type and then copies it out to task memory.
+func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
+ n := t.Arch().NewSignalStack()
+ n.SerializeFrom(s)
+ _, err := t.CopyOut(addr, n)
+ return err
+}
+
+// CopyInSignalStack copies an architecture-specific stack_t from task memory
+// and then converts it into a SignalStack.
+func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
+ n := t.Arch().NewSignalStack()
+ var s arch.SignalStack
+ if _, err := t.CopyIn(addr, n); err != nil {
+ return s, err
+ }
+ n.DeserializeTo(&s)
+ return s, nil
+}
+
+// groupStop is a TaskStop placed on tasks that have received a stop signal
+// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
+// the ptrace man page.)
+type groupStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*groupStop) Killable() bool { return true }
+
+type groupStopPhase int
+
+const (
+ // groupStopNone indicates that a thread group is not in, or attempting to
+ // enter or leave, a group stop.
+ groupStopNone groupStopPhase = iota
+
+ // groupStopDequeued indicates that at least one task in a thread group has
+ // dequeued a stop signal (or dequeued any signal and entered a
+ // signal-delivery-stop as a result, which allows ptrace to change the
+ // signal into a stop signal), but temporarily dropped the signal mutex
+ // without initiating the group stop.
+ //
+ // groupStopDequeued is analogous to JOBCTL_STOP_DEQUEUED in Linux.
+ groupStopDequeued
+
+ // groupStopInitiated indicates that a task in a thread group has initiated
+ // a group stop, but not all tasks in the thread group have acknowledged
+ // entering the group stop.
+ //
+ // groupStopInitiated is represented by JOBCTL_STOP_PENDING &&
+ // !SIGNAL_STOP_STOPPED in Linux.
+ groupStopInitiated
+
+ // groupStopComplete indicates that all tasks in a thread group have
+ // acknowledged entering the group stop, and the last one to do so has
+ // notified the thread group's parent.
+ //
+ // groupStopComplete is represented by JOBCTL_STOP_PENDING &&
+ // SIGNAL_STOP_STOPPED in Linux.
+ groupStopComplete
+)
+
+// initiateGroupStop attempts to initiate a group stop based on a
+// previously-dequeued stop signal.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) initiateGroupStop(info *arch.SignalInfo) {
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ if t.tg.groupStopPhase != groupStopDequeued {
+ t.Debugf("Signal %d: not stopping thread group: lost to racing signal", info.Signo)
+ return
+ }
+ if t.tg.exiting {
+ t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo)
+ return
+ }
+ if t.tg.execing != nil {
+ t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo)
+ return
+ }
+ t.Debugf("Signal %d: stopping thread group", info.Signo)
+ t.tg.groupStopPhase = groupStopInitiated
+ t.tg.groupStopSignal = linux.Signal(info.Signo)
+ t.tg.groupStopCount = 0
+ for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() {
+ t2.groupStopRequired = true
+ t2.groupStopAcknowledged = false
+ t2.interrupt()
+ }
+}
+
+// endGroupStopLocked ensures that all prior stop signals received by tg are
+// not stopping tg and will not stop tg in the future. If broadcast is true,
+// parent and tracer notification will be scheduled if appropriate.
+//
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) {
+ // Discard all previously-queued stop signals.
+ linux.ForEachSignal(StopSignals, tg.discardSpecificLocked)
+
+ if tg.groupStopPhase != groupStopNone {
+ tg.leader.Debugf("Ending group stop currently in phase %d", tg.groupStopPhase)
+ if tg.groupStopPhase == groupStopInitiated || tg.groupStopPhase == groupStopComplete {
+ tg.groupStopSignal = 0
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ if _, ok := t.stop.(*groupStop); ok {
+ t.endInternalStopLocked()
+ }
+ }
+ if broadcast {
+ // Instead of notifying the parent here, set groupContNotify so
+ // that one of the continuing tasks does so. (Linux does
+ // something similar.) The reason we do this is to keep locking
+ // sane. In order to send a signal to the parent, we need to
+ // lock its signal mutex, but we're already holding tg's signal
+ // mutex, and the TaskSet mutex must be locked for writing for
+ // us to hold two signal mutexes. Since we don't want to
+ // require this for endGroupStopLocked (which is called from
+ // signal-sending paths), nor do we want to lose atomicity by
+ // releasing the mutexes we're already holding, just let the
+ // continuing thread group deal with it.
+ tg.groupContNotify = true
+ tg.groupContInterrupted = tg.groupStopPhase == groupStopInitiated
+ tg.groupContWaitable = true
+ }
+ }
+ // If groupStopPhase was groupStopDequeued, setting it to groupStopNone
+ // will cause following calls to initiateGroupStop to recognize that
+ // the group stop has been cancelled.
+ tg.groupStopPhase = groupStopNone
+ }
+}
+
+// signalStop sends a signal to t's thread group of a new group stop, group
+// continue, or ptrace stop, if appropriate. code and status are set in the
+// signal sent to tg, if any.
+//
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (t *Task) signalStop(target *Task, code int32, status int32) {
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD]
+ if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) {
+ sigchld := &arch.SignalInfo{
+ Signo: int32(linux.SIGCHLD),
+ Code: code,
+ }
+ sigchld.SetPid(int32(t.tg.pidns.tids[target]))
+ sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+ sigchld.SetStatus(status)
+ // TODO: Set utime, stime.
+ t.sendSignalLocked(sigchld, true /* group */)
+ }
+}
+
+// The runInterrupt state handles conditions indicated by interrupts.
+type runInterrupt struct{}
+
+func (*runInterrupt) execute(t *Task) taskRunState {
+ // Interrupts are de-duplicated (if t is interrupted twice before
+ // t.interrupted() is called, t.interrupted() will only return true once),
+ // so early exits from this function must re-enter the runInterrupt state
+ // to check for more interrupt-signaled conditions.
+
+ t.tg.signalHandlers.mu.Lock()
+
+ // Did we just leave a group stop?
+ if t.tg.groupContNotify {
+ t.tg.groupContNotify = false
+ sig := t.tg.groupStopSignal
+ intr := t.tg.groupContInterrupted
+ t.tg.signalHandlers.mu.Unlock()
+ t.tg.pidns.owner.mu.RLock()
+ // For consistency with Linux, if the parent and (thread group
+ // leader's) tracer are in the same thread group, deduplicate
+ // notifications.
+ notifyParent := t.tg.leader.parent != nil
+ if tracer := t.tg.leader.ptraceTracer.Load().(*Task); tracer != nil {
+ if notifyParent && tracer.tg == t.tg.leader.parent.tg {
+ notifyParent = false
+ }
+ // Sending CLD_STOPPED to the tracer doesn't really make any sense;
+ // the thread group leader may have already entered the stop and
+ // notified its tracer accordingly. But it's consistent with
+ // Linux...
+ if intr {
+ tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+ if !notifyParent {
+ tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop)
+ } else {
+ tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop)
+ }
+ } else {
+ tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
+ tracer.tg.eventQueue.Notify(EventGroupContinue)
+ }
+ }
+ if notifyParent {
+ // If groupContInterrupted, do as Linux does and pretend the group
+ // stop completed just before it ended. The theoretical behavior in
+ // this case would be to send a SIGCHLD indicating the completed
+ // stop, followed by a SIGCHLD indicating the continue. However,
+ // SIGCHLD is a standard signal, so the latter would always be
+ // dropped. Hence sending only the former is equivalent.
+ if intr {
+ t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+ t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop)
+ } else {
+ t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
+ t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue)
+ }
+ }
+ t.tg.pidns.owner.mu.RUnlock()
+ return (*runInterrupt)(nil)
+ }
+
+ // Do we need to enter a group stop?
+ if t.groupStopRequired {
+ t.groupStopRequired = false
+ sig := t.tg.groupStopSignal
+ notifyParent := false
+ if !t.groupStopAcknowledged {
+ t.groupStopAcknowledged = true
+ t.tg.groupStopCount++
+ if t.tg.groupStopCount == t.tg.activeTasks {
+ t.Debugf("Completing group stop")
+ notifyParent = true
+ t.tg.groupStopPhase = groupStopComplete
+ t.tg.groupStopWaitable = true
+ t.tg.groupContNotify = false
+ t.tg.groupContWaitable = false
+ }
+ }
+ // Drop the signal mutex so we can take the TaskSet mutex.
+ t.tg.signalHandlers.mu.Unlock()
+
+ t.tg.pidns.owner.mu.RLock()
+ if t.tg.leader.parent == nil {
+ notifyParent = false
+ }
+ if tracer := t.Tracer(); tracer != nil {
+ t.ptraceCode = int32(sig)
+ t.ptraceSiginfo = nil
+ if t.beginPtraceStopLocked() {
+ tracer.signalStop(t, arch.CLD_STOPPED, int32(sig))
+ // For consistency with Linux, if the parent and tracer are in the
+ // same thread group, deduplicate notification signals.
+ if notifyParent && tracer.tg == t.tg.leader.parent.tg {
+ notifyParent = false
+ tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop)
+ } else {
+ tracer.tg.eventQueue.Notify(EventTraceeStop)
+ }
+ }
+ } else {
+ t.tg.signalHandlers.mu.Lock()
+ if !t.killedLocked() {
+ t.beginInternalStopLocked((*groupStop)(nil))
+ }
+ t.tg.signalHandlers.mu.Unlock()
+ }
+ if notifyParent {
+ t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
+ t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+ }
+ t.tg.pidns.owner.mu.RUnlock()
+
+ return (*runInterrupt)(nil)
+ }
+
+ // Are there signals pending?
+ if info := t.dequeueSignalLocked(); info != nil {
+ if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 && t.tg.groupStopPhase == groupStopNone {
+ // Indicate that we've dequeued a stop signal before
+ // unlocking the signal mutex; initiateGroupStop will check
+ // that the phase hasn't changed (or is at least another
+ // "stop signal dequeued" phase) after relocking it.
+ t.tg.groupStopPhase = groupStopDequeued
+ }
+ if t.ptraceSignalLocked(info) {
+ // Dequeueing the signal action must wait until after the
+ // signal-delivery-stop ends since the tracer can change or
+ // suppress the signal.
+ t.tg.signalHandlers.mu.Unlock()
+ return (*runInterruptAfterSignalDeliveryStop)(nil)
+ }
+ act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
+ t.tg.signalHandlers.mu.Unlock()
+ return t.deliverSignal(info, act)
+ }
+
+ t.tg.signalHandlers.mu.Unlock()
+ return (*runApp)(nil)
+}
+
+type runInterruptAfterSignalDeliveryStop struct{}
+
+func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
+ t.tg.pidns.owner.mu.Lock()
+ // Can't defer unlock: deliverSignal must be called without holding TaskSet
+ // mutex.
+ sig := linux.Signal(t.ptraceCode)
+ defer func() {
+ t.ptraceSiginfo = nil
+ }()
+ if !sig.IsValid() {
+ t.tg.pidns.owner.mu.Unlock()
+ return (*runInterrupt)(nil)
+ }
+ info := t.ptraceSiginfo
+ if sig != linux.Signal(info.Signo) {
+ info.Signo = int32(sig)
+ info.Errno = 0
+ info.Code = arch.SignalInfoUser
+ // pid isn't a valid field for all signal numbers, but Linux
+ // doesn't care (kernel/signal.c:ptrace_signal()).
+ //
+ // Linux uses t->parent for the tid and uid here, which is the tracer
+ // if it hasn't detached or the real parent otherwise.
+ parent := t.parent
+ if tracer := t.Tracer(); tracer != nil {
+ parent = tracer
+ }
+ if parent == nil {
+ // Tracer has detached and t was created by Kernel.CreateProcess().
+ // Pretend the parent is in an ancestor PID + user namespace.
+ info.SetPid(0)
+ info.SetUid(int32(auth.OverflowUID))
+ } else {
+ info.SetPid(int32(t.tg.pidns.tids[parent]))
+ info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+ }
+ }
+ t.tg.signalHandlers.mu.Lock()
+ t.tg.pidns.owner.mu.Unlock()
+ // If the signal is masked, re-queue it.
+ if linux.SignalSetOf(sig)&t.tr.SignalMask != 0 {
+ t.sendSignalLocked(info, false /* group */)
+ t.tg.signalHandlers.mu.Unlock()
+ return (*runInterrupt)(nil)
+ }
+ act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
+ t.tg.signalHandlers.mu.Unlock()
+ return t.deliverSignal(info, act)
+}
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
new file mode 100644
index 000000000..801cb3395
--- /dev/null
+++ b/pkg/sentry/kernel/task_start.go
@@ -0,0 +1,252 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TaskConfig defines the configuration of a new Task (see below).
+type TaskConfig struct {
+ // Kernel is the owning Kernel.
+ *Kernel
+
+ // Parent is the new task's parent. Parent may be nil.
+ Parent *Task
+
+ // ThreadGroup is the ThreadGroup the new task belongs to.
+ *ThreadGroup
+
+ // TaskContext is the TaskContext of the new task.
+ *TaskContext
+
+ // TaskResources is the TaskResources of the new task.
+ *TaskResources
+
+ // Credentials is the Credentials of the new task.
+ Credentials *auth.Credentials
+
+ // Niceness is the niceness of the new task.
+ Niceness int
+
+ // If NetworkNamespaced is true, the new task should observe a non-root
+ // network namespace.
+ NetworkNamespaced bool
+
+ // AllowedCPUMask contains the cpus that this task can run on.
+ AllowedCPUMask sched.CPUSet
+
+ // UTSNamespace is the UTSNamespace of the new task.
+ UTSNamespace *UTSNamespace
+
+ // IPCNamespace is the IPCNamespace of the new task.
+ IPCNamespace *IPCNamespace
+}
+
+// NewTask creates a new task defined by TaskConfig.
+// Whether or not NewTask is successful, it takes ownership of both TaskContext
+// and TaskResources of the TaskConfig.
+//
+// NewTask does not start the returned task; the caller must call Task.Start.
+func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
+ t, err := ts.newTask(cfg)
+ if err != nil {
+ cfg.TaskContext.release()
+ cfg.TaskResources.release()
+ return nil, err
+ }
+ return t, nil
+}
+
+// newTask is a helper for TaskSet.NewTask that only takes ownership of TaskContext
+// and TaskResources of the TaskConfig if it succeeds.
+func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
+ tg := cfg.ThreadGroup
+ tc := cfg.TaskContext
+ t := &Task{
+ taskNode: taskNode{
+ tg: tg,
+ parent: cfg.Parent,
+ children: make(map[*Task]struct{}),
+ },
+ runState: (*runApp)(nil),
+ interruptChan: make(chan struct{}, 1),
+ signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+ tc: *tc,
+ tr: *cfg.TaskResources,
+ p: cfg.Kernel.Platform.NewContext(),
+ k: cfg.Kernel,
+ ptraceTracees: make(map[*Task]struct{}),
+ allowedCPUMask: cfg.AllowedCPUMask.Copy(),
+ ioUsage: &usage.IO{},
+ creds: cfg.Credentials,
+ niceness: cfg.Niceness,
+ netns: cfg.NetworkNamespaced,
+ utsns: cfg.UTSNamespace,
+ ipcns: cfg.IPCNamespace,
+ rseqCPU: -1,
+ futexWaiter: futex.NewWaiter(),
+ }
+ t.endStopCond.L = &t.tg.signalHandlers.mu
+ t.ptraceTracer.Store((*Task)(nil))
+ // We don't construct t.blockingTimer until Task.run(); see that function
+ // for justification.
+
+ // Make the new task (and possibly thread group) visible to the rest of
+ // the system atomically.
+ ts.mu.Lock()
+ defer ts.mu.Unlock()
+ tg.signalHandlers.mu.Lock()
+ defer tg.signalHandlers.mu.Unlock()
+ if tg.exiting || tg.execing != nil {
+ // If the caller is in the same thread group, then what we return
+ // doesn't matter too much since the caller will exit before it returns
+ // to userspace. If the caller isn't in the same thread group, then
+ // we're in uncharted territory and can return whatever we want.
+ return nil, syserror.EINTR
+ }
+ if err := ts.assignTIDsLocked(t); err != nil {
+ return nil, err
+ }
+ // Below this point, newTask is expected not to fail (there is no rollback
+ // of assignTIDsLocked or any of the following).
+
+ // Logging on t's behalf will panic if t.logPrefix hasn't been initialized.
+ // This is the earliest point at which we can do so (since t now has thread
+ // IDs).
+ t.updateLogPrefixLocked()
+
+ if t.parent != nil {
+ t.parent.children[t] = struct{}{}
+ }
+
+ if tg.leader == nil {
+ // New thread group.
+ tg.leader = t
+ if parentPG := tg.parentPG(); parentPG == nil {
+ tg.createSession()
+ } else {
+ // Inherit the process group.
+ parentPG.incRefWithParent(parentPG)
+ tg.processGroup = parentPG
+ }
+ }
+ tg.tasks.PushBack(t)
+ tg.tasksCount++
+ tg.liveTasks++
+ tg.activeTasks++
+
+ // Propagate external TaskSet stops to the new task.
+ t.stopCount = ts.stopCount
+
+ t.mu.Lock()
+ defer t.mu.Unlock()
+
+ t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t])
+
+ t.startTime = t.k.RealtimeClock().Now()
+
+ return t, nil
+}
+
+// assignTIDsLocked ensures that new task t is visible in all PID namespaces in
+// which it should be visible.
+//
+// Preconditions: ts.mu must be locked for writing.
+func (ts *TaskSet) assignTIDsLocked(t *Task) error {
+ type allocatedTID struct {
+ ns *PIDNamespace
+ tid ThreadID
+ }
+ var allocatedTIDs []allocatedTID
+ for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+ tid, err := ns.allocateTID()
+ if err != nil {
+ // Failure. Remove the tids we already allocated in descendant
+ // namespaces.
+ for _, a := range allocatedTIDs {
+ delete(a.ns.tasks, a.tid)
+ delete(a.ns.tids, t)
+ }
+ return err
+ }
+ ns.tasks[tid] = t
+ ns.tids[t] = tid
+ allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
+ }
+ return nil
+}
+
+// allocateTID returns an unused ThreadID from ns.
+//
+// Preconditions: ns.owner.mu must be locked for writing.
+func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
+ if ns.exiting {
+ // "In this case, a subsequent fork(2) into this PID namespace will
+ // fail with the error ENOMEM; it is not possible to create a new
+ // processes [sic] in a PID namespace whose init process has
+ // terminated." - pid_namespaces(7)
+ return 0, syserror.ENOMEM
+ }
+ tid := ns.last
+ for {
+ // Next.
+ tid++
+ if tid > TasksLimit {
+ tid = InitTID + 1
+ }
+
+ // Is it available?
+ _, ok := ns.tasks[tid]
+ if !ok {
+ ns.last = tid
+ return tid, nil
+ }
+
+ // Did we do a full cycle?
+ if tid == ns.last {
+ // No tid available.
+ return 0, syserror.EAGAIN
+ }
+ }
+}
+
+// Start starts the task goroutine. Start must be called exactly once for each
+// task returned by NewTask.
+//
+// 'tid' must be the task's TID in the root PID namespace and it's used for
+// debugging purposes only (set as parameter to Task.run to make it visible
+// in stack dumps).
+func (t *Task) Start(tid ThreadID) {
+ // If the task was restored, it may be "starting" after having already exited.
+ if t.runState == nil {
+ return
+ }
+ t.goroutineStopped.Add(1)
+ t.tg.liveGoroutines.Add(1)
+ t.tg.pidns.owner.liveGoroutines.Add(1)
+ t.tg.pidns.owner.runningGoroutines.Add(1)
+
+ // Task is now running in system mode.
+ t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
+
+ // Use the task's TID in the root PID namespace to make it visible in stack dumps.
+ go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
+}
diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go
new file mode 100644
index 000000000..feaf6cae4
--- /dev/null
+++ b/pkg/sentry/kernel/task_stop.go
@@ -0,0 +1,226 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements task stops, which represent the equivalent of Linux's
+// uninterruptible sleep states in a way that is compatible with save/restore.
+// Task stops comprise both internal stops (which form part of the task's
+// "normal" control flow) and external stops (which do not); see README.md for
+// details.
+//
+// There are multiple interfaces for interacting with stops because there are
+// multiple cases to consider:
+//
+// - A task goroutine can begin a stop on its associated task (e.g. a
+// vfork() syscall stopping the calling task until the child task releases its
+// MM). In this case, calling Task.interrupt is both unnecessary (the task
+// goroutine obviously cannot be blocked in Task.block or executing application
+// code) and undesirable (as it may spuriously interrupt a in-progress
+// syscall).
+//
+// Beginning internal stops in this case is implemented by
+// Task.beginInternalStop / Task.beginInternalStopLocked. As of this writing,
+// there are no instances of this case that begin external stops, except for
+// autosave; however, autosave terminates the sentry without ending the
+// external stop, so the spurious interrupt is moot.
+//
+// - An arbitrary goroutine can begin a stop on an unrelated task (e.g. all
+// tasks being stopped in preparation for state checkpointing). If the task
+// goroutine may be in Task.block or executing application code, it must be
+// interrupted by Task.interrupt for it to actually enter the stop; since,
+// strictly speaking, we have no way of determining this, we call
+// Task.interrupt unconditionally.
+//
+// Beginning external stops in this case is implemented by
+// Task.BeginExternalStop. As of this writing, there are no instances of this
+// case that begin internal stops.
+//
+// - An arbitrary goroutine can end a stop on an unrelated task (e.g. an
+// exiting task resuming a sibling task that has been blocked in an execve()
+// syscall waiting for other tasks to exit). In this case, Task.endStopCond
+// must be notified to kick the task goroutine out of Task.doStop.
+//
+// Ending internal stops in this case is implemented by
+// Task.endInternalStopLocked. Ending external stops in this case is
+// implemented by Task.EndExternalStop.
+//
+// - Hypothetically, a task goroutine can end an internal stop on its
+// associated task. As of this writing, there are no instances of this case.
+// However, any instances of this case could still use the above functions,
+// since notifying Task.endStopCond would be unnecessary but harmless.
+
+import (
+ "fmt"
+ "sync/atomic"
+)
+
+// A TaskStop is a condition visible to the task control flow graph that
+// prevents a task goroutine from running or exiting, i.e. an internal stop.
+//
+// NOTE: Most TaskStops don't contain any data; they're
+// distinguished by their type. The obvious way to implement such a TaskStop
+// is:
+//
+// type groupStop struct{}
+// func (groupStop) Killable() bool { return true }
+// ...
+// t.beginInternalStop(groupStop{})
+//
+// However, this doesn't work because the state package can't serialize values,
+// only pointers. Furthermore, the correctness of save/restore depends on the
+// ability to pass a TaskStop to endInternalStop that will compare equal to the
+// TaskStop that was passed to beginInternalStop, even if a save/restore cycle
+// occurred between the two. As a result, the current idiom is to always use a
+// typecast nil for data-free TaskStops:
+//
+// type groupStop struct{}
+// func (*groupStop) Killable() bool { return true }
+// ...
+// t.beginInternalStop((*groupStop)(nil))
+//
+// This is pretty gross, but the alternatives seem grosser.
+type TaskStop interface {
+ // Killable returns true if Task.Kill should end the stop prematurely.
+ // Killable is analogous to Linux's TASK_WAKEKILL.
+ Killable() bool
+}
+
+// beginInternalStop indicates the start of an internal stop that applies to t.
+//
+// Preconditions: The task must not already be in an internal stop (i.e. t.stop
+// == nil). The caller must be running on the task goroutine.
+func (t *Task) beginInternalStop(s TaskStop) {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ t.beginInternalStopLocked(s)
+}
+
+// Preconditions: The signal mutex must be locked. All preconditions for
+// Task.beginInternalStop also apply.
+func (t *Task) beginInternalStopLocked(s TaskStop) {
+ if t.stop != nil {
+ panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
+ }
+ t.Debugf("Entering internal stop %#v", s)
+ t.stop = s
+ t.beginStopLocked()
+}
+
+// endInternalStopLocked indicates the end of an internal stop that applies to
+// t. endInternalStopLocked does not wait for the task to resume.
+//
+// The caller is responsible for ensuring that the internal stop they expect
+// actually applies to t; this requires holding the signal mutex which protects
+// t.stop, which is why there is no endInternalStop that locks the signal mutex
+// for you.
+//
+// Preconditions: The signal mutex must be locked. The task must be in an
+// internal stop (i.e. t.stop != nil).
+func (t *Task) endInternalStopLocked() {
+ if t.stop == nil {
+ panic("Attempting to leave non-existent internal stop")
+ }
+ t.Debugf("Leaving internal stop %#v", t.stop)
+ t.stop = nil
+ t.endStopLocked()
+}
+
+// BeginExternalStop indicates the start of an external stop that applies to t.
+// BeginExternalStop does not wait for t's task goroutine to stop.
+func (t *Task) BeginExternalStop() {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ t.beginStopLocked()
+ t.interrupt()
+}
+
+// EndExternalStop indicates the end of an external stop started by a previous
+// call to Task.BeginExternalStop. EndExternalStop does not wait for t's task
+// goroutine to resume.
+func (t *Task) EndExternalStop() {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ t.endStopLocked()
+}
+
+// beginStopLocked increments t.stopCount to indicate that a new internal or
+// external stop applies to t.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) beginStopLocked() {
+ if newval := atomic.AddInt32(&t.stopCount, 1); newval <= 0 {
+ // Most likely overflow.
+ panic(fmt.Sprintf("Invalid stopCount: %d", newval))
+ }
+}
+
+// endStopLocked decerements t.stopCount to indicate that an existing internal
+// or external stop no longer applies to t.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) endStopLocked() {
+ if newval := atomic.AddInt32(&t.stopCount, -1); newval < 0 {
+ panic(fmt.Sprintf("Invalid stopCount: %d", newval))
+ } else if newval == 0 {
+ t.endStopCond.Signal()
+ }
+}
+
+// BeginExternalStop indicates the start of an external stop that applies to
+// all current and future tasks in ts. BeginExternalStop does not wait for
+// task goroutines to stop.
+func (ts *TaskSet) BeginExternalStop() {
+ ts.mu.Lock()
+ defer ts.mu.Unlock()
+ ts.stopCount++
+ if ts.stopCount <= 0 {
+ panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
+ }
+ if ts.Root == nil {
+ return
+ }
+ for t := range ts.Root.tids {
+ t.tg.signalHandlers.mu.Lock()
+ t.beginStopLocked()
+ t.tg.signalHandlers.mu.Unlock()
+ t.interrupt()
+ }
+}
+
+// EndExternalStop indicates the end of an external stop started by a previous
+// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task
+// goroutines to resume.
+func (ts *TaskSet) EndExternalStop() {
+ ts.mu.Lock()
+ defer ts.mu.Unlock()
+ ts.stopCount--
+ if ts.stopCount < 0 {
+ panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
+ }
+ if ts.Root == nil {
+ return
+ }
+ for t := range ts.Root.tids {
+ t.tg.signalHandlers.mu.Lock()
+ t.endStopLocked()
+ t.tg.signalHandlers.mu.Unlock()
+ }
+}
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
new file mode 100644
index 000000000..79f4ff60c
--- /dev/null
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -0,0 +1,434 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "os"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/bits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
+// include/linux/errno.h. These errnos are never returned to userspace
+// directly, but are used to communicate the expected behavior of an
+// interrupted syscall from the syscall to signal handling.
+type SyscallRestartErrno int
+
+// These numeric values are significant because ptrace syscall exit tracing can
+// observe them.
+//
+// For all of the following errnos, if the syscall is not interrupted by a
+// signal delivered to a user handler, the syscall is restarted.
+const (
+ // ERESTARTSYS is returned by an interrupted syscall to indicate that it
+ // should be converted to EINTR if interrupted by a signal delivered to a
+ // user handler without SA_RESTART set, and restarted otherwise.
+ ERESTARTSYS = SyscallRestartErrno(512)
+
+ // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
+ // should always be restarted.
+ ERESTARTNOINTR = SyscallRestartErrno(513)
+
+ // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
+ // should be converted to EINTR if interrupted by a signal delivered to a
+ // user handler, and restarted otherwise.
+ ERESTARTNOHAND = SyscallRestartErrno(514)
+
+ // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
+ // that it should be restarted using a custom function. The interrupted
+ // syscall must register a custom restart function by calling
+ // Task.SetRestartSyscallFn.
+ ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
+)
+
+// Error implements error.Error.
+func (e SyscallRestartErrno) Error() string {
+ // Descriptions are borrowed from strace.
+ switch e {
+ case ERESTARTSYS:
+ return "to be restarted if SA_RESTART is set"
+ case ERESTARTNOINTR:
+ return "to be restarted"
+ case ERESTARTNOHAND:
+ return "to be restarted if no handler"
+ case ERESTART_RESTARTBLOCK:
+ return "interrupted by signal"
+ default:
+ return "(unknown interrupt error)"
+ }
+}
+
+// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
+// rv, the value in a syscall return register.
+func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
+ switch int(rv) {
+ case -int(ERESTARTSYS):
+ return ERESTARTSYS, true
+ case -int(ERESTARTNOINTR):
+ return ERESTARTNOINTR, true
+ case -int(ERESTARTNOHAND):
+ return ERESTARTNOHAND, true
+ case -int(ERESTART_RESTARTBLOCK):
+ return ERESTART_RESTARTBLOCK, true
+ default:
+ return 0, false
+ }
+}
+
+// SyscallRestartBlock represents the restart block for a syscall restartable
+// with a custom function. It encapsulates the state required to restart a
+// syscall across a S/R.
+type SyscallRestartBlock interface {
+ Restart(t *Task) (uintptr, error)
+}
+
+// SyscallControl is returned by syscalls to control the behavior of
+// Task.doSyscallInvoke.
+type SyscallControl struct {
+ // next is the state that the task goroutine should switch to. If next is
+ // nil, the task goroutine should continue to syscall exit as usual.
+ next taskRunState
+
+ // If ignoreReturn is true, Task.doSyscallInvoke should not store any value
+ // in the task's syscall return value register.
+ ignoreReturn bool
+}
+
+var (
+ // CtrlDoExit is returned by the implementations of the exit and exit_group
+ // syscalls to enter the task exit path directly, skipping syscall exit
+ // tracing.
+ CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}
+
+ // ctrlStopAndReinvokeSyscall is returned by syscalls using the external
+ // feature before syscall execution. This causes Task.doSyscallInvoke
+ // to return runSyscallReinvoke, allowing Task.run to check for stops
+ // before immediately re-invoking the syscall (skipping the re-checking
+ // of seccomp filters and ptrace which would confuse userspace
+ // tracing).
+ ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}
+
+ // ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
+ // their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
+ // than tail-calling it, allowing stops to be checked before syscall exit.
+ ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
+)
+
+func (t *Task) invokeExternal() {
+ t.BeginExternalStop()
+ go func() { // S/R-SAFE: External control flow.
+ defer t.EndExternalStop()
+ t.SyscallTable().External(t.Kernel())
+ }()
+}
+
+func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
+ s := t.SyscallTable()
+
+ fe := s.FeatureEnable.Word(sysno)
+
+ var straceContext interface{}
+ if bits.IsAnyOn32(fe, StraceEnableBits) {
+ straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
+ }
+
+ if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
+ t.invokeExternal()
+ // Ensure we check for stops, then invoke the syscall again.
+ ctrl = ctrlStopAndReinvokeSyscall
+ } else {
+ fn := s.Lookup(sysno)
+ if fn != nil {
+ // Call our syscall implementation.
+ rval, ctrl, err = fn(t, args)
+ } else {
+ // Use the missing function if not found.
+ rval, err = t.SyscallTable().Missing(t, sysno, args)
+ }
+ }
+
+ if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
+ t.invokeExternal()
+ // Don't reinvoke the syscall.
+ }
+
+ if bits.IsAnyOn32(fe, StraceEnableBits) {
+ s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
+ }
+
+ return
+}
+
+// doSyscall is the entry point for an invocation of a system call specified by
+// the current state of t's registers.
+//
+// The syscall path is very hot; avoid defer.
+func (t *Task) doSyscall() taskRunState {
+ sysno := t.Arch().SyscallNo()
+ args := t.Arch().SyscallArgs()
+
+ // Tracers expect to see this between when the task traps into the kernel
+ // to perform a syscall and when the syscall is actually invoked.
+ // This useless-looking temporary is needed because Go.
+ tmp := uintptr(syscall.ENOSYS)
+ t.Arch().SetReturn(-tmp)
+
+ // Check seccomp filters. The nil check is for performance (as seccomp use
+ // is rare), not needed for correctness.
+ if t.syscallFilters != nil {
+ switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r {
+ case seccompResultDeny:
+ t.Debugf("Syscall %d: denied by seccomp", sysno)
+ return (*runSyscallExit)(nil)
+ case seccompResultAllow:
+ // ok
+ case seccompResultKill:
+ t.Debugf("Syscall %d: killed by seccomp", sysno)
+ t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+ return (*runExit)(nil)
+ case seccompResultTrace:
+ t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
+ return (*runSyscallAfterPtraceEventSeccomp)(nil)
+ default:
+ panic(fmt.Sprintf("Unknown seccomp result %d", r))
+ }
+ }
+
+ return t.doSyscallEnter(sysno, args)
+}
+
+type runSyscallAfterPtraceEventSeccomp struct{}
+
+func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+ if t.killed() {
+ // "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
+ // ptrace(2)
+ return (*runInterrupt)(nil)
+ }
+ sysno := t.Arch().SyscallNo()
+ // "The tracer can skip the system call by changing the syscall number to
+ // -1." - Documentation/prctl/seccomp_filter.txt
+ if sysno == ^uintptr(0) {
+ return (*runSyscallExit)(nil).execute(t)
+ }
+ args := t.Arch().SyscallArgs()
+ return t.doSyscallEnter(sysno, args)
+}
+
+func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
+ if next, ok := t.ptraceSyscallEnter(); ok {
+ return next
+ }
+ return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallAfterSyscallEnterStop struct{}
+
+func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
+ if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+ t.tg.signalHandlers.mu.Lock()
+ t.sendSignalLocked(sigPriv(sig), false /* group */)
+ t.tg.signalHandlers.mu.Unlock()
+ }
+ if t.killed() {
+ return (*runInterrupt)(nil)
+ }
+ sysno := t.Arch().SyscallNo()
+ if sysno == ^uintptr(0) {
+ return (*runSyscallExit)(nil)
+ }
+ args := t.Arch().SyscallArgs()
+ return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallAfterSysemuStop struct{}
+
+func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
+ if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
+ t.tg.signalHandlers.mu.Lock()
+ t.sendSignalLocked(sigPriv(sig), false /* group */)
+ t.tg.signalHandlers.mu.Unlock()
+ }
+ if t.killed() {
+ return (*runInterrupt)(nil)
+ }
+ return (*runSyscallExit)(nil).execute(t)
+}
+
+func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
+ rval, ctrl, err := t.executeSyscall(sysno, args)
+
+ if ctrl != nil {
+ if !ctrl.ignoreReturn {
+ t.Arch().SetReturn(rval)
+ }
+ if ctrl.next != nil {
+ return ctrl.next
+ }
+ } else if err != nil {
+ t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+ t.haveSyscallReturn = true
+ } else {
+ t.Arch().SetReturn(rval)
+ }
+
+ return (*runSyscallExit)(nil).execute(t)
+}
+
+type runSyscallReinvoke struct{}
+
+func (*runSyscallReinvoke) execute(t *Task) taskRunState {
+ if t.killed() {
+ // It's possible that since the last execution, the task has
+ // been forcible killed. Invoking the system call here could
+ // result in an infinite loop if it is again preempted by an
+ // external stop and reinvoked.
+ return (*runInterrupt)(nil)
+ }
+
+ sysno := t.Arch().SyscallNo()
+ args := t.Arch().SyscallArgs()
+ return t.doSyscallInvoke(sysno, args)
+}
+
+type runSyscallExit struct{}
+
+func (*runSyscallExit) execute(t *Task) taskRunState {
+ t.ptraceSyscallExit()
+ return (*runApp)(nil)
+}
+
+// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
+// indicated by an execution fault at address addr. doVsyscall returns the
+// task's next run state.
+func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState {
+ // Grab the caller up front, to make sure there's a sensible stack.
+ caller := t.Arch().Native(uintptr(0))
+ if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil {
+ t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(sigPriv(linux.SIGSEGV))
+ return (*runApp)(nil)
+ }
+
+ // For _vsyscalls_, there is no need to translate System V calling convention
+ // to syscall ABI because they both use RDI, RSI, and RDX for the first three
+ // arguments and none of the vsyscalls uses more than two arguments.
+ args := t.Arch().SyscallArgs()
+ if t.syscallFilters != nil {
+ switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
+ case seccompResultDeny:
+ t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
+ return (*runApp)(nil)
+ case seccompResultAllow:
+ // ok
+ case seccompResultTrace:
+ t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
+ return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
+ default:
+ panic(fmt.Sprintf("Unknown seccomp result %d", r))
+ }
+ }
+
+ return t.doVsyscallInvoke(sysno, args, caller)
+}
+
+type runVsyscallAfterPtraceEventSeccomp struct {
+ addr usermem.Addr
+ sysno uintptr
+ caller interface{}
+}
+
+func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
+ if t.killed() {
+ return (*runInterrupt)(nil)
+ }
+ sysno := t.Arch().SyscallNo()
+ // "... the syscall may not be changed to another system call using the
+ // orig_rax register. It may only be changed to -1 order [sic] to skip the
+ // currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
+ // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
+ // causes do_exit(SIGSYS), and changing sp is ignored.
+ if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr {
+ t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)})
+ return (*runExit)(nil)
+ }
+ if sysno == ^uintptr(0) {
+ return (*runApp)(nil)
+ }
+ return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
+}
+
+func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState {
+ rval, ctrl, err := t.executeSyscall(sysno, args)
+ if ctrl != nil {
+ t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
+ // Set the return value. The stack has already been adjusted.
+ t.Arch().SetReturn(0)
+ } else if err == nil {
+ t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
+ // Set the return value. The stack has already been adjusted.
+ t.Arch().SetReturn(uintptr(rval))
+ } else {
+ t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
+ if err == syserror.EFAULT {
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(sigPriv(linux.SIGSEGV))
+ // A return is not emulated in this case.
+ return (*runApp)(nil)
+ }
+ t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno))))
+ }
+ t.Arch().SetIP(t.Arch().Value(caller))
+ t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
+ return (*runApp)(nil)
+}
+
+// ExtractErrno extracts an integer error number from the error.
+// The syscall number is purely for context in the error case. Use -1 if
+// syscall number is unknown.
+func (t *Task) ExtractErrno(err error, sysno int) int {
+ switch err := err.(type) {
+ case nil:
+ return 0
+ case syscall.Errno:
+ return int(err)
+ case SyscallRestartErrno:
+ return int(err)
+ case *memmap.BusError:
+ // Bus errors may generate SIGBUS, but for syscalls they still
+ // return EFAULT. See case in task_run.go where the fault is
+ // handled (and the SIGBUS is delivered).
+ return int(syscall.EFAULT)
+ case *os.PathError:
+ return t.ExtractErrno(err.Err, sysno)
+ case *os.LinkError:
+ return t.ExtractErrno(err.Err, sysno)
+ case *os.SyscallError:
+ return t.ExtractErrno(err.Err, sysno)
+ default:
+ if errno, ok := syserror.TranslateError(err); ok {
+ return int(errno)
+ }
+ }
+ panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
+}
diff --git a/pkg/sentry/kernel/task_test.go b/pkg/sentry/kernel/task_test.go
new file mode 100644
index 000000000..82ef858a1
--- /dev/null
+++ b/pkg/sentry/kernel/task_test.go
@@ -0,0 +1,69 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+)
+
+func TestTaskCPU(t *testing.T) {
+ for _, test := range []struct {
+ mask sched.CPUSet
+ tid ThreadID
+ cpu int32
+ }{
+ {
+ mask: []byte{0xff},
+ tid: 1,
+ cpu: 0,
+ },
+ {
+ mask: []byte{0xff},
+ tid: 10,
+ cpu: 1,
+ },
+ {
+ // more than 8 cpus.
+ mask: []byte{0xff, 0xff},
+ tid: 10,
+ cpu: 9,
+ },
+ {
+ // missing the first cpu.
+ mask: []byte{0xfe},
+ tid: 1,
+ cpu: 1,
+ },
+ {
+ mask: []byte{0xfe},
+ tid: 10,
+ cpu: 3,
+ },
+ {
+ // missing the fifth cpu.
+ mask: []byte{0xef},
+ tid: 10,
+ cpu: 2,
+ },
+ } {
+ assigned := assignCPU(test.mask, test.tid)
+ if test.cpu != assigned {
+ t.Errorf("assignCPU(%v, %v) got %v, want %v", test.mask, test.tid, assigned, test.cpu)
+ }
+ }
+
+}
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
new file mode 100644
index 000000000..7a62ab674
--- /dev/null
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -0,0 +1,298 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "math"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// _MAX_RW_COUNT is the maximum size in bytes of a single read or write.
+// Reads and writes that exceed this size may be silently truncated.
+// (Linux: include/linux/fs.h:MAX_RW_COUNT)
+var _MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown())
+
+// Activate ensures that the task has an active address space.
+func (t *Task) Activate() {
+ if mm := t.MemoryManager(); mm != nil {
+ if err := mm.Activate(); err != nil {
+ panic("unable to activate mm: " + err.Error())
+ }
+ }
+}
+
+// Deactivate relinquishes the task's active address space.
+func (t *Task) Deactivate() {
+ if mm := t.MemoryManager(); mm != nil {
+ if err := mm.Deactivate(); err != nil {
+ panic("unable to deactivate mm: " + err.Error())
+ }
+ }
+}
+
+// CopyIn copies a fixed-size value or slice of fixed-size values in from the
+// task's memory. The copy will fail with syscall.EFAULT if it traverses user
+// memory that is unmapped or not readable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) {
+ return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+}
+
+// CopyInBytes is a fast version of CopyIn if the caller can serialize the
+// data without reflection and pass in a byte slice.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+ return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+}
+
+// CopyOut copies a fixed-size value or slice of fixed-size values out to the
+// task's memory. The copy will fail with syscall.EFAULT if it traverses user
+// memory that is unmapped or not writeable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) {
+ return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+}
+
+// CopyOutBytes is a fast version of CopyOut if the caller can serialize the
+// data without reflection and pass in a byte slice.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+ return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+}
+
+// CopyInString copies a NUL-terminated string of length at most maxlen in from
+// the task's memory. The copy will fail with syscall.EFAULT if it traverses
+// user memory that is unmapped or not readable by the user.
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) {
+ return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+}
+
+// CopyInVector copies a NULL-terminated vector of strings from the task's
+// memory. The copy will fail with syscall.EFAULT if it traverses
+// user memory that is unmapped or not readable by the user.
+//
+// maxElemSize is the maximum size of each individual element.
+//
+// maxTotalSize is the maximum total length of all elements plus the total
+// number of elements. For example, the following strings correspond to
+// the following set of sizes:
+//
+// { "a", "b", "c" } => 6 (3 for lengths, 3 for elements)
+// { "abc" } => 4 (3 for length, 1 for elements)
+//
+// This Task's AddressSpace must be active.
+func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) {
+ var v []string
+ for {
+ argAddr := t.Arch().Native(0)
+ if _, err := t.CopyIn(addr, argAddr); err != nil {
+ return v, err
+ }
+ if t.Arch().Value(argAddr) == 0 {
+ break
+ }
+ // Each string has a zero terminating byte counted, so copying out a string
+ // requires at least one byte of space. Also, see the calculation below.
+ if maxTotalSize <= 0 {
+ return nil, syserror.ENOMEM
+ }
+ thisMax := maxElemSize
+ if maxTotalSize < thisMax {
+ thisMax = maxTotalSize
+ }
+ arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax)
+ if err != nil {
+ return v, err
+ }
+ v = append(v, arg)
+ addr += usermem.Addr(t.Arch().Width())
+ maxTotalSize -= len(arg) + 1
+ }
+ return v, nil
+}
+
+// CopyOutIovecs converts src to an array of struct iovecs and copies it to the
+// memory mapped at addr.
+//
+// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error {
+ switch t.Arch().Width() {
+ case 8:
+ const itemLen = 16
+ if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok {
+ return syserror.EFAULT
+ }
+
+ b := t.CopyScratchBuffer(itemLen)
+ for ; !src.IsEmpty(); src = src.Tail() {
+ ar := src.Head()
+ usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start))
+ usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length()))
+ if _, err := t.CopyOutBytes(addr, b); err != nil {
+ return err
+ }
+ addr += itemLen
+ }
+
+ default:
+ return syserror.ENOSYS
+ }
+
+ return nil
+}
+
+// CopyInIovecs copies an array of numIovecs struct iovecs from the memory
+// mapped at addr, converts them to usermem.AddrRanges, and returns them as a
+// usermem.AddrRangeSeq.
+//
+// CopyInIovecs shares the following properties with Linux's
+// lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector():
+//
+// - If the length of any AddrRange would exceed the range of an ssize_t,
+// CopyInIovecs returns EINVAL.
+//
+// - If the length of any AddrRange would cause its end to overflow,
+// CopyInIovecs returns EFAULT.
+//
+// - The combined length of all AddrRanges is limited to _MAX_RW_COUNT. If the
+// combined length of all AddrRanges would otherwise exceed this amount, ranges
+// beyond _MAX_RW_COUNT are silently truncated.
+//
+// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the
+// task goroutine. t's AddressSpace must be active.
+func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) {
+ if numIovecs == 0 {
+ return usermem.AddrRangeSeq{}, nil
+ }
+
+ var dst []usermem.AddrRange
+ if numIovecs > 1 {
+ dst = make([]usermem.AddrRange, 0, numIovecs)
+ }
+
+ switch t.Arch().Width() {
+ case 8:
+ const itemLen = 16
+ if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok {
+ return usermem.AddrRangeSeq{}, syserror.EFAULT
+ }
+
+ b := t.CopyScratchBuffer(itemLen)
+ for i := 0; i < numIovecs; i++ {
+ if _, err := t.CopyInBytes(addr, b); err != nil {
+ return usermem.AddrRangeSeq{}, err
+ }
+
+ base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8]))
+ length := usermem.ByteOrder.Uint64(b[8:16])
+ if length > math.MaxInt64 {
+ return usermem.AddrRangeSeq{}, syserror.EINVAL
+ }
+ ar, ok := base.ToRange(length)
+ if !ok {
+ return usermem.AddrRangeSeq{}, syserror.EFAULT
+ }
+
+ if numIovecs == 1 {
+ // Special case to avoid allocating dst.
+ return usermem.AddrRangeSeqOf(ar).TakeFirst(_MAX_RW_COUNT), nil
+ }
+ dst = append(dst, ar)
+
+ addr += itemLen
+ }
+
+ default:
+ return usermem.AddrRangeSeq{}, syserror.ENOSYS
+ }
+
+ // Truncate to _MAX_RW_COUNT.
+ var total uint64
+ for i := range dst {
+ dstlen := uint64(dst[i].Length())
+ if rem := uint64(_MAX_RW_COUNT) - total; rem < dstlen {
+ dst[i].End -= usermem.Addr(dstlen - rem)
+ dstlen = rem
+ }
+ total += dstlen
+ }
+
+ return usermem.AddrRangeSeqFromSlice(dst), nil
+}
+
+// SingleIOSequence returns a usermem.IOSequence representing [addr,
+// addr+length) in t's address space. If length exceeds _MAX_RW_COUNT, it is
+// silently truncated.
+//
+// SingleIOSequence is analogous to Linux's
+// lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and
+// write syscalls in Linux do not use import_single_range(), but are still
+// truncated to _MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
+func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+ if length > _MAX_RW_COUNT {
+ length = _MAX_RW_COUNT
+ }
+ ar, ok := addr.ToRange(uint64(length))
+ if !ok {
+ return usermem.IOSequence{}, syserror.EFAULT
+ }
+ return usermem.IOSequence{
+ IO: t.MemoryManager(),
+ Addrs: usermem.AddrRangeSeqOf(ar),
+ Opts: opts,
+ }, nil
+}
+
+// IovecsIOSequence returns a usermem.IOSequence representing the array of
+// iovcnt struct iovecs at addr in t's address space. opts applies to the
+// returned IOSequence, not the reading of the struct iovec array.
+//
+// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
+//
+// Preconditions: As for Task.CopyInIovecs.
+func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
+ if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
+ return usermem.IOSequence{}, syserror.EINVAL
+ }
+ ars, err := t.CopyInIovecs(addr, iovcnt)
+ if err != nil {
+ return usermem.IOSequence{}, err
+ }
+ return usermem.IOSequence{
+ IO: t.MemoryManager(),
+ Addrs: ars,
+ Opts: opts,
+ }, nil
+}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
new file mode 100644
index 000000000..8fffd3446
--- /dev/null
+++ b/pkg/sentry/kernel/thread_group.go
@@ -0,0 +1,269 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "sync"
+ "sync/atomic"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// A ThreadGroup is a logical grouping of tasks that has widespread
+// significance to other kernel features (e.g. signal handling). ("Thread
+// groups" are usually called "processes" in userspace documentation.)
+//
+// ThreadGroup is a superset of Linux's struct signal_struct.
+type ThreadGroup struct {
+ threadGroupNode
+
+ // signalHandlers is the set of signal handlers used by every task in this
+ // thread group. (signalHandlers may also be shared with other thread
+ // groups.)
+ //
+ // signalHandlers.mu (hereafter "the signal mutex") protects state related
+ // to signal handling, as well as state that usually needs to be atomic
+ // with signal handling, for all ThreadGroups and Tasks using
+ // signalHandlers. (This is analogous to Linux's use of struct
+ // sighand_struct::siglock.)
+ //
+ // The signalHandlers pointer can only be mutated during an execve
+ // (Task.finishExec). Consequently, when it's possible for a task in the
+ // thread group to be completing an execve, signalHandlers is protected by
+ // the owning TaskSet.mu. Otherwise, it is possible to read the
+ // signalHandlers pointer without synchronization. In particular,
+ // completing an execve requires that all other tasks in the thread group
+ // have exited, so task goroutines do not need the owning TaskSet.mu to
+ // read the signalHandlers pointer of their thread groups.
+ signalHandlers *SignalHandlers
+
+ // pendingSignals is the set of pending signals that may be handled by any
+ // task in this thread group.
+ //
+ // pendingSignals is protected by the signal mutex.
+ pendingSignals pendingSignals
+
+ // lastTimerSignalTask records the last task we deliver a process timer signal to.
+ // Please see SendTimerSignal for more details.
+ //
+ // lastTimerSignalTask is protected by the signal mutex.
+ lastTimerSignalTask *Task
+
+ // groupStopPhase indicates the state of a group stop in progress on the
+ // thread group, if any.
+ //
+ // groupStopPhase is protected by the signal mutex.
+ groupStopPhase groupStopPhase
+
+ // groupStopSignal is the signal that caused a group stop to be initiated.
+ // groupStopSignal is only meaningful if groupStopPhase is
+ // groupStopInitiated or groupStopComplete.
+ //
+ // groupStopSignal is protected by the signal mutex.
+ groupStopSignal linux.Signal
+
+ // groupStopCount is the number of non-exited tasks in the thread group
+ // that have acknowledged an initiated group stop. groupStopCount is only
+ // meaningful if groupStopPhase is groupStopInitiated.
+ //
+ // groupStopCount is protected by the signal mutex.
+ groupStopCount int
+
+ // If groupStopWaitable is true, the thread group is indicating a waitable
+ // group stop event (as defined by EventChildGroupStop).
+ //
+ // Linux represents the analogous state as SIGNAL_STOP_STOPPED being set
+ // and group_exit_code being non-zero.
+ //
+ // groupStopWaitable is protected by the signal mutex.
+ groupStopWaitable bool
+
+ // If groupContNotify is true, then a SIGCONT has recently ended a group
+ // stop on this thread group, and the first task to observe it should
+ // notify its parent.
+ //
+ // groupContNotify is protected by the signal mutex.
+ groupContNotify bool
+
+ // If groupContNotify is true, groupContInterrupted is true iff SIGCONT
+ // ended a group stop in phase groupStopInitiated. If groupContNotify is
+ // false, groupContInterrupted is meaningless.
+ //
+ // Analogues in Linux:
+ //
+ // - groupContNotify && groupContInterrupted is represented by
+ // SIGNAL_CLD_STOPPED.
+ //
+ // - groupContNotify && !groupContInterrupted is represented by
+ // SIGNAL_CLD_CONTINUED.
+ //
+ // - !groupContNotify is represented by neither flag being set.
+ //
+ // groupContInterrupted is protected by the signal mutex.
+ groupContInterrupted bool
+
+ // If groupContWaitable is true, the thread group is indicating a waitable
+ // continue event (as defined by EventGroupContinue).
+ //
+ // groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED.
+ //
+ // groupContWaitable is protected by the signal mutex.
+ groupContWaitable bool
+
+ // exiting is true if all tasks in the ThreadGroup should exit. exiting is
+ // analogous to Linux's SIGNAL_GROUP_EXIT.
+ //
+ // exiting is protected by the signal mutex. exiting can only transition
+ // from false to true.
+ exiting bool
+
+ // exitStatus is the thread group's exit status.
+ //
+ // While exiting is false, exitStatus is protected by the signal mutex.
+ // When exiting becomes true, exitStatus becomes immutable.
+ exitStatus ExitStatus
+
+ // terminationSignal is the signal that this thread group's leader will
+ // send to its parent when it exits.
+ //
+ // terminationSignal is protected by the TaskSet mutex.
+ terminationSignal linux.Signal
+
+ // liveGoroutines is the number of non-exited task goroutines in the thread
+ // group.
+ //
+ // liveGoroutines is not saved; it is reset as task goroutines are
+ // restarted by Task.Start.
+ liveGoroutines sync.WaitGroup `state:"nosave"`
+
+ // tm contains process timers. TimerManager fields are immutable.
+ tm TimerManager
+
+ // exitedCPUStats is the CPU usage for all exited tasks in the thread
+ // group. exitedCPUStats is protected by the TaskSet mutex.
+ exitedCPUStats usage.CPUStats
+
+ // childCPUStats is the CPU usage of all joined descendants of this thread
+ // group. childCPUStats is protected by the TaskSet mutex.
+ childCPUStats usage.CPUStats
+
+ // ioUsage is the I/O usage for all exited tasks in the thread group.
+ // The ioUsage pointer is immutable.
+ ioUsage *usage.IO
+
+ // maxRSS is the historical maximum resident set size of the thread group, updated when:
+ //
+ // - A task in the thread group exits, since after all tasks have
+ // exited the MemoryManager is no longer reachable.
+ //
+ // - The thread group completes an execve, since this changes
+ // MemoryManagers.
+ //
+ // maxRSS is protected by the TaskSet mutex.
+ maxRSS uint64
+
+ // childMaxRSS is the maximum resident set size in bytes of all joined
+ // descendants of this thread group.
+ //
+ // childMaxRSS is protected by the TaskSet mutex.
+ childMaxRSS uint64
+
+ // Resource limits for this ThreadGroup. The limits pointer is immutable.
+ limits *limits.LimitSet
+
+ // processGroup is the processGroup for this thread group.
+ //
+ // processGroup is protected by the TaskSet mutex.
+ processGroup *ProcessGroup
+
+ // execed indicates an exec has occurred since creation. This will be
+ // set by finishExec, and new TheadGroups will have this field cleared.
+ // When execed is set, the processGroup may no longer be changed.
+ //
+ // execed is protected by the TaskSet mutex.
+ execed bool
+
+ // rscr is the thread group's RSEQ critical region.
+ rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
+}
+
+// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// thread group leader will send its parent terminationSignal when it exits.
+// The new thread group isn't visible to the system until a task has been
+// created inside of it by a successful call to TaskSet.NewTask.
+func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+ tg := &ThreadGroup{
+ threadGroupNode: threadGroupNode{
+ pidns: ns,
+ },
+ signalHandlers: sh,
+ terminationSignal: terminationSignal,
+ ioUsage: &usage.IO{},
+ limits: limits,
+ }
+ tg.tm = newTimerManager(tg, monotonicClock)
+ tg.rscr.Store(&RSEQCriticalRegion{})
+ return tg
+}
+
+// saveRscr is invopked by stateify.
+func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion {
+ return tg.rscr.Load().(*RSEQCriticalRegion)
+}
+
+// loadRscr is invoked by stateify.
+func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) {
+ tg.rscr.Store(rscr)
+}
+
+// SignalHandlers returns the signal handlers used by tg.
+//
+// Preconditions: The caller must provide the synchronization required to read
+// tg.signalHandlers, as described in the field's comment.
+func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
+ return tg.signalHandlers
+}
+
+// Timer returns tg's timers.
+func (tg *ThreadGroup) Timer() *TimerManager {
+ return &tg.tm
+}
+
+// Limits returns tg's limits.
+func (tg *ThreadGroup) Limits() *limits.LimitSet {
+ return tg.limits
+}
+
+// release releases the thread group's resources.
+func (tg *ThreadGroup) release() {
+ // This must be done without holding the TaskSet mutex since thread group
+ // timers call SendSignal with Timer.mu locked.
+ tg.tm.destroy()
+}
+
+// forEachChildThreadGroupLocked indicates over all child ThreadGroups.
+//
+// Precondition: TaskSet.mu must be held.
+func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ for child := range t.children {
+ if child == child.tg.leader {
+ fn(child.tg)
+ }
+ }
+ }
+}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
new file mode 100644
index 000000000..440da9dad
--- /dev/null
+++ b/pkg/sentry/kernel/threads.go
@@ -0,0 +1,443 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// TasksLimit is the maximum number of threads for untrusted application.
+// Linux doesn't really limit this directly, rather it is limited by total
+// memory size, stacks allocated and a global maximum. There's no real reason
+// for us to limit it either, (esp. since threads are backed by go routines),
+// and we would expect to hit resource limits long before hitting this number.
+// However, for correctness, we still check that the user doesn't exceed this
+// number.
+//
+// Note that because of the way futexes are implemented, there *are* in fact
+// serious restrictions on valid thread IDs. They are limited to 2^30 - 1
+// (kernel/fork.c:MAX_THREADS).
+const TasksLimit = (1 << 16)
+
+// ThreadID is a generic thread identifier.
+type ThreadID int32
+
+// String returns a decimal representation of the ThreadID.
+func (tid ThreadID) String() string {
+ return fmt.Sprintf("%d", tid)
+}
+
+// InitTID is the TID given to the first task added to each PID namespace. The
+// thread group led by InitTID is called the namespace's init process. The
+// death of a PID namespace's init process causes all tasks visible in that
+// namespace to be killed.
+const InitTID ThreadID = 1
+
+// A TaskSet comprises all tasks in a system.
+type TaskSet struct {
+ // mu protects all relationships betweens tasks and thread groups in the
+ // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
+ mu sync.RWMutex `state:"nosave"`
+
+ // Root is the root PID namespace, in which all tasks in the TaskSet are
+ // visible. The Root pointer is immutable.
+ Root *PIDNamespace
+
+ // sessions is the set of all sessions.
+ sessions sessionList
+
+ // stopCount is the number of active external stops applicable to all tasks
+ // in the TaskSet (calls to TaskSet.BeginExternalStop that have not been
+ // paired with a call to TaskSet.EndExternalStop). stopCount is protected
+ // by mu.
+ //
+ // stopCount is not saved for the same reason as Task.stopCount; it is
+ // always reset to zero after restore.
+ stopCount int32 `state:"nosave"`
+
+ // liveGoroutines is the number of non-exited task goroutines in the
+ // TaskSet.
+ //
+ // liveGoroutines is not saved; it is reset as task goroutines are
+ // restarted by Task.Start.
+ liveGoroutines sync.WaitGroup `state:"nosave"`
+
+ // runningGoroutines is the number of running task goroutines in the
+ // TaskSet.
+ //
+ // runningGoroutines is not saved; its counter value is required to be zero
+ // at time of save (but note that this is not necessarily the same thing as
+ // sync.WaitGroup's zero value).
+ runningGoroutines sync.WaitGroup `state:"nosave"`
+}
+
+// newTaskSet returns a new, empty TaskSet.
+func newTaskSet() *TaskSet {
+ ts := &TaskSet{}
+ ts.Root = newPIDNamespace(ts, nil /* parent */, auth.NewRootUserNamespace())
+ return ts
+}
+
+// forEachThreadGroupLocked applies f to each thread group in ts.
+//
+// Preconditions: ts.mu must be locked (for reading or writing).
+func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
+ for t := range ts.Root.tids {
+ if t == t.tg.leader {
+ f(t.tg)
+ }
+ }
+}
+
+// A PIDNamespace represents a PID namespace, a bimap between thread IDs and
+// tasks. See the pid_namespaces(7) man page for further details.
+//
+// N.B. A task is said to be visible in a PID namespace if the PID namespace
+// contains a thread ID that maps to that task.
+type PIDNamespace struct {
+ // owner is the TaskSet that this PID namespace belongs to. The owner
+ // pointer is immutable.
+ owner *TaskSet
+
+ // parent is the PID namespace of the process that created this one. If
+ // this is the root PID namespace, parent is nil. The parent pointer is
+ // immutable.
+ //
+ // Invariant: All tasks that are visible in this namespace are also visible
+ // in all ancestor namespaces.
+ parent *PIDNamespace
+
+ // userns is the user namespace with which this PID namespace is
+ // associated. Privileged operations on this PID namespace must have
+ // appropriate capabilities in userns. The userns pointer is immutable.
+ userns *auth.UserNamespace
+
+ // The following fields are protected by owner.mu.
+
+ // last is the last ThreadID to be allocated in this namespace.
+ last ThreadID
+
+ // tasks is a mapping from ThreadIDs in this namespace to tasks visible in
+ // the namespace.
+ tasks map[ThreadID]*Task
+
+ // tids is a mapping from tasks visible in this namespace to their
+ // identifiers in this namespace.
+ tids map[*Task]ThreadID
+
+ // sessions is a mapping from SessionIDs in this namespace to sessions
+ // visible in the namespace.
+ sessions map[SessionID]*Session
+
+ // sids is a mapping from sessions visible in this namespace to their
+ // identifiers in this namespace.
+ sids map[*Session]SessionID
+
+ // processGroups is a mapping from ProcessGroupIDs in this namespace to
+ // process groups visible in the namespace.
+ processGroups map[ProcessGroupID]*ProcessGroup
+
+ // pgids is a mapping from process groups visible in this namespace to
+ // their identifiers in this namespace.
+ pgids map[*ProcessGroup]ProcessGroupID
+
+ // exiting indicates that the namespace's init process is exiting or has
+ // exited.
+ exiting bool
+}
+
+func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
+ return &PIDNamespace{
+ owner: ts,
+ parent: parent,
+ userns: userns,
+ tasks: make(map[ThreadID]*Task),
+ tids: make(map[*Task]ThreadID),
+ sessions: make(map[SessionID]*Session),
+ sids: make(map[*Session]SessionID),
+ processGroups: make(map[ProcessGroupID]*ProcessGroup),
+ pgids: make(map[*ProcessGroup]ProcessGroupID),
+ }
+}
+
+// NewChild returns a new, empty PID namespace that is a child of ns. Authority
+// over the new PID namespace is controlled by userns.
+func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
+ return newPIDNamespace(ns.owner, ns, userns)
+}
+
+// TaskWithID returns the task with thread ID tid in PID namespace ns. If no
+// task has that TID, TaskWithID returns nil.
+func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.tasks[tid]
+}
+
+// ThreadGroupWithID returns the thread group lead by the task with thread ID
+// tid in PID namespace ns. If no task has that TID, or if the task with that
+// TID is not a thread group leader, ThreadGroupWithID returns nil.
+func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ t := ns.tasks[tid]
+ if t == nil {
+ return nil
+ }
+ if t != t.tg.leader {
+ return nil
+ }
+ return t.tg
+}
+
+// IDOfTask returns the TID assigned to the given task in PID namespace ns. If
+// the task is not visible in that namespace, IDOfTask returns 0. (This return
+// value is significant in some cases, e.g. getppid() is documented as
+// returning 0 if the caller's parent is in an ancestor namespace and
+// consequently not visible to the caller.) If the task is nil, IDOfTask returns
+// 0.
+func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.tids[t]
+}
+
+// IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
+// If the task is not visible in that namespace, IDOfThreadGroup returns 0.
+func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.tids[tg.leader]
+}
+
+// Tasks returns a snapshot of the tasks in ns.
+func (ns *PIDNamespace) Tasks() []*Task {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ tasks := make([]*Task, 0, len(ns.tasks))
+ for t := range ns.tids {
+ tasks = append(tasks, t)
+ }
+ return tasks
+}
+
+// ThreadGroups returns a snapshot of the thread groups in ns.
+func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ var tgs []*ThreadGroup
+ for t := range ns.tids {
+ if t == t.tg.leader {
+ tgs = append(tgs, t.tg)
+ }
+ }
+ return tgs
+}
+
+// UserNamespace returns the user namespace associated with PID namespace ns.
+func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
+ return ns.userns
+}
+
+// A threadGroupNode defines the relationship between a thread group and the
+// rest of the system. Conceptually, threadGroupNode is data belonging to the
+// owning TaskSet, as if TaskSet contained a field `nodes
+// map[*ThreadGroup]*threadGroupNode`. However, for practical reasons,
+// threadGroupNode is embedded in the ThreadGroup it represents.
+// (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
+// threadGroupEntry's methods on ThreadGroup to make it implement
+// threadGroupLinker.)
+type threadGroupNode struct {
+ // pidns is the PID namespace containing the thread group and all of its
+ // member tasks. The pidns pointer is immutable.
+ pidns *PIDNamespace
+
+ // eventQueue is notified whenever a event of interest to Task.Wait occurs
+ // in a child of this thread group, or a ptrace tracee of a task in this
+ // thread group. Events are defined in task_exit.go.
+ //
+ // Note that we cannot check and save this wait queue similarly to other
+ // wait queues, as the queue will not be empty by the time of saving, due
+ // to the wait sourced from Exec().
+ eventQueue waiter.Queue `state:"nosave"`
+
+ // leader is the thread group's leader, which is the oldest task in the
+ // thread group; usually the last task in the thread group to call
+ // execve(), or if no such task exists then the first task in the thread
+ // group, which was created by a call to fork() or clone() without
+ // CLONE_THREAD. Once a thread group has been made visible to the rest of
+ // the system by TaskSet.newTask, leader is never nil.
+ //
+ // Note that it's possible for the leader to exit without causing the rest
+ // of the thread group to exit; in such a case, leader will still be valid
+ // and non-nil, but leader will not be in tasks.
+ //
+ // leader is protected by the TaskSet mutex.
+ leader *Task
+
+ // If execing is not nil, it is a task in the thread group that has killed
+ // all other tasks so that it can become the thread group leader and
+ // perform an execve. (execing may already be the thread group leader.)
+ //
+ // execing is analogous to Linux's signal_struct::group_exit_task.
+ //
+ // execing is protected by the TaskSet mutex.
+ execing *Task
+
+ // tasks is all tasks in the thread group that have not yet been reaped.
+ //
+ // tasks is protected by both the TaskSet mutex and the signal mutex:
+ // Mutating tasks requires locking the TaskSet mutex for writing *and*
+ // locking the signal mutex. Reading tasks requires locking the TaskSet
+ // mutex *or* locking the signal mutex.
+ tasks taskList
+
+ // tasksCount is the number of tasks in the thread group that have not yet
+ // been reaped; equivalently, tasksCount is the number of tasks in tasks.
+ //
+ // tasksCount is protected by both the TaskSet mutex and the signal mutex,
+ // as with tasks.
+ tasksCount int
+
+ // liveTasks is the number of tasks in the thread group that have not yet
+ // reached TaskExitZombie.
+ //
+ // liveTasks is protected by the TaskSet mutex (NOT the signal mutex).
+ liveTasks int
+
+ // activeTasks is the number of tasks in the thread group that have not yet
+ // reached TaskExitInitiated.
+ //
+ // activeTasks is protected by both the TaskSet mutex and the signal mutex,
+ // as with tasks.
+ activeTasks int
+}
+
+// PIDNamespace returns the PID namespace containing tg.
+func (tg *ThreadGroup) PIDNamespace() *PIDNamespace {
+ return tg.pidns
+}
+
+// TaskSet returns the TaskSet containing tg.
+func (tg *ThreadGroup) TaskSet() *TaskSet {
+ return tg.pidns.owner
+}
+
+// Leader returns tg's leader.
+func (tg *ThreadGroup) Leader() *Task {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ return tg.leader
+}
+
+// Count returns the number of non-exited threads in the group.
+func (tg *ThreadGroup) Count() int {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ var count int
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ count++
+ }
+ return count
+}
+
+// MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for
+// all tasks in tg.
+func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+
+ var tasks []ThreadID
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ if id, ok := pidns.tids[t]; ok {
+ tasks = append(tasks, id)
+ }
+ }
+ return tasks
+}
+
+// ID returns tg's leader's thread ID in its own PID namespace. If tg's leader
+// is dead, ID returns 0.
+func (tg *ThreadGroup) ID() ThreadID {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ return tg.pidns.tids[tg.leader]
+}
+
+// A taskNode defines the relationship between a task and the rest of the
+// system. The comments on threadGroupNode also apply to taskNode.
+type taskNode struct {
+ // tg is the thread group that this task belongs to. The tg pointer is
+ // immutable.
+ tg *ThreadGroup `state:"wait"`
+
+ // taskEntry links into tg.tasks. Note that this means that
+ // Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread
+ // group. See threadGroupNode.tasks for synchronization info.
+ taskEntry
+
+ // parent is the task's parent. parent may be nil.
+ //
+ // parent is protected by the TaskSet mutex.
+ parent *Task
+
+ // children is this task's children.
+ //
+ // children is protected by the TaskSet mutex.
+ children map[*Task]struct{}
+
+ // If childPIDNamespace is not nil, all new tasks created by this task will
+ // be members of childPIDNamespace rather than this one. (As a corollary,
+ // this task becomes unable to create sibling tasks in the same thread
+ // group.)
+ //
+ // childPIDNamespace is exclusive to the task goroutine.
+ childPIDNamespace *PIDNamespace
+}
+
+// ThreadGroup returns the thread group containing t.
+func (t *Task) ThreadGroup() *ThreadGroup {
+ return t.tg
+}
+
+// PIDNamespace returns the PID namespace containing t.
+func (t *Task) PIDNamespace() *PIDNamespace {
+ return t.tg.pidns
+}
+
+// TaskSet returns the TaskSet containing t.
+func (t *Task) TaskSet() *TaskSet {
+ return t.tg.pidns.owner
+}
+
+// Timekeeper returns the system Timekeeper.
+func (t *Task) Timekeeper() *Timekeeper {
+ return t.k.timekeeper
+}
+
+// Parent returns t's parent.
+func (t *Task) Parent() *Task {
+ return t.parent
+}
+
+// ThreadID returns t's thread ID in its own PID namespace. If the task is
+// dead, ThreadID returns 0.
+func (t *Task) ThreadID() ThreadID {
+ return t.tg.pidns.IDOfTask(t)
+}
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
new file mode 100644
index 000000000..84f31b2dc
--- /dev/null
+++ b/pkg/sentry/kernel/time/BUILD
@@ -0,0 +1,32 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+ name = "time_state",
+ srcs = [
+ "time.go",
+ ],
+ out = "time_state.go",
+ package = "time",
+)
+
+go_library(
+ name = "time",
+ srcs = [
+ "context.go",
+ "time.go",
+ "time_state.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/log",
+ "//pkg/sentry/context",
+ "//pkg/state",
+ "//pkg/syserror",
+ "//pkg/waiter",
+ ],
+)
diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go
new file mode 100644
index 000000000..ac4dc01d8
--- /dev/null
+++ b/pkg/sentry/kernel/time/context.go
@@ -0,0 +1,44 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package time
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the time package's type for context.Context.Value keys.
+type contextID int
+
+const (
+ // CtxRealtimeClock is a Context.Value key for the current real time.
+ CtxRealtimeClock contextID = iota
+)
+
+// RealtimeClockFromContext returns the real time clock associated with context
+// ctx.
+func RealtimeClockFromContext(ctx context.Context) Clock {
+ if v := ctx.Value(CtxRealtimeClock); v != nil {
+ return v.(Clock)
+ }
+ return nil
+}
+
+// NowFromContext returns the current real time associated with context ctx.
+func NowFromContext(ctx context.Context) Time {
+ if clk := RealtimeClockFromContext(ctx); clk != nil {
+ return clk.Now()
+ }
+ panic("encountered context without RealtimeClock")
+}
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
new file mode 100644
index 000000000..c223c2f19
--- /dev/null
+++ b/pkg/sentry/kernel/time/time.go
@@ -0,0 +1,649 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package time defines the Timer type, which provides a periodic timer that
+// works by sampling a user-provided clock.
+package time
+
+import (
+ "fmt"
+ "math"
+ "sync"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Events that may be generated by a Clock.
+const (
+ // ClockEventSet occurs when a Clock undergoes a discontinuous change.
+ ClockEventSet waiter.EventMask = 1 << iota
+
+ // ClockEventRateIncrease occurs when the rate at which a Clock advances
+ // increases significantly, such that values returned by previous calls to
+ // Clock.WallTimeUntil may be too large.
+ ClockEventRateIncrease
+)
+
+// Time represents an instant in time with nanosecond precision.
+//
+// Time may represent time with respect to any clock and may not have any
+// meaning in the real world.
+type Time struct {
+ ns int64
+}
+
+var (
+ // MinTime is the zero time instant, the lowest possible time that can
+ // be represented by Time.
+ MinTime = Time{ns: math.MinInt64}
+
+ // MaxTime is the highest possible time that can be represented by
+ // Time.
+ MaxTime = Time{ns: math.MaxInt64}
+
+ // ZeroTime represents the zero time in an unspecified Clock's domain.
+ ZeroTime = Time{ns: 0}
+)
+
+const (
+ // MinDuration is the minimum duration representable by time.Duration.
+ MinDuration = time.Duration(math.MinInt64)
+
+ // MaxDuration is the maximum duration representable by time.Duration.
+ MaxDuration = time.Duration(math.MaxInt64)
+)
+
+// FromNanoseconds returns a Time representing the point ns nanoseconds after
+// an unspecified Clock's zero time.
+func FromNanoseconds(ns int64) Time {
+ return Time{ns}
+}
+
+// FromSeconds returns a Time representing the point s seconds after an
+// unspecified Clock's zero time.
+func FromSeconds(s int64) Time {
+ if s > math.MaxInt64/time.Second.Nanoseconds() {
+ return MaxTime
+ }
+ return Time{s * 1e9}
+}
+
+// FromUnix converts from Unix seconds and nanoseconds to Time, assuming a real
+// time Unix clock domain.
+func FromUnix(s int64, ns int64) Time {
+ if s > math.MaxInt64/time.Second.Nanoseconds() {
+ return MaxTime
+ }
+ t := s * 1e9
+ if t > math.MaxInt64-ns {
+ return MaxTime
+ }
+ return Time{t + ns}
+}
+
+// FromTimespec converts from Linux Timespec to Time.
+func FromTimespec(ts linux.Timespec) Time {
+ return Time{ts.ToNsecCapped()}
+}
+
+// FromTimeval converts a Linux Timeval to Time.
+func FromTimeval(tv linux.Timeval) Time {
+ return Time{tv.ToNsecCapped()}
+}
+
+// Nanoseconds returns nanoseconds elapsed since the zero time in t's Clock
+// domain. If t represents walltime, this is nanoseconds since the Unix epoch.
+func (t Time) Nanoseconds() int64 {
+ return t.ns
+}
+
+// Seconds returns seconds elapsed since the zero time in t's Clock domain. If
+// t represents walltime, this is seconds since Unix epoch.
+func (t Time) Seconds() int64 {
+ return t.Nanoseconds() / time.Second.Nanoseconds()
+}
+
+// Timespec converts Time to a Linux timespec.
+func (t Time) Timespec() linux.Timespec {
+ return linux.NsecToTimespec(t.Nanoseconds())
+}
+
+// Unix returns the (seconds, nanoseconds) representation of t such that
+// seconds*1e9 + nanoseconds = t.
+func (t Time) Unix() (s int64, ns int64) {
+ s = t.ns / 1e9
+ ns = t.ns % 1e9
+ return
+}
+
+// TimeT converts Time to a Linux time_t.
+func (t Time) TimeT() linux.TimeT {
+ return linux.NsecToTimeT(t.Nanoseconds())
+}
+
+// Timeval converts Time to a Linux timeval.
+func (t Time) Timeval() linux.Timeval {
+ return linux.NsecToTimeval(t.Nanoseconds())
+}
+
+// Add adds the duration of d to t.
+func (t Time) Add(d time.Duration) Time {
+ if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) {
+ return MaxTime
+ }
+ if t.ns < 0 && d.Nanoseconds() < math.MinInt64-int64(t.ns) {
+ return MinTime
+ }
+ return Time{int64(t.ns) + d.Nanoseconds()}
+}
+
+// AddTime adds the duration of u to t.
+func (t Time) AddTime(u Time) Time {
+ return t.Add(time.Duration(u.ns))
+}
+
+// Equal reports whether the two times represent the same instant in time.
+func (t Time) Equal(u Time) bool {
+ return t.ns == u.ns
+}
+
+// Before reports whether the instant t is before the instant u.
+func (t Time) Before(u Time) bool {
+ return t.ns < u.ns
+}
+
+// After reports whether the instant t is after the instant u.
+func (t Time) After(u Time) bool {
+ return t.ns > u.ns
+}
+
+// Sub returns the duration of t - u.
+//
+// N.B. This measure may not make sense for every Time returned by ktime.Clock.
+// Callers who need wall time duration can use ktime.Clock.WallTimeUntil to
+// estimate that wall time.
+func (t Time) Sub(u Time) time.Duration {
+ dur := time.Duration(int64(t.ns)-int64(u.ns)) * time.Nanosecond
+ switch {
+ case u.Add(dur).Equal(t):
+ return dur
+ case t.Before(u):
+ return MinDuration
+ default:
+ return MaxDuration
+ }
+}
+
+// IsMin returns whether t represents the lowest possible time instant.
+func (t Time) IsMin() bool {
+ return t == MinTime
+}
+
+// IsZero returns whether t represents the zero time instant in t's Clock domain.
+func (t Time) IsZero() bool {
+ return t == ZeroTime
+}
+
+// String returns the time represented in nanoseconds as a string.
+func (t Time) String() string {
+ return fmt.Sprintf("%dns", t.Nanoseconds())
+}
+
+// A Clock is an abstract time source.
+type Clock interface {
+ // Now returns the current time in nanoseconds according to the Clock.
+ Now() Time
+
+ // WallTimeUntil returns the estimated wall time until Now will return a
+ // value greater than or equal to t, given that a recent call to Now
+ // returned now. If t has already passed, WallTimeUntil may return 0 or a
+ // negative value.
+ //
+ // WallTimeUntil must be abstract to support Clocks that do not represent
+ // wall time (e.g. thread group execution timers). Clocks that represent
+ // wall times may embed the WallRateClock type to obtain an appropriate
+ // trivial implementation of WallTimeUntil.
+ //
+ // WallTimeUntil is used to determine when associated Timers should next
+ // check for expirations. Returning too small a value may result in
+ // spurious Timer goroutine wakeups, while returning too large a value may
+ // result in late expirations. Implementations should usually err on the
+ // side of underestimating.
+ WallTimeUntil(t, now Time) time.Duration
+
+ // Waitable methods may be used to subscribe to Clock events. Waiters will
+ // not be preserved by Save and must be re-established during restore.
+ //
+ // Since Clock events are transient, implementations of
+ // waiter.Waitable.Readiness should return 0.
+ waiter.Waitable
+}
+
+// WallRateClock implements Clock.WallTimeUntil for Clocks that elapse at the
+// same rate as wall time.
+type WallRateClock struct{}
+
+// WallTimeUntil implements Clock.WallTimeUntil.
+func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
+ return t.Sub(now)
+}
+
+// NoClockEvents implements waiter.Waitable for Clocks that do not generate
+// events.
+type NoClockEvents struct{}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (NoClockEvents) EventUnregister(e *waiter.Entry) {
+}
+
+// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
+// defining waiter.Waitable.Readiness as required by Clock.
+type ClockEventsQueue struct {
+ waiter.Queue
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return 0
+}
+
+// A TimerListener receives expirations from a Timer.
+type TimerListener interface {
+ // Notify is called when its associated Timer expires. exp is the number of
+ // expirations.
+ //
+ // Notify is called with the associated Timer's mutex locked, so Notify
+ // must not take any locks that precede Timer.mu in lock order.
+ //
+ // Preconditions: exp > 0.
+ Notify(exp uint64)
+
+ // Destroy is called when the timer is destroyed.
+ Destroy()
+}
+
+// Setting contains user-controlled mutable Timer properties.
+type Setting struct {
+ // Enabled is true if the timer is running.
+ Enabled bool
+
+ // Next is the time in nanoseconds of the next expiration.
+ Next Time
+
+ // Period is the time in nanoseconds between expirations. If Period is
+ // zero, the timer will not automatically restart after expiring.
+ //
+ // Invariant: Period >= 0.
+ Period time.Duration
+}
+
+// SettingFromSpec converts a (value, interval) pair to a Setting based on a
+// reading from c. value is interpreted as a time relative to c.Now().
+func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) {
+ if value < 0 {
+ return Setting{}, syserror.EINVAL
+ }
+ if value == 0 {
+ return Setting{Period: interval}, nil
+ }
+ return Setting{
+ Enabled: true,
+ Next: c.Now().Add(value),
+ Period: interval,
+ }, nil
+}
+
+// SettingFromAbsSpec converts a (value, interval) pair to a Setting based on a
+// reading from c. value is interpreted as an absolute time.
+func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) {
+ if value.Before(ZeroTime) {
+ return Setting{}, syserror.EINVAL
+ }
+ if value.IsZero() {
+ return Setting{Period: interval}, nil
+ }
+ return Setting{
+ Enabled: true,
+ Next: value,
+ Period: interval,
+ }, nil
+}
+
+// SpecFromSetting converts a timestamp and a Setting to a (relative value,
+// interval) pair, as used by most Linux syscalls that return a struct
+// itimerval or struct itimerspec.
+func SpecFromSetting(now Time, s Setting) (value, period time.Duration) {
+ if !s.Enabled {
+ return 0, s.Period
+ }
+ return s.Next.Sub(now), s.Period
+}
+
+// advancedTo returns an updated Setting and a number of expirations after
+// the associated Clock indicates a time of now.
+//
+// Settings may be created by successive calls to advancedTo with decreasing
+// values of now (i.e. time may appear to go backward). Supporting this is
+// required to support non-monotonic clocks, as well as allowing
+// Timer.clock.Now() to be called without holding Timer.mu.
+func (s Setting) advancedTo(now Time) (Setting, uint64) {
+ if !s.Enabled {
+ return s, 0
+ }
+ if s.Next.After(now) {
+ return s, 0
+ }
+ if s.Period == 0 {
+ s.Enabled = false
+ return s, 1
+ }
+ exp := 1 + uint64(now.Sub(s.Next).Nanoseconds())/uint64(s.Period)
+ s.Next = s.Next.Add(time.Duration(uint64(s.Period) * exp))
+ return s, exp
+}
+
+// Timer is an optionally-periodic timer driven by sampling a user-specified
+// Clock. Timer's semantics support the requirements of Linux's interval timers
+// (setitimer(2), timer_create(2), timerfd_create(2)).
+//
+// Timers should be created using NewTimer and must be cleaned up by calling
+// Timer.Destroy when no longer used.
+type Timer struct {
+ // clock is the time source. clock is immutable.
+ clock Clock
+
+ // listener is notified of expirations. listener is immutable.
+ listener TimerListener
+
+ // mu protects the following mutable fields.
+ mu sync.Mutex `state:"nosave"`
+
+ // setting is the timer setting. setting is protected by mu.
+ setting Setting
+
+ // paused is true if the Timer is paused. paused is protected by mu.
+ paused bool
+
+ // kicker is used to wake the Timer goroutine. The kicker pointer is
+ // immutable, but its state is protected by mu.
+ kicker *time.Timer `state:"nosave"`
+
+ // entry is registered with clock.EventRegister. entry is immutable.
+ //
+ // Per comment in Clock, entry must be re-registered after restore; per
+ // comment in Timer.Load, this is done in Timer.Resume.
+ entry waiter.Entry `state:"nosave"`
+
+ // events is the channel that will be notified whenever entry receives an
+ // event. It is also closed by Timer.Destroy to instruct the Timer
+ // goroutine to exit.
+ events chan struct{} `state:"nosave"`
+}
+
+// timerTickEvents are Clock events that require the Timer goroutine to Tick
+// prematurely.
+const timerTickEvents = ClockEventSet | ClockEventRateIncrease
+
+// NewTimer returns a new Timer that will obtain time from clock and send
+// expirations to listener. The Timer is initially stopped and has no first
+// expiration or period configured.
+func NewTimer(clock Clock, listener TimerListener) *Timer {
+ t := &Timer{
+ clock: clock,
+ listener: listener,
+ }
+ t.init()
+ return t
+}
+
+// After waits for the duration to elapse according to clock and then sends a
+// notification on the returned channel. The timer is started immediately and
+// will fire exactly once. The second return value is the start time used with
+// the duration.
+//
+// Callers must call Timer.Destroy.
+func After(clock Clock, duration time.Duration) (*Timer, Time, <-chan struct{}) {
+ notifier, tchan := NewChannelNotifier()
+ t := NewTimer(clock, notifier)
+ now := clock.Now()
+
+ t.Swap(Setting{
+ Enabled: true,
+ Period: 0,
+ Next: now.Add(duration),
+ })
+ return t, now, tchan
+}
+
+// init initializes Timer state that is not preserved across save/restore. If
+// init has already been called, calling it again is a no-op.
+//
+// Preconditions: t.mu must be locked, or the caller must have exclusive access
+// to t.
+func (t *Timer) init() {
+ if t.kicker != nil {
+ return
+ }
+ // If t.kicker is nil, the Timer goroutine can't be running, so we can't
+ // race with it.
+ t.kicker = time.NewTimer(0)
+ t.entry, t.events = waiter.NewChannelEntry(nil)
+ t.clock.EventRegister(&t.entry, timerTickEvents)
+ go t.runGoroutine() // S/R-SAFE: synchronized by t.mu
+}
+
+// Destroy releases resources owned by the Timer. A Destroyed Timer must not be
+// used again; in particular, a Destroyed Timer should not be Saved.
+func (t *Timer) Destroy() {
+ // Stop the Timer, ensuring that the Timer goroutine will not call
+ // t.kicker.Reset, before calling t.kicker.Stop.
+ t.mu.Lock()
+ t.setting.Enabled = false
+ t.mu.Unlock()
+ t.kicker.Stop()
+ // Unregister t.entry, ensuring that the Clock will not send to t.events,
+ // before closing t.events to instruct the Timer goroutine to exit.
+ t.clock.EventUnregister(&t.entry)
+ close(t.events)
+ t.listener.Destroy()
+}
+
+func (t *Timer) runGoroutine() {
+ for {
+ select {
+ case <-t.kicker.C:
+ case _, ok := <-t.events:
+ if !ok {
+ // Channel closed by Destroy.
+ return
+ }
+ }
+ t.Tick()
+ }
+}
+
+// Tick requests that the Timer immediately check for expirations and
+// re-evaluate when it should next check for expirations.
+func (t *Timer) Tick() {
+ now := t.clock.Now()
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if t.paused {
+ return
+ }
+ s, exp := t.setting.advancedTo(now)
+ t.setting = s
+ if exp > 0 {
+ t.listener.Notify(exp)
+ }
+ t.resetKickerLocked(now)
+}
+
+// Pause pauses the Timer, ensuring that it does not generate any further
+// expirations until Resume is called. If the Timer is already paused, Pause
+// has no effect.
+func (t *Timer) Pause() {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.paused = true
+ // t.kicker may be nil if we were restored but never resumed.
+ if t.kicker != nil {
+ t.kicker.Stop()
+ }
+}
+
+// Resume ends the effect of Pause. If the Timer is not paused, Resume has no
+// effect.
+func (t *Timer) Resume() {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if !t.paused {
+ return
+ }
+ t.paused = false
+
+ // Lazily initialize the Timer. We can't call Timer.init until Timer.Resume
+ // because save/restore will restore Timers before
+ // kernel.Timekeeper.SetClocks() has been called, so if t.clock is backed
+ // by a kernel.Timekeeper then the Timer goroutine will panic if it calls
+ // t.clock.Now().
+ t.init()
+
+ // Kick the Timer goroutine in case it was already initialized, but the
+ // Timer goroutine was sleeping.
+ t.kicker.Reset(0)
+}
+
+// Get returns a snapshot of the Timer's current Setting and the time
+// (according to the Timer's Clock) at which the snapshot was taken.
+//
+// Preconditions: The Timer must not be paused (since its Setting cannot
+// be advanced to the current time while it is paused.)
+func (t *Timer) Get() (Time, Setting) {
+ now := t.clock.Now()
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if t.paused {
+ panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t))
+ }
+ s, exp := t.setting.advancedTo(now)
+ t.setting = s
+ if exp > 0 {
+ t.listener.Notify(exp)
+ }
+ t.resetKickerLocked(now)
+ return now, s
+}
+
+// Swap atomically changes the Timer's Setting and returns the Timer's previous
+// Setting and the time (according to the Timer's Clock) at which the snapshot
+// was taken. Setting s.Enabled to true starts the Timer, while setting
+// s.Enabled to false stops it.
+//
+// Preconditions: The Timer must not be paused.
+func (t *Timer) Swap(s Setting) (Time, Setting) {
+ return t.SwapAnd(s, nil)
+}
+
+// SwapAnd atomically changes the Timer's Setting, calls f if it is not nil,
+// and returns the Timer's previous Setting and the time (according to the
+// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
+// starts the timer, while setting s.Enabled to false stops it.
+//
+// Preconditions: The Timer must not be paused. f cannot call any Timer methods
+// since it is called with the Timer mutex locked.
+func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
+ now := t.clock.Now()
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ if t.paused {
+ panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t))
+ }
+ oldS, oldExp := t.setting.advancedTo(now)
+ if oldExp > 0 {
+ t.listener.Notify(oldExp)
+ }
+ if f != nil {
+ f()
+ }
+ newS, newExp := s.advancedTo(now)
+ t.setting = newS
+ if newExp > 0 {
+ t.listener.Notify(newExp)
+ }
+ t.resetKickerLocked(now)
+ return now, oldS
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Timer) resetKickerLocked(now Time) {
+ if t.setting.Enabled {
+ // Clock.WallTimeUntil may return a negative value. This is fine;
+ // time.when treats negative Durations as 0.
+ t.kicker.Reset(t.clock.WallTimeUntil(t.setting.Next, now))
+ }
+ // We don't call t.kicker.Stop if !t.setting.Enabled because in most cases
+ // resetKickerLocked will be called from the Timer goroutine itself, in
+ // which case t.kicker has already fired and t.kicker.Stop will be an
+ // expensive no-op (time.Timer.Stop => time.stopTimer => runtime.stopTimer
+ // => runtime.deltimer).
+}
+
+// Clock returns the Clock used by t.
+func (t *Timer) Clock() Clock {
+ return t.clock
+}
+
+// ChannelNotifier is a TimerListener that sends a message on an empty struct
+// channel.
+//
+// ChannelNotifier cannot be saved or loaded.
+type ChannelNotifier struct {
+ // tchan must be a buffered channel.
+ tchan chan struct{}
+}
+
+// NewChannelNotifier creates a new channel notifier.
+//
+// If the notifier is used with a timer, Timer.Destroy will close the channel
+// returned here.
+func NewChannelNotifier() (TimerListener, <-chan struct{}) {
+ tchan := make(chan struct{}, 1)
+ return &ChannelNotifier{tchan}, tchan
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (c *ChannelNotifier) Notify(uint64) {
+ select {
+ case c.tchan <- struct{}{}:
+ default:
+ }
+}
+
+// Destroy implements ktime.TimerListener.Destroy and will close the channel.
+func (c *ChannelNotifier) Destroy() {
+ close(c.tchan)
+}
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
new file mode 100644
index 000000000..3f16c1676
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -0,0 +1,270 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "sync"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// Timekeeper manages all of the kernel clocks.
+type Timekeeper struct {
+ // clocks are the clock sources.
+ //
+ // These are not saved directly, as the new machine's clock may behave
+ // differently.
+ //
+ // It is set only once, by SetClocks.
+ clocks sentrytime.Clocks `state:"nosave"`
+
+ // bootTime is the realtime when the system "booted". i.e., when
+ // SetClocks was called in the initial (not restored) run.
+ bootTime ktime.Time
+
+ // monotonicOffset is the offset to apply to the monotonic clock output
+ // from clocks.
+ //
+ // It is set only once, by SetClocks.
+ monotonicOffset int64 `state:"nosave"`
+
+ // restored indicates that this Timekeeper was restored from a state
+ // file.
+ restored bool `state:"nosave"`
+
+ // saveMonotonic is the (offset) value of the monotonic clock at the
+ // time of save.
+ //
+ // It is only valid if restored is true.
+ //
+ // It is only used in SetClocks after restore to compute the new
+ // monotonicOffset.
+ saveMonotonic int64
+
+ // saveRealtime is the value of the realtime clock at the time of save.
+ //
+ // It is only valid if restored is true.
+ //
+ // It is only used in SetClocks after restore to compute the new
+ // monotonicOffset.
+ saveRealtime int64
+
+ // params manages the parameter page.
+ params *VDSOParamPage
+
+ // mu protects destruction with stop and wg.
+ mu sync.Mutex `state:"nosave"`
+
+ // stop is used to tell the update goroutine to exit.
+ stop chan struct{} `state:"nosave"`
+
+ // wg is used to indicate that the update goroutine has exited.
+ wg sync.WaitGroup `state:"nosave"`
+}
+
+// NewTimekeeper returns a Timekeeper that is automatically kept up-to-date.
+// NewTimekeeper does not take ownership of paramPage.
+//
+// SetClocks must be called on the returned Timekeeper before it is usable.
+func NewTimekeeper(platform platform.Platform, paramPage platform.FileRange) (*Timekeeper, error) {
+ return &Timekeeper{
+ params: NewVDSOParamPage(platform, paramPage),
+ }, nil
+}
+
+// SetClocks the backing clock source.
+//
+// SetClocks must be called before the Timekeeper is used, and it may not be
+// called more than once, as changing the clock source without extra correction
+// could cause time discontinuities.
+//
+// It must also be called after Load.
+func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
+ // Update the params, marking them "not ready", as we may need to
+ // restart calibration on this new machine.
+ if t.restored {
+ if err := t.params.Write(func() vdsoParams {
+ return vdsoParams{}
+ }); err != nil {
+ panic("unable to reset VDSO params: " + err.Error())
+ }
+ }
+
+ if t.clocks != nil {
+ panic("SetClocks called on previously-initialized Timekeeper")
+ }
+
+ t.clocks = c
+
+ // Compute the offset of the monotonic clock from the base Clocks.
+ //
+ // In a fresh (not restored) sentry, monotonic time starts at zero.
+ //
+ // In a restored sentry, monotonic time jumps forward by approximately
+ // the same amount as real time. There are no guarantees here, we are
+ // just making a best-effort attempt to to make it appear that the app
+ // was simply not scheduled for a long period, rather than that the
+ // real time clock was changed.
+ //
+ // If real time went backwards, it remains the same.
+ wantMonotonic := int64(0)
+
+ nowMonotonic, err := t.clocks.GetTime(sentrytime.Monotonic)
+ if err != nil {
+ panic("Unable to get current monotonic time: " + err.Error())
+ }
+
+ nowRealtime, err := t.clocks.GetTime(sentrytime.Realtime)
+ if err != nil {
+ panic("Unable to get current realtime: " + err.Error())
+ }
+
+ if t.restored {
+ wantMonotonic = t.saveMonotonic
+ elapsed := nowRealtime - t.saveRealtime
+ if elapsed > 0 {
+ wantMonotonic += elapsed
+ }
+ }
+
+ t.monotonicOffset = wantMonotonic - nowMonotonic
+
+ if !t.restored {
+ // Hold on to the initial "boot" time.
+ t.bootTime = ktime.FromNanoseconds(nowRealtime)
+ }
+
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.startUpdater()
+}
+
+// startUpdater starts an update goroutine that keeps the clocks updated.
+//
+// mu must be held.
+func (t *Timekeeper) startUpdater() {
+ if t.stop != nil {
+ // Timekeeper already started
+ return
+ }
+ t.stop = make(chan struct{})
+
+ // Keep the clocks up to date.
+ //
+ // Note that the Go runtime uses host CLOCK_MONOTONIC to service the
+ // timer, so it may run at a *slightly* different rate from the
+ // application CLOCK_MONOTONIC. That is fine, as we only need to update
+ // at approximately this rate.
+ timer := time.NewTicker(sentrytime.ApproxUpdateInterval)
+ t.wg.Add(1)
+ go func() { // S/R-SAFE: stopped during save.
+ for {
+ // Start with an update immediately, so the clocks are
+ // ready ASAP.
+
+ // Call Update within a Write block to prevent the VDSO
+ // from using the old params between Update and
+ // Write.
+ if err := t.params.Write(func() vdsoParams {
+ monotonicParams, monotonicOk, realtimeParams, realtimeOk := t.clocks.Update()
+
+ var p vdsoParams
+ if monotonicOk {
+ p.monotonicReady = 1
+ p.monotonicBaseCycles = int64(monotonicParams.BaseCycles)
+ p.monotonicBaseRef = int64(monotonicParams.BaseRef) + t.monotonicOffset
+ p.monotonicFrequency = monotonicParams.Frequency
+ }
+ if realtimeOk {
+ p.realtimeReady = 1
+ p.realtimeBaseCycles = int64(realtimeParams.BaseCycles)
+ p.realtimeBaseRef = int64(realtimeParams.BaseRef)
+ p.realtimeFrequency = realtimeParams.Frequency
+ }
+
+ log.Debugf("Updating VDSO parameters: %+v", p)
+
+ return p
+ }); err != nil {
+ log.Warningf("Unable to update VDSO parameter page: %v", err)
+ }
+
+ select {
+ case <-timer.C:
+ case <-t.stop:
+ t.wg.Done()
+ return
+ }
+ }
+ }()
+}
+
+// stopUpdater stops the update goroutine, blocking until it exits.
+//
+// mu must be held.
+func (t *Timekeeper) stopUpdater() {
+ if t.stop == nil {
+ // Updater not running.
+ return
+ }
+
+ close(t.stop)
+ t.wg.Wait()
+ t.stop = nil
+}
+
+// Destroy destroys the Timekeeper, freeing all associated resources.
+func (t *Timekeeper) Destroy() {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+
+ t.stopUpdater()
+}
+
+// PauseUpdates stops clock parameter updates. This should only be used when
+// Tasks are not running and thus cannot access the clock.
+func (t *Timekeeper) PauseUpdates() {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.stopUpdater()
+}
+
+// ResumeUpdates restarts clock parameter updates stopped by PauseUpdates.
+func (t *Timekeeper) ResumeUpdates() {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.startUpdater()
+}
+
+// GetTime returns the current time in nanoseconds.
+func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
+ if t.clocks == nil {
+ panic("Timekeeper used before initialized with SetClocks")
+ }
+ now, err := t.clocks.GetTime(c)
+ if err == nil && c == sentrytime.Monotonic {
+ now += t.monotonicOffset
+ }
+ return now, err
+}
+
+// BootTime returns the system boot real time.
+func (t *Timekeeper) BootTime() ktime.Time {
+ return t.bootTime
+}
diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go
new file mode 100644
index 000000000..aee983ac7
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper_state.go
@@ -0,0 +1,41 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// beforeSave is invoked by stateify.
+func (t *Timekeeper) beforeSave() {
+ if t.stop != nil {
+ panic("pauseUpdates must be called before Save")
+ }
+
+ // N.B. we want the *offset* monotonic time.
+ var err error
+ if t.saveMonotonic, err = t.GetTime(time.Monotonic); err != nil {
+ panic("unable to get current monotonic time: " + err.Error())
+ }
+
+ if t.saveRealtime, err = t.GetTime(time.Realtime); err != nil {
+ panic("unable to get current realtime: " + err.Error())
+ }
+}
+
+// afterLoad is invoked by stateify.
+func (t *Timekeeper) afterLoad() {
+ t.restored = true
+}
diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go
new file mode 100644
index 000000000..08bacba4f
--- /dev/null
+++ b/pkg/sentry/kernel/timekeeper_test.go
@@ -0,0 +1,156 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// mockClocks is a sentrytime.Clocks that simply returns the times in the
+// struct.
+type mockClocks struct {
+ monotonic int64
+ realtime int64
+}
+
+// Update implements sentrytime.Clocks.Update. It does nothing.
+func (*mockClocks) Update() (monotonicParams sentrytime.Parameters, monotonicOk bool, realtimeParam sentrytime.Parameters, realtimeOk bool) {
+ return
+}
+
+// Update implements sentrytime.Clocks.GetTime.
+func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) {
+ switch id {
+ case sentrytime.Monotonic:
+ return c.monotonic, nil
+ case sentrytime.Realtime:
+ return c.realtime, nil
+ default:
+ return 0, syserror.EINVAL
+ }
+}
+
+// stateTestClocklessTimekeeper returns a test Timekeeper which has not had
+// SetClocks called.
+func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper {
+ ctx := contexttest.Context(tb)
+ p := platform.FromContext(ctx)
+ fr, err := p.Memory().Allocate(usermem.PageSize, usage.Anonymous)
+ if err != nil {
+ tb.Fatalf("failed to allocate memory: %v", err)
+ }
+ return &Timekeeper{
+ params: NewVDSOParamPage(p, fr),
+ }
+}
+
+func stateTestTimekeeper(tb testing.TB) *Timekeeper {
+ t := stateTestClocklessTimekeeper(tb)
+ t.SetClocks(sentrytime.NewCalibratedClocks())
+ return t
+}
+
+// TestTimekeeperMonotonicZero tests that monotonic time starts at zero.
+func TestTimekeeperMonotonicZero(t *testing.T) {
+ c := &mockClocks{
+ monotonic: 100000,
+ }
+
+ tk := stateTestClocklessTimekeeper(t)
+ tk.SetClocks(c)
+ defer tk.Destroy()
+
+ now, err := tk.GetTime(sentrytime.Monotonic)
+ if err != nil {
+ t.Errorf("GetTime err got %v want nil", err)
+ }
+ if now != 0 {
+ t.Errorf("GetTime got %d want 0", now)
+ }
+
+ c.monotonic += 10
+
+ now, err = tk.GetTime(sentrytime.Monotonic)
+ if err != nil {
+ t.Errorf("GetTime err got %v want nil", err)
+ }
+ if now != 10 {
+ t.Errorf("GetTime got %d want 10", now)
+ }
+}
+
+// TestTimekeeperMonotonicJumpForward tests that monotonic time jumps forward
+// after restore.
+func TestTimekeeperMonotonicForward(t *testing.T) {
+ c := &mockClocks{
+ monotonic: 900000,
+ realtime: 600000,
+ }
+
+ tk := stateTestClocklessTimekeeper(t)
+ tk.restored = true
+ tk.saveMonotonic = 100000
+ tk.saveRealtime = 400000
+ tk.SetClocks(c)
+ defer tk.Destroy()
+
+ // The monotonic clock should jump ahead by 200000 to 300000.
+ //
+ // The new system monotonic time (900000) is irrelevant to what the app
+ // sees.
+ now, err := tk.GetTime(sentrytime.Monotonic)
+ if err != nil {
+ t.Errorf("GetTime err got %v want nil", err)
+ }
+ if now != 300000 {
+ t.Errorf("GetTime got %d want 300000", now)
+ }
+}
+
+// TestTimekeeperMonotonicJumpBackwards tests that monotonic time does not jump
+// backwards when realtime goes backwards.
+func TestTimekeeperMonotonicJumpBackwards(t *testing.T) {
+ c := &mockClocks{
+ monotonic: 900000,
+ realtime: 400000,
+ }
+
+ tk := stateTestClocklessTimekeeper(t)
+ tk.restored = true
+ tk.saveMonotonic = 100000
+ tk.saveRealtime = 600000
+ tk.SetClocks(c)
+ defer tk.Destroy()
+
+ // The monotonic clock should remain at 100000.
+ //
+ // The new system monotonic time (900000) is irrelevant to what the app
+ // sees and we don't want to jump the monotonic clock backwards like
+ // realtime did.
+ now, err := tk.GetTime(sentrytime.Monotonic)
+ if err != nil {
+ t.Errorf("GetTime err got %v want nil", err)
+ }
+ if now != 100000 {
+ t.Errorf("GetTime got %d want 100000", now)
+ }
+}
diff --git a/pkg/sentry/kernel/timer.go b/pkg/sentry/kernel/timer.go
new file mode 100644
index 000000000..03a3310be
--- /dev/null
+++ b/pkg/sentry/kernel/timer.go
@@ -0,0 +1,282 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// timekeeperClock is a ktime.Clock that reads time from a
+// kernel.Timekeeper-managed clock.
+type timekeeperClock struct {
+ tk *Timekeeper
+ c sentrytime.ClockID
+
+ // Implements ktime.Clock.WallTimeUntil.
+ ktime.WallRateClock `state:"nosave"`
+
+ // Implements waiter.Waitable. (We have no ability to detect
+ // discontinuities from external changes to CLOCK_REALTIME).
+ ktime.NoClockEvents `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *timekeeperClock) Now() ktime.Time {
+ now, err := tc.tk.GetTime(tc.c)
+ if err != nil {
+ panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
+ }
+ return ktime.FromNanoseconds(now)
+}
+
+// tgClock is a ktime.Clock that measures the time a thread group has spent
+// executing.
+type tgClock struct {
+ tg *ThreadGroup
+
+ // If includeSys is true, the tgClock includes both time spent executing
+ // application code as well as time spent in the sentry. Otherwise, the
+ // tgClock includes only time spent executing application code.
+ includeSys bool
+
+ // Implements waiter.Waitable.
+ ktime.ClockEventsQueue `state:"nosave"`
+}
+
+// UserCPUClock returns a ktime.Clock that measures the time that a thread
+// group has spent executing.
+func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
+ return tg.tm.virtClock
+}
+
+// CPUClock returns a ktime.Clock that measures the time that a thread group
+// has spent executing, including sentry time.
+func (tg *ThreadGroup) CPUClock() ktime.Clock {
+ return tg.tm.profClock
+}
+
+// Now implements ktime.Clock.Now.
+func (tgc *tgClock) Now() ktime.Time {
+ stats := tgc.tg.CPUStats()
+ if tgc.includeSys {
+ return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+ }
+ return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// WallTimeUntil implements ktime.Clock.WallTimeUntil.
+func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
+ // The assumption here is that the time spent in this process (not matter
+ // virtual or prof) should not exceed wall time * active tasks, since
+ // Task.exitThreadGroup stops accounting as it transitions to
+ // TaskExitInitiated.
+ tgc.tg.pidns.owner.mu.RLock()
+ n := tgc.tg.activeTasks
+ tgc.tg.pidns.owner.mu.RUnlock()
+ if n == 0 {
+ if t.Before(now) {
+ return 0
+ }
+ // The timer tick raced with thread group exit, after which no more
+ // tasks can enter the thread group. So tgc.Now() will never advance
+ // again. Return a large delay; the timer should be stopped long before
+ // it comes again anyway.
+ return time.Hour
+ }
+ // This is a lower bound on the amount of time that can elapse before an
+ // associated timer expires, so returning this value tends to result in a
+ // sequence of closely-spaced ticks just before timer expiry. To avoid
+ // this, round up to the nearest ClockTick; CPU usage measurements are
+ // limited to this resolution anyway.
+ remaining := time.Duration(int64(t.Sub(now))/int64(n)) * time.Nanosecond
+ return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
+}
+
+// taskClock is a ktime.Clock that measures the time that a task has spent
+// executing.
+type taskClock struct {
+ t *Task
+
+ // If includeSys is true, the taskClock includes both time spent executing
+ // application code as well as time spent in the sentry. Otherwise, the
+ // taskClock includes only time spent executing application code.
+ includeSys bool
+
+ // Implements waiter.Waitable. TimeUntil wouldn't change its estimation
+ // based on either of the clock events, so there's no event to be
+ // notified for.
+ ktime.NoClockEvents `state:"nosave"`
+
+ // Implements ktime.Clock.WallTimeUntil.
+ //
+ // As an upper bound, a task's clock cannot advance faster than CPU
+ // time. It would have to execute at a rate of more than 1 task-second
+ // per 1 CPU-second, which isn't possible.
+ ktime.WallRateClock `state:"nosave"`
+}
+
+// UserCPUClock returns a clock measuring the CPU time the task has spent
+// executing application code.
+func (t *Task) UserCPUClock() ktime.Clock {
+ return &taskClock{t: t, includeSys: false}
+}
+
+// CPUClock returns a clock measuring the CPU time the task has spent executing
+// application and "kernel" code.
+func (t *Task) CPUClock() ktime.Clock {
+ return &taskClock{t: t, includeSys: true}
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *taskClock) Now() ktime.Time {
+ stats := tc.t.CPUStats()
+ if tc.includeSys {
+ return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+ }
+ return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// signalNotifier is a ktime.Listener that sends signals to a ThreadGroup.
+type signalNotifier struct {
+ tg *ThreadGroup
+ signal linux.Signal
+ realTimer bool
+ includeSys bool
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (s *signalNotifier) Notify(exp uint64) {
+ // Since all signals sent using a signalNotifier are standard (not
+ // real-time) signals, we can ignore the number of expirations and send
+ // only a single signal.
+ if s.realTimer {
+ // real timer signal sent to leader. See kernel/time/itimer.c:it_real_fn
+ s.tg.SendSignal(sigPriv(s.signal))
+ } else {
+ s.tg.SendTimerSignal(sigPriv(s.signal), s.includeSys)
+ }
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (s *signalNotifier) Destroy() {}
+
+// TimerManager is a collection of supported process cpu timers.
+type TimerManager struct {
+ // Clocks used to drive thread group execution time timers.
+ virtClock *tgClock
+ profClock *tgClock
+
+ RealTimer *ktime.Timer
+ VirtualTimer *ktime.Timer
+ ProfTimer *ktime.Timer
+ SoftLimitTimer *ktime.Timer
+ HardLimitTimer *ktime.Timer
+}
+
+// newTimerManager returns a new instance of TimerManager.
+func newTimerManager(tg *ThreadGroup, monotonicClock ktime.Clock) TimerManager {
+ virtClock := &tgClock{tg: tg, includeSys: false}
+ profClock := &tgClock{tg: tg, includeSys: true}
+ tm := TimerManager{
+ virtClock: virtClock,
+ profClock: profClock,
+ RealTimer: ktime.NewTimer(monotonicClock, &signalNotifier{
+ tg: tg,
+ signal: linux.SIGALRM,
+ realTimer: true,
+ includeSys: false,
+ }),
+ VirtualTimer: ktime.NewTimer(virtClock, &signalNotifier{
+ tg: tg,
+ signal: linux.SIGVTALRM,
+ realTimer: false,
+ includeSys: false,
+ }),
+ ProfTimer: ktime.NewTimer(profClock, &signalNotifier{
+ tg: tg,
+ signal: linux.SIGPROF,
+ realTimer: false,
+ includeSys: true,
+ }),
+ SoftLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
+ tg: tg,
+ signal: linux.SIGXCPU,
+ realTimer: false,
+ includeSys: true,
+ }),
+ HardLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
+ tg: tg,
+ signal: linux.SIGKILL,
+ realTimer: false,
+ includeSys: true,
+ }),
+ }
+ tm.applyCPULimits(tg.Limits().Get(limits.CPU))
+ return tm
+}
+
+// Save saves this TimerManger.
+
+// destroy destroys all timers.
+func (tm *TimerManager) destroy() {
+ tm.RealTimer.Destroy()
+ tm.VirtualTimer.Destroy()
+ tm.ProfTimer.Destroy()
+ tm.SoftLimitTimer.Destroy()
+ tm.HardLimitTimer.Destroy()
+}
+
+func (tm *TimerManager) applyCPULimits(l limits.Limit) {
+ tm.SoftLimitTimer.Swap(ktime.Setting{
+ Enabled: l.Cur != limits.Infinity,
+ Next: ktime.FromNanoseconds((time.Duration(l.Cur) * time.Second).Nanoseconds()),
+ Period: time.Second,
+ })
+ tm.HardLimitTimer.Swap(ktime.Setting{
+ Enabled: l.Max != limits.Infinity,
+ Next: ktime.FromNanoseconds((time.Duration(l.Max) * time.Second).Nanoseconds()),
+ })
+}
+
+// kick is called when the number of threads in the thread group associated
+// with tm increases.
+func (tm *TimerManager) kick() {
+ tm.virtClock.Notify(ktime.ClockEventRateIncrease)
+ tm.profClock.Notify(ktime.ClockEventRateIncrease)
+}
+
+// pause is to pause the timers and stop timer signal delivery.
+func (tm *TimerManager) pause() {
+ tm.RealTimer.Pause()
+ tm.VirtualTimer.Pause()
+ tm.ProfTimer.Pause()
+ tm.SoftLimitTimer.Pause()
+ tm.HardLimitTimer.Pause()
+}
+
+// resume is to resume the timers and continue timer signal delivery.
+func (tm *TimerManager) resume() {
+ tm.RealTimer.Resume()
+ tm.VirtualTimer.Resume()
+ tm.ProfTimer.Resume()
+ tm.SoftLimitTimer.Resume()
+ tm.HardLimitTimer.Resume()
+}
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
new file mode 100644
index 000000000..58e9b4d1b
--- /dev/null
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -0,0 +1,100 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+)
+
+// UTSNamespace represents a UTS namespace, a holder of two system identifiers:
+// the hostname and domain name.
+type UTSNamespace struct {
+ // mu protects all fields below.
+ mu sync.Mutex `state:"nosave"`
+ hostName string
+ domainName string
+
+ // userns is the user namespace associated with the UTSNamespace.
+ // Privileged operations on this UTSNamespace must have appropriate
+ // capabilities in userns.
+ //
+ // userns is immutable.
+ userns *auth.UserNamespace
+}
+
+// NewUTSNamespace creates a new UTS namespace.
+func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace {
+ return &UTSNamespace{
+ hostName: hostName,
+ domainName: domainName,
+ userns: userns,
+ }
+}
+
+// UTSNamespace returns the task's UTS namespace.
+func (t *Task) UTSNamespace() *UTSNamespace {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.utsns
+}
+
+// HostName returns the host name of this UTS namespace.
+func (u *UTSNamespace) HostName() string {
+ u.mu.Lock()
+ defer u.mu.Unlock()
+ return u.hostName
+}
+
+// SetHostName sets the host name of this UTS namespace.
+func (u *UTSNamespace) SetHostName(host string) {
+ u.mu.Lock()
+ defer u.mu.Unlock()
+ u.hostName = host
+}
+
+// DomainName returns the domain name of this UTS namespace.
+func (u *UTSNamespace) DomainName() string {
+ u.mu.Lock()
+ defer u.mu.Unlock()
+ return u.domainName
+}
+
+// SetDomainName sets the domain name of this UTS namespace.
+func (u *UTSNamespace) SetDomainName(domain string) {
+ u.mu.Lock()
+ defer u.mu.Unlock()
+ u.domainName = domain
+}
+
+// UserNamespace returns the user namespace associated with this UTS namespace.
+func (u *UTSNamespace) UserNamespace() *auth.UserNamespace {
+ u.mu.Lock()
+ defer u.mu.Unlock()
+ return u.userns
+}
+
+// Clone makes a copy of this UTS namespace, associating the given user
+// namespace.
+func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace {
+ u.mu.Lock()
+ defer u.mu.Unlock()
+ return &UTSNamespace{
+ hostName: u.hostName,
+ domainName: u.domainName,
+ userns: userns,
+ }
+}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
new file mode 100644
index 000000000..0bacbea49
--- /dev/null
+++ b/pkg/sentry/kernel/vdso.go
@@ -0,0 +1,145 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "fmt"
+
+ "gvisor.googlesource.com/gvisor/pkg/binary"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// vdsoParams are the parameters exposed to the VDSO.
+//
+// They are exposed to the VDSO via a parameter page managed by VDSOParamPage,
+// which also includes a sequence counter.
+type vdsoParams struct {
+ monotonicReady uint64
+ monotonicBaseCycles int64
+ monotonicBaseRef int64
+ monotonicFrequency uint64
+
+ realtimeReady uint64
+ realtimeBaseCycles int64
+ realtimeBaseRef int64
+ realtimeFrequency uint64
+}
+
+// VDSOParamPage manages a VDSO parameter page.
+//
+// Its memory layout looks like:
+//
+// type page struct {
+// // seq is a sequence counter that protects the fields below.
+// seq uint64
+// vdsoParams
+// }
+//
+// Everything in the struct is 8 bytes for easy alignment.
+//
+// It must be kept in sync with params in vdso/vdso_time.cc.
+type VDSOParamPage struct {
+ // The parameter page is fr, allocated from platform.Memory().
+ platform platform.Platform
+ fr platform.FileRange
+
+ // seq is the current sequence count written to the page.
+ //
+ // A write is in progress if bit 1 of the counter is set.
+ //
+ // Timekeeper's updater goroutine may call Write before equality is
+ // checked in state_test_util tests, causing this field to change across
+ // save / restore.
+ seq uint64
+}
+
+// NewVDSOParamPage returns a VDSOParamPage.
+//
+// Preconditions:
+//
+// * fr is a single page allocated from platform.Memory(). VDSOParamPage does
+// not take ownership of fr; it must remain allocated for the lifetime of the
+// VDSOParamPage.
+//
+// * VDSOParamPage must be the only writer to fr.
+//
+// * platform.Memory().MapInternal(fr) must return a single safemem.Block.
+func NewVDSOParamPage(platform platform.Platform, fr platform.FileRange) *VDSOParamPage {
+ return &VDSOParamPage{platform: platform, fr: fr}
+}
+
+// access returns a mapping of the param page.
+func (v *VDSOParamPage) access() (safemem.Block, error) {
+ bs, err := v.platform.Memory().MapInternal(v.fr, usermem.ReadWrite)
+ if err != nil {
+ return safemem.Block{}, err
+ }
+ if bs.NumBlocks() != 1 {
+ panic(fmt.Sprintf("Multiple blocks (%d) in VDSO param BlockSeq", bs.NumBlocks()))
+ }
+ return bs.Head(), nil
+}
+
+// incrementSeq increments the sequence counter in the param page.
+func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error {
+ next := v.seq + 1
+ old, err := safemem.SwapUint64(paramPage, next)
+ if err != nil {
+ return err
+ }
+
+ if old != v.seq {
+ return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq)
+ }
+
+ v.seq = next
+ return nil
+}
+
+// Write updates the VDSO parameters.
+//
+// Write starts a write block, calls f to get the new parameters, writes
+// out the new parameters, then ends the write block.
+func (v *VDSOParamPage) Write(f func() vdsoParams) error {
+ paramPage, err := v.access()
+ if err != nil {
+ return err
+ }
+
+ // Write begin.
+ next := v.seq + 1
+ if next%2 != 1 {
+ panic("Out-of-order sequence count")
+ }
+
+ err = v.incrementSeq(paramPage)
+ if err != nil {
+ return err
+ }
+
+ // Get the new params.
+ p := f()
+ buf := binary.Marshal(nil, usermem.ByteOrder, p)
+
+ // Skip the sequence counter.
+ if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil {
+ panic(fmt.Sprintf("Unable to get set VDSO parameters: %v", err))
+ }
+
+ // Write end.
+ return v.incrementSeq(paramPage)
+}
diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go
new file mode 100644
index 000000000..a9e84673f
--- /dev/null
+++ b/pkg/sentry/kernel/version.go
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// Version defines the application-visible system version.
+type Version struct {
+ // Operating system name (e.g. "Linux").
+ Sysname string
+
+ // Operating system release (e.g. "3.11.10-amd64").
+ Release string
+
+ // Operating system version. On Linux this takes the shape
+ // "#VERSION CONFIG_FLAGS TIMESTAMP"
+ // where:
+ // - VERSION is a sequence counter incremented on every successful build
+ // - CONFIG_FLAGS is a space-separated list of major enabled kernel features
+ // (e.g. "SMP" and "PREEMPT")
+ // - TIMESTAMP is the build timestamp as returned by `date`
+ Version string
+}