diff options
Diffstat (limited to 'pkg/sentry/kernel')
93 files changed, 19474 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD new file mode 100644 index 000000000..62794cff5 --- /dev/null +++ b/pkg/sentry/kernel/BUILD @@ -0,0 +1,234 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "kernel_state", + srcs = [ + "abstract_socket_namespace.go", + "fd_map.go", + "fs_context.go", + "ipc_namespace.go", + "kernel.go", + "pending_signals.go", + "pending_signals_list.go", + "process_group_list.go", + "ptrace.go", + "rseq.go", + "session_list.go", + "sessions.go", + "signal.go", + "signal_handlers.go", + "syscalls.go", + "syscalls_state.go", + "syslog.go", + "task.go", + "task_clone.go", + "task_context.go", + "task_exec.go", + "task_exit.go", + "task_list.go", + "task_resources.go", + "task_run.go", + "task_sched.go", + "task_signals.go", + "task_start.go", + "task_syscall.go", + "thread_group.go", + "threads.go", + "timekeeper.go", + "timekeeper_state.go", + "timer.go", + "uts_namespace.go", + "vdso.go", + "version.go", + ], + out = "kernel_state.go", + imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"], + package = "kernel", +) + +go_template_instance( + name = "pending_signals_list", + out = "pending_signals_list.go", + package = "kernel", + prefix = "pendingSignal", + template = "//pkg/ilist:generic_list", + types = { + "Linker": "*pendingSignal", + }, +) + +go_template_instance( + name = "process_group_list", + out = "process_group_list.go", + package = "kernel", + prefix = "processGroup", + template = "//pkg/ilist:generic_list", + types = { + "Linker": "*ProcessGroup", + }, +) + +go_template_instance( + name = "seqatomic_taskgoroutineschedinfo", + out = "seqatomic_taskgoroutineschedinfo.go", + package = "kernel", + suffix = "TaskGoroutineSchedInfo", + template = "//pkg/sync:generic_seqatomic", + types = { + "Value": "TaskGoroutineSchedInfo", + }, +) + +go_template_instance( + name = "session_list", + out = "session_list.go", + package = "kernel", + prefix = "session", + template = "//pkg/ilist:generic_list", + types = { + "Linker": "*Session", + }, +) + +go_template_instance( + name = "task_list", + out = "task_list.go", + package = "kernel", + prefix = "task", + template = "//pkg/ilist:generic_list", + types = { + "Linker": "*Task", + }, +) + +go_library( + name = "kernel", + srcs = [ + "abstract_socket_namespace.go", + "context.go", + "fd_map.go", + "fs_context.go", + "ipc_namespace.go", + "kernel.go", + "kernel_state.go", + "pending_signals.go", + "pending_signals_list.go", + "process_group_list.go", + "ptrace.go", + "rseq.go", + "seccomp.go", + "seqatomic_taskgoroutineschedinfo.go", + "session_list.go", + "sessions.go", + "signal.go", + "signal_handlers.go", + "syscalls.go", + "syscalls_state.go", + "syslog.go", + "task.go", + "task_acct.go", + "task_block.go", + "task_clone.go", + "task_context.go", + "task_exec.go", + "task_exit.go", + "task_identity.go", + "task_list.go", + "task_log.go", + "task_net.go", + "task_resources.go", + "task_run.go", + "task_sched.go", + "task_signals.go", + "task_start.go", + "task_stop.go", + "task_syscall.go", + "task_usermem.go", + "thread_group.go", + "threads.go", + "timekeeper.go", + "timekeeper_state.go", + "timer.go", + "uts_namespace.go", + "vdso.go", + "version.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel", + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi", + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/binary", + "//pkg/bits", + "//pkg/bpf", + "//pkg/cpuid", + "//pkg/eventchannel", + "//pkg/log", + "//pkg/refs", + "//pkg/secio", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fs/timerfd", + "//pkg/sentry/hostcpu", + "//pkg/sentry/inet", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/epoll", + "//pkg/sentry/kernel/futex", + "//pkg/sentry/kernel/kdefs", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/kernel/semaphore", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/memmap", + "//pkg/sentry/mm", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/socket/netlink/port", + "//pkg/sentry/time", + "//pkg/sentry/uniqueid", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/state/statefile", + "//pkg/sync", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/stack", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + ], +) + +go_test( + name = "kernel_test", + size = "small", + srcs = [ + "fd_map_test.go", + "table_test.go", + "task_test.go", + "timekeeper_test.go", + ], + embed = [":kernel"], + deps = [ + "//pkg/abi", + "//pkg/sentry/arch", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs/filetest", + "//pkg/sentry/kernel/kdefs", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/limits", + "//pkg/sentry/platform", + "//pkg/sentry/time", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/kernel/README.md b/pkg/sentry/kernel/README.md new file mode 100644 index 000000000..3306780d6 --- /dev/null +++ b/pkg/sentry/kernel/README.md @@ -0,0 +1,106 @@ +This package contains: + +- A (partial) emulation of the "core Linux kernel", which governs task + execution and scheduling, system call dispatch, and signal handling. See + below for details. + +- The top-level interface for the sentry's Linux kernel emulation in general, + used by the `main` function of all versions of the sentry. This interface + revolves around the `Env` type (defined in `kernel.go`). + +# Background + +In Linux, each schedulable context is referred to interchangeably as a "task" or +"thread". Tasks can be divided into userspace and kernel tasks. In the sentry, +scheduling is managed by the Go runtime, so each schedulable context is a +goroutine; only "userspace" (application) contexts are referred to as tasks, and +represented by Task objects. (From this point forward, "task" refers to the +sentry's notion of a task unless otherwise specified.) + +At a high level, Linux application threads can be thought of as repeating a "run +loop": + +- Some amount of application code is executed in userspace. + +- A trap (explicit syscall invocation, hardware interrupt or exception, etc.) + causes control flow to switch to the kernel. + +- Some amount of kernel code is executed in kernelspace, e.g. to handle the + cause of the trap. + +- The kernel "returns from the trap" into application code. + +Analogously, each task in the sentry is associated with a *task goroutine* that +executes that task's run loop (`Task.run` in `task_run.go`). However, the +sentry's task run loop differs in structure in order to support saving execution +state to, and resuming execution from, checkpoints. + +While in kernelspace, a Linux thread can be descheduled (cease execution) in a +variety of ways: + +- It can yield or be preempted, becoming temporarily descheduled but still + runnable. At present, the sentry delegates scheduling of runnable threads to + the Go runtime. + +- It can exit, becoming permanently descheduled. The sentry's equivalent is + returning from `Task.run`, terminating the task goroutine. + +- It can enter interruptible sleep, a state in which it can be woken by a + caller-defined wakeup or the receipt of a signal. In the sentry, interruptible + sleep (which is ambiguously referred to as *blocking*) is implemented by + making all events that can end blocking (including signal notifications) + communicated via Go channels and using `select` to multiplex wakeup sources; + see `task_block.go`. + +- It can enter uninterruptible sleep, a state in which it can only be woken by a + caller-defined wakeup. Killable sleep is a closely related variant in which + the task can also be woken by SIGKILL. (These definitions also include Linux's + "group-stopped" (`TASK_STOPPED`) and "ptrace-stopped" (`TASK_TRACED`) states.) + +To maximize compatibility with Linux, sentry checkpointing appears as a spurious +signal-delivery interrupt on all tasks; interrupted system calls return `EINTR` +or are automatically restarted as usual. However, these semantics require that +uninterruptible and killable sleeps do not appear to be interrupted. In other +words, the state of the task, including its progress through the interrupted +operation, must be preserved by checkpointing. For many such sleeps, the wakeup +condition is application-controlled, making it infeasible to wait for the sleep +to end before checkpointing. Instead, we must support checkpointing progress +through sleeping operations. + +# Implementation + +We break the task's control flow graph into *states*, delimited by: + +1. Points where uninterruptible and killable sleeps may occur. For example, +there exists a state boundary between signal dequeueing and signal delivery +because there may be an intervening ptrace signal-delivery-stop. + +2. Points where sleep-induced branches may "rejoin" normal execution. For +example, the syscall exit state exists because it can be reached immediately +following a synchronous syscall, or after a task that is sleeping in `execve()` +or `vfork()` resumes execution. + +3. Points containing large branches. This is strictly for organizational +purposes. For example, the state that processes interrupt-signaled conditions is +kept separate from the main "app" state to reduce the size of the latter. + +4. `SyscallReinvoke`, which does not correspond to anything in Linux, and exists +solely to serve the autosave feature. + +![dot -Tsvg -Goverlap=false -orun_states.svg run_states.dot](g3doc/run_states.dot "Task control flow graph") + +States before which a stop may occur are represented as implementations of the +`taskRunState` interface named `run(state)`, allowing them to be saved and +restored. States that cannot be immediately preceded by a stop are simply `Task` +methods named `do(state)`. + +Conditions that can require task goroutines to cease execution for unknown +lengths of time are called *stops*. Stops are divided into *internal stops*, +which are stops whose start and end conditions are implemented within the +sentry, and *external stops*, which are stops whose start and end conditions are +not known to the sentry. Hence all uninterruptible and killable sleeps are +internal stops, and the existence of a pending checkpoint operation is an +external stop. Internal stops are reified into instances of the `TaskStop` type, +while external stops are merely counted. The task run loop alternates between +checking for stops and advancing the task's state. This allows checkpointing to +hold tasks in a stopped state while waiting for all tasks in the system to stop. diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go new file mode 100644 index 000000000..014c4a3bf --- /dev/null +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -0,0 +1,108 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +type abstractEndpoint struct { + ep unix.BoundEndpoint + wr *refs.WeakRef + name string + ns *AbstractSocketNamespace +} + +// WeakRefGone implements refs.WeakRefUser.WeakRefGone. +func (e *abstractEndpoint) WeakRefGone() { + e.ns.mu.Lock() + if e.ns.endpoints[e.name].ep == e.ep { + delete(e.ns.endpoints, e.name) + } + e.ns.mu.Unlock() +} + +// AbstractSocketNamespace is used to implement the Linux abstract socket functionality. +type AbstractSocketNamespace struct { + mu sync.Mutex `state:"nosave"` + + // Keeps mapping from name to endpoint. + endpoints map[string]abstractEndpoint +} + +// NewAbstractSocketNamespace returns a new AbstractSocketNamespace. +func NewAbstractSocketNamespace() *AbstractSocketNamespace { + return &AbstractSocketNamespace{ + endpoints: make(map[string]abstractEndpoint), + } +} + +// A boundEndpoint wraps a unix.BoundEndpoint to maintain a reference on its +// backing object. +type boundEndpoint struct { + unix.BoundEndpoint + rc refs.RefCounter +} + +// Release implements unix.BoundEndpoint.Release. +func (e *boundEndpoint) Release() { + e.rc.DecRef() + e.BoundEndpoint.Release() +} + +// BoundEndpoint retrieves the endpoint bound to the given name. The return +// value is nil if no endpoint was bound. +func (a *AbstractSocketNamespace) BoundEndpoint(name string) unix.BoundEndpoint { + a.mu.Lock() + defer a.mu.Unlock() + + ep, ok := a.endpoints[name] + if !ok { + return nil + } + + rc := ep.wr.Get() + if rc == nil { + delete(a.endpoints, name) + return nil + } + + return &boundEndpoint{ep.ep, rc} +} + +// Bind binds the given socket. +// +// When the last reference managed by rc is dropped, ep may be removed from the +// namespace. +func (a *AbstractSocketNamespace) Bind(name string, ep unix.BoundEndpoint, rc refs.RefCounter) error { + a.mu.Lock() + defer a.mu.Unlock() + + if ep, ok := a.endpoints[name]; ok { + if rc := ep.wr.Get(); rc != nil { + rc.DecRef() + return syscall.EADDRINUSE + } + } + + ae := abstractEndpoint{ep: ep, name: name, ns: a} + ae.wr = refs.NewWeakRef(rc, &ae) + a.endpoints[name] = ae + return nil +} diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD new file mode 100644 index 000000000..7f0680b88 --- /dev/null +++ b/pkg/sentry/kernel/auth/BUILD @@ -0,0 +1,73 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "auth_state", + srcs = [ + "credentials.go", + "id.go", + "id_map_range.go", + "id_map_set.go", + "user_namespace.go", + ], + out = "auth_state.go", + package = "auth", +) + +go_template_instance( + name = "id_map_range", + out = "id_map_range.go", + package = "auth", + prefix = "idMap", + template = "//pkg/segment:generic_range", + types = { + "T": "uint32", + }, +) + +go_template_instance( + name = "id_map_set", + out = "id_map_set.go", + consts = { + "minDegree": "3", + }, + package = "auth", + prefix = "idMap", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint32", + "Range": "idMapRange", + "Value": "uint32", + "Functions": "idMapFunctions", + }, +) + +go_library( + name = "auth", + srcs = [ + "auth.go", + "auth_state.go", + "capability_set.go", + "context.go", + "credentials.go", + "id.go", + "id_map.go", + "id_map_functions.go", + "id_map_range.go", + "id_map_set.go", + "user_namespace.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/bits", + "//pkg/log", + "//pkg/sentry/context", + "//pkg/state", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go new file mode 100644 index 000000000..c49a6b852 --- /dev/null +++ b/pkg/sentry/kernel/auth/auth.go @@ -0,0 +1,22 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package auth implements an access control model that is a subset of Linux's. +// +// The auth package supports two kinds of access controls: user/group IDs and +// capabilities. Each resource in the security model is associated with a user +// namespace; "privileged" operations check that the operator's credentials +// have the required user/group IDs or capabilities within the user namespace +// of accessed resources. +package auth diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go new file mode 100644 index 000000000..5b8164c49 --- /dev/null +++ b/pkg/sentry/kernel/auth/capability_set.go @@ -0,0 +1,61 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bits" +) + +// A CapabilitySet is a set of capabilities implemented as a bitset. The zero +// value of CapabilitySet is a set containing no capabilities. +type CapabilitySet uint64 + +// AllCapabilities is a CapabilitySet containing all valid capabilities. +var AllCapabilities = CapabilitySetOf(linux.MaxCapability+1) - 1 + +// CapabilitySetOf returns a CapabilitySet containing only the given +// capability. +func CapabilitySetOf(cp linux.Capability) CapabilitySet { + return CapabilitySet(bits.MaskOf64(int(cp))) +} + +// CapabilitySetOfMany returns a CapabilitySet containing the given capabilities. +func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet { + var cs uint64 + for _, cp := range cps { + cs |= bits.MaskOf64(int(cp)) + } + return CapabilitySet(cs) +} + +// TaskCapabilities represents all the capability sets for a task. Each of these +// sets is explained in greater detail in capabilities(7). +type TaskCapabilities struct { + // Permitted is a limiting superset for the effective capabilities that + // the thread may assume. + PermittedCaps CapabilitySet + // Inheritable is a set of capabilities preserved across an execve(2). + InheritableCaps CapabilitySet + // Effective is the set of capabilities used by the kernel to perform + // permission checks for the thread. + EffectiveCaps CapabilitySet + // Bounding is a limiting superset for the capabilities that a thread + // can add to its inheritable set using capset(2). + BoundingCaps CapabilitySet + // Ambient is a set of capabilities that are preserved across an + // execve(2) of a program that is not privileged. + AmbientCaps CapabilitySet +} diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go new file mode 100644 index 000000000..914589b28 --- /dev/null +++ b/pkg/sentry/kernel/auth/context.go @@ -0,0 +1,36 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// contextID is the auth package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxCredentials is a Context.Value key for Credentials. + CtxCredentials contextID = iota +) + +// CredentialsFromContext returns a copy of the Credentials used by ctx, or a +// set of Credentials with no capabilities if ctx does not have Credentials. +func CredentialsFromContext(ctx context.Context) *Credentials { + if v := ctx.Value(CtxCredentials); v != nil { + return v.(*Credentials) + } + return NewAnonymousCredentials() +} diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go new file mode 100644 index 000000000..b832b28fe --- /dev/null +++ b/pkg/sentry/kernel/auth/credentials.go @@ -0,0 +1,227 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Credentials contains information required to authorize privileged operations +// in a user namespace. +type Credentials struct { + // Real/effective/saved user/group IDs in the root user namespace. None of + // these should ever be NoID. + RealKUID KUID + EffectiveKUID KUID + SavedKUID KUID + RealKGID KGID + EffectiveKGID KGID + SavedKGID KGID + + // Filesystem user/group IDs are not implemented. "... setfsuid() is + // nowadays unneeded and should be avoided in new applications (likewise + // for setfsgid(2))." - setfsuid(2) + + // Supplementary groups used by set/getgroups. + // + // ExtraKGIDs slices are immutable, allowing multiple Credentials with the + // same ExtraKGIDs to share the same slice. + ExtraKGIDs []KGID + + // The capability sets applicable to this set of credentials. + PermittedCaps CapabilitySet + InheritableCaps CapabilitySet + EffectiveCaps CapabilitySet + BoundingCaps CapabilitySet + // Ambient capabilities are not introduced until Linux 4.3. + + // KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be + // maintained after a switch from root user to non-root user via setuid(). + KeepCaps bool + + // The user namespace associated with the owner of the credentials. + UserNamespace *UserNamespace +} + +// NewAnonymousCredentials returns a set of credentials with no capabilities in +// any user namespace. +func NewAnonymousCredentials() *Credentials { + // Create a new root user namespace. Since the new namespace's owner is + // KUID 0 and the returned credentials have non-zero KUID/KGID, the + // returned credentials do not have any capabilities in the new namespace. + // Since the new namespace is not part of any existing user namespace + // hierarchy, the returned credentials do not have any capabilities in any + // other namespace. + return &Credentials{ + RealKUID: NobodyKUID, + EffectiveKUID: NobodyKUID, + SavedKUID: NobodyKUID, + RealKGID: NobodyKGID, + EffectiveKGID: NobodyKGID, + SavedKGID: NobodyKGID, + UserNamespace: NewRootUserNamespace(), + } +} + +// NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e. +// global root) in user namespace ns. +func NewRootCredentials(ns *UserNamespace) *Credentials { + // I can't find documentation for this anywhere, but it's correct for the + // inheritable capability set to be initially empty (the capabilities test + // checks for this property). + return &Credentials{ + RealKUID: RootKUID, + EffectiveKUID: RootKUID, + SavedKUID: RootKUID, + RealKGID: RootKGID, + EffectiveKGID: RootKGID, + SavedKGID: RootKGID, + PermittedCaps: AllCapabilities, + EffectiveCaps: AllCapabilities, + BoundingCaps: AllCapabilities, + UserNamespace: ns, + } +} + +// NewUserCredentials returns a set of credentials based on the given UID, GIDs, +// and capabilities in a given namespace. If all arguments are their zero +// values, this returns the same credentials as NewRootCredentials. +func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials { + creds := NewRootCredentials(ns) + + // Set the UID. + uid := kuid + creds.RealKUID = uid + creds.EffectiveKUID = uid + creds.SavedKUID = uid + + // Set GID. + gid := kgid + creds.RealKGID = gid + creds.EffectiveKGID = gid + creds.SavedKGID = gid + + // Set additional GIDs. + creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...) + + // Set capabilities. If capabilities aren't specified, we default to + // all capabilities. + if capabilities != nil { + creds.PermittedCaps = capabilities.PermittedCaps + creds.EffectiveCaps = capabilities.EffectiveCaps + creds.BoundingCaps = capabilities.BoundingCaps + creds.InheritableCaps = capabilities.InheritableCaps + // // TODO: Support ambient capabilities. + } else { + // If no capabilities are specified, grant the same capabilites + // that NewRootCredentials does. + creds.PermittedCaps = AllCapabilities + creds.EffectiveCaps = AllCapabilities + creds.BoundingCaps = AllCapabilities + } + + return creds +} + +// Fork generates an identical copy of a set of credentials. +func (c *Credentials) Fork() *Credentials { + nc := new(Credentials) + *nc = *c // Copy-by-value; this is legal for all fields. + return nc +} + +// InGroup returns true if c is in group kgid. Compare Linux's +// kernel/groups.c:in_group_p(). +func (c *Credentials) InGroup(kgid KGID) bool { + if c.EffectiveKGID == kgid { + return true + } + for _, extraKGID := range c.ExtraKGIDs { + if extraKGID == kgid { + return true + } + } + return false +} + +// HasCapabilityIn returns true if c has capability cp in ns. +func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool { + for { + // "1. A process has a capability inside a user namespace if it is a member + // of that namespace and it has the capability in its effective capability + // set." - user_namespaces(7) + if c.UserNamespace == ns { + return CapabilitySetOf(cp)&c.EffectiveCaps != 0 + } + // "3. ... A process that resides in the parent of the user namespace and + // whose effective user ID matches the owner of the namespace has all + // capabilities in the namespace." + if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner { + return true + } + // "2. If a process has a capability in a user namespace, then it has that + // capability in all child (and further removed descendant) namespaces as + // well." + if ns.parent == nil { + return false + } + ns = ns.parent + } +} + +// HasCapability returns true if c has capability cp in its user namespace. +func (c *Credentials) HasCapability(cp linux.Capability) bool { + return c.HasCapabilityIn(cp, c.UserNamespace) +} + +// UseUID checks that c can use uid in its user namespace, then translates it +// to the root user namespace. +// +// The checks UseUID does are common, but you should verify that it's doing +// exactly what you want. +func (c *Credentials) UseUID(uid UID) (KUID, error) { + // uid must be mapped. + kuid := c.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return NoID, syserror.EINVAL + } + // If c has CAP_SETUID, then it can use any UID in its user namespace. + if c.HasCapability(linux.CAP_SETUID) { + return kuid, nil + } + // Otherwise, c must already have the UID as its real, effective, or saved + // set-user-ID. + if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID { + return kuid, nil + } + return NoID, syserror.EPERM +} + +// UseGID checks that c can use gid in its user namespace, then translates it +// to the root user namespace. +func (c *Credentials) UseGID(gid GID) (KGID, error) { + kgid := c.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return NoID, syserror.EINVAL + } + if c.HasCapability(linux.CAP_SETGID) { + return kgid, nil + } + if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID { + return kgid, nil + } + return NoID, syserror.EPERM +} diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go new file mode 100644 index 000000000..37522b018 --- /dev/null +++ b/pkg/sentry/kernel/auth/id.go @@ -0,0 +1,121 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "math" +) + +// UID is a user ID in an unspecified user namespace. +type UID uint32 + +// GID is a group ID in an unspecified user namespace. +type GID uint32 + +// In the root user namespace, user/group IDs have a 1-to-1 relationship with +// the users/groups they represent. In other user namespaces, this is not the +// case; for example, two different unmapped users may both "have" the overflow +// UID. This means that it is generally only valid to compare user and group +// IDs in the root user namespace. We assign distinct types, KUID/KGID, to such +// IDs to emphasize this distinction. ("k" is for "key", as in "unique key". +// Linux also uses the prefix "k", but I think they mean "kernel".) + +// KUID is a user ID in the root user namespace. +type KUID uint32 + +// KGID is a group ID in the root user namespace. +type KGID uint32 + +const ( + // NoID is uint32(-1). -1 is consistently used as a special value, in Linux + // and by extension in the auth package, to mean "no ID": + // + // - ID mapping returns -1 if the ID is not mapped. + // + // - Most set*id() syscalls accept -1 to mean "do not change this ID". + NoID = math.MaxUint32 + + // OverflowUID is the default value of /proc/sys/kernel/overflowuid. The + // "overflow UID" is usually [1] used when translating a user ID between + // namespaces fails because the ID is not mapped. (We don't implement this + // file, so the overflow UID is constant.) + // + // [1] "There is one notable case where unmapped user and group IDs are not + // converted to the corresponding overflow ID value. When viewing a uid_map + // or gid_map file in which there is no mapping for the second field, that + // field is displayed as 4294967295 (-1 as an unsigned integer);" - + // user_namespaces(7) + OverflowUID = UID(65534) + OverflowGID = GID(65534) + + // NobodyKUID is the user ID usually reserved for the least privileged user + // "nobody". + NobodyKUID = KUID(65534) + NobodyKGID = KGID(65534) + + // RootKUID is the user ID usually used for the most privileged user "root". + RootKUID = KUID(0) + RootKGID = KGID(0) + RootUID = UID(0) + RootGID = GID(0) +) + +// Ok returns true if uid is not -1. +func (uid UID) Ok() bool { + return uid != NoID +} + +// Ok returns true if gid is not -1. +func (gid GID) Ok() bool { + return gid != NoID +} + +// Ok returns true if kuid is not -1. +func (kuid KUID) Ok() bool { + return kuid != NoID +} + +// Ok returns true if kgid is not -1. +func (kgid KGID) Ok() bool { + return kgid != NoID +} + +// OrOverflow returns uid if it is valid and the overflow UID otherwise. +func (uid UID) OrOverflow() UID { + if uid.Ok() { + return uid + } + return OverflowUID +} + +// OrOverflow returns gid if it is valid and the overflow GID otherwise. +func (gid GID) OrOverflow() GID { + if gid.Ok() { + return gid + } + return OverflowGID +} + +// In translates kuid into user namespace ns. If kuid is not mapped in ns, In +// returns NoID. +func (kuid KUID) In(ns *UserNamespace) UID { + return ns.MapFromKUID(kuid) +} + +// In translates kgid into user namespace ns. If kgid is not mapped in ns, In +// returns NoID. +func (kgid KGID) In(ns *UserNamespace) GID { + return ns.MapFromKGID(kgid) +} diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go new file mode 100644 index 000000000..6adb33530 --- /dev/null +++ b/pkg/sentry/kernel/auth/id_map.go @@ -0,0 +1,283 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns. +func (ns *UserNamespace) MapFromKUID(kuid KUID) UID { + if ns.parent == nil { + return UID(kuid) + } + return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid)))) +} + +// MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns. +func (ns *UserNamespace) MapFromKGID(kgid KGID) GID { + if ns.parent == nil { + return GID(kgid) + } + return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid)))) +} + +// MapToKUID translates uid, a UID in ns, to a UID in the root namespace. +func (ns *UserNamespace) MapToKUID(uid UID) KUID { + if ns.parent == nil { + return KUID(uid) + } + return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid)))) +} + +// MapToKGID translates gid, a GID in ns, to a GID in the root namespace. +func (ns *UserNamespace) MapToKGID(gid GID) KGID { + if ns.parent == nil { + return KGID(gid) + } + return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid)))) +} + +func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 { + if id == NoID { + return NoID + } + ns.mu.Lock() + defer ns.mu.Unlock() + if it := m.FindSegment(id); it.Ok() { + return it.Value() + (id - it.Start()) + } + return NoID +} + +// allIDsMapped returns true if all IDs in the range [start, end) are mapped in +// m. +// +// Preconditions: end >= start. +func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool { + ns.mu.Lock() + defer ns.mu.Unlock() + return m.SpanRange(idMapRange{start, end}) == end-start +} + +// An IDMapEntry represents a mapping from a range of contiguous IDs in a user +// namespace to an equally-sized range of contiguous IDs in the namespace's +// parent. +type IDMapEntry struct { + // FirstID is the first ID in the range in the namespace. + FirstID uint32 + + // FirstParentID is the first ID in the range in the parent namespace. + FirstParentID uint32 + + // Length is the number of IDs in the range. + Length uint32 +} + +// SetUIDMap instructs ns to translate UIDs as specified by entries. +// +// Note: SetUIDMap does not place an upper bound on the number of entries, but +// Linux does. This restriction is implemented in SetUIDMap's caller, the +// implementation of /proc/[pid]/uid_map. +func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error { + c := CredentialsFromContext(ctx) + + ns.mu.Lock() + defer ns.mu.Unlock() + // "After the creation of a new user namespace, the uid_map file of *one* + // of the processes in the namespace may be written to *once* to define the + // mapping of user IDs in the new user namespace. An attempt to write more + // than once to a uid_map file in a user namespace fails with the error + // EPERM. Similar rules apply for gid_map files." - user_namespaces(7) + if !ns.uidMapFromParent.IsEmpty() { + return syserror.EPERM + } + // "At least one line must be written to the file." + if len(entries) == 0 { + return syserror.EINVAL + } + // """ + // In order for a process to write to the /proc/[pid]/uid_map + // (/proc/[pid]/gid_map) file, all of the following requirements must be + // met: + // + // 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability + // in the user namespace of the process pid. + // """ + if !c.HasCapabilityIn(linux.CAP_SETUID, ns) { + return syserror.EPERM + } + // "2. The writing process must either be in the user namespace of the process + // pid or be in the parent user namespace of the process pid." + if c.UserNamespace != ns && c.UserNamespace != ns.parent { + return syserror.EPERM + } + // """ + // 3. (see trySetUIDMap) + // + // 4. One of the following two cases applies: + // + // * Either the writing process has the CAP_SETUID (CAP_SETGID) capability + // in the parent user namespace. + // """ + if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) { + // """ + // * Or otherwise all of the following restrictions apply: + // + // + The data written to uid_map (gid_map) must consist of a single line + // that maps the writing process' effective user ID (group ID) in the + // parent user namespace to a user ID (group ID) in the user namespace. + // """ + if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 { + return syserror.EPERM + } + // """ + // + The writing process must have the same effective user ID as the + // process that created the user namespace. + // """ + if c.EffectiveKUID != ns.owner { + return syserror.EPERM + } + } + // trySetUIDMap leaves data in maps if it fails. + if err := ns.trySetUIDMap(entries); err != nil { + ns.uidMapFromParent.RemoveAll() + ns.uidMapToParent.RemoveAll() + return err + } + return nil +} + +func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error { + for _, e := range entries { + // Determine upper bounds and check for overflow. This implicitly + // checks for NoID. + lastID := e.FirstID + e.Length + if lastID <= e.FirstID { + return syserror.EINVAL + } + lastParentID := e.FirstParentID + e.Length + if lastParentID <= e.FirstParentID { + return syserror.EINVAL + } + // "3. The mapped user IDs (group IDs) must in turn have a mapping in + // the parent user namespace." + // Only the root namespace has a nil parent, and root is assigned + // mappings when it's created, so SetUIDMap would have returned EPERM + // without reaching this point if ns is root. + if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) { + return syserror.EPERM + } + // If either of these Adds fail, we have an overlapping range. + if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) { + return syserror.EINVAL + } + if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) { + return syserror.EINVAL + } + } + return nil +} + +// SetGIDMap instructs ns to translate GIDs as specified by entries. +func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error { + c := CredentialsFromContext(ctx) + + ns.mu.Lock() + defer ns.mu.Unlock() + if !ns.gidMapFromParent.IsEmpty() { + return syserror.EPERM + } + if len(entries) == 0 { + return syserror.EINVAL + } + if !c.HasCapabilityIn(linux.CAP_SETGID, ns) { + return syserror.EPERM + } + if c.UserNamespace != ns && c.UserNamespace != ns.parent { + return syserror.EPERM + } + if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) { + if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 { + return syserror.EPERM + } + // It's correct for this to still be UID. + if c.EffectiveKUID != ns.owner { + return syserror.EPERM + } + // "In the case of gid_map, use of the setgroups(2) system call must + // first be denied by writing "deny" to the /proc/[pid]/setgroups file + // (see below) before writing to gid_map." (This file isn't implemented + // in the version of Linux we're emulating; see comment in + // UserNamespace.) + } + if err := ns.trySetGIDMap(entries); err != nil { + ns.gidMapFromParent.RemoveAll() + ns.gidMapToParent.RemoveAll() + return err + } + return nil +} + +func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error { + for _, e := range entries { + lastID := e.FirstID + e.Length + if lastID <= e.FirstID { + return syserror.EINVAL + } + lastParentID := e.FirstParentID + e.Length + if lastParentID <= e.FirstParentID { + return syserror.EINVAL + } + if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) { + return syserror.EPERM + } + if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) { + return syserror.EINVAL + } + if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) { + return syserror.EINVAL + } + } + return nil +} + +// UIDMap returns the user ID mappings configured for ns. If no mappings +// have been configured, UIDMap returns nil. +func (ns *UserNamespace) UIDMap() []IDMapEntry { + return ns.getIDMap(&ns.uidMapToParent) +} + +// GIDMap returns the group ID mappings configured for ns. If no mappings +// have been configured, GIDMap returns nil. +func (ns *UserNamespace) GIDMap() []IDMapEntry { + return ns.getIDMap(&ns.gidMapToParent) +} + +func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry { + ns.mu.Lock() + defer ns.mu.Unlock() + var entries []IDMapEntry + for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() { + entries = append(entries, IDMapEntry{ + FirstID: it.Start(), + FirstParentID: it.Value(), + Length: it.Range().Length(), + }) + } + return entries +} diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go new file mode 100644 index 000000000..889291d96 --- /dev/null +++ b/pkg/sentry/kernel/auth/id_map_functions.go @@ -0,0 +1,45 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +// idMapFunctions "implements" generic interface segment.Functions for +// idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one +// user namespace to non-overlapping ranges of contiguous IDs in another user +// namespace. Each such ID mapping is implemented as a range-to-value mapping +// in the set such that [range.Start(), range.End()) => [value, value + +// range.Length()). +type idMapFunctions struct{} + +func (idMapFunctions) MinKey() uint32 { + return 0 +} + +func (idMapFunctions) MaxKey() uint32 { + return NoID +} + +func (idMapFunctions) ClearValue(*uint32) {} + +func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) { + // Mapped ranges have to be contiguous. + if val1+r1.Length() != val2 { + return 0, false + } + return val1, true +} + +func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) { + return val, val + (split - r.Start) +} diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go new file mode 100644 index 000000000..0980aeadf --- /dev/null +++ b/pkg/sentry/kernel/auth/user_namespace.go @@ -0,0 +1,130 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "math" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// A UserNamespace represents a user namespace. See user_namespaces(7) for +// details. +type UserNamespace struct { + // parent is this namespace's parent. If this is the root namespace, parent + // is nil. The parent pointer is immutable. + parent *UserNamespace + + // owner is the effective UID of the namespace's creator in the root + // namespace. owner is immutable. + owner KUID + + // mu protects the following fields. + // + // If mu will be locked in multiple UserNamespaces, it must be locked in + // descendant namespaces before ancestors. + mu sync.Mutex `state:"nosave"` + + // Mappings of user/group IDs between this namespace and its parent. + // + // All ID maps, once set, cannot be changed. This means that successful + // UID/GID translations cannot be racy. + uidMapFromParent idMapSet + uidMapToParent idMapSet + gidMapFromParent idMapSet + gidMapToParent idMapSet + + // TODO: Consider supporting disabling setgroups(2), which "was + // added in Linux 3.19, but was backported to many earlier stable kernel + // series, because it addresses a security issue" - user_namespaces(7). (It + // was not backported to 3.11.10, which we are currently imitating.) +} + +// NewRootUserNamespace returns a UserNamespace that is appropriate for a +// system's root user namespace. +func NewRootUserNamespace() *UserNamespace { + var ns UserNamespace + // """ + // The initial user namespace has no parent namespace, but, for + // consistency, the kernel provides dummy user and group ID mapping files + // for this namespace. Looking at the uid_map file (gid_map is the same) + // from a shell in the initial namespace shows: + // + // $ cat /proc/$$/uid_map + // 0 0 4294967295 + // """ - user_namespaces(7) + for _, m := range []*idMapSet{ + &ns.uidMapFromParent, + &ns.uidMapToParent, + &ns.gidMapFromParent, + &ns.gidMapToParent, + } { + if !m.Add(idMapRange{0, math.MaxUint32}, 0) { + panic("Failed to insert into empty ID map") + } + } + return &ns +} + +// Root returns the root of the user namespace tree containing ns. +func (ns *UserNamespace) Root() *UserNamespace { + for ns.parent != nil { + ns = ns.parent + } + return ns +} + +// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user +// namespaces." - user_namespaces(7) +const maxUserNamespaceDepth = 32 + +func (ns *UserNamespace) depth() int { + var i int + for ns != nil { + i++ + ns = ns.parent + } + return i +} + +// NewChildUserNamespace returns a new user namespace created by a caller with +// credentials c. +func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) { + if c.UserNamespace.depth() >= maxUserNamespaceDepth { + // "... Calls to unshare(2) or clone(2) that would cause this limit to + // be exceeded fail with the error EUSERS." - user_namespaces(7) + return nil, syserror.EUSERS + } + // "EPERM: CLONE_NEWUSER was specified in flags, but either the effective + // user ID or the effective group ID of the caller does not have a mapping + // in the parent namespace (see user_namespaces(7))." - clone(2) + // "CLONE_NEWUSER requires that the user ID and group ID of the calling + // process are mapped to user IDs and group IDs in the user namespace of + // the calling process at the time of the call." - unshare(2) + if !c.EffectiveKUID.In(c.UserNamespace).Ok() { + return nil, syserror.EPERM + } + if !c.EffectiveKGID.In(c.UserNamespace).Ok() { + return nil, syserror.EPERM + } + return &UserNamespace{ + parent: c.UserNamespace, + owner: c.EffectiveKUID, + // "When a user namespace is created, it starts without a mapping of + // user IDs (group IDs) to the parent user namespace." - + // user_namespaces(7) + }, nil +} diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go new file mode 100644 index 000000000..261ca6f7a --- /dev/null +++ b/pkg/sentry/kernel/context.go @@ -0,0 +1,135 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// contextID is the kernel package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxCanTrace is a Context.Value key for a function with the same + // signature and semantics as kernel.Task.CanTrace. + CtxCanTrace contextID = iota + + // CtxKernel is a Context.Value key for a Kernel. + CtxKernel + + // CtxPIDNamespace is a Context.Value key for a PIDNamespace. + CtxPIDNamespace + + // CtxTask is a Context.Value key for a Task. + CtxTask + + // CtxUTSNamespace is a Context.Value key for a UTSNamespace. + CtxUTSNamespace + + // CtxIPCNamespace is a Context.Value key for a IPCNamespace. + CtxIPCNamespace +) + +// ContextCanTrace returns true if ctx is permitted to trace t, in the same sense +// as kernel.Task.CanTrace. +func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool { + if v := ctx.Value(CtxCanTrace); v != nil { + return v.(func(*Task, bool) bool)(t, attach) + } + return false +} + +// KernelFromContext returns the Kernel in which ctx is executing, or nil if +// there is no such Kernel. +func KernelFromContext(ctx context.Context) *Kernel { + if v := ctx.Value(CtxKernel); v != nil { + return v.(*Kernel) + } + return nil +} + +// PIDNamespaceFromContext returns the PID namespace in which ctx is executing, +// or nil if there is no such PID namespace. +func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace { + if v := ctx.Value(CtxPIDNamespace); v != nil { + return v.(*PIDNamespace) + } + return nil +} + +// UTSNamespaceFromContext returns the UTS namespace in which ctx is executing, +// or nil if there is no such UTS namespace. +func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { + if v := ctx.Value(CtxUTSNamespace); v != nil { + return v.(*UTSNamespace) + } + return nil +} + +// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing, +// or nil if there is no such IPC namespace. +func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { + if v := ctx.Value(CtxIPCNamespace); v != nil { + return v.(*IPCNamespace) + } + return nil +} + +// TaskFromContext returns the Task associated with ctx, or nil if there is no +// such Task. +func TaskFromContext(ctx context.Context) *Task { + if v := ctx.Value(CtxTask); v != nil { + return v.(*Task) + } + return nil +} + +// AsyncContext returns a context.Context that may be used by goroutines that +// do work on behalf of t and therefore share its contextual values, but are +// not t's task goroutine (e.g. asynchronous I/O). +func (t *Task) AsyncContext() context.Context { + return taskAsyncContext{t: t} +} + +type taskAsyncContext struct { + context.NoopSleeper + t *Task +} + +// Debugf implements log.Logger.Debugf. +func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { + ctx.t.Debugf(format, v...) +} + +// Infof implements log.Logger.Infof. +func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { + ctx.t.Infof(format, v...) +} + +// Warningf implements log.Logger.Warningf. +func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { + ctx.t.Warningf(format, v...) +} + +// IsLogging implements log.Logger.IsLogging. +func (ctx taskAsyncContext) IsLogging(level log.Level) bool { + return ctx.t.IsLogging(level) +} + +// Value implements context.Context.Value. +func (ctx taskAsyncContext) Value(key interface{}) interface{} { + return ctx.t.Value(key) +} diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD new file mode 100644 index 000000000..04651d961 --- /dev/null +++ b/pkg/sentry/kernel/epoll/BUILD @@ -0,0 +1,52 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "epoll_autogen_state", + srcs = [ + "epoll.go", + "epoll_state.go", + ], + out = "epoll_autogen_state.go", + package = "epoll", +) + +go_library( + name = "epoll", + srcs = [ + "epoll.go", + "epoll_autogen_state.go", + "epoll_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/ilist", + "//pkg/refs", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel/kdefs", + "//pkg/sentry/kernel/time", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/waiter", + ], +) + +go_test( + name = "epoll_test", + size = "small", + srcs = [ + "epoll_test.go", + ], + embed = [":epoll"], + deps = [ + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs/filetest", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go new file mode 100644 index 000000000..b572fcd7e --- /dev/null +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -0,0 +1,466 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package epoll provides an implementation of Linux's IO event notification +// facility. See epoll(7) for more details. +package epoll + +import ( + "fmt" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/ilist" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Event describes the event mask that was observed and the user data to be +// returned when one of the events occurs. It has this format to match the linux +// format to avoid extra copying/allocation when writing events to userspace. +type Event struct { + // Events is the event mask containing the set of events that have been + // observed on an entry. + Events uint32 + + // Data is an opaque 64-bit value provided by the caller when adding the + // entry, and returned to the caller when the entry reports an event. + Data [2]int32 +} + +// EntryFlags is a bitmask that holds an entry's flags. +type EntryFlags int + +// Valid entry flags. +const ( + OneShot EntryFlags = 1 << iota + EdgeTriggered +) + +// FileIdentifier identifies a file. We cannot use just the FD because it could +// potentially be reassigned. We also cannot use just the file pointer because +// it is possible to have multiple entries for the same file object as long as +// they are created with different FDs (i.e., the FDs point to the same file). +type FileIdentifier struct { + File *fs.File + Fd kdefs.FD +} + +// pollEntry holds all the state associated with an event poll entry, that is, +// a file being observed by an event poll object. +type pollEntry struct { + ilist.Entry + file *refs.WeakRef `state:"manual"` + id FileIdentifier `state:"wait"` + userData [2]int32 + waiter waiter.Entry `state:"manual"` + mask waiter.EventMask + flags EntryFlags + + epoll *EventPoll + + // We cannot save the current list pointer as it points into EventPoll + // struct, while state framework currently does not support such + // in-struct pointers. Instead, EventPoll will properly set this field + // in its loading logic. + curList *ilist.List `state:"nosave"` +} + +// WeakRefGone implements refs.WeakRefUser.WeakRefGone. +// weakReferenceGone is called when the file in the weak reference is destroyed. +// The poll entry is removed in response to this. +func (p *pollEntry) WeakRefGone() { + p.epoll.RemoveEntry(p.id) +} + +// EventPoll holds all the state associated with an event poll object, that is, +// collection of files to observe and their current state. +type EventPoll struct { + fsutil.PipeSeek `state:"zerovalue"` + fsutil.NotDirReaddir `state:"zerovalue"` + fsutil.NoFsync `state:"zerovalue"` + fsutil.NoopFlush `state:"zerovalue"` + fsutil.NoMMap `state:"zerovalue"` + fsutil.NoIoctl `state:"zerovalue"` + + // Wait queue is used to notify interested parties when the event poll + // object itself becomes readable or writable. + waiter.Queue + + // files is the map of all the files currently being observed, it is + // protected by mu. + mu sync.Mutex `state:"nosave"` + files map[FileIdentifier]*pollEntry + + // listsMu protects manipulation of the lists below. It needs to be a + // different lock to avoid circular lock acquisition order involving + // the wait queue mutexes and mu. The full order is mu, observed file + // wait queue mutex, then listsMu; this allows listsMu to be acquired + // when readyCallback is called. + // + // An entry is always in one of the following lists: + // readyList -- when there's a chance that it's ready to have + // events delivered to epoll waiters. Given that being + // ready is a transient state, the Readiness() and + // readEvents() functions always call the entry's file + // Readiness() function to confirm it's ready. + // waitingList -- when there's no chance that the entry is ready, + // so it's waiting for the readyCallback to be called + // on it before it gets moved to the readyList. + // disabledList -- when the entry is disabled. This happens when + // a one-shot entry gets delivered via readEvents(). + listsMu sync.Mutex `state:"nosave"` + readyList ilist.List + waitingList ilist.List + disabledList ilist.List +} + +// cycleMu is used to serialize all the cycle checks. This is only used when +// an event poll file is added as an entry to another event poll. Such checks +// are serialized to avoid lock acquisition order inversion: if a thread is +// adding A to B, and another thread is adding B to A, each would acquire A's +// and B's mutexes in reverse order, and could cause deadlocks. Having this +// lock prevents this by allowing only one check at a time to happen. +// +// We do the cycle check to prevent callers from introducing potentially +// infinite recursions. If a caller were to add A to B and then B to A, for +// event poll A to know if it's readable, it would need to check event poll B, +// which in turn would need event poll A and so on indefinitely. +var cycleMu sync.Mutex + +// NewEventPoll allocates and initializes a new event poll object. +func NewEventPoll(ctx context.Context) *fs.File { + // name matches fs/eventpoll.c:epoll_create1. + dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]")) + return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{ + files: make(map[FileIdentifier]*pollEntry), + }) +} + +// Release implements fs.FileOperations.Release. +func (e *EventPoll) Release() { + // We need to take the lock now because files may be attempting to + // remove entries in parallel if they get destroyed. + e.mu.Lock() + defer e.mu.Unlock() + + // Go through all entries and clean up. + for _, entry := range e.files { + entry.id.File.EventUnregister(&entry.waiter) + entry.file.Drop() + } +} + +// Read implements fs.FileOperations.Read. +func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syscall.ENOSYS +} + +// Write implements fs.FileOperations.Write. +func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syscall.ENOSYS +} + +// eventsAvailable determines if 'e' has events available for delivery. +func (e *EventPoll) eventsAvailable() bool { + e.listsMu.Lock() + + for it := e.readyList.Front(); it != nil; { + entry := it.(*pollEntry) + it = it.Next() + + // If the entry is ready, we know 'e' has at least one entry + // ready for delivery. + ready := entry.id.File.Readiness(entry.mask) + if ready != 0 { + e.listsMu.Unlock() + return true + } + + // Entry is not ready, so move it to waiting list. + e.readyList.Remove(entry) + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + } + + e.listsMu.Unlock() + + return false +} + +// Readiness determines if the event poll object is currently readable (i.e., +// if there are pending events for delivery). +func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask { + ready := waiter.EventMask(0) + + if (mask&waiter.EventIn) != 0 && e.eventsAvailable() { + ready |= waiter.EventIn + } + + return ready +} + +// ReadEvents returns up to max available events. +func (e *EventPoll) ReadEvents(max int) []Event { + var local ilist.List + var ret []Event + + e.listsMu.Lock() + + // Go through all entries we believe may be ready. + for it := e.readyList.Front(); it != nil && len(ret) < max; { + entry := it.(*pollEntry) + it = it.Next() + + // Check the entry's readiness. It it's not really ready, we + // just put it back in the waiting list and move on to the next + // entry. + ready := entry.id.File.Readiness(entry.mask) & entry.mask + if ready == 0 { + e.readyList.Remove(entry) + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + + continue + } + + // Add event to the array that will be returned to caller. + ret = append(ret, Event{ + Events: uint32(ready), + Data: entry.userData, + }) + + // The entry is consumed, so we must move it to the disabled + // list in case it's one-shot, or back to the wait list if it's + // edge-triggered. If it's neither, we leave it in the ready + // list so that its readiness can be checked the next time + // around; however, we must move it to the end of the list so + // that other events can be delivered as well. + e.readyList.Remove(entry) + if entry.flags&OneShot != 0 { + e.disabledList.PushBack(entry) + entry.curList = &e.disabledList + } else if entry.flags&EdgeTriggered != 0 { + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + } else { + local.PushBack(entry) + } + } + + e.readyList.PushBackList(&local) + + e.listsMu.Unlock() + + return ret +} + +// readyCallback is called when one of the files we're polling becomes ready. It +// moves said file to the readyList if it's currently in the waiting list. +type readyCallback struct{} + +// Callback implements waiter.EntryCallback.Callback. +func (*readyCallback) Callback(w *waiter.Entry) { + entry := w.Context.(*pollEntry) + e := entry.epoll + + e.listsMu.Lock() + + if entry.curList == &e.waitingList { + e.waitingList.Remove(entry) + e.readyList.PushBack(entry) + entry.curList = &e.readyList + + e.Notify(waiter.EventIn) + } + + e.listsMu.Unlock() +} + +// initEntryReadiness initializes the entry's state with regards to its +// readiness by placing it in the appropriate list and registering for +// notifications. +func (e *EventPoll) initEntryReadiness(entry *pollEntry) { + // A new entry starts off in the waiting list. + e.listsMu.Lock() + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + e.listsMu.Unlock() + + // Register for event notifications. + f := entry.id.File + f.EventRegister(&entry.waiter, entry.mask) + + // Check if the file happens to already be in a ready state. + ready := f.Readiness(entry.mask) & entry.mask + if ready != 0 { + (*readyCallback).Callback(nil, &entry.waiter) + } +} + +// observes checks if event poll object e is directly or indirectly observing +// event poll object ep. It uses a bounded recursive depth-first search. +func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool { + // If we reached the maximum depth, we'll consider that we found it + // because we don't want to allow chains that are too long. + if depthLeft <= 0 { + return true + } + + e.mu.Lock() + defer e.mu.Unlock() + + // Go through each observed file and check if it is or observes ep. + for id := range e.files { + f, ok := id.File.FileOperations.(*EventPoll) + if !ok { + continue + } + + if f == ep || f.observes(ep, depthLeft-1) { + return true + } + } + + return false +} + +// AddEntry adds a new file to the collection of files observed by e. +func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error { + // Acquire cycle check lock if another event poll is being added. + ep, ok := id.File.FileOperations.(*EventPoll) + if ok { + cycleMu.Lock() + defer cycleMu.Unlock() + } + + e.mu.Lock() + defer e.mu.Unlock() + + // Fail if the file already has an entry. + if _, ok := e.files[id]; ok { + return syscall.EEXIST + } + + // Check if a cycle would be created. We use 4 as the limit because + // that's the value used by linux and we want to emulate it. + if ep != nil { + if e == ep { + return syscall.EINVAL + } + + if ep.observes(e, 4) { + return syscall.ELOOP + } + } + + // Create new entry and add it to map. + // + // N.B. Even though we are creating a weak reference here, we know it + // won't trigger a callback because we hold a reference to the file + // throughout the execution of this function. + entry := &pollEntry{ + id: id, + userData: data, + epoll: e, + flags: flags, + waiter: waiter.Entry{Callback: &readyCallback{}}, + mask: mask, + } + entry.waiter.Context = entry + e.files[id] = entry + entry.file = refs.NewWeakRef(id.File, entry) + + // Initialize the readiness state of the new entry. + e.initEntryReadiness(entry) + + return nil +} + +// UpdateEntry updates the flags, mask and user data associated with a file that +// is already part of the collection of observed files. +func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error { + e.mu.Lock() + defer e.mu.Unlock() + + // Fail if the file doesn't have an entry. + entry, ok := e.files[id] + if !ok { + return syscall.ENOENT + } + + // Unregister the old mask and remove entry from the list it's in, so + // readyCallback is guaranteed to not be called on this entry anymore. + entry.id.File.EventUnregister(&entry.waiter) + + // Remove entry from whatever list it's in. This ensure that no other + // threads have access to this entry as the only way left to find it + // is via e.files, but we hold e.mu, which prevents that. + e.listsMu.Lock() + entry.curList.Remove(entry) + e.listsMu.Unlock() + + // Initialize new readiness state. + entry.flags = flags + entry.mask = mask + entry.userData = data + e.initEntryReadiness(entry) + + return nil +} + +// RemoveEntry a files from the collection of observed files. +func (e *EventPoll) RemoveEntry(id FileIdentifier) error { + e.mu.Lock() + defer e.mu.Unlock() + + // Fail if the file doesn't have an entry. + entry, ok := e.files[id] + if !ok { + return syscall.ENOENT + } + + // Unregister from file first so that no concurrent attempts will be + // made to manipulate the file. + entry.id.File.EventUnregister(&entry.waiter) + + // Remove from the current list. + e.listsMu.Lock() + entry.curList.Remove(entry) + entry.curList = nil + e.listsMu.Unlock() + + // Remove file from map, and drop weak reference. + delete(e.files, id) + entry.file.Drop() + + return nil +} + +// UnregisterEpollWaiters removes the epoll waiter objects from the waiting +// queues. This is different from Release() as the file is not dereferenced. +func (e *EventPoll) UnregisterEpollWaiters() { + e.mu.Lock() + defer e.mu.Unlock() + + for _, entry := range e.files { + entry.id.File.EventUnregister(&entry.waiter) + } +} diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go new file mode 100644 index 000000000..dabb32f49 --- /dev/null +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -0,0 +1,51 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package epoll + +import ( + "gvisor.googlesource.com/gvisor/pkg/ilist" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// afterLoad is invoked by stateify. +func (p *pollEntry) afterLoad() { + p.waiter = waiter.Entry{Callback: &readyCallback{}} + p.waiter.Context = p + p.file = refs.NewWeakRef(p.id.File, p) + p.id.File.EventRegister(&p.waiter, p.mask) +} + +// afterLoad is invoked by stateify. +func (e *EventPoll) afterLoad() { + e.listsMu.Lock() + defer e.listsMu.Unlock() + + for _, ls := range []*ilist.List{&e.waitingList, &e.readyList, &e.disabledList} { + for it := ls.Front(); it != nil; it = it.Next() { + it.(*pollEntry).curList = ls + } + } + + for it := e.waitingList.Front(); it != nil; it = it.Next() { + p := it.(*pollEntry) + if p.id.File.Readiness(p.mask) != 0 { + e.waitingList.Remove(p) + e.readyList.PushBack(p) + p.curList = &e.readyList + e.Notify(waiter.EventIn) + } + } +} diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go new file mode 100644 index 000000000..bc869fc13 --- /dev/null +++ b/pkg/sentry/kernel/epoll/epoll_test.go @@ -0,0 +1,54 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package epoll + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +func TestFileDestroyed(t *testing.T) { + f := filetest.NewTestFile(t) + id := FileIdentifier{f, 12} + + efile := NewEventPoll(contexttest.Context(t)) + e := efile.FileOperations.(*EventPoll) + if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil { + t.Fatalf("addEntry failed: %v", err) + } + + // Check that we get an event reported twice in a row. + evt := e.ReadEvents(1) + if len(evt) != 1 { + t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt)) + } + + evt = e.ReadEvents(1) + if len(evt) != 1 { + t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt)) + } + + // Destroy the file. Check that we get no more events. + f.DecRef() + + evt = e.ReadEvents(1) + if len(evt) != 0 { + t.Fatalf("Unexpected number of ready events: want %v, got %v", 0, len(evt)) + } + +} diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD new file mode 100644 index 000000000..2d5a3c693 --- /dev/null +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -0,0 +1,46 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "eventfd_state", + srcs = [ + "eventfd.go", + ], + out = "eventfd_state.go", + package = "eventfd", +) + +go_library( + name = "eventfd", + srcs = [ + "eventfd.go", + "eventfd_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/eventfd", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/refs", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/waiter", + ], +) + +go_test( + name = "eventfd_test", + size = "small", + srcs = ["eventfd_test.go"], + embed = [":eventfd"], + deps = [ + "//pkg/sentry/context/contexttest", + "//pkg/sentry/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go new file mode 100644 index 000000000..c9333719e --- /dev/null +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -0,0 +1,172 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package eventfd provides an implementation of Linux's file-based event +// notification. +package eventfd + +import ( + "math" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// EventOperations represents an event with the semantics of Linux's file-based event +// notification (eventfd). +type EventOperations struct { + fsutil.NoopRelease `state:"nosave"` + fsutil.PipeSeek `state:"nosave"` + fsutil.NotDirReaddir `state:"nosave"` + fsutil.NoFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoMMap `state:"nosave"` + fsutil.NoIoctl `state:"nosave"` + + // Mutex that protects accesses to the fields of this event. + mu sync.Mutex `state:"nosave"` + + // Queue is used to notify interested parties when the event object + // becomes readable or writable. + waiter.Queue `state:"nosave"` + + // val is the current value of the event counter. + val uint64 + + // semMode specifies whether the event is in "semaphore" mode. + semMode bool +} + +// New creates a new event object with the supplied initial value and mode. +func New(ctx context.Context, initVal uint64, semMode bool) *fs.File { + // name matches fs/eventfd.c:eventfd_file_create. + dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[eventfd]") + return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{ + val: initVal, + semMode: semMode, + }) +} + +// Read implements fs.FileOperations.Read. +func (e *EventOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + if dst.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := e.read(ctx, dst); err != nil { + return 0, err + } + return 8, nil +} + +// Write implements fs.FileOperations.Write. +func (e *EventOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + if src.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := e.write(ctx, src); err != nil { + return 0, err + } + return 8, nil +} + +func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) error { + e.mu.Lock() + + // We can't complete the read if the value is currently zero. + if e.val == 0 { + e.mu.Unlock() + return syserror.ErrWouldBlock + } + + // Update the value based on the mode the event is operating in. + var val uint64 + if e.semMode { + val = 1 + // Consistent with Linux, this is done even if writing to memory fails. + e.val-- + } else { + val = e.val + e.val = 0 + } + + e.mu.Unlock() + + // Notify writers. We do this even if we were already writable because + // it is possible that a writer is waiting to write the maximum value + // to the event. + e.Notify(waiter.EventOut) + + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) error { + var buf [8]byte + if _, err := src.CopyIn(ctx, buf[:]); err != nil { + return err + } + val := usermem.ByteOrder.Uint64(buf[:]) + + return e.Signal(val) +} + +// Signal is an internal function to signal the event fd. +func (e *EventOperations) Signal(val uint64) error { + if val == math.MaxUint64 { + return syscall.EINVAL + } + + e.mu.Lock() + + // We only allow writes that won't cause the value to go over the max + // uint64 minus 1. + if val > math.MaxUint64-1-e.val { + e.mu.Unlock() + return syserror.ErrWouldBlock + } + + e.val += val + e.mu.Unlock() + + // Always trigger a notification. + e.Notify(waiter.EventIn) + + return nil +} + +// Readiness returns the ready events for the event fd. +func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + ready := waiter.EventMask(0) + + e.mu.Lock() + if e.val > 0 { + ready |= waiter.EventIn + } + + if e.val < math.MaxUint64-1 { + ready |= waiter.EventOut + } + e.mu.Unlock() + + return mask & ready +} diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go new file mode 100644 index 000000000..71326b62f --- /dev/null +++ b/pkg/sentry/kernel/eventfd/eventfd_test.go @@ -0,0 +1,78 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package eventfd + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +func TestEventfd(t *testing.T) { + initVals := []uint64{ + 0, + // Using a non-zero initial value verifies that writing to an + // eventfd signals when the eventfd's counter was already + // non-zero. + 343, + } + + for _, initVal := range initVals { + ctx := contexttest.Context(t) + + // Make a new event that is writable. + event := New(ctx, initVal, false) + + // Register a callback for a write event. + w, ch := waiter.NewChannelEntry(nil) + event.EventRegister(&w, waiter.EventIn) + defer event.EventUnregister(&w) + + data := []byte("00000124") + // Create and submit a write request. + n, err := event.Writev(ctx, usermem.BytesIOSequence(data)) + if err != nil { + t.Fatal(err) + } + if n != 8 { + t.Errorf("eventfd.write wrote %d bytes, not full int64", n) + } + + // Check if the callback fired due to the write event. + select { + case <-ch: + default: + t.Errorf("Didn't get notified of EventIn after write") + } + } +} + +func TestEventfdStat(t *testing.T) { + ctx := contexttest.Context(t) + + // Make a new event that is writable. + event := New(ctx, 0, false) + + // Create and submit an stat request. + uattr, err := event.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + t.Fatalf("eventfd stat request failed: %v", err) + } + if uattr.Size != 0 { + t.Fatal("EventFD size should be 0") + } +} diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go new file mode 100644 index 000000000..ef73125fd --- /dev/null +++ b/pkg/sentry/kernel/fd_map.go @@ -0,0 +1,340 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "fmt" + "sort" + "sync" + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" +) + +// FDs is an ordering of FD's that can be made stable. +type FDs []kdefs.FD + +func (f FDs) Len() int { + return len(f) +} + +func (f FDs) Swap(i, j int) { + f[i], f[j] = f[j], f[i] +} + +func (f FDs) Less(i, j int) bool { + return f[i] < f[j] +} + +// FDFlags define flags for an individual descriptor. +type FDFlags struct { + // CloseOnExec indicates the descriptor should be closed on exec. + CloseOnExec bool +} + +// descriptor holds the details about a file descriptor, namely a pointer the +// file itself and the descriptor flags. +type descriptor struct { + file *fs.File + flags FDFlags +} + +// FDMap is used to manage File references and flags. +type FDMap struct { + refs.AtomicRefCount + k *Kernel + files map[kdefs.FD]descriptor + mu sync.RWMutex `state:"nosave"` + uid uint64 +} + +// ID returns a unique identifier for this FDMap. +func (f *FDMap) ID() uint64 { + return f.uid +} + +// NewFDMap allocates a new FDMap that may be used by tasks in k. +func (k *Kernel) NewFDMap() *FDMap { + return &FDMap{ + k: k, + files: make(map[kdefs.FD]descriptor), + uid: atomic.AddUint64(&k.fdMapUids, 1), + } +} + +// destroy removes all of the file descriptors from the map. +func (f *FDMap) destroy() { + f.RemoveIf(func(*fs.File, FDFlags) bool { + return true + }) +} + +// DecRef implements RefCounter.DecRef with destructor f.destroy. +func (f *FDMap) DecRef() { + f.DecRefWithDestructor(f.destroy) +} + +// Size returns the number of file descriptor slots currently allocated. +func (f *FDMap) Size() int { + f.mu.RLock() + defer f.mu.RUnlock() + + return len(f.files) +} + +// String is a stringer for FDMap. +func (f *FDMap) String() string { + f.mu.RLock() + defer f.mu.RUnlock() + + var b bytes.Buffer + for k, v := range f.files { + n, _ := v.file.Dirent.FullName(nil /* root */) + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n)) + } + return b.String() +} + +// NewFDFrom allocates a new FD guaranteed to be the lowest number available +// greater than or equal to from. This property is important as Unix programs +// tend to count on this allocation order. +func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) { + if fd < 0 { + // Don't accept negative FDs. + return 0, syscall.EINVAL + } + + f.mu.Lock() + defer f.mu.Unlock() + + // Finds the lowest fd not in the handles map. + lim := limitSet.Get(limits.NumberOfFiles) + for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ { + if _, ok := f.files[i]; !ok { + file.IncRef() + f.files[i] = descriptor{file, flags} + return i, nil + } + } + + return -1, syscall.EMFILE +} + +// NewFDAt sets the file reference for the given FD. If there is an +// active reference for that FD, the ref count for that existing reference +// is decremented. +func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + // In this one case we do not do a defer of the Unlock. The + // reason is that we must have done all the work needed for + // discarding any old open file before we return to the + // caller. In other words, the DecRef(), below, must have + // completed by the time we return to the caller to ensure + // side effects are, in fact, effected. A classic example is + // dup2(fd1, fd2); if fd2 was already open, it must be closed, + // and we don't want to resume the caller until it is; we have + // to block on the DecRef(). Hence we can not just do a 'go + // oldfile.DecRef()', since there would be no guarantee that + // it would be done before we the caller resumed. Since we + // must wait for the DecRef() to finish, and that could take + // time, it's best to first call f.muUnlock beore so we are + // not blocking other uses of this FDMap on the DecRef() call. + f.mu.Lock() + oldDesc, oldExists := f.files[fd] + lim := limitSet.Get(limits.NumberOfFiles).Cur + // if we're closing one then the effective limit is one + // more than the actual limit. + if oldExists && lim != limits.Infinity { + lim++ + } + if lim != limits.Infinity && fd >= kdefs.FD(lim) { + f.mu.Unlock() + return syscall.EMFILE + } + + file.IncRef() + f.files[fd] = descriptor{file, flags} + f.mu.Unlock() + + if oldExists { + oldDesc.file.DecRef() + } + return nil +} + +// SetFlags sets the flags for the given file descriptor, if it is valid. +func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) { + f.mu.Lock() + defer f.mu.Unlock() + + desc, ok := f.files[fd] + if !ok { + return + } + + f.files[fd] = descriptor{desc.file, flags} +} + +// GetDescriptor returns a reference to the file and the flags for the FD. It +// bumps its reference count as well. It returns nil if there is no File +// for the FD, i.e. if the FD is invalid. The caller must use DecRef +// when they are done. +func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) { + f.mu.RLock() + defer f.mu.RUnlock() + + if desc, ok := f.files[fd]; ok { + desc.file.IncRef() + return desc.file, desc.flags + } + return nil, FDFlags{} +} + +// GetFile returns a reference to the File for the FD and bumps +// its reference count as well. It returns nil if there is no File +// for the FD, i.e. if the FD is invalid. The caller must use DecRef +// when they are done. +func (f *FDMap) GetFile(fd kdefs.FD) *fs.File { + f.mu.RLock() + if desc, ok := f.files[fd]; ok { + desc.file.IncRef() + f.mu.RUnlock() + return desc.file + } + f.mu.RUnlock() + return nil +} + +// fds returns an ordering of FDs. +func (f *FDMap) fds() FDs { + fds := make(FDs, 0, len(f.files)) + for fd := range f.files { + fds = append(fds, fd) + } + sort.Sort(fds) + return fds +} + +// GetFDs returns a list of valid fds. +func (f *FDMap) GetFDs() FDs { + f.mu.RLock() + defer f.mu.RUnlock() + return f.fds() +} + +// GetRefs returns a stable slice of references to all files and bumps the +// reference count on each. The caller must use DecRef on each reference when +// they're done using the slice. +func (f *FDMap) GetRefs() []*fs.File { + f.mu.RLock() + defer f.mu.RUnlock() + + fds := f.fds() + fs := make([]*fs.File, 0, len(fds)) + for _, fd := range fds { + desc := f.files[fd] + desc.file.IncRef() + fs = append(fs, desc.file) + } + return fs +} + +// Fork returns an independent FDMap pointing to the same descriptors. +func (f *FDMap) Fork() *FDMap { + f.mu.RLock() + defer f.mu.RUnlock() + + clone := f.k.NewFDMap() + + // Grab a extra reference for every file. + for fd, desc := range f.files { + desc.file.IncRef() + clone.files[fd] = desc + } + + // That's it! + return clone +} + +// unlock releases all file locks held by this FDMap's uid. Must only be +// called on a non-nil *fs.File. +func (f *FDMap) unlock(file *fs.File) { + id := lock.UniqueID(f.ID()) + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF}) +} + +// inotifyFileClose generates the appropriate inotify events for f being closed. +func inotifyFileClose(f *fs.File) { + var ev uint32 + d := f.Dirent + + if fs.IsDir(d.Inode.StableAttr) { + ev |= linux.IN_ISDIR + } + + if f.Flags().Write { + ev |= linux.IN_CLOSE_WRITE + } else { + ev |= linux.IN_CLOSE_NOWRITE + } + + d.InotifyEvent(ev, 0) +} + +// Remove removes an FD from the FDMap, and returns (File, true) if a File +// one was found. Callers are expected to decrement the reference count on +// the File. Otherwise returns (nil, false). +func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) { + f.mu.Lock() + desc := f.files[fd] + delete(f.files, fd) + f.mu.Unlock() + if desc.file != nil { + f.unlock(desc.file) + inotifyFileClose(desc.file) + return desc.file, true + } + return nil, false +} + +// RemoveIf removes all FDs where cond is true. +func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) { + var removed []*fs.File + f.mu.Lock() + for fd, desc := range f.files { + if desc.file != nil && cond(desc.file, desc.flags) { + delete(f.files, fd) + removed = append(removed, desc.file) + } + } + f.mu.Unlock() + + for _, file := range removed { + f.unlock(file) + inotifyFileClose(file) + file.DecRef() + } +} diff --git a/pkg/sentry/kernel/fd_map_test.go b/pkg/sentry/kernel/fd_map_test.go new file mode 100644 index 000000000..e1ac900e8 --- /dev/null +++ b/pkg/sentry/kernel/fd_map_test.go @@ -0,0 +1,134 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" +) + +const ( + // maxFD is the maximum FD to try to create in the map. + // This number of open files has been seen in the wild. + maxFD = 2 * 1024 +) + +func newTestFDMap() *FDMap { + return &FDMap{ + files: make(map[kdefs.FD]descriptor), + } +} + +// TestFDMapMany allocates maxFD FDs, i.e. maxes out the FDMap, +// until there is no room, then makes sure that NewFDAt works +// and also that if we remove one and add one that works too. +func TestFDMapMany(t *testing.T) { + file := filetest.NewTestFile(t) + limitSet := limits.NewLimitSet() + limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}) + + f := newTestFDMap() + for i := 0; i < maxFD; i++ { + if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { + t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD) + } + } + + if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil { + t.Fatalf("f.NewFDFrom(0, r) in full map: got nil, wanted error") + } + + if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil { + t.Fatalf("f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } +} + +// TestFDMap does a set of simple tests to make sure simple adds, +// removes, GetRefs, and DecRefs work. The ordering is just weird +// enough that a table-driven approach seemed clumsy. +func TestFDMap(t *testing.T) { + file := filetest.NewTestFile(t) + limitSet := limits.NewLimitSet() + limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}) + + f := newTestFDMap() + if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { + t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err) + } + + if _, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err == nil { + t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error") + } + + largeLimit := limits.Limit{maxFD, maxFD} + limitSet.Set(limits.NumberOfFiles, largeLimit) + + if fd, err := f.NewFDFrom(0, file, FDFlags{}, limitSet); err != nil { + t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) + } else if fd != kdefs.FD(1) { + t.Fatalf("Added an FD to a resized map: got %v, want 1", fd) + } + + if err := f.NewFDAt(1, file, FDFlags{}, limitSet); err != nil { + t.Fatalf("Replacing FD 1 via f.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + + if err := f.NewFDAt(maxFD+1, file, FDFlags{}, limitSet); err == nil { + t.Fatalf("Using an FD that was too large via f.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1) + } + + if ref := f.GetFile(1); ref == nil { + t.Fatalf("f.GetFile(1): got nil, wanted %v", file) + } + + if ref := f.GetFile(2); ref != nil { + t.Fatalf("f.GetFile(2): got a %v, wanted nil", ref) + } + + ref, ok := f.Remove(1) + if !ok { + t.Fatalf("f.Remove(1) for an existing FD: failed, want success") + } + ref.DecRef() + + if ref, ok := f.Remove(1); ok { + ref.DecRef() + t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") + } + +} + +func TestDescriptorFlags(t *testing.T) { + file := filetest.NewTestFile(t) + f := newTestFDMap() + limitSet := limits.NewLimitSet() + limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}) + + if err := f.NewFDAt(2, file, FDFlags{CloseOnExec: true}, limitSet); err != nil { + t.Fatalf("f.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err) + } + + newFile, flags := f.GetDescriptor(2) + if newFile == nil { + t.Fatalf("f.GetFile(2): got a %v, wanted nil", newFile) + } + + if !flags.CloseOnExec { + t.Fatalf("new File flags %d don't match original %d\n", flags, 0) + } +} diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go new file mode 100644 index 000000000..9aa6fa951 --- /dev/null +++ b/pkg/sentry/kernel/fs_context.go @@ -0,0 +1,172 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// FSContext contains filesystem context. +// +// This includes umask and working directory. +type FSContext struct { + refs.AtomicRefCount + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // root is the filesystem root. Will be nil iff the FSContext has been + // destroyed. + root *fs.Dirent + + // cwd is the current working directory. Will be nil iff the FSContext + // has been destroyed. + cwd *fs.Dirent + + // umask is the current file mode creation mask. When a thread using this + // context invokes a syscall that creates a file, bits set in umask are + // removed from the permissions that the file is created with. + umask uint +} + +// newFSContext returns a new filesystem context. +func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { + root.IncRef() + cwd.IncRef() + return &FSContext{ + root: root, + cwd: cwd, + umask: umask, + } +} + +// destroy is the destructor for an FSContext. +// +// This will call DecRef on both root and cwd Dirents. If either call to +// DecRef returns an error, then it will be propigated. If both calls to +// DecRef return an error, then the one from root.DecRef will be propigated. +// +// Note that there may still be calls to WorkingDirectory() or RootDirectory() +// (that return nil). This is because valid references may still be held via +// proc files or other mechanisms. +func (f *FSContext) destroy() { + f.root.DecRef() + f.root = nil + + f.cwd.DecRef() + f.cwd = nil +} + +// DecRef implements RefCounter.DecRef with destructor f.destroy. +func (f *FSContext) DecRef() { + f.DecRefWithDestructor(f.destroy) +} + +// Fork forks this FSContext. +// +// This is not a valid call after destroy. +func (f *FSContext) Fork() *FSContext { + f.mu.Lock() + defer f.mu.Unlock() + f.cwd.IncRef() + f.root.IncRef() + return &FSContext{ + cwd: f.cwd, + root: f.root, + umask: f.umask, + } +} + +// WorkingDirectory returns the current working directory. +// You should call DecRef on the returned Dirent when finished. +// +// This will return nil if called after destroy(). +func (f *FSContext) WorkingDirectory() *fs.Dirent { + f.mu.Lock() + defer f.mu.Unlock() + if f.cwd != nil { + f.cwd.IncRef() + } + return f.cwd +} + +// SetWorkingDirectory sets the current working directory. +// This will take an extra reference on the Dirent. +// +// This is not a valid call after destroy. +func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { + if d == nil { + panic("FSContext.SetWorkingDirectory called with nil dirent") + } + if f.cwd == nil { + panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d)) + } + f.mu.Lock() + defer f.mu.Unlock() + old := f.cwd + f.cwd = d + d.IncRef() + old.DecRef() +} + +// RootDirectory returns the current filesystem root. +// You should call DecRef on the returned Dirent when finished. +// +// This will return nil if called after destroy(). +func (f *FSContext) RootDirectory() *fs.Dirent { + f.mu.Lock() + defer f.mu.Unlock() + f.root.IncRef() + return f.root +} + +// SetRootDirectory sets the root directory. +// This will take an extra reference on the Dirent. +// +// This is not a valid call after free. +func (f *FSContext) SetRootDirectory(d *fs.Dirent) { + if d == nil { + panic("FSContext.SetRootDirectory called with nil dirent") + } + if f.root == nil { + panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", d)) + } + f.mu.Lock() + defer f.mu.Unlock() + old := f.root + f.root = d + d.IncRef() + old.DecRef() +} + +// Umask returns the current umask. +func (f *FSContext) Umask() uint { + f.mu.Lock() + defer f.mu.Unlock() + return f.umask +} + +// SwapUmask atomically sets the current umask and returns the old umask. +func (f *FSContext) SwapUmask(mask uint) uint { + f.mu.Lock() + defer f.mu.Unlock() + old := f.umask + f.umask = mask + return old +} diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD new file mode 100644 index 000000000..de9897c58 --- /dev/null +++ b/pkg/sentry/kernel/futex/BUILD @@ -0,0 +1,48 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_template_instance( + name = "waiter_list", + out = "waiter_list.go", + package = "futex", + prefix = "waiter", + template = "//pkg/ilist:generic_list", + types = { + "Linker": "*Waiter", + }, +) + +go_stateify( + name = "futex_state", + srcs = [ + "futex.go", + "waiter_list.go", + ], + out = "futex_state.go", + package = "futex", +) + +go_library( + name = "futex", + srcs = [ + "futex.go", + "futex_state.go", + "waiter_list.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/state", + "//pkg/syserror", + ], +) + +go_test( + name = "futex_test", + size = "small", + srcs = ["futex_test.go"], + embed = [":futex"], +) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go new file mode 100644 index 000000000..b3ba57a2c --- /dev/null +++ b/pkg/sentry/kernel/futex/futex.go @@ -0,0 +1,405 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package futex provides an implementation of the futex interface as found in +// the Linux kernel. It allows one to easily transform Wait() calls into waits +// on a channel, which is useful in a Go-based kernel, for example. +package futex + +import ( + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Checker abstracts memory accesses. This is useful because the "addresses" +// used in this package may not be real addresses (they could be indices of an +// array, for example), or they could be mapped via some special mechanism. +// +// TODO: Replace this with usermem.IO. +type Checker interface { + // Check should validate that given address contains the given value. + // If it does not contain the value, syserror.EAGAIN must be returned. + // Any other error may be returned, which will be propagated. + Check(addr uintptr, val uint32) error + + // Op should atomically perform the operation encoded in op on the data + // pointed to by addr, then apply the comparison encoded in op to the + // original value at addr, returning the result. + // Note that op is an opaque operation whose behaviour is defined + // outside of the futex manager. + Op(addr uintptr, op uint32) (bool, error) +} + +// Waiter is the struct which gets enqueued into buckets for wake up routines +// and requeue routines to scan and notify. Once a Waiter has been enqueued by +// WaitPrepare(), callers may listen on C for wake up events. +type Waiter struct { + // Synchronization: + // + // - A Waiter that is not enqueued in a bucket is exclusively owned (no + // synchronization applies). + // + // - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this, + // waiterEntry, complete, and addr are protected by the bucket.mu ("bucket + // lock") of the containing bucket, and bitmask is immutable. complete and + // addr are additionally mutated using atomic memory operations, ensuring + // that they can be read using atomic memory operations without holding the + // bucket lock. + // + // - A Waiter is only guaranteed to be no longer queued after calling + // WaitComplete(). + + // waiterEntry links Waiter into bucket.waiters. + waiterEntry + + // complete is 1 if the Waiter was removed from its bucket by a wakeup and + // 0 otherwise. + complete int32 + + // C is sent to when the Waiter is woken. + C chan struct{} + + // addr is the address being waited on. + addr uintptr + + // The bitmask we're waiting on. + // This is used the case of a FUTEX_WAKE_BITSET. + bitmask uint32 +} + +// NewWaiter returns a new unqueued Waiter. +func NewWaiter() *Waiter { + return &Waiter{ + C: make(chan struct{}, 1), + } +} + +// bucket holds a list of waiters for a given address hash. +type bucket struct { + // mu protects waiters and contained Waiter state. See comment in Waiter. + mu sync.Mutex `state:"nosave"` + + waiters waiterList `state:"zerovalue"` +} + +// wakeLocked wakes up to n waiters matching the bitmask at the addr for this +// bucket and returns the number of waiters woken. +// +// Preconditions: b.mu must be locked. +func (b *bucket) wakeLocked(addr uintptr, bitmask uint32, n int) int { + done := 0 + for w := b.waiters.Front(); done < n && w != nil; { + if w.addr != addr || w.bitmask&bitmask == 0 { + // Not matching. + w = w.Next() + continue + } + + // Remove from the bucket and wake the waiter. + woke := w + w = w.Next() // Next iteration. + b.waiters.Remove(woke) + woke.C <- struct{}{} + + // NOTE: The above channel write establishes a write barrier + // according to the memory model, so nothing may be ordered + // around it. Since we've dequeued w and will never touch it + // again, we can safely store 1 to w.complete here and allow + // the WaitComplete() to short-circuit grabbing the bucket + // lock. If they somehow miss the w.complete, we are still + // holding the lock, so we can know that they won't dequeue w, + // assume it's free and have the below operation afterwards. + atomic.StoreInt32(&woke.complete, 1) + done++ + } + return done +} + +// requeueLocked takes n waiters from the bucket and moves them to naddr on the +// bucket "to". +// +// Preconditions: b and to must be locked. +func (b *bucket) requeueLocked(to *bucket, addr, naddr uintptr, n int) int { + done := 0 + for w := b.waiters.Front(); done < n && w != nil; { + if w.addr != addr { + // Not matching. + w = w.Next() + continue + } + + requeued := w + w = w.Next() // Next iteration. + b.waiters.Remove(requeued) + atomic.StoreUintptr(&requeued.addr, naddr) + to.waiters.PushBack(requeued) + done++ + } + return done +} + +const ( + // bucketCount is the number of buckets per Manager. By having many of + // these we reduce contention when concurrent yet unrelated calls are made. + bucketCount = 1 << bucketCountBits + bucketCountBits = 10 +) + +func checkAddr(addr uintptr) error { + // Ensure the address is aligned. + // It must be a DWORD boundary. + if addr&0x3 != 0 { + return syserror.EINVAL + } + + return nil +} + +// bucketIndexForAddr returns the index into Manager.buckets for addr. +func bucketIndexForAddr(addr uintptr) uintptr { + // - The bottom 2 bits of addr must be 0, per checkAddr. + // + // - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47 + // for a canonical address, and (on all existing platforms) bit 47 must be + // 0 for an application address. + // + // Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful" + // bits. We choose one of the simplest possible hash functions that at + // least uses all 45 useful bits in the output, given that bucketCountBits + // == 10. This hash function also has the property that it will usually map + // adjacent addresses to adjacent buckets, slightly improving memory + // locality when an application synchronization structure uses multiple + // nearby futexes. + // + // Note that despite the large number of arithmetic operations in the + // function, many components can be computed in parallel, such that the + // critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This + // is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... + + // (addr >> 42)" without any additional grouping, the compiler puts all 4 + // additions in the critical path. + h1 := (addr >> 2) + (addr >> 12) + (addr >> 22) + h2 := (addr >> 32) + (addr >> 42) + return (h1 + h2) % bucketCount +} + +// Manager holds futex state for a single virtual address space. +type Manager struct { + buckets [bucketCount]bucket +} + +// NewManager returns an initialized futex manager. +// N.B. we use virtual address to tag futexes, so it only works for private +// (within a single process) futex. +func NewManager() *Manager { + return &Manager{} +} + +// lockBucket returns a locked bucket for the given addr. +// +// Preconditions: checkAddr(addr) == nil. +func (m *Manager) lockBucket(addr uintptr) *bucket { + b := &m.buckets[bucketIndexForAddr(addr)] + b.mu.Lock() + return b +} + +// lockBuckets returns locked buckets for the given addrs. +// +// Preconditions: checkAddr(addr1) == checkAddr(addr2) == nil. +func (m *Manager) lockBuckets(addr1 uintptr, addr2 uintptr) (*bucket, *bucket) { + i1 := bucketIndexForAddr(addr1) + i2 := bucketIndexForAddr(addr2) + b1 := &m.buckets[i1] + b2 := &m.buckets[i2] + + // Ensure that buckets are locked in a consistent order (lowest index + // first) to avoid circular locking. + switch { + case i1 < i2: + b1.mu.Lock() + b2.mu.Lock() + case i2 < i1: + b2.mu.Lock() + b1.mu.Lock() + default: + b1.mu.Lock() + } + + return b1, b2 +} + +// Wake wakes up to n waiters matching the bitmask on the given addr. +// The number of waiters woken is returned. +func (m *Manager) Wake(addr uintptr, bitmask uint32, n int) (int, error) { + if err := checkAddr(addr); err != nil { + return 0, err + } + + b := m.lockBucket(addr) + // This function is very hot; avoid defer. + r := b.wakeLocked(addr, bitmask, n) + b.mu.Unlock() + return r, nil +} + +func (m *Manager) doRequeue(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) { + if err := checkAddr(addr); err != nil { + return 0, err + } + if err := checkAddr(naddr); err != nil { + return 0, err + } + + b1, b2 := m.lockBuckets(addr, naddr) + defer b1.mu.Unlock() + if b2 != b1 { + defer b2.mu.Unlock() + } + + // Check our value. + // This only applied for RequeueCmp(). + if c != nil { + if err := c.Check(addr, val); err != nil { + return 0, err + } + } + + // Wake the number required. + done := b1.wakeLocked(addr, ^uint32(0), nwake) + + // Requeue the number required. + b1.requeueLocked(b2, addr, naddr, nreq) + + return done, nil +} + +// Requeue wakes up to nwake waiters on the given addr, and unconditionally +// requeues up to nreq waiters on naddr. +func (m *Manager) Requeue(addr uintptr, naddr uintptr, nwake int, nreq int) (int, error) { + return m.doRequeue(nil, addr, 0, naddr, nwake, nreq) +} + +// RequeueCmp atomically checks that the addr contains val (via the Checker), +// wakes up to nwake waiters on addr and then unconditionally requeues nreq +// waiters on naddr. +func (m *Manager) RequeueCmp(c Checker, addr uintptr, val uint32, naddr uintptr, nwake int, nreq int) (int, error) { + return m.doRequeue(c, addr, val, naddr, nwake, nreq) +} + +// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1 +// waiters unconditionally from addr1, and, based on the original value at addr2 +// and a comparison encoded in op, wakes up to nwake2 waiters from addr2. +// It returns the total number of waiters woken. +func (m *Manager) WakeOp(c Checker, addr1 uintptr, addr2 uintptr, nwake1 int, nwake2 int, op uint32) (int, error) { + if err := checkAddr(addr1); err != nil { + return 0, err + } + if err := checkAddr(addr2); err != nil { + return 0, err + } + + b1, b2 := m.lockBuckets(addr1, addr2) + + done := 0 + cond, err := c.Op(addr2, op) + if err == nil { + // Wake up up to nwake1 entries from the first bucket. + done = b1.wakeLocked(addr1, ^uint32(0), nwake1) + + // Wake up up to nwake2 entries from the second bucket if the + // operation yielded true. + if cond { + done += b2.wakeLocked(addr2, ^uint32(0), nwake2) + } + } + + b1.mu.Unlock() + if b2 != b1 { + b2.mu.Unlock() + } + return done, err +} + +// WaitPrepare atomically checks that addr contains val (via the Checker), then +// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the +// Waiter must be subsequently removed by calling WaitComplete, whether or not +// a wakeup is received on w.C. +func (m *Manager) WaitPrepare(w *Waiter, c Checker, addr uintptr, val uint32, bitmask uint32) error { + if err := checkAddr(addr); err != nil { + return err + } + + // Prepare the Waiter before taking the bucket lock. + w.complete = 0 + select { + case <-w.C: + default: + } + w.addr = addr + w.bitmask = bitmask + + b := m.lockBucket(addr) + // This function is very hot; avoid defer. + + // Perform our atomic check. + if err := c.Check(addr, val); err != nil { + b.mu.Unlock() + return err + } + + // Add the waiter to the bucket. + b.waiters.PushBack(w) + + b.mu.Unlock() + return nil +} + +// WaitComplete must be called when a Waiter previously added by WaitPrepare is +// no longer eligible to be woken. +func (m *Manager) WaitComplete(w *Waiter) { + // Can we short-circuit acquiring the lock? + // This is the happy path where a notification + // was received and we don't need to dequeue this + // waiter from any list (or take any locks). + if atomic.LoadInt32(&w.complete) != 0 { + return + } + + // Take the bucket lock. Note that without holding the bucket lock, the + // waiter is not guaranteed to stay in that bucket, so after we take the + // bucket lock, we must ensure that the bucket hasn't changed: if it + // happens to have changed, we release the old bucket lock and try again + // with the new bucket; if it hasn't changed, we know it won't change now + // because we hold the lock. + var b *bucket + for { + addr := atomic.LoadUintptr(&w.addr) + b = m.lockBucket(addr) + // We still have to use an atomic load here, because if w was racily + // requeued then w.addr is not protected by b.mu. + if addr == atomic.LoadUintptr(&w.addr) { + break + } + b.mu.Unlock() + } + + // Remove waiter from the bucket. w.complete can only be stored with b.mu + // locked, so this load doesn't need to use sync/atomic. + if w.complete == 0 { + b.waiters.Remove(w) + } + b.mu.Unlock() +} diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go new file mode 100644 index 000000000..7b81358ec --- /dev/null +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -0,0 +1,500 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package futex + +import ( + "math" + "runtime" + "sync" + "sync/atomic" + "syscall" + "testing" + "unsafe" +) + +const ( + testMutexSize = 4 + testMutexLocked uint32 = 1 + testMutexUnlocked uint32 = 0 +) + +// testData implements the Checker interface, and allows us to +// treat the address passed for futex operations as an index in +// a byte slice for testing simplicity. +type testData []byte + +func newTestData(size uint) testData { + return make([]byte, size) +} + +func (t testData) Check(addr uintptr, val uint32) error { + if val != atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))) { + return syscall.EAGAIN + } + return nil +} + +func (t testData) Op(addr uintptr, val uint32) (bool, error) { + return val == 0, nil +} + +// testMutex ties together a testData slice, an address, and a +// futex manager in order to implement the sync.Locker interface. +// Beyond being used as a Locker, this is a simple mechanism for +// changing the underlying values for simpler tests. +type testMutex struct { + a uintptr + d testData + m *Manager +} + +func newTestMutex(addr uintptr, d testData, m *Manager) *testMutex { + return &testMutex{a: addr, d: d, m: m} +} + +// Lock acquires the testMutex. +// This may wait for it to be available via the futex manager. +func (t *testMutex) Lock() { + for { + // Attempt to grab the lock. + if atomic.CompareAndSwapUint32( + ((*uint32)(unsafe.Pointer(&t.d[t.a]))), + testMutexUnlocked, + testMutexLocked) { + // Lock held. + return + } + + // Wait for it to be "not locked". + w := NewWaiter() + err := t.m.WaitPrepare(w, t.d, t.a, testMutexLocked, ^uint32(0)) + if err == syscall.EAGAIN { + continue + } + if err != nil { + // Should never happen. + panic("WaitPrepare returned unexpected error: " + err.Error()) + } + <-w.C + t.m.WaitComplete(w) + } +} + +// Unlock releases the testMutex. +// This will notify any waiters via the futex manager. +func (t *testMutex) Unlock() { + // Unlock. + atomic.StoreUint32(((*uint32)(unsafe.Pointer(&t.d[t.a]))), testMutexUnlocked) + + // Notify all waiters. + t.m.Wake(t.a, ^uint32(0), math.MaxInt32) +} + +func TestFutexWake(t *testing.T) { + m := NewManager() + d := newTestData(testMutexSize) + + // Wait for it to be locked. + // (This won't trigger the wake in testMutex) + w := NewWaiter() + m.WaitPrepare(w, d, 0, testMutexUnlocked, ^uint32(0)) + + // Wake the single thread. + if _, err := m.Wake(0, ^uint32(0), 1); err != nil { + t.Error("wake error:", err) + } + + <-w.C + m.WaitComplete(w) +} + +func TestFutexWakeBitmask(t *testing.T) { + m := NewManager() + d := newTestData(testMutexSize) + + // Wait for it to be locked. + // (This won't trigger the wake in testMutex) + w := NewWaiter() + m.WaitPrepare(w, d, 0, testMutexUnlocked, 0x0000ffff) + + // Wake the single thread, not using the bitmask. + if _, err := m.Wake(0, 0xffff0000, 1); err != nil { + t.Error("wake non-matching bitmask error:", err) + } + + select { + case <-w.C: + t.Error("w is alive?") + default: + } + + // Now use a matching bitmask. + if _, err := m.Wake(0, 0x00000001, 1); err != nil { + t.Error("wake matching bitmask error:", err) + } + + <-w.C + m.WaitComplete(w) +} + +func TestFutexWakeTwo(t *testing.T) { + m := NewManager() + d := newTestData(testMutexSize) + + // Wait for it to be locked. + // (This won't trigger the wake in testMutex) + w1 := NewWaiter() + w2 := NewWaiter() + w3 := NewWaiter() + m.WaitPrepare(w1, d, 0, testMutexUnlocked, ^uint32(0)) + m.WaitPrepare(w2, d, 0, testMutexUnlocked, ^uint32(0)) + m.WaitPrepare(w3, d, 0, testMutexUnlocked, ^uint32(0)) + + // Wake exactly two threads. + if _, err := m.Wake(0, ^uint32(0), 2); err != nil { + t.Error("wake error:", err) + } + + // Ensure exactly two are alive. + // We don't get guarantees about exactly which two, + // (although we expect them to be w1 and w2). + awake := 0 + for { + select { + case <-w1.C: + awake++ + case <-w2.C: + awake++ + case <-w3.C: + awake++ + default: + if awake != 2 { + t.Error("awake != 2?") + } + + // Success. + return + } + } +} + +func TestFutexWakeUnrelated(t *testing.T) { + m := NewManager() + d := newTestData(2 * testMutexSize) + + // Wait for it to be locked. + w1 := NewWaiter() + w2 := NewWaiter() + m.WaitPrepare(w1, d, 0*testMutexSize, testMutexUnlocked, ^uint32(0)) + m.WaitPrepare(w2, d, 1*testMutexSize, testMutexUnlocked, ^uint32(0)) + + // Wake only the second one. + if _, err := m.Wake(1*testMutexSize, ^uint32(0), 2); err != nil { + t.Error("wake error:", err) + } + + // Ensure only r2 is alive. + select { + case <-w1.C: + t.Error("w1 is alive?") + default: + } + <-w2.C +} + +// This function was shamelessly stolen from mutex_test.go. +func HammerMutex(l sync.Locker, loops int, cdone chan bool) { + for i := 0; i < loops; i++ { + l.Lock() + runtime.Gosched() + l.Unlock() + } + cdone <- true +} + +func TestFutexStress(t *testing.T) { + m := NewManager() + d := newTestData(testMutexSize) + tm := newTestMutex(0*testMutexSize, d, m) + c := make(chan bool) + + for i := 0; i < 10; i++ { + go HammerMutex(tm, 1000, c) + } + + for i := 0; i < 10; i++ { + <-c + } +} + +func TestWakeOpEmpty(t *testing.T) { + m := NewManager() + d := newTestData(8) + + n, err := m.WakeOp(d, 0, 4, 10, 10, 0) + if err != nil { + t.Fatalf("WakeOp failed: %v", err) + } + + if n != 0 { + t.Fatalf("Invalid number of wakes: want 0, got %d", n) + } +} + +func TestWakeOpFirstNonEmpty(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add two waiters on address 0. + w1 := NewWaiter() + if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w1) + + w2 := NewWaiter() + if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w2) + + // Wake up all waiters on address 0. + n, err := m.WakeOp(d, 0, 4, 10, 10, 0) + if err != nil { + t.Fatalf("WakeOp failed: %v", err) + } + + if n != 2 { + t.Fatalf("Invalid number of wakes: want 2, got %d", n) + } +} + +func TestWakeOpSecondNonEmpty(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add two waiters on address 4. + w1 := NewWaiter() + if err := m.WaitPrepare(w1, d, 4, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w1) + + w2 := NewWaiter() + if err := m.WaitPrepare(w2, d, 4, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w2) + + // Wake up all waiters on address 4. + n, err := m.WakeOp(d, 0, 4, 10, 10, 0) + if err != nil { + t.Fatalf("WakeOp failed: %v", err) + } + + if n != 2 { + t.Fatalf("Invalid number of wakes: want 2, got %d", n) + } +} + +func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add two waiters on address 4. + w1 := NewWaiter() + if err := m.WaitPrepare(w1, d, 4, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w1) + + w2 := NewWaiter() + if err := m.WaitPrepare(w2, d, 4, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w2) + + // Wake up all waiters on address 4. + n, err := m.WakeOp(d, 0, 4, 10, 10, 1) + if err != nil { + t.Fatalf("WakeOp failed: %v", err) + } + + if n != 0 { + t.Fatalf("Invalid number of wakes: want 0, got %d", n) + } +} + +func TestWakeOpAllNonEmpty(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add two waiters on address 0. + w1 := NewWaiter() + if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w1) + + w2 := NewWaiter() + if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w2) + + // Add two waiters on address 4. + w3 := NewWaiter() + if err := m.WaitPrepare(w3, d, 4, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w3) + + w4 := NewWaiter() + if err := m.WaitPrepare(w4, d, 4, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w4) + + // Wake up all waiters on both addresses. + n, err := m.WakeOp(d, 0, 4, 10, 10, 0) + if err != nil { + t.Fatalf("WakeOp failed: %v", err) + } + + if n != 4 { + t.Fatalf("Invalid number of wakes: want 4, got %d", n) + } +} + +func TestWakeOpAllNonEmptyFailingOp(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add two waiters on address 0. + w1 := NewWaiter() + if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w1) + + w2 := NewWaiter() + if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w2) + + // Add two waiters on address 4. + w3 := NewWaiter() + if err := m.WaitPrepare(w3, d, 4, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w3) + + w4 := NewWaiter() + if err := m.WaitPrepare(w4, d, 4, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w4) + + // Wake up all waiters on both addresses. + n, err := m.WakeOp(d, 0, 4, 10, 10, 1) + if err != nil { + t.Fatalf("WakeOp failed: %v", err) + } + + if n != 2 { + t.Fatalf("Invalid number of wakes: want 2, got %d", n) + } +} + +func TestWakeOpSameAddress(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add four waiters on address 0. + w1 := NewWaiter() + if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w1) + + w2 := NewWaiter() + if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w2) + + w3 := NewWaiter() + if err := m.WaitPrepare(w3, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w3) + + w4 := NewWaiter() + if err := m.WaitPrepare(w4, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w4) + + // Use the same address, with one at most one waiter from each. + n, err := m.WakeOp(d, 0, 0, 1, 1, 0) + if err != nil { + t.Fatalf("WakeOp failed: %v", err) + } + + if n != 2 { + t.Fatalf("Invalid number of wakes: want 2, got %d", n) + } +} + +func TestWakeOpSameAddressFailingOp(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add four waiters on address 0. + w1 := NewWaiter() + if err := m.WaitPrepare(w1, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w1) + + w2 := NewWaiter() + if err := m.WaitPrepare(w2, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w2) + + w3 := NewWaiter() + if err := m.WaitPrepare(w3, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w3) + + w4 := NewWaiter() + if err := m.WaitPrepare(w4, d, 0, 0, ^uint32(0)); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + defer m.WaitComplete(w4) + + // Use the same address, with one at most one waiter from each. + n, err := m.WakeOp(d, 0, 0, 1, 1, 1) + if err != nil { + t.Fatalf("WakeOp failed: %v", err) + } + + if n != 1 { + t.Fatalf("Invalid number of wakes: want 1, got %d", n) + } +} diff --git a/pkg/sentry/kernel/g3doc/run_states.dot b/pkg/sentry/kernel/g3doc/run_states.dot new file mode 100644 index 000000000..7861fe1f5 --- /dev/null +++ b/pkg/sentry/kernel/g3doc/run_states.dot @@ -0,0 +1,99 @@ +digraph { + subgraph { + App; + } + subgraph { + Interrupt; + InterruptAfterSignalDeliveryStop; + } + subgraph { + Syscall; + SyscallAfterPtraceEventSeccomp; + SyscallEnter; + SyscallAfterSyscallEnterStop; + SyscallAfterSysemuStop; + SyscallInvoke; + SyscallAfterPtraceEventClone; + SyscallAfterExecStop; + SyscallAfterVforkStop; + SyscallReinvoke; + SyscallExit; + } + subgraph { + Vsyscall; + VsyscallAfterPtraceEventSeccomp; + VsyscallInvoke; + } + subgraph { + Exit; + ExitMain; // leave thread group, release resources, reparent children, kill PID namespace and wait if TGID 1 + ExitNotify; // signal parent/tracer, become waitable + ExitDone; // represented by t.runState == nil + } + + // Task exit + Exit -> ExitMain; + ExitMain -> ExitNotify; + ExitNotify -> ExitDone; + + // Execution of untrusted application code + App -> App; + + // Interrupts (usually signal delivery) + App -> Interrupt; + Interrupt -> Interrupt; // if other interrupt conditions may still apply + Interrupt -> Exit; // if killed + + // Syscalls + App -> Syscall; + Syscall -> SyscallEnter; + SyscallEnter -> SyscallInvoke; + SyscallInvoke -> SyscallExit; + SyscallExit -> App; + + // exit, exit_group + SyscallInvoke -> Exit; + + // execve + SyscallInvoke -> SyscallAfterExecStop; + SyscallAfterExecStop -> SyscallExit; + SyscallAfterExecStop -> App; // fatal signal pending + + // vfork + SyscallInvoke -> SyscallAfterVforkStop; + SyscallAfterVforkStop -> SyscallExit; + + // Vsyscalls + App -> Vsyscall; + Vsyscall -> VsyscallInvoke; + Vsyscall -> App; // fault while reading return address from stack + VsyscallInvoke -> App; + + // ptrace-specific branches + Interrupt -> InterruptAfterSignalDeliveryStop; + InterruptAfterSignalDeliveryStop -> Interrupt; + SyscallEnter -> SyscallAfterSyscallEnterStop; + SyscallAfterSyscallEnterStop -> SyscallInvoke; + SyscallAfterSyscallEnterStop -> SyscallExit; // skipped by tracer + SyscallAfterSyscallEnterStop -> App; // fatal signal pending + SyscallEnter -> SyscallAfterSysemuStop; + SyscallAfterSysemuStop -> SyscallExit; + SyscallAfterSysemuStop -> App; // fatal signal pending + SyscallInvoke -> SyscallAfterPtraceEventClone; + SyscallAfterPtraceEventClone -> SyscallExit; + SyscallAfterPtraceEventClone -> SyscallAfterVforkStop; + + // seccomp + Syscall -> App; // SECCOMP_RET_TRAP, SECCOMP_RET_ERRNO, SECCOMP_RET_KILL, SECCOMP_RET_TRACE without tracer + Syscall -> SyscallAfterPtraceEventSeccomp; // SECCOMP_RET_TRACE + SyscallAfterPtraceEventSeccomp -> SyscallEnter; + SyscallAfterPtraceEventSeccomp -> SyscallExit; // skipped by tracer + SyscallAfterPtraceEventSeccomp -> App; // fatal signal pending + Vsyscall -> VsyscallAfterPtraceEventSeccomp; + VsyscallAfterPtraceEventSeccomp -> VsyscallInvoke; + VsyscallAfterPtraceEventSeccomp -> App; + + // Autosave + SyscallInvoke -> SyscallReinvoke; + SyscallReinvoke -> SyscallInvoke; +} diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go new file mode 100644 index 000000000..78737f58f --- /dev/null +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -0,0 +1,43 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore" +) + +// IPCNamespace represents an IPC namespace. +type IPCNamespace struct { + semaphores *semaphore.Registry +} + +// NewIPCNamespace creates a new IPC namespace. +func NewIPCNamespace() *IPCNamespace { + return &IPCNamespace{ + semaphores: semaphore.NewRegistry(), + } +} + +// SemaphoreRegistry returns the semanphore set registry for this namespace. +func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry { + return i.semaphores +} + +// IPCNamespace returns the task's IPC namespace. +func (t *Task) IPCNamespace() *IPCNamespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.ipcns +} diff --git a/pkg/sentry/kernel/kdefs/BUILD b/pkg/sentry/kernel/kdefs/BUILD new file mode 100644 index 000000000..b6c00042a --- /dev/null +++ b/pkg/sentry/kernel/kdefs/BUILD @@ -0,0 +1,10 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "kdefs", + srcs = ["kdefs.go"], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs", + visibility = ["//:sandbox"], +) diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go new file mode 100644 index 000000000..bbb476544 --- /dev/null +++ b/pkg/sentry/kernel/kdefs/kdefs.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kdefs defines common kernel definitions. +// +package kdefs + +// FD is a File Descriptor. +type FD int32 diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go new file mode 100644 index 000000000..0932965e0 --- /dev/null +++ b/pkg/sentry/kernel/kernel.go @@ -0,0 +1,957 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kernel provides an emulation of the Linux kernel. +// +// See README.md for a detailed overview. +// +// Lock order (outermost locks must be taken first): +// +// Kernel.extMu +// TaskSet.mu +// SignalHandlers.mu +// Task.mu +// +// Locking SignalHandlers.mu in multiple SignalHandlers requires locking +// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same +// time requires locking all of their signal mutexes first. +package kernel + +import ( + "fmt" + "io" + "path/filepath" + "sync" + "sync/atomic" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd" + "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/loader" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port" + sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" + "gvisor.googlesource.com/gvisor/pkg/state" +) + +// Kernel represents an emulated Linux kernel. It must be initialized by calling +// Init() or LoadFrom(). +type Kernel struct { + // extMu serializes external changes to the Kernel with calls to + // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel + // remains frozen for the duration of the call; it requires that the Kernel + // is paused as a precondition, which ensures that none of the tasks + // running within the Kernel can affect its state, but extMu is required to + // ensure that concurrent users of the Kernel *outside* the Kernel's + // control cannot affect its state by calling e.g. + // Kernel.SendExternalSignal.) + extMu sync.Mutex `state:"nosave"` + + // started is true if Start has been called. Unless otherwise specified, + // all Kernel fields become immutable once started becomes true. + started bool `state:"nosave"` + + // All of the following fields are immutable unless otherwise specified. + + // Platform is the platform that is used to execute tasks in the + // created Kernel. It is embedded so that Kernel can directly serve as + // Platform in mm logic and also serve as platform.MemoryProvider in + // filemem S/R logic. + platform.Platform `state:"nosave"` + + // See InitKernelArgs for the meaning of these fields. + featureSet *cpuid.FeatureSet + timekeeper *Timekeeper + tasks *TaskSet + rootUserNamespace *auth.UserNamespace + networkStack inet.Stack `state:"nosave"` + applicationCores uint + useHostCores bool + extraAuxv []arch.AuxEntry + vdso *loader.VDSO + rootUTSNamespace *UTSNamespace + rootIPCNamespace *IPCNamespace + + // mounts holds the state of the virtual filesystem. mounts is initially + // nil, and must be set by calling Kernel.SetRootMountNamespace before + // Kernel.CreateProcess can succeed. + mounts *fs.MountNamespace + + // globalInit is the thread group whose leader has ID 1 in the root PID + // namespace. globalInit is stored separately so that it is accessible even + // after all tasks in the thread group have exited, such that ID 1 is no + // longer mapped. + // + // globalInit is mutable until it is assigned by the first successful call + // to CreateProcess, and is protected by extMu. + globalInit *ThreadGroup + + // realtimeClock is a ktime.Clock based on timekeeper's Realtime. + realtimeClock *timekeeperClock + + // monotonicClock is a ktime.Clock based on timekeeper's Monotonic. + monotonicClock *timekeeperClock + + // syslog is the kernel log. + syslog syslog + + // cpuClock is incremented every linux.ClockTick. cpuClock is used to + // measure task CPU usage, since sampling monotonicClock twice on every + // syscall turns out to be unreasonably expensive. This is similar to how + // Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING), + // although Linux also uses scheduler timing information to improve + // resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do + // since "preeemptive" scheduling is managed by the Go runtime, which + // doesn't provide this information. + // + // cpuClock is mutable, and is accessed using atomic memory operations. + cpuClock uint64 + + // cpuClockTicker increments cpuClock. + cpuClockTicker *ktime.Timer `state:"nosave"` + + // fdMapUids is an ever-increasing counter for generating FDMap uids. + // + // fdMapUids is mutable, and is accessed using atomic memory operations. + fdMapUids uint64 + + // uniqueID is used to generate unique identifiers. + // + // uniqueID is mutable, and is accessed using atomic memory operations. + uniqueID uint64 + + // nextInotifyCookie is a monotonically increasing counter used for + // generating unique inotify event cookies. + // + // nextInotifyCookie is mutable, and is accesed using atomic memory + // operations. + nextInotifyCookie uint32 + + // netlinkPorts manages allocation of netlink socket port IDs. + netlinkPorts *port.Manager + + // exitErr is the error causing the sandbox to exit, if any. It is + // protected by extMu. + exitErr error +} + +// InitKernelArgs holds arguments to Init. +type InitKernelArgs struct { + // FeatureSet is the emulated CPU feature set. + FeatureSet *cpuid.FeatureSet + + // Timekeeper manages time for all tasks in the system. + Timekeeper *Timekeeper + + // RootUserNamespace is the root user namespace. + RootUserNamespace *auth.UserNamespace + + // NetworkStack is the TCP/IP network stack. NetworkStack may be nil. + NetworkStack inet.Stack + + // ApplicationCores is the number of logical CPUs visible to sandboxed + // applications. The set of logical CPU IDs is [0, ApplicationCores); thus + // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the + // most significant bit in cpu_possible_mask + 1. + ApplicationCores uint + + // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU + // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a + // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it + // will be overridden. + UseHostCores bool + + // ExtraAuxv contains additional auxiliary vector entries that are added to + // each process by the ELF loader. + ExtraAuxv []arch.AuxEntry + + // Vdso holds the VDSO and its parameter page. + Vdso *loader.VDSO + + // RootUTSNamespace is the root UTS namepsace. + RootUTSNamespace *UTSNamespace + + // RootIPCNamespace is the root IPC namepsace. + RootIPCNamespace *IPCNamespace +} + +// Init initialize the Kernel with no tasks. +// +// Callers must manually set Kernel.Platform before caling Init. +func (k *Kernel) Init(args InitKernelArgs) error { + if args.FeatureSet == nil { + return fmt.Errorf("FeatureSet is nil") + } + if args.Timekeeper == nil { + return fmt.Errorf("Timekeeper is nil") + } + if args.RootUserNamespace == nil { + return fmt.Errorf("RootUserNamespace is nil") + } + if args.ApplicationCores == 0 { + return fmt.Errorf("ApplicationCores is 0") + } + + k.featureSet = args.FeatureSet + k.timekeeper = args.Timekeeper + k.tasks = newTaskSet() + k.rootUserNamespace = args.RootUserNamespace + k.rootUTSNamespace = args.RootUTSNamespace + k.rootIPCNamespace = args.RootIPCNamespace + k.networkStack = args.NetworkStack + k.applicationCores = args.ApplicationCores + if args.UseHostCores { + k.useHostCores = true + maxCPU, err := hostcpu.MaxPossibleCPU() + if err != nil { + return fmt.Errorf("Failed to get maximum CPU number: %v", err) + } + minAppCores := uint(maxCPU) + 1 + if k.applicationCores < minAppCores { + log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) + k.applicationCores = minAppCores + } + } + k.extraAuxv = args.ExtraAuxv + k.vdso = args.Vdso + k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime} + k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} + k.netlinkPorts = port.New() + + return nil +} + +// SaveTo saves the state of k to w. +// +// Preconditions: The kernel must be paused throughout the call to SaveTo. +func (k *Kernel) SaveTo(w io.Writer) error { + saveStart := time.Now() + ctx := k.SupervisorContext() + + // Do not allow other Kernel methods to affect it while it's being saved. + k.extMu.Lock() + defer k.extMu.Unlock() + + // Stop time. + k.pauseTimeLocked() + defer k.resumeTimeLocked() + + // Flush write operations on open files so data reaches backing storage. + if err := k.tasks.flushWritesToFiles(ctx); err != nil { + return err + } + + // Remove all epoll waiter objects from underlying wait queues. + // NOTE: for programs to resume execution in future snapshot scenarios, + // we will need to re-establish these waiter objects after saving. + k.tasks.unregisterEpollWaiters() + + // Clear the dirent cache before saving because Dirents must be Loaded in a + // particular order (parents before children), and Loading dirents from a cache + // breaks that order. + k.mounts.FlushMountSourceRefs() + + // Ensure that all pending asynchronous work is complete: + // - inode and mount release + // - asynchronuous IO + fs.AsyncBarrier() + + // Once all fs work has completed (flushed references have all been released), + // reset mount mappings. This allows individual mounts to save how inodes map + // to filesystem resources. Without this, fs.Inodes cannot be restored. + fs.SaveInodeMappings() + + // Discard unsavable mappings, such as those for host file descriptors. + // This must be done after waiting for "asynchronous fs work", which + // includes async I/O that may touch application memory. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } + + // Save the kernel state. + kernelStart := time.Now() + var stats state.Stats + if err := state.Save(w, k, &stats); err != nil { + return err + } + log.Infof("Kernel save stats: %s", &stats) + log.Infof("Kernel save took [%s].", time.Since(kernelStart)) + + // Save the memory state. + // + // FIXME: In the future, this should not be dispatched via + // an abstract memory type. This should be dispatched to a single + // memory implementation that belongs to the kernel. (There is + // currently a single implementation anyways, it just needs to be + // "unabstracted" and reparented appropriately.) + memoryStart := time.Now() + if err := k.Platform.Memory().SaveTo(w); err != nil { + return err + } + log.Infof("Memory save took [%s].", time.Since(memoryStart)) + + log.Infof("Overall save took [%s].", time.Since(saveStart)) + + return nil +} + +func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { + ts.mu.RLock() + defer ts.mu.RUnlock() + for t := range ts.Root.tids { + if fdmap := t.FDMap(); fdmap != nil { + for _, desc := range fdmap.files { + if flags := desc.file.Flags(); !flags.Write { + continue + } + if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { + continue + } + // Here we need all metadata synced. + syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + if err := fs.SaveFileFsyncError(syncErr); err != nil { + name, _ := desc.file.Dirent.FullName(nil /* root */) + return fmt.Errorf("%q was not sufficiently synced: %v", name, err) + } + } + } + } + return nil +} + +// Preconditions: The kernel must be paused. +func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { + invalidated := make(map[*mm.MemoryManager]struct{}) + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t := range k.tasks.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if mm := t.tc.MemoryManager; mm != nil { + if _, ok := invalidated[mm]; !ok { + if err := mm.InvalidateUnsavable(ctx); err != nil { + return err + } + invalidated[mm] = struct{}{} + } + } + // I really wish we just had a sync.Map of all MMs... + if r, ok := t.runState.(*runSyscallAfterExecStop); ok { + if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil { + return err + } + } + } + return nil +} + +func (ts *TaskSet) unregisterEpollWaiters() { + ts.mu.RLock() + defer ts.mu.RUnlock() + for t := range ts.Root.tids { + if fdmap := t.FDMap(); fdmap != nil { + for _, desc := range fdmap.files { + if desc.file != nil { + if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() + } + } + } + } + } +} + +// LoadFrom returns a new Kernel loaded from args. +func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) error { + loadStart := time.Now() + if p == nil { + return fmt.Errorf("Platform is nil") + } + + k.Platform = p + k.networkStack = net + + initAppCores := k.applicationCores + + // Load the kernel state. + kernelStart := time.Now() + var stats state.Stats + if err := state.Load(r, k, &stats); err != nil { + return err + } + log.Infof("Kernel load stats: %s", &stats) + log.Infof("Kernel load took [%s].", time.Since(kernelStart)) + + // Load the memory state. + // + // See the note in SaveTo. + memoryStart := time.Now() + if err := k.Platform.Memory().LoadFrom(r); err != nil { + return err + } + log.Infof("Memory load took [%s].", time.Since(memoryStart)) + + // Ensure that all pending asynchronous work is complete: + // - namedpipe opening + // - inode file opening + fs.AsyncBarrier() + + log.Infof("Overall load took [%s]", time.Since(loadStart)) + + // Applications may size per-cpu structures based on k.applicationCores, so + // it can't change across save/restore. When we are virtualizing CPU + // numbers, this isn't a problem. However, when we are exposing host CPU + // assignments, we can't tolerate an increase in the number of host CPUs, + // which could result in getcpu(2) returning CPUs that applications expect + // not to exist. + if k.useHostCores && initAppCores > k.applicationCores { + return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) + } + + return nil +} + +// Destroy releases resources owned by k. +// +// Preconditions: There must be no task goroutines running in k. +func (k *Kernel) Destroy() { + if k.mounts != nil { + k.mounts.DecRef() + k.mounts = nil + } +} + +// UniqueID returns a unique identifier. +func (k *Kernel) UniqueID() uint64 { + id := atomic.AddUint64(&k.uniqueID, 1) + if id == 0 { + panic("unique identifier generator wrapped around") + } + return id +} + +// CreateProcessArgs holds arguments to kernel.CreateProcess. +type CreateProcessArgs struct { + // Filename is the filename to load. + // + // If this is provided as "", then the file will be guessed via Argv[0]. + Filename string + + // Argvv is a list of arguments. + Argv []string + + // Envv is a list of environment variables. + Envv []string + + // WorkingDirectory is the initial working directory. + // + // This defaults to the root if empty. + WorkingDirectory string + + // Credentials is the initial credentials. + Credentials *auth.Credentials + + // FDMap is the initial set of file descriptors. If CreateProcess succeeds, + // it takes a reference on FDMap. + FDMap *FDMap + + // Umask is the initial umask. + Umask uint + + // Limits is the initial resource limits. + Limits *limits.LimitSet + + // MaxSymlinkTraversals is the maximum number of symlinks to follow + // during resolution. + MaxSymlinkTraversals uint + + // UTSNamespace is the initial UTS namespace. + UTSNamespace *UTSNamespace + + // IPCNamespace is the initial IPC namespace. + IPCNamespace *IPCNamespace +} + +// NewContext returns a context.Context that represents the task that will be +// created by args.NewContext(k). +func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext { + return &createProcessContext{ + Logger: log.Log(), + k: k, + args: args, + } +} + +// createProcessContext is a context.Context that represents the context +// associated with a task that is being created. +type createProcessContext struct { + context.NoopSleeper + log.Logger + k *Kernel + args *CreateProcessArgs +} + +// Value implements context.Context.Value. +func (ctx *createProcessContext) Value(key interface{}) interface{} { + switch key { + case CtxKernel: + return ctx.k + case CtxPIDNamespace: + // "The new task ... is in the root PID namespace." - + // Kernel.CreateProcess + return ctx.k.tasks.Root + case CtxUTSNamespace: + return ctx.args.UTSNamespace + case CtxIPCNamespace: + return ctx.args.IPCNamespace + case auth.CtxCredentials: + return ctx.args.Credentials + case fs.CtxRoot: + if ctx.k.mounts == nil { + return nil + } + return ctx.k.mounts.Root() + case ktime.CtxRealtimeClock: + return ctx.k.RealtimeClock() + case limits.CtxLimits: + return ctx.args.Limits + case platform.CtxPlatform: + return ctx.k + case uniqueid.CtxGlobalUniqueID: + return ctx.k.UniqueID() + case uniqueid.CtxInotifyCookie: + return ctx.k.GenerateInotifyCookie() + default: + return nil + } +} + +// CreateProcess creates a new task in a new thread group with the given +// options. The new task has no parent and is in the root PID namespace. +// +// If k.Start() has already been called, the created task will begin running +// immediately. Otherwise, it will be started when k.Start() is called. +// +// CreateProcess has no analogue in Linux; it is used to create the initial +// application task, as well as processes started by the control server. +func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) { + k.extMu.Lock() + defer k.extMu.Unlock() + log.Infof("EXEC: %v", args.Argv) + + if k.mounts == nil { + return nil, fmt.Errorf("no kernel MountNamespace") + } + + tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) + ctx := args.NewContext(k) + + // Grab the root directory. + root := fs.RootFromContext(ctx) + defer root.DecRef() + + // Grab the working directory. + wd := root // Default. + if args.WorkingDirectory != "" { + var err error + wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, args.MaxSymlinkTraversals) + if err != nil { + return nil, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + + if args.Filename == "" { + // Was anything provided? + if len(args.Argv) == 0 { + return nil, fmt.Errorf("no filename or command provided") + } + if !filepath.IsAbs(args.Argv[0]) { + return nil, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) + } + args.Filename = args.Argv[0] + } + + // Create a fresh task context. + tc, err := k.LoadTaskImage(ctx, k.mounts, root, wd, args.MaxSymlinkTraversals, args.Filename, args.Argv, args.Envv, k.featureSet) + if err != nil { + return nil, err + } + tr := newTaskResources(args.FDMap, newFSContext(root, wd, args.Umask)) + // NewTask unconditionally takes ownership of tr, so we never have to call + // tr.release. + + // Create the task. + config := &TaskConfig{ + Kernel: k, + ThreadGroup: tg, + TaskContext: tc, + TaskResources: tr, + Credentials: args.Credentials, + UTSNamespace: args.UTSNamespace, + IPCNamespace: args.IPCNamespace, + AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), + } + t, err := k.tasks.NewTask(config) + if err != nil { + return nil, err + } + + // Success. + if k.started { + tid := k.tasks.Root.IDOfTask(t) + t.Start(tid) + } else if k.globalInit == nil { + k.globalInit = tg + } + return tg, nil +} + +// Start starts execution of all tasks in k. +// +// Preconditions: Start may be called exactly once. +func (k *Kernel) Start() error { + k.extMu.Lock() + defer k.extMu.Unlock() + + if k.globalInit == nil { + return fmt.Errorf("kernel contains no tasks") + } + if k.started { + return fmt.Errorf("kernel already started") + } + + k.started = true + k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, kernelCPUClockListener{k}) + k.cpuClockTicker.Swap(ktime.Setting{ + Enabled: true, + Period: linux.ClockTick, + }) + // If k was created by LoadKernelFrom, timers were stopped during + // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, + // this is a no-op. + k.resumeTimeLocked() + // Start task goroutines. + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t, tid := range k.tasks.Root.tids { + t.Start(tid) + } + return nil +} + +// pauseTimeLocked pauses all Timers and Timekeeper updates. +// +// Preconditions: Any task goroutines running in k must be stopped. k.extMu +// must be locked. +func (k *Kernel) pauseTimeLocked() { + // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before + // Kernel.Start(). + if k.cpuClockTicker != nil { + k.cpuClockTicker.Pause() + } + + // By precondition, nothing else can be interacting with PIDNamespace.tids + // or FDMap.files, so we can iterate them without synchronization. (We + // can't hold the TaskSet mutex when pausing thread group timers because + // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet + // mutex, while holding the Timer mutex.) + for t := range k.tasks.Root.tids { + if t == t.tg.leader { + t.tg.tm.pause() + } + // This means we'll iterate FDMaps shared by multiple tasks repeatedly, + // but ktime.Timer.Pause is idempotent so this is harmless. + if fdm := t.tr.FDMap; fdm != nil { + for _, desc := range fdm.files { + if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.PauseTimer() + } + } + } + } + k.timekeeper.PauseUpdates() +} + +// resumeTimeLocked resumes all Timers and Timekeeper updates. If +// pauseTimeLocked has not been previously called, resumeTimeLocked has no +// effect. +// +// Preconditions: Any task goroutines running in k must be stopped. k.extMu +// must be locked. +func (k *Kernel) resumeTimeLocked() { + if k.cpuClockTicker != nil { + k.cpuClockTicker.Resume() + } + + k.timekeeper.ResumeUpdates() + for t := range k.tasks.Root.tids { + if t == t.tg.leader { + t.tg.tm.resume() + } + if fdm := t.tr.FDMap; fdm != nil { + for _, desc := range fdm.files { + if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.ResumeTimer() + } + } + } + } +} + +// WaitExited blocks until all tasks in k have exited. +func (k *Kernel) WaitExited() { + k.tasks.liveGoroutines.Wait() +} + +// Kill requests that all tasks in k immediately exit as if group exiting with +// status es. Kill does not wait for tasks to exit. +func (k *Kernel) Kill(es ExitStatus) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.Kill(es) +} + +// Pause requests that all tasks in k temporarily stop executing, and blocks +// until all tasks in k have stopped. Multiple calls to Pause nest and require +// an equal number of calls to Unpause to resume execution. +func (k *Kernel) Pause() { + k.extMu.Lock() + k.tasks.BeginExternalStop() + k.extMu.Unlock() + k.tasks.runningGoroutines.Wait() +} + +// Unpause ends the effect of a previous call to Pause. If Unpause is called +// without a matching preceding call to Pause, Unpause may panic. +func (k *Kernel) Unpause() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.EndExternalStop() +} + +// SendExternalSignal injects a signal into the kernel. +// +// context is used only for debugging to describe how the signal was received. +// +// Returns false if signal could not be sent because the Kernel is not fully +// initialized yet. +func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) bool { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.sendExternalSignal(info, context) +} + +// FeatureSet returns the FeatureSet. +func (k *Kernel) FeatureSet() *cpuid.FeatureSet { + return k.featureSet +} + +// Timekeeper returns the Timekeeper. +func (k *Kernel) Timekeeper() *Timekeeper { + return k.timekeeper +} + +// TaskSet returns the TaskSet. +func (k *Kernel) TaskSet() *TaskSet { + return k.tasks +} + +// RootUserNamespace returns the root UserNamespace. +func (k *Kernel) RootUserNamespace() *auth.UserNamespace { + return k.rootUserNamespace +} + +// RootUTSNamespace returns the root UTSNamespace. +func (k *Kernel) RootUTSNamespace() *UTSNamespace { + return k.rootUTSNamespace +} + +// RootIPCNamespace returns the root IPCNamespace. +func (k *Kernel) RootIPCNamespace() *IPCNamespace { + return k.rootIPCNamespace +} + +// RootMountNamespace returns the MountNamespace. +func (k *Kernel) RootMountNamespace() *fs.MountNamespace { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.mounts +} + +// SetRootMountNamespace sets the MountNamespace. +func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.mounts = mounts +} + +// NetworkStack returns the network stack. NetworkStack may return nil if no +// network stack is available. +func (k *Kernel) NetworkStack() inet.Stack { + return k.networkStack +} + +// GlobalInit returns the thread group with ID 1 in the root PID namespace, or +// nil if no such thread group exists. GlobalInit may return a thread group +// containing no tasks if the thread group has already exited. +func (k *Kernel) GlobalInit() *ThreadGroup { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.globalInit +} + +// ApplicationCores returns the number of CPUs visible to sandboxed +// applications. +func (k *Kernel) ApplicationCores() uint { + return k.applicationCores +} + +// RealtimeClock returns the application CLOCK_REALTIME clock. +func (k *Kernel) RealtimeClock() ktime.Clock { + return k.realtimeClock +} + +// MonotonicClock returns the application CLOCK_MONOTONIC clock. +func (k *Kernel) MonotonicClock() ktime.Clock { + return k.monotonicClock +} + +// CPUClockNow returns the current value of k.cpuClock. +func (k *Kernel) CPUClockNow() uint64 { + return atomic.LoadUint64(&k.cpuClock) +} + +// Syslog returns the syslog. +func (k *Kernel) Syslog() *syslog { + return &k.syslog +} + +// GenerateInotifyCookie generates a unique inotify event cookie. +// +// Returned values may overlap with previously returned values if the value +// space is exhausted. 0 is not a valid cookie value, all other values +// representable in a uint32 are allowed. +func (k *Kernel) GenerateInotifyCookie() uint32 { + id := atomic.AddUint32(&k.nextInotifyCookie, 1) + // Wrap-around is explicitly allowed for inotify event cookies. + if id == 0 { + id = atomic.AddUint32(&k.nextInotifyCookie, 1) + } + return id +} + +// NetlinkPorts returns the netlink port manager. +func (k *Kernel) NetlinkPorts() *port.Manager { + return k.netlinkPorts +} + +// ExitError returns the sandbox error that caused the kernel to exit. +func (k *Kernel) ExitError() error { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.exitErr +} + +// SetExitError sets the sandbox error that caused the kernel to exit, if one is +// not already set. +func (k *Kernel) SetExitError(err error) { + k.extMu.Lock() + defer k.extMu.Unlock() + if k.exitErr == nil { + k.exitErr = err + } +} + +// SupervisorContext returns a Context with maximum privileges in k. It should +// only be used by goroutines outside the control of the emulated kernel +// defined by e. +// +// Callers are responsible for ensuring that the returned Context is not used +// concurrently with changes to the Kernel. +func (k *Kernel) SupervisorContext() context.Context { + return supervisorContext{ + Logger: log.Log(), + k: k, + } +} + +type supervisorContext struct { + context.NoopSleeper + log.Logger + k *Kernel +} + +// Value implements context.Context. +func (ctx supervisorContext) Value(key interface{}) interface{} { + switch key { + case CtxCanTrace: + // The supervisor context can trace anything. (None of + // supervisorContext's users are expected to invoke ptrace, but ptrace + // permissions are required for certain file accesses.) + return func(*Task, bool) bool { return true } + case CtxKernel: + return ctx.k + case CtxPIDNamespace: + return ctx.k.tasks.Root + case CtxUTSNamespace: + return ctx.k.rootUTSNamespace + case CtxIPCNamespace: + return ctx.k.rootIPCNamespace + case auth.CtxCredentials: + // The supervisor context is global root. + return auth.NewRootCredentials(ctx.k.rootUserNamespace) + case fs.CtxRoot: + return ctx.k.mounts.Root() + case ktime.CtxRealtimeClock: + return ctx.k.RealtimeClock() + case limits.CtxLimits: + // No limits apply. + return limits.NewLimitSet() + case platform.CtxPlatform: + return ctx.k + case uniqueid.CtxGlobalUniqueID: + return ctx.k.UniqueID() + case uniqueid.CtxInotifyCookie: + return ctx.k.GenerateInotifyCookie() + default: + return nil + } +} + +type kernelCPUClockListener struct { + k *Kernel +} + +// Notify implements ktime.TimerListener.Notify. +func (l kernelCPUClockListener) Notify(exp uint64) { + atomic.AddUint64(&l.k.cpuClock, exp) +} + +// Destroy implements ktime.TimerListener.Destroy. +func (l kernelCPUClockListener) Destroy() { +} diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD new file mode 100644 index 000000000..c7779e1d5 --- /dev/null +++ b/pkg/sentry/kernel/memevent/BUILD @@ -0,0 +1,31 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "memevent", + srcs = ["memory_events.go"], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent", + visibility = ["//:sandbox"], + deps = [ + ":memory_events_go_proto", + "//pkg/eventchannel", + "//pkg/log", + "//pkg/sentry/kernel", + "//pkg/sentry/usage", + ], +) + +proto_library( + name = "memory_events_proto", + srcs = ["memory_events.proto"], + visibility = ["//visibility:public"], +) + +go_proto_library( + name = "memory_events_go_proto", + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto", + proto = ":memory_events_proto", + visibility = ["//visibility:public"], +) diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go new file mode 100644 index 000000000..ecc9151de --- /dev/null +++ b/pkg/sentry/kernel/memevent/memory_events.go @@ -0,0 +1,98 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package memevent implements the memory usage events controller, which +// periodically emits events via the eventchannel. +package memevent + +import ( + "sync" + "time" + + "gvisor.googlesource.com/gvisor/pkg/eventchannel" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + pb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" +) + +// MemoryEvents describes the configuration for the global memory event emitter. +type MemoryEvents struct { + k *kernel.Kernel + + // The period is how often to emit an event. The memory events goroutine + // will ensure a minimum of one event is emitted per this period, regardless + // how of much memory usage has changed. + period time.Duration + + // Writing to this channel indicates the memory goroutine should stop. + stop chan struct{} + + // done is used to signal when the memory event goroutine has exited. + done sync.WaitGroup +} + +// New creates a new MemoryEvents. +func New(k *kernel.Kernel, period time.Duration) *MemoryEvents { + return &MemoryEvents{ + k: k, + period: period, + stop: make(chan struct{}), + } +} + +// Stop stops the memory usage events emitter goroutine. Stop must not be called +// concurrently with Start and may only be called once. +func (m *MemoryEvents) Stop() { + close(m.stop) + m.done.Wait() +} + +// Start starts the memory usage events emitter goroutine. Start must not be +// called concurrently with Stop and may only be called once. +func (m *MemoryEvents) Start() { + if m.period == 0 { + return + } + go m.run() // S/R-SAFE: doesn't interact with saved state. +} + +func (m *MemoryEvents) run() { + m.done.Add(1) + + ticker := time.NewTicker(m.period) + defer ticker.Stop() + + for { + select { + case <-m.stop: + m.done.Done() + return + case <-ticker.C: + m.emit() + } + } +} + +func (m *MemoryEvents) emit() { + totalPlatform, err := m.k.Platform.Memory().TotalUsage() + if err != nil { + log.Warningf("Failed to fetch memory usage for memory events: %v", err) + return + } + snapshot, _ := usage.MemoryAccounting.Copy() + total := totalPlatform + snapshot.Mapped + + eventchannel.Emit(&pb.MemoryUsageEvent{Total: total}) +} diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto new file mode 100644 index 000000000..e6e0bd628 --- /dev/null +++ b/pkg/sentry/kernel/memevent/memory_events.proto @@ -0,0 +1,25 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package gvisor; + +// MemoryUsageEvent describes the memory usage of the sandbox at a single +// instant in time. These messages are emitted periodically on the eventchannel. +message MemoryUsageEvent { + // The total memory usage of the sandboxed application in bytes, calculated + // using the 'fast' method. + uint64 total = 1; +} diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go new file mode 100644 index 000000000..d8701f47a --- /dev/null +++ b/pkg/sentry/kernel/pending_signals.go @@ -0,0 +1,126 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bits" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" +) + +const ( + // stdSignalCap is the maximum number of instances of a given standard + // signal that may be pending. ("[If] multiple instances of a standard + // signal are delivered while that signal is currently blocked, then only + // one instance is queued.") - signal(7) + stdSignalCap = 1 + + // rtSignalCap is the maximum number of instances of a given realtime + // signal that may be pending. + // + // TODO: In Linux, the minimum signal queue size is + // RLIMIT_SIGPENDING, which is by default max_threads/2. + rtSignalCap = 32 +) + +// pendingSignals holds a collection of pending signals. The zero value of +// pendingSignals is a valid empty collection. pendingSignals is thread-unsafe; +// users must provide synchronization. +type pendingSignals struct { + // signals contains all pending signals. + // + // Note that signals is zero-indexed, but signal 1 is the first valid + // signal, so signals[0] contains signals with signo 1 etc. This offset is + // usually handled by using Signal.index(). + signals [linux.SignalMaximum]pendingSignalQueue + + // Bit i of pendingSet is set iff there is at least one signal with signo + // i+1 pending. + pendingSet linux.SignalSet +} + +// pendingSignalQueue holds a pendingSignalList for a single signal number. +type pendingSignalQueue struct { + pendingSignalList + length int +} + +type pendingSignal struct { + // pendingSignalEntry links into a pendingSignalList. + pendingSignalEntry + *arch.SignalInfo +} + +// enqueue enqueues the given signal. enqueue returns true on success and false +// on failure (if the given signal's queue is full). +// +// Preconditions: info represents a valid signal. +func (p *pendingSignals) enqueue(info *arch.SignalInfo) bool { + sig := linux.Signal(info.Signo) + q := &p.signals[sig.Index()] + if sig.IsStandard() { + if q.length >= stdSignalCap { + return false + } + } else if q.length >= rtSignalCap { + return false + } + q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info}) + q.length++ + p.pendingSet |= linux.SignalSetOf(sig) + return true +} + +// dequeue dequeues and returns any pending signal not masked by mask. If no +// unmasked signals are pending, dequeue returns nil. +func (p *pendingSignals) dequeue(mask linux.SignalSet) *arch.SignalInfo { + // "Real-time signals are delivered in a guaranteed order. Multiple + // real-time signals of the same type are delivered in the order they were + // sent. If different real-time signals are sent to a process, they are + // delivered starting with the lowest-numbered signal. (I.e., low-numbered + // signals have highest priority.) By contrast, if multiple standard + // signals are pending for a process, the order in which they are delivered + // is unspecified. If both standard and real-time signals are pending for a + // process, POSIX leaves it unspecified which is delivered first. Linux, + // like many other implementations, gives priority to standard signals in + // this case." - signal(7) + lowestPendingUnblockedBit := bits.TrailingZeros64(uint64(p.pendingSet &^ mask)) + if lowestPendingUnblockedBit >= linux.SignalMaximum { + return nil + } + return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1)) +} + +func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *arch.SignalInfo { + q := &p.signals[sig.Index()] + ps := q.pendingSignalList.Front() + if ps == nil { + return nil + } + q.pendingSignalList.Remove(ps) + q.length-- + if q.length == 0 { + p.pendingSet &^= linux.SignalSetOf(sig) + } + return ps.SignalInfo +} + +// discardSpecific causes all pending signals with number sig to be discarded. +func (p *pendingSignals) discardSpecific(sig linux.Signal) { + q := &p.signals[sig.Index()] + q.pendingSignalList.Reset() + q.length = 0 + p.pendingSet &^= linux.SignalSetOf(sig) +} diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD new file mode 100644 index 000000000..ca9825f9d --- /dev/null +++ b/pkg/sentry/kernel/pipe/BUILD @@ -0,0 +1,68 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "pipe_state", + srcs = [ + "buffers.go", + "node.go", + "pipe.go", + "reader.go", + "reader_writer.go", + "writer.go", + ], + out = "pipe_state.go", + package = "pipe", +) + +go_library( + name = "pipe", + srcs = [ + "buffers.go", + "device.go", + "node.go", + "pipe.go", + "pipe_state.go", + "reader.go", + "reader_writer.go", + "writer.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/ilist", + "//pkg/log", + "//pkg/refs", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/waiter", + ], +) + +go_test( + name = "pipe_test", + size = "small", + srcs = [ + "node_test.go", + "pipe_test.go", + ], + embed = [":pipe"], + deps = [ + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/usermem", + "//pkg/syserror", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/pipe/buffers.go b/pkg/sentry/kernel/pipe/buffers.go new file mode 100644 index 000000000..f300537c5 --- /dev/null +++ b/pkg/sentry/kernel/pipe/buffers.go @@ -0,0 +1,50 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "gvisor.googlesource.com/gvisor/pkg/ilist" +) + +// Buffer encapsulates a queueable byte buffer that can +// easily be truncated. It is designed only for use with pipes. +type Buffer struct { + ilist.Entry + data []byte +} + +// newBuffer initializes a Buffer. +func newBuffer(buf []byte) *Buffer { + return &Buffer{data: buf} +} + +// bytes returns the bytes contained in the buffer. +func (b *Buffer) bytes() []byte { + return b.data +} + +// size returns the number of bytes contained in the buffer. +func (b *Buffer) size() int { + return len(b.data) +} + +// truncate removes the first n bytes from the buffer. +func (b *Buffer) truncate(n int) int { + if n > len(b.data) { + panic("Trying to truncate past end of array.") + } + b.data = b.data[n:] + return len(b.data) +} diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go new file mode 100644 index 000000000..8d383577a --- /dev/null +++ b/pkg/sentry/kernel/pipe/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import "gvisor.googlesource.com/gvisor/pkg/sentry/device" + +// pipeDevice is used for all pipe files. +var pipeDevice = device.NewAnonDevice() diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go new file mode 100644 index 000000000..5b47427ef --- /dev/null +++ b/pkg/sentry/kernel/pipe/node.go @@ -0,0 +1,175 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/amutex" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// inodeOperations wraps fs.InodeOperations operations with common pipe opening semantics. +type inodeOperations struct { + fs.InodeOperations + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // p is the underlying Pipe object representing this fifo. + p *Pipe + + // Channels for synchronizing the creation of new readers and writers of + // this fifo. See waitFor and newHandleLocked. + // + // These are not saved/restored because all waiters are unblocked on save, + // and either automatically restart (via ERESTARTSYS) or return EINTR on + // resume. On restarts via ERESTARTSYS, the appropriate channel will be + // recreated. + rWakeup chan struct{} `state:"nosave"` + wWakeup chan struct{} `state:"nosave"` +} + +// NewInodeOperations creates a new pipe fs.InodeOperations. +func NewInodeOperations(base fs.InodeOperations, p *Pipe) fs.InodeOperations { + return &inodeOperations{ + InodeOperations: base, + p: p, + } +} + +// GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking +// semantics during open: +// +// "Normally, opening the FIFO blocks until the other end is opened also. A +// process can open a FIFO in nonblocking mode. In this case, opening for +// read-only will succeed even if no-one has opened on the write side yet, +// opening for write-only will fail with ENXIO (no such device or address) +// unless the other end has already been opened. Under Linux, opening a FIFO +// for read and write will succeed both in blocking and nonblocking mode. POSIX +// leaves this behavior undefined. This can be used to open a FIFO for writing +// while there are no readers available." - fifo(7) +func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + i.mu.Lock() + defer i.mu.Unlock() + + switch { + case flags.Read && !flags.Write: // O_RDONLY. + r := i.p.ROpen(ctx) + i.newHandleLocked(&i.rWakeup) + + if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { + if !i.waitFor(&i.wWakeup, ctx) { + r.DecRef() + return nil, syserror.ErrInterrupted + } + } + + // By now, either we're doing a nonblocking open or we have a writer. On + // a nonblocking read-only open, the open succeeds even if no-one has + // opened the write side yet. + return r, nil + + case flags.Write && !flags.Read: // O_WRONLY. + w := i.p.WOpen(ctx) + i.newHandleLocked(&i.wWakeup) + + if i.p.isNamed && !i.p.HasReaders() { + // On a nonblocking, write-only open, the open fails with ENXIO if the + // read side isn't open yet. + if flags.NonBlocking { + w.DecRef() + return nil, syserror.ENXIO + } + + if !i.waitFor(&i.rWakeup, ctx) { + w.DecRef() + return nil, syserror.ErrInterrupted + } + } + return w, nil + + case flags.Read && flags.Write: // O_RDWR. + // Pipes opened for read-write always succeeds without blocking. + rw := i.p.RWOpen(ctx) + i.newHandleLocked(&i.rWakeup) + i.newHandleLocked(&i.wWakeup) + return rw, nil + + default: + return nil, syserror.EINVAL + } +} + +// waitFor blocks until the underlying pipe has at least one reader/writer is +// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this +// function will block for either readers or writers, depending on where +// 'wakeupChan' points. +// +// f.mu must be held by the caller. waitFor returns with f.mu held, but it will +// drop f.mu before blocking for any reader/writers. +func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool { + // Ideally this function would simply use a condition variable. However, the + // wait needs to be interruptible via 'sleeper', so we must sychronize via a + // channel. The synchronization below relies on the fact that closing a + // channel unblocks all receives on the channel. + + // Does an appropriate wakeup channel already exist? If not, create a new + // one. This is all done under f.mu to avoid races. + if *wakeupChan == nil { + *wakeupChan = make(chan struct{}) + } + + // Grab a local reference to the wakeup channel since it may disappear as + // soon as we drop f.mu. + wakeup := *wakeupChan + + // Drop the lock and prepare to sleep. + i.mu.Unlock() + cancel := sleeper.SleepStart() + + // Wait for either a new reader/write to be signalled via 'wakeup', or + // for the sleep to be cancelled. + select { + case <-wakeup: + sleeper.SleepFinish(true) + case <-cancel: + sleeper.SleepFinish(false) + } + + // Take the lock and check if we were woken. If we were woken and + // interrupted, the former takes priority. + i.mu.Lock() + select { + case <-wakeup: + return true + default: + return false + } +} + +// newHandleLocked signals a new pipe reader or writer depending on where +// 'wakeupChan' points. This unblocks any corresponding reader or writer +// waiting for the other end of the channel to be opened, see Fifo.waitFor. +// +// i.mu must be held. +func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) { + if *wakeupChan != nil { + close(*wakeupChan) + *wakeupChan = nil + } +} diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go new file mode 100644 index 000000000..cc1ebf4f6 --- /dev/null +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -0,0 +1,308 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "testing" + "time" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +type sleeper struct { + context.Context + ch chan struct{} +} + +func newSleeperContext(t *testing.T) context.Context { + return &sleeper{ + Context: contexttest.Context(t), + ch: make(chan struct{}), + } +} + +func (s *sleeper) SleepStart() <-chan struct{} { + return s.ch +} + +func (s *sleeper) SleepFinish(bool) { +} + +func (s *sleeper) Cancel() { + s.ch <- struct{}{} +} + +type openResult struct { + *fs.File + error +} + +func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) { + file, err := n.GetFile(ctx, nil, flags) + if err != nil { + t.Fatalf("open with flags %+v failed: %v", flags, err) + } + if doneChan != nil { + doneChan <- struct{}{} + } + return file, err +} + +func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, resChan chan<- openResult) (*fs.File, error) { + file, err := n.GetFile(ctx, nil, flags) + if resChan != nil { + resChan <- openResult{file, err} + } + return file, err +} + +func newNamedPipe(t *testing.T) *Pipe { + return NewPipe(contexttest.Context(t), true, DefaultPipeSize, usermem.PageSize) +} + +func newAnonPipe(t *testing.T) *Pipe { + return NewPipe(contexttest.Context(t), false, DefaultPipeSize, usermem.PageSize) +} + +// assertRecvBlocks ensures that a recv attempt on c blocks for at least +// blockDuration. This is useful for checking that a goroutine that is supposed +// to be executing a blocking operation is actually blocking. +func assertRecvBlocks(t *testing.T, c <-chan struct{}, blockDuration time.Duration, failMsg string) { + select { + case <-c: + t.Fatalf(failMsg) + case <-time.After(blockDuration): + // Ok, blocked for the required duration. + } +} + +func TestReadOpenBlocksForWriteOpen(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + rDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone) + + // Verify that the open for read is blocking. + assertRecvBlocks(t, rDone, time.Millisecond*100, + "open for read not blocking with no writers") + + wDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + + <-wDone + <-rDone +} + +func TestWriteOpenBlocksForReadOpen(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + wDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + + // Verify that the open for write is blocking + assertRecvBlocks(t, wDone, time.Millisecond*100, + "open for write not blocking with no readers") + + rDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone) + + <-rDone + <-wDone +} + +func TestMultipleWriteOpenDoesntCountAsReadOpen(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + rDone1 := make(chan struct{}) + rDone2 := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone1) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone2) + + assertRecvBlocks(t, rDone1, time.Millisecond*100, + "open for read didn't block with no writers") + assertRecvBlocks(t, rDone2, time.Millisecond*100, + "open for read didn't block with no writers") + + wDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + + <-wDone + <-rDone2 + <-rDone1 +} + +func TestClosedReaderBlocksWriteOpen(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil) + rFile.DecRef() + + wDone := make(chan struct{}) + // This open for write should block because the reader is now gone. + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + assertRecvBlocks(t, wDone, time.Millisecond*100, + "open for write didn't block with no concurrent readers") + + // Open for read again. This should unblock the open for write. + rDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone) + + <-rDone + <-wDone +} + +func TestReadWriteOpenNeverBlocks(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + rwDone := make(chan struct{}) + // Open for read-write never wait for a reader or writer, even if the + // nonblocking flag is not set. + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true, NonBlocking: false}, rwDone) + <-rwDone +} + +func TestReadWriteOpenUnblocksReadOpen(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + rDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone) + + rwDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone) + + <-rwDone + <-rDone +} + +func TestReadWriteOpenUnblocksWriteOpen(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + wDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + + rwDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone) + + <-rwDone + <-wDone +} + +func TestBlockedOpenIsCancellable(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + done := make(chan openResult) + go testOpen(ctx, t, f, fs.FileFlags{Read: true}, done) + select { + case <-done: + t.Fatalf("open for read didn't block with no writers") + case <-time.After(time.Millisecond * 100): + // Ok. + } + + ctx.(*sleeper).Cancel() + // If the cancel on the sleeper didn't work, the open for read would never + // return. + res := <-done + if res.error != syserror.ErrInterrupted { + t.Fatalf("Cancellation didn't cause GetFile to return fs.ErrInterrupted, got %v.", + res.error) + } +} + +func TestNonblockingReadOpenNoWriters(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil { + t.Fatalf("Nonblocking open for read failed with error %v.", err) + } +} + +func TestNonblockingWriteOpenNoReaders(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != syserror.ENXIO { + t.Fatalf("Nonblocking open for write failed unexpected error %v.", err) + } +} + +func TestNonBlockingReadOpenWithWriter(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + wDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + + // Open for write blocks since there are no readers yet. + assertRecvBlocks(t, wDone, time.Millisecond*100, + "Open for write didn't block with no reader.") + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil { + t.Fatalf("Nonblocking open for read failed with error %v.", err) + } + + // Open for write should now be unblocked. + <-wDone +} + +func TestNonBlockingWriteOpenWithReader(t *testing.T) { + f := NewInodeOperations(nil, newNamedPipe(t)) + ctx := newSleeperContext(t) + + rDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone) + + // Open for write blocked, since no reader yet. + assertRecvBlocks(t, rDone, time.Millisecond*100, + "Open for reader didn't block with no writer.") + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != nil { + t.Fatalf("Nonblocking open for write failed with error %v.", err) + } + + // Open for write should now be unblocked. + <-rDone +} + +func TestAnonReadOpen(t *testing.T) { + f := NewInodeOperations(nil, newAnonPipe(t)) + ctx := newSleeperContext(t) + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true}, nil); err != nil { + t.Fatalf("open anon pipe for read failed: %v", err) + } +} + +func TestAnonWriteOpen(t *testing.T) { + f := NewInodeOperations(nil, newAnonPipe(t)) + ctx := newSleeperContext(t) + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true}, nil); err != nil { + t.Fatalf("open anon pipe for write failed: %v", err) + } +} diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go new file mode 100644 index 000000000..1656c6ff3 --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -0,0 +1,335 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pipe provides an in-memory implementation of a unidirectional +// pipe. +// +// The goal of this pipe is to emulate the pipe syscall in all of its +// edge cases and guarantees of atomic IO. +package pipe + +import ( + "fmt" + "sync" + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/ilist" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// DefaultPipeSize is the system-wide default size of a pipe in bytes. +const DefaultPipeSize = 65536 + +// Pipe is an encapsulation of a platform-independent pipe. +// It manages a buffered byte queue shared between a reader/writer +// pair. +type Pipe struct { + waiter.Queue `state:"nosave"` + + // Whether this is a named or anonymous pipe. + isNamed bool + + // The dirent backing this pipe. Shared by all readers and writers. + dirent *fs.Dirent + + // The buffered byte queue. + data ilist.List + + // Max size of the pipe in bytes. When this max has been reached, + // writers will get EWOULDBLOCK. + max int + + // Current size of the pipe in bytes. + size int + + // Max number of bytes the pipe can guarantee to read or write + // atomically. + atomicIOBytes int + + // The number of active readers for this pipe. Load/store atomically. + readers int32 + + // The number of active writes for this pipe. Load/store atomically. + writers int32 + + // This flag indicates if this pipe ever had a writer. Note that this does + // not necessarily indicate there is *currently* a writer, just that there + // has been a writer at some point since the pipe was created. + // + // Protected by mu. + hadWriter bool + + // Lock protecting all pipe internal state. + mu sync.Mutex `state:"nosave"` +} + +// NewPipe initializes and returns a pipe. A pipe created by this function is +// persistent, and will remain valid even without any open fds to it. Named +// pipes for mknod(2) are created via this function. Note that the +// implementation of blocking semantics for opening the read and write ends of a +// named pipe are left to filesystems. +func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int) *Pipe { + p := &Pipe{ + isNamed: isNamed, + max: sizeBytes, + atomicIOBytes: atomicIOBytes, + } + + // Build the fs.Dirent of this pipe, shared by all fs.Files associated + // with this pipe. + ino := pipeDevice.NextIno() + base := fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{ + FSType: linux.PIPEFS_MAGIC, + UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: fs.FileOwnerFromContext(ctx), + Perms: fs.FilePermissions{ + User: fs.PermMask{Read: true, Write: true}, + }, + Links: 1, + }), + }) + sattr := fs.StableAttr{ + Type: fs.Pipe, + DeviceID: pipeDevice.DeviceID(), + InodeID: ino, + BlockSize: int64(atomicIOBytes), + } + // There is no real filesystem backing this pipe, so we pass in a nil + // Filesystem. + sb := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}) + p.dirent = fs.NewDirent(fs.NewInode(NewInodeOperations(base, p), sb, sattr), fmt.Sprintf("pipe:[%d]", ino)) + + return p +} + +// NewConnectedPipe initializes a pipe and returns a pair of objects (which +// implement kio.File) representing the read and write ends of the pipe. A pipe +// created by this function becomes invalid as soon as either the read or write +// end is closed, and errors on subsequent operations on either end. Pipes +// for pipe(2) and pipe2(2) are generally created this way. +func NewConnectedPipe(ctx context.Context, sizeBytes int, atomicIOBytes int) (*fs.File, *fs.File) { + p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes) + return p.ROpen(ctx), p.WOpen(ctx) +} + +// ROpen opens the pipe for reading. +func (p *Pipe) ROpen(ctx context.Context) *fs.File { + p.rOpen() + return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true}, &Reader{ + ReaderWriter: ReaderWriter{Pipe: p}, + }) +} + +// WOpen opens the pipe for writing. +func (p *Pipe) WOpen(ctx context.Context) *fs.File { + p.wOpen() + return fs.NewFile(ctx, p.dirent, fs.FileFlags{Write: true}, &Writer{ + ReaderWriter: ReaderWriter{Pipe: p}, + }) +} + +// RWOpen opens the pipe for both reading and writing. +func (p *Pipe) RWOpen(ctx context.Context) *fs.File { + p.rOpen() + p.wOpen() + return fs.NewFile(ctx, p.dirent, fs.FileFlags{Read: true, Write: true}, &ReaderWriter{ + Pipe: p, + }) +} + +// read reads data from the pipe into dst and returns the number of bytes +// read, or returns ErrWouldBlock if the pipe is empty. +func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) { + if !p.HasReaders() { + return 0, syscall.EBADF + } + + // Don't block for a zero-length read even if the pipe is empty. + if dst.NumBytes() == 0 { + return 0, nil + } + + p.mu.Lock() + defer p.mu.Unlock() + // If there is nothing to read at the moment but there is a writer, tell the + // caller to block. + if p.size == 0 { + if !p.HasWriters() { + // There are no writers, return EOF. + return 0, nil + } + return 0, syserror.ErrWouldBlock + } + var n int64 + for b := p.data.Front(); b != nil; b = p.data.Front() { + buffer := b.(*Buffer) + n0, err := dst.CopyOut(ctx, buffer.bytes()) + n += int64(n0) + p.size -= n0 + if buffer.truncate(n0) == 0 { + p.data.Remove(b) + } + dst = dst.DropFirst(n0) + if dst.NumBytes() == 0 || err != nil { + return n, err + } + } + return n, nil +} + +// write writes data from sv into the pipe and returns the number of bytes +// written. If no bytes are written because the pipe is full (or has less than +// atomicIOBytes free capacity), write returns ErrWouldBlock. +func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) { + p.mu.Lock() + defer p.mu.Unlock() + + if !p.HasWriters() { + return 0, syscall.EBADF + } + if !p.HasReaders() { + return 0, syscall.EPIPE + } + + // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be + // atomic, but requires no atomicity for writes larger than this. However, + // Linux appears to provide stronger semantics than this in practice: + // unmerged writes are done one PAGE_SIZE buffer at a time, so for larger + // writes, the writing of each PIPE_BUF-sized chunk is atomic. We implement + // this by writing at most atomicIOBytes at a time if we can't service the + // write in its entirety. + canWrite := src.NumBytes() + if canWrite > int64(p.max-p.size) { + if p.max-p.size >= p.atomicIOBytes { + canWrite = int64(p.atomicIOBytes) + } else { + return 0, syserror.ErrWouldBlock + } + } + + // Copy data from user memory into a pipe-owned buffer. + buf := make([]byte, canWrite) + n, err := src.CopyIn(ctx, buf) + if n > 0 { + p.data.PushBack(newBuffer(buf[:n])) + p.size += n + } + if int64(n) < src.NumBytes() && err == nil { + // Partial write due to full pipe. + err = syserror.ErrWouldBlock + } + return int64(n), err +} + +// rOpen signals a new reader of the pipe. +func (p *Pipe) rOpen() { + atomic.AddInt32(&p.readers, 1) +} + +// wOpen signals a new writer of the pipe. +func (p *Pipe) wOpen() { + p.mu.Lock() + defer p.mu.Unlock() + p.hadWriter = true + atomic.AddInt32(&p.writers, 1) +} + +// rClose signals that a reader has closed their end of the pipe. +func (p *Pipe) rClose() { + newReaders := atomic.AddInt32(&p.readers, -1) + if newReaders < 0 { + panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders)) + } +} + +// wClose signals that a writer has closed their end of the pipe. +func (p *Pipe) wClose() { + newWriters := atomic.AddInt32(&p.writers, -1) + if newWriters < 0 { + panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters)) + } +} + +// HasReaders returns whether the pipe has any active readers. +func (p *Pipe) HasReaders() bool { + return atomic.LoadInt32(&p.readers) > 0 +} + +// HasWriters returns whether the pipe has any active writers. +func (p *Pipe) HasWriters() bool { + return atomic.LoadInt32(&p.writers) > 0 +} + +func (p *Pipe) rReadinessLocked() waiter.EventMask { + ready := waiter.EventMask(0) + if p.HasReaders() && p.data.Front() != nil { + ready |= waiter.EventIn + } + if !p.HasWriters() && p.hadWriter { + // POLLHUP must be supressed until the pipe has had at least one writer + // at some point. Otherwise a reader thread may poll and immediately get + // a POLLHUP before the writer ever opens the pipe, which the reader may + // interpret as the writer opening then closing the pipe. + ready |= waiter.EventHUp + } + return ready +} + +// rReadiness returns a mask that states whether the read end of the pipe is +// ready for reading. +func (p *Pipe) rReadiness() waiter.EventMask { + p.mu.Lock() + defer p.mu.Unlock() + return p.rReadinessLocked() +} + +func (p *Pipe) wReadinessLocked() waiter.EventMask { + ready := waiter.EventMask(0) + if p.HasWriters() && p.size < p.max { + ready |= waiter.EventOut + } + if !p.HasReaders() { + ready |= waiter.EventErr + } + return ready +} + +// wReadiness returns a mask that states whether the write end of the pipe +// is ready for writing. +func (p *Pipe) wReadiness() waiter.EventMask { + p.mu.Lock() + defer p.mu.Unlock() + return p.wReadinessLocked() +} + +// rwReadiness returns a mask that states whether a read-write handle to the +// pipe is ready for IO. +func (p *Pipe) rwReadiness() waiter.EventMask { + p.mu.Lock() + defer p.mu.Unlock() + return p.rReadinessLocked() | p.wReadinessLocked() +} + +func (p *Pipe) queuedSize() int { + p.mu.Lock() + defer p.mu.Unlock() + return p.size +} diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go new file mode 100644 index 000000000..49ef8c8ac --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -0,0 +1,138 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "bytes" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +func TestPipeRW(t *testing.T) { + ctx := contexttest.Context(t) + r, w := NewConnectedPipe(ctx, 65536, 4096) + defer r.DecRef() + defer w.DecRef() + + msg := []byte("here's some bytes") + wantN := int64(len(msg)) + n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) + if n != wantN || err != nil { + t.Fatalf("Writev: got (%d, %v), wanted (%d, nil)", n, err, wantN) + } + + buf := make([]byte, len(msg)) + n, err = r.Readv(ctx, usermem.BytesIOSequence(buf)) + if n != wantN || err != nil || !bytes.Equal(buf, msg) { + t.Fatalf("Readv: got (%d, %v) %q, wanted (%d, nil) %q", n, err, buf, wantN, msg) + } +} + +func TestPipeReadBlock(t *testing.T) { + ctx := contexttest.Context(t) + r, w := NewConnectedPipe(ctx, 65536, 4096) + defer r.DecRef() + defer w.DecRef() + + n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1))) + if n != 0 || err != syserror.ErrWouldBlock { + t.Fatalf("Readv: got (%d, %v), wanted (0, %v)", n, err, syserror.ErrWouldBlock) + } +} + +func TestPipeWriteBlock(t *testing.T) { + const atomicIOBytes = 2 + + ctx := contexttest.Context(t) + r, w := NewConnectedPipe(ctx, 10, atomicIOBytes) + defer r.DecRef() + defer w.DecRef() + + msg := []byte("here's some bytes") + n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) + if wantN, wantErr := int64(atomicIOBytes), syserror.ErrWouldBlock; n != wantN || err != wantErr { + t.Fatalf("Writev: got (%d, %v), wanted (%d, %v)", n, err, wantN, wantErr) + } +} + +func TestPipeWriteUntilEnd(t *testing.T) { + const atomicIOBytes = 2 + + ctx := contexttest.Context(t) + r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes) + defer r.DecRef() + defer w.DecRef() + + msg := []byte("here's some bytes") + + wDone := make(chan struct{}, 0) + rDone := make(chan struct{}, 0) + defer func() { + // Signal the reader to stop and wait until it does so. + close(wDone) + <-rDone + }() + + go func() { + defer close(rDone) + // Read from r until done is closed. + ctx := contexttest.Context(t) + buf := make([]byte, len(msg)+1) + dst := usermem.BytesIOSequence(buf) + e, ch := waiter.NewChannelEntry(nil) + r.EventRegister(&e, waiter.EventIn) + defer r.EventUnregister(&e) + for { + n, err := r.Readv(ctx, dst) + dst = dst.DropFirst64(n) + if err == syserror.ErrWouldBlock { + select { + case <-ch: + continue + case <-wDone: + // We expect to have 1 byte left in dst since len(buf) == + // len(msg)+1. + if dst.NumBytes() != 1 || !bytes.Equal(buf[:len(msg)], msg) { + t.Errorf("Reader: got %q (%d bytes remaining), wanted %q", buf, dst.NumBytes(), msg) + } + return + } + } + if err != nil { + t.Fatalf("Readv: got unexpected error %v", err) + } + } + }() + + src := usermem.BytesIOSequence(msg) + e, ch := waiter.NewChannelEntry(nil) + w.EventRegister(&e, waiter.EventOut) + defer w.EventUnregister(&e) + for src.NumBytes() != 0 { + n, err := w.Writev(ctx, src) + src = src.DropFirst64(n) + if err == syserror.ErrWouldBlock { + <-ch + continue + } + if err != nil { + t.Fatalf("Writev: got (%d, %v)", n, err) + } + } +} diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go new file mode 100644 index 000000000..40d5e4943 --- /dev/null +++ b/pkg/sentry/kernel/pipe/reader.go @@ -0,0 +1,37 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Reader satisfies the fs.FileOperations interface for read-only pipes. +// Reader should be used with !fs.FileFlags.Write to reject writes. +type Reader struct { + ReaderWriter +} + +// Release implements fs.FileOperations.Release. +func (r *Reader) Release() { + r.Pipe.rClose() + // Wake up writers. + r.Pipe.Notify(waiter.EventOut) +} + +// Readiness returns the ready events in the underlying pipe. +func (r *Reader) Readiness(mask waiter.EventMask) waiter.EventMask { + return r.Pipe.rReadiness() & mask +} diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go new file mode 100644 index 000000000..dc642a3a6 --- /dev/null +++ b/pkg/sentry/kernel/pipe/reader_writer.go @@ -0,0 +1,91 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "fmt" + "math" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// ReaderWriter satisfies the FileOperations interface and services both +// read and write requests. This should only be used directly for named pipes. +// pipe(2) and pipe2(2) only support unidirectional pipes and should use +// either pipe.Reader or pipe.Writer. +type ReaderWriter struct { + fsutil.PipeSeek `state:"nosave"` + fsutil.NotDirReaddir `state:"nosave"` + fsutil.NoFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoMMap `state:"nosave"` + *Pipe +} + +// Release implements fs.FileOperations.Release. +func (rw *ReaderWriter) Release() { + rw.Pipe.rClose() + rw.Pipe.wClose() + // Wake up readers and writers. + rw.Pipe.Notify(waiter.EventIn | waiter.EventOut) +} + +// Read implements fs.FileOperations.Read. +func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + n, err := rw.Pipe.read(ctx, dst) + if n > 0 { + rw.Pipe.Notify(waiter.EventOut) + } + return n, err +} + +// Write implements fs.FileOperations.Write. +func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + n, err := rw.Pipe.write(ctx, src) + if n > 0 { + rw.Pipe.Notify(waiter.EventIn) + } + return n, err +} + +// Readiness returns the ready events in the underlying pipe. +func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask { + return rw.Pipe.rwReadiness() & mask +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Switch on ioctl request. + switch int(args[1].Int()) { + case syscall.TIOCINQ: + v := rw.queuedSize() + if v > math.MaxInt32 { + panic(fmt.Sprintf("Impossibly large pipe queued size: %d", v)) + } + // Copy result to user-space. + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + default: + return 0, syscall.ENOTTY + } +} diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go new file mode 100644 index 000000000..fd13008ac --- /dev/null +++ b/pkg/sentry/kernel/pipe/writer.go @@ -0,0 +1,37 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Writer satisfies the fs.FileOperations interface for write-only pipes. +// Writer should be used with !fs.FileFlags.Read to reject reads. +type Writer struct { + ReaderWriter +} + +// Release implements fs.FileOperations.Release. +func (w *Writer) Release() { + w.Pipe.wClose() + // Wake up readers. + w.Pipe.Notify(waiter.EventHUp) +} + +// Readiness returns the ready events in the underlying pipe. +func (w *Writer) Readiness(mask waiter.EventMask) waiter.EventMask { + return w.Pipe.wReadiness() & mask +} diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go new file mode 100644 index 000000000..20b1c4cd4 --- /dev/null +++ b/pkg/sentry/kernel/ptrace.go @@ -0,0 +1,1054 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// ptrace constants from Linux's include/uapi/linux/ptrace.h. +const ( + _PTRACE_EVENT_SECCOMP = 7 + PTRACE_SEIZE = 0x4206 + PTRACE_INTERRUPT = 0x4207 + PTRACE_LISTEN = 0x4208 + PTRACE_PEEKSIGINFO = 0x4209 + PTRACE_GETSIGMASK = 0x420a + PTRACE_SETSIGMASK = 0x420b + _PTRACE_O_EXITKILL = 1 << 20 + _PTRACE_O_TRACESECCOMP = 1 << _PTRACE_EVENT_SECCOMP +) + +// ptraceOptions are the subset of options controlling a task's ptrace behavior +// that are set by ptrace(PTRACE_SETOPTIONS). +type ptraceOptions struct { + // ExitKill is true if the tracee should be sent SIGKILL when the tracer + // exits. + ExitKill bool + + // If SysGood is true, set bit 7 in the signal number for + // syscall-entry-stop and syscall-exit-stop traps delivered to this task's + // tracer. + SysGood bool + + // TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE + // events. + TraceClone bool + + // TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC + // events. + TraceExec bool + + // TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT + // events. + TraceExit bool + + // TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK + // events. + TraceFork bool + + // TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP + // events. + TraceSeccomp bool + + // TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK + // events. + TraceVfork bool + + // TraceVforkDone is true if the tracer wants to receive + // PTRACE_EVENT_VFORK_DONE events. + TraceVforkDone bool +} + +// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry +// and exit. +type ptraceSyscallMode int + +const ( + // ptraceSyscallNone indicates that the task has never ptrace-stopped, or + // that it was resumed from its last ptrace-stop by PTRACE_CONT or + // PTRACE_DETACH. The task's syscalls will not be intercepted. + ptraceSyscallNone ptraceSyscallMode = iota + + // ptraceSyscallIntercept indicates that the task was resumed from its last + // ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a + // syscall, a ptrace-stop will occur. + ptraceSyscallIntercept + + // ptraceSyscallEmu indicates that the task was resumed from its last + // ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time + // the task enters a syscall, the syscall will be skipped, and a + // ptrace-stop will occur. + ptraceSyscallEmu +) + +// CanTrace checks that t is permitted to access target's state, as defined by +// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it +// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access +// mode PTRACE_MODE_READ. +func (t *Task) CanTrace(target *Task, attach bool) bool { + // "1. If the calling thread and the target thread are in the same thread + // group, access is always allowed." - ptrace(2) + // + // Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access() + // should not deny sub-threads", first released in Linux 3.12), the rule + // only applies if t and target are the same task. But, as that commit + // message puts it, "[any] security check is pointless when the tasks share + // the same ->mm." + if t.tg == target.tg { + return true + } + + // """ + // 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped, + // doesn't exist until Linux 4.5). + // + // Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the + // caller's real UID and GID for the checks in the next step. (Most APIs + // that check the caller's UID and GID use the effective IDs. For + // historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs + // instead.) + // + // 3. Deny access if neither of the following is true: + // + // - The real, effective, and saved-set user IDs of the target match the + // caller's user ID, *and* the real, effective, and saved-set group IDs of + // the target match the caller's group ID. + // + // - The caller has the CAP_SYS_PTRACE capability in the user namespace of + // the target. + // + // 4. Deny access if the target process "dumpable" attribute has a value + // other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in + // prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in + // the user namespace of the target process. + // + // 5. The kernel LSM security_ptrace_access_check() interface is invoked to + // see if ptrace access is permitted. The results depend on the LSM(s). The + // implementation of this interface in the commoncap LSM performs the + // following steps: + // + // a) If the access mode includes PTRACE_MODE_FSCREDS, then use the + // caller's effective capability set; otherwise (the access mode specifies + // PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set. + // + // b) Deny access if neither of the following is true: + // + // - The caller and the target process are in the same user namespace, and + // the caller's capabilities are a proper superset of the target process's + // permitted capabilities. + // + // - The caller has the CAP_SYS_PTRACE capability in the target process's + // user namespace. + // + // Note that the commoncap LSM does not distinguish between + // PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this + // section: "the commoncap LSM ... is always invoked".) + // """ + callerCreds := t.Credentials() + targetCreds := target.Credentials() + if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) { + return true + } + if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID { + return false + } + if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID { + return false + } + // TODO: dumpability check + if callerCreds.UserNamespace != targetCreds.UserNamespace { + return false + } + if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 { + return false + } + // TODO: Yama LSM + return true +} + +// Tracer returns t's ptrace Tracer. +func (t *Task) Tracer() *Task { + return t.ptraceTracer.Load().(*Task) +} + +// hasTracer returns true if t has a ptrace tracer attached. +func (t *Task) hasTracer() bool { + // This isn't just inlined into callers so that if Task.Tracer() turns out + // to be too expensive because of e.g. interface conversion, we can switch + // to having a separate atomic flag more easily. + return t.Tracer() != nil +} + +// ptraceStop is a TaskStop placed on tasks in a ptrace-stop. +type ptraceStop struct { + // If frozen is true, the stopped task's tracer is currently operating on + // it, so Task.Kill should not remove the stop. + frozen bool +} + +// Killable implements TaskStop.Killable. +func (s *ptraceStop) Killable() bool { + return !s.frozen +} + +// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been +// killed, the stop is skipped, and beginPtraceStopLocked returns false. +// +// beginPtraceStopLocked does not signal t's tracer or wake it if it is +// waiting. +// +// Preconditions: The TaskSet mutex must be locked. The caller must be running +// on the task goroutine. +func (t *Task) beginPtraceStopLocked() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + // This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... => + // kernel/sched/core.c:__schedule() => signal_pending_state() check, which + // is what prevents tasks from entering ptrace-stops after being killed. + // Note that if t was SIGKILLed and beingPtraceStopLocked is being called + // for PTRACE_EVENT_EXIT, the task will have dequeued the signal before + // entering the exit path, so t.killable() will no longer return true. This + // is consistent with Linux: "Bugs: ... A SIGKILL signal may still cause a + // PTRACE_EVENT_EXIT stop before actual signal death. This may be changed + // in the future; SIGKILL is meant to always immediately kill tasks even + // under ptrace. Last confirmed on Linux 3.13." - ptrace(2) + if t.killedLocked() { + return false + } + t.beginInternalStopLocked(&ptraceStop{}) + return true +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceTrapLocked(code int32) { + t.ptraceCode = code + t.ptraceSiginfo = &arch.SignalInfo{ + Signo: int32(linux.SIGTRAP), + Code: code, + } + t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + if t.beginPtraceStopLocked() { + tracer := t.Tracer() + tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP)) + tracer.tg.eventQueue.Notify(EventTraceeStop) + } +} + +// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the +// ptraceStop, temporarily preventing it from being removed by a concurrent +// Task.Kill, and returns true. Otherwise it returns false. +// +// Preconditions: The TaskSet mutex must be locked. The caller must be running +// on the task goroutine of t's tracer. +func (t *Task) ptraceFreeze() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.stop == nil { + return false + } + s, ok := t.stop.(*ptraceStop) + if !ok { + return false + } + s.frozen = true + return true +} + +// ptraceUnfreeze ends the effect of a previous successful call to +// ptraceFreeze. +// +// Preconditions: t must be in a frozen ptraceStop. +func (t *Task) ptraceUnfreeze() { + // t.tg.signalHandlers is stable because t is in a frozen ptrace-stop, + // preventing its thread group from completing execve. + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + // Do this even if the task has been killed to ensure a panic if t.stop is + // nil or not a ptraceStop. + t.stop.(*ptraceStop).frozen = false + if t.killedLocked() { + t.endInternalStopLocked() + } +} + +// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL, +// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on +// mode and singlestep. +// +// Preconditions: t must be in a frozen ptrace stop. +// +// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace +// stop. +func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error { + if sig != 0 && !sig.IsValid() { + return syserror.EIO + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.ptraceCode = int32(sig) + t.ptraceSyscallMode = mode + t.ptraceSinglestep = singlestep + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.endInternalStopLocked() + return nil +} + +func (t *Task) ptraceTraceme() error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if t.hasTracer() { + return syserror.EPERM + } + if t.parent == nil { + // In Linux, only init can not have a parent, and init is assumed never + // to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user + // application that may invoke PTRACE_TRACEME; having no parent can + // also occur if all tasks in the parent thread group have exited, and + // failed to find a living thread group to reparent to. The former case + // is treated as if TGID 1 has an exited parent in an invisible + // ancestor PID namespace that is an owner of the root user namespace + // (and consequently has CAP_SYS_PTRACE), and the latter case is a + // special form of the exited parent case below. In either case, + // returning nil here is correct. + return nil + } + if !t.parent.CanTrace(t, true) { + return syserror.EPERM + } + if t.parent.exitState != TaskExitNone { + // Fail silently, as if we were successfully attached but then + // immediately detached. This is consistent with Linux. + return nil + } + t.ptraceTracer.Store(t.parent) + t.parent.ptraceTracees[t] = struct{}{} + return nil +} + +// ptraceAttach implements ptrace(PTRACE_ATTACH, target). t is the caller. +func (t *Task) ptraceAttach(target *Task) error { + if t.tg == target.tg { + return syserror.EPERM + } + if !t.CanTrace(target, true) { + return syserror.EPERM + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.hasTracer() { + return syserror.EPERM + } + // Attaching to zombies and dead tasks is not permitted; the exit + // notification logic relies on this. Linux allows attaching to PF_EXITING + // tasks, though. + if target.exitState >= TaskExitZombie { + return syserror.EPERM + } + target.ptraceTracer.Store(t) + t.ptraceTracees[target] = struct{}{} + target.tg.signalHandlers.mu.Lock() + target.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGSTOP), + Code: arch.SignalInfoUser, + }, false /* group */) + // Undocumented Linux feature: If the tracee is already group-stopped (and + // consequently will not report the SIGSTOP just sent), force it to leave + // and re-enter the stop so that it will switch to a ptrace-stop. + if target.stop == (*groupStop)(nil) { + target.groupStopRequired = true + target.endInternalStopLocked() + } + target.tg.signalHandlers.mu.Unlock() + return nil +} + +// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the +// caller. +// +// Preconditions: target must be a tracee of t in a frozen ptrace stop. +// +// Postconditions: If ptraceDetach returns nil, target will no longer be in a +// ptrace stop. +func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error { + if sig != 0 && !sig.IsValid() { + return syserror.EIO + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + target.ptraceCode = int32(sig) + target.forgetTracerLocked() + delete(t.ptraceTracees, target) + return nil +} + +// exitPtrace is called in the exit path to detach all of t's tracees. +func (t *Task) exitPtrace() { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + for target := range t.ptraceTracees { + if target.ptraceOpts.ExitKill { + target.tg.signalHandlers.mu.Lock() + target.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + }, false /* group */) + target.tg.signalHandlers.mu.Unlock() + } + // Leave ptraceCode unchanged so that if the task is ptrace-stopped, it + // observes the ptraceCode it set before it entered the stop. I believe + // this is consistent with Linux. + target.forgetTracerLocked() + } + // "nil maps cannot be saved" + t.ptraceTracees = make(map[*Task]struct{}) +} + +// forgetTracerLocked detaches t's tracer and ensures that t is no longer +// ptrace-stopped. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) forgetTracerLocked() { + t.ptraceOpts = ptraceOptions{} + t.ptraceSyscallMode = ptraceSyscallNone + t.ptraceSinglestep = false + t.ptraceTracer.Store((*Task)(nil)) + if t.exitTracerNotified && !t.exitTracerAcked { + t.exitTracerAcked = true + t.exitNotifyLocked(true) + } + // If t is ptrace-stopped, but its thread group is in a group stop and t is + // eligible to participate, make it do so. This is essentially the reverse + // of the special case in ptraceAttach, which converts a group stop to a + // ptrace stop. ("Handling of restart from group-stop is currently buggy, + // but the "as planned" behavior is to leave tracee stopped and waiting for + // SIGCONT." - ptrace(2)) + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.stop == nil { + return + } + if _, ok := t.stop.(*ptraceStop); ok { + if t.exitState < TaskExitInitiated && t.tg.groupStopPhase >= groupStopInitiated { + t.groupStopRequired = true + } + t.endInternalStopLocked() + } +} + +// ptraceSignalLocked is called after signal dequeueing to check if t should +// enter ptrace signal-delivery-stop. +// +// Preconditions: The signal mutex must be locked. The caller must be running +// on the task goroutine. +func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool { + if linux.Signal(info.Signo) == linux.SIGKILL { + return false + } + if !t.hasTracer() { + return false + } + // The tracer might change this signal into a stop signal, in which case + // any SIGCONT received after the signal was originally dequeued should + // cancel it. This is consistent with Linux. + if t.tg.groupStopPhase == groupStopNone { + t.tg.groupStopPhase = groupStopDequeued + } + // Can't lock the TaskSet mutex while holding a signal mutex. + t.tg.signalHandlers.mu.Unlock() + defer t.tg.signalHandlers.mu.Lock() + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + tracer := t.Tracer() + if tracer == nil { + return false + } + t.ptraceCode = info.Signo + t.ptraceSiginfo = info + t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo) + if t.beginPtraceStopLocked() { + tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo) + tracer.tg.eventQueue.Notify(EventTraceeStop) + } + return true +} + +// ptraceSeccomp is called when a seccomp-bpf filter returns action +// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data +// is the lower 16 bits of the filter's return value. +func (t *Task) ptraceSeccomp(data uint16) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceSeccomp { + return false + } + t.Debugf("Entering PTRACE_EVENT_SECCOMP stop") + t.ptraceEventLocked(_PTRACE_EVENT_SECCOMP, uint64(data)) + return true +} + +// ptraceSyscallEnter is called immediately before entering a syscall to check +// if t should enter ptrace syscall-enter-stop. +func (t *Task) ptraceSyscallEnter() (taskRunState, bool) { + if !t.hasTracer() { + return nil, false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + switch t.ptraceSyscallMode { + case ptraceSyscallNone: + return nil, false + case ptraceSyscallIntercept: + t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL") + t.ptraceSyscallStopLocked() + return (*runSyscallAfterSyscallEnterStop)(nil), true + case ptraceSyscallEmu: + t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU") + t.ptraceSyscallStopLocked() + return (*runSyscallAfterSysemuStop)(nil), true + } + panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode)) +} + +// ptraceSyscallExit is called immediately after leaving a syscall to check if +// t should enter ptrace syscall-exit-stop. +func (t *Task) ptraceSyscallExit() { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if t.ptraceSyscallMode != ptraceSyscallIntercept { + return + } + t.Debugf("Entering syscall-exit-stop") + t.ptraceSyscallStopLocked() +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceSyscallStopLocked() { + code := int32(linux.SIGTRAP) + if t.ptraceOpts.SysGood { + code |= 0x80 + } + t.ptraceTrapLocked(code) +} + +type ptraceCloneKind int32 + +const ( + // ptraceCloneKindClone represents a call to Task.Clone where + // TerminationSignal is not SIGCHLD and Vfork is false. + ptraceCloneKindClone ptraceCloneKind = iota + + // ptraceCloneKindFork represents a call to Task.Clone where + // TerminationSignal is SIGCHLD and Vfork is false. + ptraceCloneKindFork + + // ptraceCloneKindVfork represents a call to Task.Clone where Vfork is + // true. + ptraceCloneKindVfork +) + +// ptraceClone is called at the end of a clone or fork syscall to check if t +// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK +// stop. child is the new task. +func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + event := false + if !opts.Untraced { + switch kind { + case ptraceCloneKindClone: + if t.ptraceOpts.TraceClone { + t.Debugf("Entering PTRACE_EVENT_CLONE stop") + t.ptraceEventLocked(syscall.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child])) + event = true + } + case ptraceCloneKindFork: + if t.ptraceOpts.TraceFork { + t.Debugf("Entering PTRACE_EVENT_FORK stop") + t.ptraceEventLocked(syscall.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child])) + event = true + } + case ptraceCloneKindVfork: + if t.ptraceOpts.TraceVfork { + t.Debugf("Entering PTRACE_EVENT_VFORK stop") + t.ptraceEventLocked(syscall.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child])) + event = true + } + default: + panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind)) + } + } + // "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE + // options are in effect, then children created by, respectively, vfork(2) + // or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit + // signal set to SIGCHLD, and other kinds of clone(2), are automatically + // attached to the same tracer which traced their parent. SIGSTOP is + // delivered to the children, causing them to enter signal-delivery-stop + // after they exit the system call which created them." - ptrace(2) + // + // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is + // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() => + // include/linux/ptrace.h:ptrace_init_task(). + if event || opts.InheritTracer { + tracer := t.Tracer() + if tracer != nil { + child.ptraceTracer.Store(tracer) + tracer.ptraceTracees[child] = struct{}{} + // "Flags are inherited by new tracees created and "auto-attached" + // via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or + // PTRACE_O_TRACECLONE options." + child.ptraceOpts = t.ptraceOpts + child.tg.signalHandlers.mu.Lock() + // If the child is PT_SEIZED (currently not possible in the sentry + // because PTRACE_SEIZE is unimplemented, but for future + // reference), Linux just sets JOBCTL_TRAP_STOP instead, so the + // child skips signal-delivery-stop and goes directly to + // group-stop. + // + // The child will self-t.interrupt() when its task goroutine starts + // running, so we don't have to. + child.pendingSignals.enqueue(&arch.SignalInfo{ + Signo: int32(linux.SIGSTOP), + }) + child.tg.signalHandlers.mu.Unlock() + } + } + return event +} + +// ptraceVforkDone is called after the end of a vfork stop to check if t should +// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's +// PID namespace. +func (t *Task) ptraceVforkDone(child ThreadID) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceVforkDone { + return false + } + t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop") + t.ptraceEventLocked(syscall.PTRACE_EVENT_VFORK_DONE, uint64(child)) + return true +} + +// ptraceExec is called at the end of an execve syscall to check if t should +// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID +// namespace, prior to the execve. (If t did not have a tracer at the time +// oldTID was read, oldTID may be 0. This is consistent with Linux.) +func (t *Task) ptraceExec(oldTID ThreadID) { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + // Recheck with the TaskSet mutex locked. Most ptrace points don't need to + // do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC + // is special because both TraceExec and !TraceExec do something if a + // tracer is attached. + if !t.hasTracer() { + return + } + if t.ptraceOpts.TraceExec { + t.Debugf("Entering PTRACE_EVENT_EXEC stop") + t.ptraceEventLocked(syscall.PTRACE_EVENT_EXEC, uint64(oldTID)) + return + } + // "If the PTRACE_O_TRACEEXEC option is not in effect for the execing + // tracee, and if the tracee was PTRACE_ATTACHed rather that [sic] + // PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after + // execve(2) returns. This is an ordinary signal (similar to one which can + // be generated by `kill -TRAP`, not a special kind of ptrace-stop. + // Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0 + // (SI_USER). This signal may be blocked by signal mask, and thus may be + // delivered (much) later." - ptrace(2) + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGTRAP), + Code: arch.SignalInfoUser, + }, false /* group */) +} + +// ptraceExit is called early in the task exit path to check if t should enter +// PTRACE_EVENT_EXIT stop. +func (t *Task) ptraceExit() { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceExit { + return + } + t.tg.signalHandlers.mu.Lock() + status := t.exitStatus.Status() + t.tg.signalHandlers.mu.Unlock() + t.Debugf("Entering PTRACE_EVENT_EXIT stop") + t.ptraceEventLocked(syscall.PTRACE_EVENT_EXIT, uint64(status)) +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceEventLocked(event int32, msg uint64) { + t.ptraceEventMsg = msg + // """ + // PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning + // with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An + // additional bit is set in the higher byte of the status word: the value + // status>>8 will be + // + // (SIGTRAP | PTRACE_EVENT_foo << 8). + // + // ... + // + // """ - ptrace(2) + t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8)) +} + +// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller. +func (t *Task) ptraceKill(target *Task) error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.Tracer() != t { + return syserror.ESRCH + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + // "This operation is deprecated; do not use it! Instead, send a SIGKILL + // directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is + // that it requires the tracee to be in signal-delivery-stop, otherwise it + // may not work (i.e., may complete successfully but won't kill the + // tracee)." - ptrace(2) + if target.stop == nil { + return nil + } + if _, ok := target.stop.(*ptraceStop); !ok { + return nil + } + target.ptraceCode = int32(linux.SIGKILL) + target.endInternalStopLocked() + return nil +} + +// Ptrace implements the ptrace system call. +func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { + // PTRACE_TRACEME ignores all other arguments. + if req == syscall.PTRACE_TRACEME { + return t.ptraceTraceme() + } + // All other ptrace requests operate on a current or future tracee + // specified by pid. + target := t.tg.pidns.TaskWithID(pid) + if target == nil { + return syserror.ESRCH + } + + // PTRACE_ATTACH (and PTRACE_SEIZE, which is unimplemented) do not require + // that target is not already a tracee. + if req == syscall.PTRACE_ATTACH { + return t.ptraceAttach(target) + } + // PTRACE_KILL (and PTRACE_INTERRUPT, which is unimplemented) require that + // the target is a tracee, but does not require that it is ptrace-stopped. + if req == syscall.PTRACE_KILL { + return t.ptraceKill(target) + } + // All other ptrace requests require that the target is a ptrace-stopped + // tracee, and freeze the ptrace-stop so the tracee can be operated on. + t.tg.pidns.owner.mu.RLock() + if target.Tracer() != t { + t.tg.pidns.owner.mu.RUnlock() + return syserror.ESRCH + } + if !target.ptraceFreeze() { + t.tg.pidns.owner.mu.RUnlock() + // "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE, + // PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the + // tracee to be in a ptrace-stop, otherwise they fail with ESRCH." - + // ptrace(2) + return syserror.ESRCH + } + t.tg.pidns.owner.mu.RUnlock() + // Even if the target has a ptrace-stop active, the tracee's task goroutine + // may not yet have reached Task.doStop; wait for it to do so. This is safe + // because there's no way for target to initiate a ptrace-stop and then + // block (by calling Task.block) before entering it. + // + // Caveat: If tasks were just restored, the tracee's first call to + // Task.Activate (in Task.run) occurs before its first call to Task.doStop, + // which may block if the tracer's address space is active. + t.UninterruptibleSleepStart(true) + target.waitGoroutineStoppedOrExited() + t.UninterruptibleSleepFinish(true) + + // Resuming commands end the ptrace stop, but only if successful. + switch req { + case syscall.PTRACE_DETACH: + if err := t.ptraceDetach(target, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + case syscall.PTRACE_CONT: + if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + case syscall.PTRACE_SYSCALL: + if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + case syscall.PTRACE_SINGLESTEP: + if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + case syscall.PTRACE_SYSEMU: + if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + case syscall.PTRACE_SYSEMU_SINGLESTEP: + if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + } + // All other ptrace requests expect us to unfreeze the stop. + defer target.ptraceUnfreeze() + + switch req { + case syscall.PTRACE_PEEKTEXT, syscall.PTRACE_PEEKDATA: + // "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and + // PTRACE_PEEKUSER requests have a different API: they store the result + // at the address specified by the data parameter, and the return value + // is the error flag." - ptrace(2) + word := t.Arch().Native(0) + if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{ + IgnorePermissions: true, + }); err != nil { + return err + } + _, err := t.CopyOut(data, word) + return err + + case syscall.PTRACE_POKETEXT, syscall.PTRACE_POKEDATA: + _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{ + IgnorePermissions: true, + }) + return err + + case syscall.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER + n, err := target.Arch().PtracePeekUser(uintptr(addr)) + if err != nil { + return err + } + _, err = t.CopyOut(data, n) + return err + + case syscall.PTRACE_POKEUSR: // aka PTRACE_POKEUSER + return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data)) + + case syscall.PTRACE_GETREGS: + // "Copy the tracee's general-purpose ... registers ... to the address + // data in the tracer. ... (addr is ignored.) Note that SPARC systems + // have the meaning of data and addr reversed ..." + _, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + case syscall.PTRACE_GETFPREGS: + _, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + case syscall.PTRACE_GETREGSET: + // "Read the tracee's registers. addr specifies, in an + // architecture-dependent way, the type of registers to be read. ... + // data points to a struct iovec, which describes the destination + // buffer's location and length. On return, the kernel modifies iov.len + // to indicate the actual number of bytes returned." - ptrace(2) + ars, err := t.CopyInIovecs(data, 1) + if err != nil { + return err + } + ar := ars.Head() + n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: ar.Start, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }, int(ar.Length())) + if err != nil { + return err + } + ar.End -= usermem.Addr(n) + return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) + + case syscall.PTRACE_SETREGS: + _, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + case syscall.PTRACE_SETFPREGS: + _, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + case syscall.PTRACE_SETREGSET: + ars, err := t.CopyInIovecs(data, 1) + if err != nil { + return err + } + ar := ars.Head() + n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: ar.Start, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }, int(ar.Length())) + if err != nil { + return err + } + ar.End -= usermem.Addr(n) + return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) + + case syscall.PTRACE_GETSIGINFO: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if target.ptraceSiginfo == nil { + return syserror.EINVAL + } + _, err := t.CopyOut(data, target.ptraceSiginfo) + return err + + case syscall.PTRACE_SETSIGINFO: + var info arch.SignalInfo + if _, err := t.CopyIn(data, &info); err != nil { + return err + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if target.ptraceSiginfo == nil { + return syserror.EINVAL + } + target.ptraceSiginfo = &info + return nil + + case PTRACE_GETSIGMASK: + if addr != linux.SignalSetSize { + return syserror.EINVAL + } + target.mu.Lock() + defer target.mu.Unlock() + _, err := t.CopyOut(data, target.tr.SignalMask) + return err + + case PTRACE_SETSIGMASK: + if addr != linux.SignalSetSize { + return syserror.EINVAL + } + var mask linux.SignalSet + if _, err := t.CopyIn(data, &mask); err != nil { + return err + } + // The target's task goroutine is stopped, so this is safe: + target.SetSignalMask(mask &^ UnblockableSignals) + return nil + + case syscall.PTRACE_SETOPTIONS: + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + validOpts := uintptr(_PTRACE_O_EXITKILL | syscall.PTRACE_O_TRACESYSGOOD | syscall.PTRACE_O_TRACECLONE | + syscall.PTRACE_O_TRACEEXEC | syscall.PTRACE_O_TRACEEXIT | syscall.PTRACE_O_TRACEFORK | + _PTRACE_O_TRACESECCOMP | syscall.PTRACE_O_TRACEVFORK | syscall.PTRACE_O_TRACEVFORKDONE) + if uintptr(data)&^validOpts != 0 { + return syserror.EINVAL + } + target.ptraceOpts = ptraceOptions{ + ExitKill: data&_PTRACE_O_EXITKILL != 0, + SysGood: data&syscall.PTRACE_O_TRACESYSGOOD != 0, + TraceClone: data&syscall.PTRACE_O_TRACECLONE != 0, + TraceExec: data&syscall.PTRACE_O_TRACEEXEC != 0, + TraceExit: data&syscall.PTRACE_O_TRACEEXIT != 0, + TraceFork: data&syscall.PTRACE_O_TRACEFORK != 0, + TraceSeccomp: data&_PTRACE_O_TRACESECCOMP != 0, + TraceVfork: data&syscall.PTRACE_O_TRACEVFORK != 0, + TraceVforkDone: data&syscall.PTRACE_O_TRACEVFORKDONE != 0, + } + return nil + + case syscall.PTRACE_GETEVENTMSG: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg) + return err + + default: + // PEEKSIGINFO is unimplemented but seems to have no users anywhere. + return syserror.EIO + } +} diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go new file mode 100644 index 000000000..635372993 --- /dev/null +++ b/pkg/sentry/kernel/rseq.go @@ -0,0 +1,118 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Restartable sequences, as described in https://lwn.net/Articles/650333/. + +// RSEQCriticalRegion describes a restartable sequence critical region. +type RSEQCriticalRegion struct { + // When a task in this thread group has its CPU preempted (as defined by + // platform.ErrContextCPUPreempted) or has a signal delivered to an + // application handler while its instruction pointer is in CriticalSection, + // set the instruction pointer to Restart and application register r10 (on + // amd64) to the former instruction pointer. + CriticalSection usermem.AddrRange + Restart usermem.Addr +} + +// RSEQAvailable returns true if t supports restartable sequences. +func (t *Task) RSEQAvailable() bool { + return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption() +} + +// RSEQCriticalRegion returns a copy of t's thread group's current restartable +// sequence. +func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion { + return *t.tg.rscr.Load().(*RSEQCriticalRegion) +} + +// SetRSEQCriticalRegion replaces t's thread group's restartable sequence. +// +// Preconditions: t.RSEQAvailable() == true. +func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error { + // These checks are somewhat more lenient than in Linux, which (bizarrely) + // requires rscr.CriticalSection to be non-empty and rscr.Restart to be + // outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0 + // (which disables the critical region). + if rscr.CriticalSection.Start == 0 { + rscr.CriticalSection.End = 0 + rscr.Restart = 0 + t.tg.rscr.Store(&rscr) + return nil + } + if rscr.CriticalSection.Start >= rscr.CriticalSection.End { + return syserror.EINVAL + } + if rscr.CriticalSection.Contains(rscr.Restart) { + return syserror.EINVAL + } + // TODO: check that rscr.CriticalSection and rscr.Restart are in + // the application address range, for consistency with Linux + t.tg.rscr.Store(&rscr) + return nil +} + +// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU +// number. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) RSEQCPUAddr() usermem.Addr { + return t.rseqCPUAddr +} + +// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU +// number. +// +// Preconditions: t.RSEQAvailable() == true. The caller must be running on the +// task goroutine. t's AddressSpace must be active. +func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error { + t.rseqCPUAddr = addr + if addr != 0 { + if err := t.rseqCopyOutCPU(); err != nil { + t.rseqCPUAddr = 0 + t.rseqCPU = -1 + return syserror.EINVAL // yes, EINVAL, not err or EFAULT + } + } else { + t.rseqCPU = -1 + } + return nil +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqCopyOutCPU() error { + t.rseqCPU = int32(hostcpu.GetCPU()) + buf := t.CopyScratchBuffer(4) + usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) + _, err := t.CopyOutBytes(t.rseqCPUAddr, buf) + return err +} + +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) rseqInterrupt() { + rscr := t.tg.rscr.Load().(*RSEQCriticalRegion) + if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) { + t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart) + t.Arch().SetIP(uintptr(rscr.Restart)) + t.Arch().SetRSEQInterruptedIP(ip) + } +} diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD new file mode 100644 index 000000000..b533c51c4 --- /dev/null +++ b/pkg/sentry/kernel/sched/BUILD @@ -0,0 +1,20 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "sched", + srcs = [ + "cpuset.go", + "sched.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched", + visibility = ["//pkg/sentry:internal"], +) + +go_test( + name = "sched_test", + size = "small", + srcs = ["cpuset_test.go"], + embed = [":sched"], +) diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go new file mode 100644 index 000000000..0a97603f0 --- /dev/null +++ b/pkg/sentry/kernel/sched/cpuset.go @@ -0,0 +1,105 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sched + +import "math/bits" + +const ( + bitsPerByte = 8 + bytesPerLong = 8 // only for 64-bit architectures +) + +// CPUSet contains a bitmap to record CPU information. +// +// Note that this definition is only correct for little-endian architectures, +// since Linux's cpumask_t uses unsigned long. +type CPUSet []byte + +// CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus. +func CPUSetSize(num uint) uint { + // NOTE: Applications may expect that the size of a CPUSet in + // bytes is always a multiple of sizeof(unsigned long), since this is true + // in Linux. Thus we always round up. + bytes := (num + bitsPerByte - 1) / bitsPerByte + longs := (bytes + bytesPerLong - 1) / bytesPerLong + return longs * bytesPerLong +} + +// NewCPUSet returns a CPUSet for the given number of CPUs which initially +// contains no CPUs. +func NewCPUSet(num uint) CPUSet { + return CPUSet(make([]byte, CPUSetSize(num))) +} + +// NewFullCPUSet returns a CPUSet for the given number of CPUs, all of which +// are present in the set. +func NewFullCPUSet(num uint) CPUSet { + c := NewCPUSet(num) + var i uint + for ; i < num/bitsPerByte; i++ { + c[i] = 0xff + } + if rem := num % bitsPerByte; rem != 0 { + c[i] = (1 << rem) - 1 + } + return c +} + +// Size returns the size of 'c' in bytes. +func (c CPUSet) Size() uint { + return uint(len(c)) +} + +// NumCPUs returns how many cpus are set in the CPUSet. +func (c CPUSet) NumCPUs() uint { + var n int + for _, b := range c { + n += bits.OnesCount8(b) + } + return uint(n) +} + +// Copy returns a copy of the CPUSet. +func (c CPUSet) Copy() CPUSet { + return append(CPUSet(nil), c...) +} + +// Set sets the bit corresponding to cpu. +func (c *CPUSet) Set(cpu uint) { + (*c)[cpu/bitsPerByte] |= 1 << (cpu % bitsPerByte) +} + +// ClearAbove clears bits corresponding to cpu and all higher cpus. +func (c *CPUSet) ClearAbove(cpu uint) { + i := cpu / bitsPerByte + if i >= c.Size() { + return + } + (*c)[i] &^= 0xff << (cpu % bitsPerByte) + for i++; i < c.Size(); i++ { + (*c)[i] = 0 + } +} + +// ForEachCPU iterates over the CPUSet and calls fn with the cpu index if +// it's set. +func (c CPUSet) ForEachCPU(fn func(uint)) { + for i := uint(0); i < c.Size()*bitsPerByte; i++ { + bit := uint(1) << (i & (bitsPerByte - 1)) + if uint(c[i/bitsPerByte])&bit == bit { + fn(i) + } + } +} diff --git a/pkg/sentry/kernel/sched/cpuset_test.go b/pkg/sentry/kernel/sched/cpuset_test.go new file mode 100644 index 000000000..8a6e12958 --- /dev/null +++ b/pkg/sentry/kernel/sched/cpuset_test.go @@ -0,0 +1,44 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sched + +import ( + "testing" +) + +func TestNumCPUs(t *testing.T) { + for i := uint(0); i < 1024; i++ { + c := NewCPUSet(i) + for j := uint(0); j < i; j++ { + c.Set(j) + } + n := c.NumCPUs() + if n != i { + t.Errorf("got wrong number of cpus %d, want %d", n, i) + } + } +} + +func TestClearAbove(t *testing.T) { + const n = 1024 + c := NewFullCPUSet(n) + for i := uint(0); i < n; i++ { + cpu := n - i + c.ClearAbove(cpu) + if got := c.NumCPUs(); got != cpu { + t.Errorf("iteration %d: got %d cpus, wanted %d", i, got, cpu) + } + } +} diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go new file mode 100644 index 000000000..f1de1da60 --- /dev/null +++ b/pkg/sentry/kernel/sched/sched.go @@ -0,0 +1,16 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sched implements scheduler related features. +package sched diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go new file mode 100644 index 000000000..b7c4a507f --- /dev/null +++ b/pkg/sentry/kernel/seccomp.go @@ -0,0 +1,205 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const maxSyscallFilterInstructions = 1 << 15 + +type seccompResult int + +const ( + // seccompResultDeny indicates that a syscall should not be executed. + seccompResultDeny seccompResult = iota + + // seccompResultAllow indicates that a syscall should be executed. + seccompResultAllow + + // seccompResultKill indicates that the task should be killed immediately, + // with the exit status indicating that the task was killed by SIGSYS. + seccompResultKill + + // seccompResultTrace indicates that a ptracer was successfully notified as + // a result of a SECCOMP_RET_TRACE. + seccompResultTrace +) + +// seccompData is equivalent to struct seccomp_data, which contains the data +// passed to seccomp-bpf filters. +type seccompData struct { + // nr is the system call number. + nr int32 + + // arch is an AUDIT_ARCH_* value indicating the system call convention. + arch uint32 + + // instructionPointer is the value of the instruction pointer at the time + // of the system call. + instructionPointer uint64 + + // args contains the first 6 system call arguments. + args [6]uint64 +} + +func (d *seccompData) asBPFInput() bpf.Input { + return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder} +} + +func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo { + si := &arch.SignalInfo{ + Signo: int32(linux.SIGSYS), + Errno: errno, + Code: arch.SYS_SECCOMP, + } + si.SetCallAddr(uint64(ip)) + si.SetSyscall(sysno) + si.SetArch(t.SyscallTable().AuditNumber) + return si +} + +// checkSeccompSyscall applies the task's seccomp filters before the execution +// of syscall sysno at instruction pointer ip. (These parameters must be passed +// in because vsyscalls do not use the values in t.Arch().) +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) seccompResult { + result := t.evaluateSyscallFilters(sysno, args, ip) + switch result & linux.SECCOMP_RET_ACTION { + case linux.SECCOMP_RET_TRAP: + // "Results in the kernel sending a SIGSYS signal to the triggering + // task without executing the system call. ... The SECCOMP_RET_DATA + // portion of the return value will be passed as si_errno." - + // Documentation/prctl/seccomp_filter.txt + t.SendSignal(seccompSiginfo(t, int32(result&linux.SECCOMP_RET_DATA), sysno, ip)) + return seccompResultDeny + + case linux.SECCOMP_RET_ERRNO: + // "Results in the lower 16-bits of the return value being passed to + // userland as the errno without executing the system call." + t.Arch().SetReturn(-uintptr(result & linux.SECCOMP_RET_DATA)) + return seccompResultDeny + + case linux.SECCOMP_RET_TRACE: + // "When returned, this value will cause the kernel to attempt to + // notify a ptrace()-based tracer prior to executing the system call. + // If there is no tracer present, -ENOSYS is returned to userland and + // the system call is not executed." + if t.ptraceSeccomp(uint16(result & linux.SECCOMP_RET_DATA)) { + return seccompResultTrace + } + // This useless-looking temporary is needed because Go. + tmp := uintptr(syscall.ENOSYS) + t.Arch().SetReturn(-tmp) + return seccompResultDeny + + case linux.SECCOMP_RET_ALLOW: + // "Results in the system call being executed." + return seccompResultAllow + + case linux.SECCOMP_RET_KILL: + // "Results in the task exiting immediately without executing the + // system call. The exit status of the task will be SIGSYS, not + // SIGKILL." + fallthrough + default: // consistent with Linux + return seccompResultKill + } +} + +func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 { + data := seccompData{ + nr: sysno, + arch: t.tc.st.AuditNumber, + instructionPointer: uint64(ip), + } + // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so + // we can't do any slicing tricks or even use copy/append here. + for i, arg := range args { + if i >= len(data.args) { + break + } + data.args[i] = arg.Uint64() + } + input := data.asBPFInput() + + ret := uint32(linux.SECCOMP_RET_ALLOW) + // "Every filter successfully installed will be evaluated (in reverse + // order) for each system call the task makes." - kernel/seccomp.c + for i := len(t.syscallFilters) - 1; i >= 0; i-- { + thisRet, err := bpf.Exec(t.syscallFilters[i], input) + if err != nil { + t.Debugf("seccomp-bpf filter %d returned error: %v", i, err) + thisRet = linux.SECCOMP_RET_KILL + } + // "If multiple filters exist, the return value for the evaluation of a + // given system call will always use the highest precedent value." - + // Documentation/prctl/seccomp_filter.txt + // + // (Note that this contradicts prctl(2): "If the filters permit prctl() + // calls, then additional filters can be added; they are run in order + // until the first non-allow result is seen." prctl(2) is incorrect.) + // + // "The ordering ensures that a min_t() over composed return values + // always selects the least permissive choice." - + // include/uapi/linux/seccomp.h + if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) { + ret = thisRet + } + } + + return ret +} + +// AppendSyscallFilter adds BPF program p as a system call filter. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) AppendSyscallFilter(p bpf.Program) error { + // Cap the combined length of all syscall filters (plus a penalty of 4 + // instructions per filter beyond the first) to + // maxSyscallFilterInstructions. (This restriction is inherited from + // Linux.) + totalLength := p.Length() + for _, f := range t.syscallFilters { + totalLength += f.Length() + 4 + } + if totalLength > maxSyscallFilterInstructions { + return syserror.ENOMEM + } + t.mu.Lock() + defer t.mu.Unlock() + t.syscallFilters = append(t.syscallFilters, p) + return nil +} + +// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current +// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP) +// and /proc/[pid]/status. +func (t *Task) SeccompMode() int { + t.mu.Lock() + defer t.mu.Unlock() + if len(t.syscallFilters) > 0 { + return linux.SECCOMP_MODE_FILTER + } + return linux.SECCOMP_MODE_NONE +} diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD new file mode 100644 index 000000000..1656ad126 --- /dev/null +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -0,0 +1,62 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_template_instance( + name = "waiter_list", + out = "waiter_list.go", + package = "semaphore", + prefix = "waiter", + template = "//pkg/ilist:generic_list", + types = { + "Linker": "*waiter", + }, +) + +go_stateify( + name = "semaphore_state", + srcs = [ + "semaphore.go", + "waiter_list.go", + ], + out = "semaphore_autogen_state.go", + package = "semaphore", +) + +go_library( + name = "semaphore", + srcs = [ + "semaphore.go", + "semaphore_autogen_state.go", + "waiter_list.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/state", + "//pkg/state/statefile", + "//pkg/syserror", + ], +) + +go_test( + name = "semaphore_test", + size = "small", + srcs = ["semaphore_test.go"], + embed = [":semaphore"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/kernel/auth", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go new file mode 100644 index 000000000..19ad5d537 --- /dev/null +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -0,0 +1,473 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package semaphore implements System V semaphores. +package semaphore + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const ( + valueMax = 32767 // SEMVMX + + // semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL). + semaphoresMax = 32000 + + // setMax is "system-wide limit on the number of semaphore sets" (SEMMNI). + setsMax = 32000 + + // semaphoresTotalMax is "system-wide limit on the number of semaphores" + // (SEMMNS = SEMMNI*SEMMSL). + semaphoresTotalMax = 1024000000 +) + +// Registry maintains a set of semaphores that can be found by key or ID. +type Registry struct { + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + semaphores map[int32]*Set + lastIDUsed int32 +} + +// Set represents a set of semaphores that can be operated atomically. +type Set struct { + // Id is a handle that identifies the set. + ID int32 + + // key is an user provided key that can be shared between processes. + key int32 + + // creator is the user that created the set. Immutable. + creator fs.FileOwner + + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + owner fs.FileOwner + perms fs.FilePermissions + opTime ktime.Time + changeTime ktime.Time + sems []sem + + // dead is set to true when the set is removed and can't be reached anymore. + // All waiters must wake up and fail when set is dead. + dead bool +} + +// sem represents a single semanphore from a set. +type sem struct { + value int16 + waiters waiterList `state:"zerovalue"` +} + +// waiter represents a caller that is waiting for the semaphore value to +// become positive or zero. +type waiter struct { + waiterEntry + + // value represents how much resource the waiter needs to wake up. + value int16 + ch chan struct{} +} + +// NewRegistry creates a new semaphore set registry. +func NewRegistry() *Registry { + return &Registry{semaphores: make(map[int32]*Set)} +} + +// FindOrCreate searches for a semaphore set that matches 'key'. If not found, +// it may create a new one if requested. If private is true, key is ignored and +// a new set is always created. If create is false, it fails if a set cannot +// be found. If exclusive is true, it fails if a set with the same key already +// exists. +func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { + if nsems < 0 || nsems > semaphoresMax { + return nil, syserror.EINVAL + } + + r.mu.Lock() + defer r.mu.Unlock() + + if !private { + // Look up an existing semaphore. + if set := r.findByKey(key); set != nil { + // Check that caller can access semaphore set. + creds := auth.CredentialsFromContext(ctx) + if !set.checkPerms(creds, fs.PermsFromMode(mode)) { + return nil, syserror.EACCES + } + + // Validate parameters. + if nsems > int32(set.size()) { + return nil, syserror.EINVAL + } + if create && exclusive { + return nil, syserror.EEXIST + } + return set, nil + } + + if !create { + // Semaphore not found and should not be created. + return nil, syserror.ENOENT + } + } + + // Zero is only valid if an existing set is found. + if nsems == 0 { + return nil, syserror.EINVAL + } + + // Apply system limits. + if len(r.semaphores) >= setsMax { + return nil, syserror.EINVAL + } + if r.totalSems() > int(semaphoresTotalMax-nsems) { + return nil, syserror.EINVAL + } + + // Finally create a new set. + owner := fs.FileOwnerFromContext(ctx) + perms := fs.FilePermsFromMode(mode) + return r.newSet(ctx, key, owner, owner, perms, nsems) +} + +// RemoveID removes set with give 'id' from the registry and marks the set as +// dead. All waiters will be awakened and fail. +func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { + r.mu.Lock() + defer r.mu.Unlock() + + set := r.semaphores[id] + if set == nil { + return syserror.EINVAL + } + + // "The effective user ID of the calling process must match the creator or + // owner of the semaphore set, or the caller must be privileged." + if !set.checkCredentials(creds) && !set.checkCapability(creds) { + return syserror.EACCES + } + + delete(r.semaphores, set.ID) + set.destroy() + return nil +} + +func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) { + set := &Set{ + key: key, + owner: owner, + creator: owner, + perms: perms, + changeTime: ktime.NowFromContext(ctx), + sems: make([]sem, nsems), + } + + // Find the next available ID. + for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { + // Handle wrap around. + if id < 0 { + id = 0 + continue + } + if r.semaphores[id] == nil { + r.lastIDUsed = id + r.semaphores[id] = set + set.ID = id + return set, nil + } + } + + log.Warningf("Semaphore map is full, they must be leaking") + return nil, syserror.ENOMEM +} + +// FindByID looks up a set given an ID. +func (r *Registry) FindByID(id int32) *Set { + r.mu.Lock() + defer r.mu.Unlock() + return r.semaphores[id] +} + +func (r *Registry) findByKey(key int32) *Set { + for _, v := range r.semaphores { + if v.key == key { + return v + } + } + return nil +} + +func (r *Registry) totalSems() int { + totalSems := 0 + for _, v := range r.semaphores { + totalSems += v.size() + } + return totalSems +} + +func (s *Set) findSem(num int32) *sem { + if num < 0 || int(num) >= s.size() { + return nil + } + return &s.sems[num] +} + +func (s *Set) size() int { + return len(s.sems) +} + +// Change changes some fields from the set atomically. +func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error { + s.mu.Lock() + defer s.mu.Unlock() + + // "The effective UID of the calling process must match the owner or creator + // of the semaphore set, or the caller must be privileged." + if !s.checkCredentials(creds) && !s.checkCapability(creds) { + return syserror.EACCES + } + + s.owner = owner + s.perms = perms + s.changeTime = ktime.NowFromContext(ctx) + return nil +} + +// SetVal overrides a semaphore value, waking up waiters as needed. +func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials) error { + if val < 0 || val > valueMax { + return syserror.ERANGE + } + + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have alter permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Write: true}) { + return syserror.EACCES + } + + sem := s.findSem(num) + if sem == nil { + return syserror.ERANGE + } + + // TODO: Clear undo entries in all processes + sem.value = val + s.changeTime = ktime.NowFromContext(ctx) + sem.wakeWaiters() + return nil +} + +// GetVal returns a semaphore value. +func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have read permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return 0, syserror.EACCES + } + + sem := s.findSem(num) + if sem == nil { + return 0, syserror.ERANGE + } + return sem.value, nil +} + +// ExecuteOps attempts to execute a list of operations to the set. It only +// suceeds when all operations can be applied. No changes are made if it fails. +// +// On failure, it may return an error (retries are hopeless) or it may return +// a channel that can be waited on before attempting again. +func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials) (chan struct{}, int32, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // Did it race with a removal operation? + if s.dead { + return nil, 0, syserror.EIDRM + } + + // Validate the operations. + readOnly := true + for _, op := range ops { + if s.findSem(int32(op.SemNum)) == nil { + return nil, 0, syserror.EFBIG + } + if op.SemOp != 0 { + readOnly = false + } + } + + if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) { + return nil, 0, syserror.EACCES + } + + ch, num, err := s.executeOps(ctx, ops) + if err != nil { + return nil, 0, err + } + return ch, num, nil +} + +func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf) (chan struct{}, int32, error) { + // Changes to semaphores go to this slice temporarily until they all succeed. + tmpVals := make([]int16, len(s.sems)) + for i := range s.sems { + tmpVals[i] = s.sems[i].value + } + + for _, op := range ops { + sem := &s.sems[op.SemNum] + if op.SemOp == 0 { + // Handle 'wait for zero' operation. + if tmpVals[op.SemNum] != 0 { + // Semaphore isn't 0, must wait. + if op.SemFlg&linux.IPC_NOWAIT != 0 { + return nil, 0, syserror.ErrWouldBlock + } + + w := newWaiter(op.SemOp) + sem.waiters.PushBack(w) + return w.ch, int32(op.SemNum), nil + } + } else { + if op.SemOp < 0 { + // Handle 'wait' operation. + if -op.SemOp > valueMax { + return nil, 0, syserror.ERANGE + } + if -op.SemOp > tmpVals[op.SemNum] { + // Not enough resources, must wait. + if op.SemFlg&linux.IPC_NOWAIT != 0 { + return nil, 0, syserror.ErrWouldBlock + } + + w := newWaiter(op.SemOp) + sem.waiters.PushBack(w) + return w.ch, int32(op.SemNum), nil + } + } else { + // op.SemOp > 0: Handle 'signal' operation. + if tmpVals[op.SemNum] > valueMax-op.SemOp { + return nil, 0, syserror.ERANGE + } + } + + tmpVals[op.SemNum] += op.SemOp + } + } + + // All operations succeeded, apply them. + // TODO: handle undo operations. + for i, v := range tmpVals { + s.sems[i].value = v + s.sems[i].wakeWaiters() + } + s.opTime = ktime.NowFromContext(ctx) + return nil, 0, nil +} + +// AbortWait notifies that a waiter is giving up and will not wait on the +// channel anymore. +func (s *Set) AbortWait(num int32, ch chan struct{}) { + s.mu.Lock() + defer s.mu.Unlock() + + sem := &s.sems[num] + for w := sem.waiters.Front(); w != nil; w = w.Next() { + if w.ch == ch { + sem.waiters.Remove(w) + return + } + } + // Waiter may not be found in case it raced with wakeWaiters(). +} + +func (s *Set) checkCredentials(creds *auth.Credentials) bool { + return s.owner.UID == creds.EffectiveKUID || + s.owner.GID == creds.EffectiveKGID || + s.creator.UID == creds.EffectiveKUID || + s.creator.GID == creds.EffectiveKGID +} + +func (s *Set) checkCapability(creds *auth.Credentials) bool { + return creds.HasCapability(linux.CAP_IPC_OWNER) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok() +} + +func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool { + // Are we owner, or in group, or other? + p := s.perms.Other + if s.owner.UID == creds.EffectiveKUID { + p = s.perms.User + } else if creds.InGroup(s.owner.GID) { + p = s.perms.Group + } + + // Are permissions satisfied without capability checks? + if p.SupersetOf(reqPerms) { + return true + } + + return s.checkCapability(creds) +} + +func (s *Set) destroy() { + s.mu.Lock() + defer s.mu.Unlock() + + // Notify all waiters. Tney will fail on the next attempt to execute + // operations and return error. + s.dead = true + for _, s := range s.sems { + for w := s.waiters.Front(); w != nil; w = w.Next() { + w.ch <- struct{}{} + } + s.waiters.Reset() + } +} + +// wakeWaiters goes over all waiters and checks which of them can be notified. +func (s *sem) wakeWaiters() { + // Note that this will release all waiters waiting for 0 too. + for w := s.waiters.Front(); w != nil; { + if s.value < w.value { + // Still blocked, skip it. + continue + } + w.ch <- struct{}{} + old := w + w = w.Next() + s.waiters.Remove(old) + } +} + +func newWaiter(val int16) *waiter { + return &waiter{ + value: val, + ch: make(chan struct{}, 1), + } +} diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go new file mode 100644 index 000000000..0386586ab --- /dev/null +++ b/pkg/sentry/kernel/semaphore/semaphore_test.go @@ -0,0 +1,172 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package semaphore + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} { + ch, _, err := set.executeOps(ctx, ops) + if err != nil { + t.Fatalf("ExecuteOps(ops) failed, err: %v, ops: %+v", err, ops) + } + if block { + if ch == nil { + t.Fatalf("ExecuteOps(ops) got: nil, expected: !nil, ops: %+v", ops) + } + if signalled(ch) { + t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops) + } + } else { + if ch != nil { + t.Fatalf("ExecuteOps(ops) got: %v, expected: nil, ops: %+v", ch, ops) + } + } + return ch +} + +func signalled(ch chan struct{}) bool { + select { + case <-ch: + return true + default: + return false + } +} + +func TestBasic(t *testing.T) { + ctx := contexttest.Context(t) + set := &Set{ID: 123, sems: make([]sem, 1)} + ops := []linux.Sembuf{ + linux.Sembuf{SemOp: 1}, + } + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = -1 + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = -1 + ch1 := executeOps(ctx, t, set, ops, true) + + ops[0].SemOp = 1 + executeOps(ctx, t, set, ops, false) + if !signalled(ch1) { + t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops) + } +} + +func TestWaitForZero(t *testing.T) { + ctx := contexttest.Context(t) + set := &Set{ID: 123, sems: make([]sem, 1)} + ops := []linux.Sembuf{ + linux.Sembuf{SemOp: 0}, + } + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = -2 + ch1 := executeOps(ctx, t, set, ops, true) + + ops[0].SemOp = 0 + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = 1 + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = 0 + chZero1 := executeOps(ctx, t, set, ops, true) + + ops[0].SemOp = 0 + chZero2 := executeOps(ctx, t, set, ops, true) + + ops[0].SemOp = 1 + executeOps(ctx, t, set, ops, false) + if !signalled(ch1) { + t.Fatalf("ExecuteOps(ops) channel should have been signalled, ops: %+v, set: %+v", ops, set) + } + + ops[0].SemOp = -2 + executeOps(ctx, t, set, ops, false) + if !signalled(chZero1) { + t.Fatalf("ExecuteOps(ops) channel zero 1 should have been signalled, ops: %+v, set: %+v", ops, set) + } + if !signalled(chZero2) { + t.Fatalf("ExecuteOps(ops) channel zero 2 should have been signalled, ops: %+v, set: %+v", ops, set) + } +} + +func TestNoWait(t *testing.T) { + ctx := contexttest.Context(t) + set := &Set{ID: 123, sems: make([]sem, 1)} + ops := []linux.Sembuf{ + linux.Sembuf{SemOp: 1}, + } + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = -2 + ops[0].SemFlg = linux.IPC_NOWAIT + if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock { + t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock) + } + + ops[0].SemOp = 0 + ops[0].SemFlg = linux.IPC_NOWAIT + if _, _, err := set.executeOps(ctx, ops); err != syserror.ErrWouldBlock { + t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock) + } +} + +func TestUnregister(t *testing.T) { + ctx := contexttest.Context(t) + r := NewRegistry() + set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true) + if err != nil { + t.Fatalf("FindOrCreate() failed, err: %v", err) + } + if got := r.FindByID(set.ID); got.ID != set.ID { + t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.ID, got, set) + } + + ops := []linux.Sembuf{ + linux.Sembuf{SemOp: -1}, + } + chs := make([]chan struct{}, 0, 5) + for i := 0; i < 5; i++ { + ch := executeOps(ctx, t, set, ops, true) + chs = append(chs, ch) + } + + creds := auth.CredentialsFromContext(ctx) + if err := r.RemoveID(set.ID, creds); err != nil { + t.Fatalf("RemoveID(%d) failed, err: %v", set.ID, err) + } + if !set.dead { + t.Fatalf("set is not dead: %+v", set) + } + if got := r.FindByID(set.ID); got != nil { + t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.ID, got) + } + for i, ch := range chs { + if !signalled(ch) { + t.Fatalf("channel %d should have been signalled", i) + } + } +} diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go new file mode 100644 index 000000000..53d8fb844 --- /dev/null +++ b/pkg/sentry/kernel/sessions.go @@ -0,0 +1,462 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SessionID is the public identifier. +type SessionID ThreadID + +// ProcessGroupID is the public identifier. +type ProcessGroupID ThreadID + +// Session contains a leader threadgroup and a list of ProcessGroups. +type Session struct { + refs refs.AtomicRefCount + + // leader is the originator of the Session. + // + // Note that this may no longer be running (and may be reaped), so the + // ID is cached upon initial creation. The leader is still required + // however, since its PIDNamespace defines the scope of the Session. + // + // The leader is immutable. + leader *ThreadGroup + + // id is the cached identifier in the leader's namespace. + // + // The id is immutable. + id SessionID + + // ProcessGroups is a list of process groups in this Session. This is + // protected by TaskSet.mu. + processGroups processGroupList + + // sessionEntry is the embed for TaskSet.sessions. This is protected by + // TaskSet.mu. + sessionEntry +} + +// incRef grabs a reference. +func (s *Session) incRef() { + s.refs.IncRef() +} + +// decRef drops a reference. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (s *Session) decRef() { + s.refs.DecRefWithDestructor(func() { + // Remove translations from the leader. + for ns := s.leader.pidns; ns != nil; ns = ns.parent { + id := ns.sids[s] + delete(ns.sids, s) + delete(ns.sessions, id) + } + + // Remove from the list of global Sessions. + s.leader.pidns.owner.sessions.Remove(s) + }) +} + +// ProcessGroup contains an originator threadgroup and a parent Session. +type ProcessGroup struct { + refs refs.AtomicRefCount // not exported. + + // originator is the originator of the group. + // + // See note re: leader in Session. The same applies here. + // + // The originator is immutable. + originator *ThreadGroup + + // id is the cached identifier in the originator's namespace. + // + // The id is immutable. + id ProcessGroupID + + // Session is the parent Session. + // + // The session is immutable. + session *Session + + // ancestors is the number of thread groups in this process group whose + // parent is in a different process group in the same session. + // + // The name is derived from the fact that process groups where + // ancestors is zero are considered "orphans". + // + // ancestors is protected by TaskSet.mu. + ancestors uint32 + + // processGroupEntry is the embedded entry for Sessions.groups. This is + // protected by TaskSet.mu. + processGroupEntry +} + +// incRefWithParent grabs a reference. +// +// This function is called when this ProcessGroup is being associated with some +// new ThreadGroup, tg. parentPG is the ProcessGroup of tg's parent +// ThreadGroup. If tg is init, then parentPG may be nil. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (pg *ProcessGroup) incRefWithParent(parentPG *ProcessGroup) { + // We acquire an "ancestor" reference in the case of a nil parent. + // This is because the process being associated is init, and init can + // never be orphaned (we count it as always having an ancestor). + if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) { + pg.ancestors++ + } + + pg.refs.IncRef() +} + +// decRefWithParent drops a reference. +// +// parentPG is per incRefWithParent. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { + // See incRefWithParent regarding parent == nil. + if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) { + pg.ancestors-- + } + + alive := true + pg.refs.DecRefWithDestructor(func() { + alive = false // don't bother with handleOrphan. + + // Remove translations from the originator. + for ns := pg.originator.pidns; ns != nil; ns = ns.parent { + id := ns.pgids[pg] + delete(ns.pgids, pg) + delete(ns.processGroups, id) + } + + // Remove the list of process groups. + pg.session.processGroups.Remove(pg) + pg.session.decRef() + }) + if alive { + pg.handleOrphan() + } +} + +// parentPG returns the parent process group. +// +// Precondition: callers must hold TaskSet.mu. +func (tg *ThreadGroup) parentPG() *ProcessGroup { + if tg.leader.parent != nil { + return tg.leader.parent.tg.processGroup + } + return nil +} + +// handleOrphan checks whether the process group is an orphan and has any +// stopped jobs. If yes, then appropriate signals are delivered to each thread +// group within the process group. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (pg *ProcessGroup) handleOrphan() { + // Check if this process is an orphan. + if pg.ancestors != 0 { + return + } + + // See if there are any stopped jobs. + hasStopped := false + pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) { + if tg.processGroup != pg { + return + } + tg.signalHandlers.mu.Lock() + if tg.groupStopPhase == groupStopComplete { + hasStopped = true + } + tg.signalHandlers.mu.Unlock() + }) + if !hasStopped { + return + } + + // Deliver appropriate signals to all thread groups. + pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) { + if tg.processGroup != pg { + return + } + tg.signalHandlers.mu.Lock() + tg.leader.sendSignalLocked(sigPriv(linux.SIGHUP), true /* group */) + tg.leader.sendSignalLocked(sigPriv(linux.SIGCONT), true /* group */) + tg.signalHandlers.mu.Unlock() + }) + + return +} + +// CreateSession creates a new Session, with the ThreadGroup as the leader. +// +// EPERM may be returned if either the given ThreadGroup is already a Session +// leader, or a ProcessGroup already exists for the ThreadGroup's ID. +func (tg *ThreadGroup) CreateSession() error { + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + return tg.createSession() +} + +// createSession creates a new session for a threadgroup. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (tg *ThreadGroup) createSession() error { + // Get the ID for this thread in the current namespace. + id := tg.pidns.tids[tg.leader] + + // Check if this ThreadGroup already leads a Session, or + // if the proposed group is already taken. + for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() { + if s.leader.pidns != tg.pidns { + continue + } + if s.leader == tg { + return syserror.EPERM + } + if s.id == SessionID(id) { + return syserror.EPERM + } + for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { + if pg.id == ProcessGroupID(id) { + return syserror.EPERM + } + } + } + + // Create a new Session, with a single reference. + s := &Session{ + id: SessionID(id), + leader: tg, + } + + // Create a new ProcessGroup, belonging to that Session. + // This also has a single reference (assigned below). + // + // Note that since this is a new session and a new process group, there + // will be zero ancestors for this process group. (It is an orphan at + // this point.) + pg := &ProcessGroup{ + id: ProcessGroupID(id), + originator: tg, + session: s, + ancestors: 0, + } + + // Tie them and return the result. + s.processGroups.PushBack(pg) + tg.pidns.owner.sessions.PushBack(s) + + // Leave the current group, and assign the new one. + if tg.processGroup != nil { + oldParentPG := tg.parentPG() + tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { + childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.decRefWithParent(oldParentPG) + }) + tg.processGroup.decRefWithParent(oldParentPG) + tg.processGroup = pg + } else { + // The current process group may be nil only in the case of an + // unparented thread group (i.e. the init process). This would + // not normally occur, but we allow it for the convenience of + // CreateSession working from that point. There will be no + // child processes. We always say that the very first group + // created has ancestors (avoids checks elsewhere). + // + // Note that this mirrors the parent == nil logic in + // incRef/decRef/reparent, which counts nil as an ancestor. + tg.processGroup = pg + tg.processGroup.ancestors++ + } + + // Ensure a translation is added to all namespaces. + for ns := tg.pidns; ns != nil; ns = ns.parent { + local := ns.tids[tg.leader] + ns.sids[s] = SessionID(local) + ns.sessions[SessionID(local)] = s + ns.pgids[pg] = ProcessGroupID(local) + ns.processGroups[ProcessGroupID(local)] = pg + } + + return nil +} + +// CreateProcessGroup creates a new process group. +// +// An EPERM error will be returned if the ThreadGroup belongs to a different +// Session, is a Session leader or the group already exists. +func (tg *ThreadGroup) CreateProcessGroup() error { + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + + // Get the ID for this thread in the current namespace. + id := tg.pidns.tids[tg.leader] + + // Per above, check for a Session leader or existing group. + for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() { + if s.leader.pidns != tg.pidns { + continue + } + if s.leader == tg { + return syserror.EPERM + } + for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { + if pg.id == ProcessGroupID(id) { + return syserror.EPERM + } + } + } + + // Create a new ProcessGroup, belonging to the current Session. + // + // We manually adjust the ancestors if the parent is in the same + // session. + tg.processGroup.session.incRef() + pg := &ProcessGroup{ + id: ProcessGroupID(id), + originator: tg, + session: tg.processGroup.session, + } + if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { + pg.ancestors++ + } + + // Assign the new process group; adjust children. + oldParentPG := tg.parentPG() + tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { + childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.decRefWithParent(oldParentPG) + }) + tg.processGroup.decRefWithParent(oldParentPG) + tg.processGroup = pg + + // Ensure this translation is added to all namespaces. + for ns := tg.pidns; ns != nil; ns = ns.parent { + local := ns.tids[tg.leader] + ns.pgids[pg] = ProcessGroupID(local) + ns.processGroups[ProcessGroupID(local)] = pg + } + + return nil +} + +// JoinProcessGroup joins an existing process group. +// +// This function will return EACCES if an exec has been performed since fork +// by the given ThreadGroup, and EPERM if the Sessions are not the same or the +// group does not exist. +// +// If checkExec is set, then the join is not permitted after the process has +// executed exec at least once. +func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error { + pidns.owner.mu.Lock() + defer pidns.owner.mu.Unlock() + + // Lookup the ProcessGroup. + pg := pidns.processGroups[pgid] + if pg == nil { + return syserror.EPERM + } + + // Disallow the join if an execve has performed, per POSIX. + if checkExec && tg.execed { + return syserror.EACCES + } + + // See if it's in the same session as ours. + if pg.session != tg.processGroup.session { + return syserror.EPERM + } + + // Join the group; adjust children. + parentPG := tg.parentPG() + pg.incRefWithParent(parentPG) + tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { + childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.decRefWithParent(tg.processGroup) + }) + tg.processGroup.decRefWithParent(parentPG) + tg.processGroup = pg + + return nil +} + +// Session returns the ThreadGroup's Session. +// +// A reference is not taken on the session. +func (tg *ThreadGroup) Session() *Session { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.processGroup.session +} + +// IDOfSession returns the Session assigned to s in PID namespace ns. +// +// If this group isn't visible in this namespace, zero will be returned. It is +// the callers responsibility to check that before using this function. +func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.sids[s] +} + +// SessionWithID returns the Session with the given ID in the PID namespace ns, +// or nil if that given ID is not defined in this namespace. +// +// A reference is not taken on the session. +func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.sessions[id] +} + +// ProcessGroup returns the ThreadGroup's ProcessGroup. +// +// A reference is not taken on the process group. +func (tg *ThreadGroup) ProcessGroup() *ProcessGroup { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.processGroup +} + +// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns. +// +// The same constraints apply as IDOfSession. +func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.pgids[pg] +} + +// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID +// namespace ns, or nil if that given ID is not defined in this namespace. +// +// A reference is not taken on the process group. +func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.processGroups[id] +} diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go new file mode 100644 index 000000000..8edd05cdf --- /dev/null +++ b/pkg/sentry/kernel/signal.go @@ -0,0 +1,69 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" +) + +// SignalPanic is used to panic the running threads. It is a signal which +// cannot be used by the application: it must be caught and ignored by the +// runtime (in order to catch possible races). +const SignalPanic = linux.SIGUSR2 + +// sendExternalSignal is called when an asynchronous signal is sent to the +// sentry ("in sentry context"). On some platforms, it may also be called when +// an asynchronous signal is sent to sandboxed application threads ("in +// application context"). +// +// context is used only for debugging to differentiate these cases. +// +// Returns false if signal could not be sent because the Kernel is not fully +// initialized yet. +func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) bool { + switch linux.Signal(info.Signo) { + case platform.SignalInterrupt: + // Assume that a call to platform.Context.Interrupt() misfired. + return true + + case SignalPanic: + // SignalPanic is also specially handled in sentry setup to ensure that + // it causes a panic even after tasks exit, but SignalPanic may also + // be sent here if it is received while in app context. + panic("Signal-induced panic") + + default: + log.Infof("Received external signal %d in %s context", info.Signo, context) + if k.globalInit == nil { + log.Warningf("Received external signal %d before init created", info.Signo) + return false + } + k.globalInit.SendSignal(info) + } + + return true +} + +// sigPriv returns a SignalInfo representing a signal sent by the sentry. (The +// name reflects its equivalence to Linux's SEND_SIG_PRIV.) +func sigPriv(sig linux.Signal) *arch.SignalInfo { + return &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoKernel, + } +} diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go new file mode 100644 index 000000000..21ba4ee70 --- /dev/null +++ b/pkg/sentry/kernel/signal_handlers.go @@ -0,0 +1,79 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" +) + +// SignalHandlers holds information about signal actions. +type SignalHandlers struct { + // mu protects actions, as well as the signal state of all tasks and thread + // groups using this SignalHandlers object. (See comment on + // ThreadGroup.signalHandlers.) + mu sync.Mutex `state:"nosave"` + + // actions is the action to be taken upon receiving each signal. + actions map[linux.Signal]arch.SignalAct +} + +// NewSignalHandlers returns a new SignalHandlers specifying all default +// actions. +func NewSignalHandlers() *SignalHandlers { + return &SignalHandlers{ + actions: make(map[linux.Signal]arch.SignalAct), + } +} + +// Fork returns a copy of sh for a new thread group. +func (sh *SignalHandlers) Fork() *SignalHandlers { + sh2 := NewSignalHandlers() + sh.mu.Lock() + defer sh.mu.Unlock() + for sig, act := range sh.actions { + sh2.actions[sig] = act + } + return sh2 +} + +// CopyForExec returns a copy of sh for a thread group that is undergoing an +// execve. (See comments in Task.finishExec.) +func (sh *SignalHandlers) CopyForExec() *SignalHandlers { + sh2 := NewSignalHandlers() + sh.mu.Lock() + defer sh.mu.Unlock() + for sig, act := range sh.actions { + if act.Handler == arch.SignalActIgnore { + sh2.actions[sig] = arch.SignalAct{ + Handler: arch.SignalActIgnore, + } + } + } + return sh2 +} + +// dequeueActionLocked returns the SignalAct that should be used to handle sig. +// +// Preconditions: sh.mu must be locked. +func (sh *SignalHandlers) dequeueAction(sig linux.Signal) arch.SignalAct { + act := sh.actions[sig] + if act.IsResetHandler() { + delete(sh.actions, sig) + } + return act +} diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go new file mode 100644 index 000000000..e20fa3eb6 --- /dev/null +++ b/pkg/sentry/kernel/syscalls.go @@ -0,0 +1,305 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi" + "gvisor.googlesource.com/gvisor/pkg/bits" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// maxSyscallNum is the highest supported syscall number. +// +// The types below create fast lookup slices for all syscalls. This maximum +// serves as a sanity check that we don't allocate huge slices for a very large +// syscall. +const maxSyscallNum = 2000 + +// SyscallFn is a syscall implementation. +type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error) + +// MissingFn is a syscall to be called when an implementation is missing. +type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) + +// Possible flags for SyscallFlagsTable.enable. +const ( + // syscallPresent indicates that this is not a missing syscall. + // + // This flag is used internally in SyscallFlagsTable. + syscallPresent = 1 << iota + + // StraceEnableLog enables syscall log tracing. + StraceEnableLog + + // StraceEnableEvent enables syscall event tracing. + StraceEnableEvent + + // ExternalBeforeEnable enables the external hook before syscall execution. + ExternalBeforeEnable + + // ExternalAfterEnable enables the external hook after syscall execution. + ExternalAfterEnable +) + +// StraceEnableBits combines both strace log and event flags. +const StraceEnableBits = StraceEnableLog | StraceEnableEvent + +// SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall +// basis. +type SyscallFlagsTable struct { + // mu protects writes to the fields below. + // + // Atomic loads are always allowed. Atomic stores are allowed only + // while mu is held. + mu sync.Mutex + + // enable contains the enable bits for each syscall. + // + // missing syscalls have the same value in enable as missingEnable to + // avoid an extra branch in Word. + enable []uint32 + + // missingEnable contains the enable bits for missing syscalls. + missingEnable uint32 +} + +// Init initializes the struct, with all syscalls in table set to enable. +// +// max is the largest syscall number in table. +func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) { + e.enable = make([]uint32, max+1) + for num := range table { + e.enable[num] = syscallPresent + } +} + +// Word returns the enable bitfield for sysno. +func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 { + if sysno < uintptr(len(e.enable)) { + return atomic.LoadUint32(&e.enable[sysno]) + } + + return atomic.LoadUint32(&e.missingEnable) +} + +// Enable sets enable bit bit for all syscalls based on s. +// +// Syscalls missing from s are disabled. +// +// Syscalls missing from the initial table passed to Init cannot be added as +// individual syscalls. If present in s they will be ignored. +// +// Callers to Word may see either the old or new value while this function +// is executing. +func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) { + e.mu.Lock() + defer e.mu.Unlock() + + missingVal := atomic.LoadUint32(&e.missingEnable) + if missingEnable { + missingVal |= bit + } else { + missingVal &^= bit + } + atomic.StoreUint32(&e.missingEnable, missingVal) + + for num := range e.enable { + val := atomic.LoadUint32(&e.enable[num]) + if !bits.IsOn32(val, syscallPresent) { + // Missing. + atomic.StoreUint32(&e.enable[num], missingVal) + continue + } + + if s[uintptr(num)] { + val |= bit + } else { + val &^= bit + } + atomic.StoreUint32(&e.enable[num], val) + } +} + +// EnableAll sets enable bit bit for all syscalls, present and missing. +func (e *SyscallFlagsTable) EnableAll(bit uint32) { + e.mu.Lock() + defer e.mu.Unlock() + + missingVal := atomic.LoadUint32(&e.missingEnable) + missingVal |= bit + atomic.StoreUint32(&e.missingEnable, missingVal) + + for num := range e.enable { + val := atomic.LoadUint32(&e.enable[num]) + if !bits.IsOn32(val, syscallPresent) { + // Missing. + atomic.StoreUint32(&e.enable[num], missingVal) + continue + } + + val |= bit + atomic.StoreUint32(&e.enable[num], val) + } +} + +// Stracer traces syscall execution. +type Stracer interface { + // SyscallEnter is called on syscall entry. + // + // The returned private data is passed to SyscallExit. + // + // TODO: remove kernel imports from the strace package so + // that the type can be used directly. + SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} + + // SyscallExit is called on syscall exit. + SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error) +} + +// SyscallTable is a lookup table of system calls. Critically, a SyscallTable +// is *immutable*. In order to make supporting suspend and resume sane, they +// must be uniquely registered and may not change during operation. +type SyscallTable struct { + // OS is the operating system that this syscall table implements. + OS abi.OS `state:"wait"` + + // Arch is the architecture that this syscall table targets. + Arch arch.Arch `state:"wait"` + + // The OS version that this syscall table implements. + Version Version `state:"manual"` + + // AuditNumber is a numeric constant that represents the syscall table. If + // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by + // linux/audit.h. + AuditNumber uint32 `state:"manual"` + + // Table is the collection of functions. + Table map[uintptr]SyscallFn `state:"manual"` + + // lookup is a fixed-size array that holds the syscalls (indexed by + // their numbers). It is used for fast look ups. + lookup []SyscallFn `state:"manual"` + + // Emulate is a collection of instruction addresses to emulate. The + // keys are addresses, and the values are system call numbers. + Emulate map[usermem.Addr]uintptr `state:"manual"` + + // The function to call in case of a missing system call. + Missing MissingFn `state:"manual"` + + // Stracer traces this syscall table. + Stracer Stracer `state:"manual"` + + // External is used to handle an external callback. + External func(*Kernel) `state:"manual"` + + // ExternalFilterBefore is called before External is called before the syscall is executed. + // External is not called if it returns false. + ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` + + // ExternalFilterAfter is called before External is called after the syscall is executed. + // External is not called if it returns false. + ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` + + // FeatureEnable stores the strace and one-shot enable bits. + FeatureEnable SyscallFlagsTable `state:"manual"` +} + +// allSyscallTables contains all known tables. +var allSyscallTables []*SyscallTable + +// SyscallTables returns a read-only slice of registered SyscallTables. +func SyscallTables() []*SyscallTable { + return allSyscallTables +} + +// LookupSyscallTable returns the SyscallCall table for the OS/Arch combination. +func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { + for _, s := range allSyscallTables { + if s.OS == os && s.Arch == a { + return s, true + } + } + return nil, false +} + +// RegisterSyscallTable registers a new syscall table for use by a Kernel. +func RegisterSyscallTable(s *SyscallTable) { + if s.Table == nil { + // Ensure non-nil lookup table. + s.Table = make(map[uintptr]SyscallFn) + } + if s.Emulate == nil { + // Ensure non-nil emulate table. + s.Emulate = make(map[usermem.Addr]uintptr) + } + + var max uintptr + for num := range s.Table { + if num > max { + max = num + } + } + + if max > maxSyscallNum { + panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) + } + + s.lookup = make([]SyscallFn, max+1) + + // Initialize the fast-lookup table. + for num, fn := range s.Table { + s.lookup[num] = fn + } + + s.FeatureEnable.init(s.Table, max) + + if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { + panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) + } + + // Save a reference to this table. + // + // This is required for a Kernel to find the table and for save/restore + // operations below. + allSyscallTables = append(allSyscallTables, s) +} + +// Lookup returns the syscall implementation, if one exists. +func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { + if sysno < uintptr(len(s.lookup)) { + return s.lookup[sysno] + } + + return nil +} + +// LookupEmulate looks up an emulation syscall number. +func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { + sysno, ok := s.Emulate[addr] + return sysno, ok +} + +// mapLookup is similar to Lookup, except that it only uses the syscall table, +// that is, it skips the fast look array. This is available for benchmarking. +func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { + return s.Table[sysno] +} diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go new file mode 100644 index 000000000..826809a70 --- /dev/null +++ b/pkg/sentry/kernel/syscalls_state.go @@ -0,0 +1,29 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import "fmt" + +// afterLoad is invoked by stateify. +func (s *SyscallTable) afterLoad() { + otherTable, ok := LookupSyscallTable(s.OS, s.Arch) + if !ok { + // Couldn't find a reference? + panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch)) + } + + // Copy the table. + *s = *otherTable +} diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go new file mode 100644 index 000000000..31541749e --- /dev/null +++ b/pkg/sentry/kernel/syslog.go @@ -0,0 +1,100 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "math/rand" + "sync" +) + +// syslog represents a sentry-global kernel log. +// +// Currently, it contains only fun messages for a dmesg easter egg. +type syslog struct { + // mu protects the below. + mu sync.Mutex `state:"nosave"` + + // msg is the syslog message buffer. It is lazily initialized. + msg []byte +} + +// Log returns a copy of the syslog. +func (s *syslog) Log() []byte { + s.mu.Lock() + defer s.mu.Unlock() + + if s.msg != nil { + // Already initialized, just return a copy. + o := make([]byte, len(s.msg)) + copy(o, s.msg) + return o + } + + // Not initialized, create message. + allMessages := []string{ + "Synthesizing system calls...", + "Mounting deweydecimalfs...", + "Moving files to filing cabinet...", + "Digging up root...", + "Constructing home...", + "Segmenting fault lines...", + "Creating bureaucratic processes...", + "Searching for needles in stacks...", + "Preparing for the zombie uprising...", + "Feeding the init monster...", + "Creating cloned children...", + "Daemonizing children...", + "Waiting for children...", + "Gathering forks...", + "Committing treasure map to memory...", + "Reading process obituaries...", + "Searching for socket adapter...", + "Creating process schedule...", + "Generating random numbers by fair dice roll...", + "Rewriting operating system in Javascript...", + "Consulting tar man page...", + "Forking spaghetti code...", + "Checking naughty and nice process list...", + "Checking naughty and nice process list...", // Check it up to twice. + "Granting licence to kill(2)...", // British spelling for British movie. + "Letting the watchdogs out...", + } + + selectMessage := func() string { + i := rand.Intn(len(allMessages)) + m := allMessages[i] + + // Delete the selected message. + allMessages[i] = allMessages[len(allMessages)-1] + allMessages = allMessages[:len(allMessages)-1] + + return m + } + + time := 0.0 + for i := 0; i < 10; i++ { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf("<6>[%11.6f] %s\n", time, selectMessage()))...) + } + + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf("<6>[%11.6f] Ready!\n", time))...) + + // Return a copy. + o := make([]byte, len(s.msg)) + copy(o, s.msg) + return o +} diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go new file mode 100644 index 000000000..71ca75555 --- /dev/null +++ b/pkg/sentry/kernel/table_test.go @@ -0,0 +1,108 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/abi" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" +) + +const ( + maxTestSyscall = 1000 +) + +func createSyscallTable() *SyscallTable { + m := make(map[uintptr]SyscallFn) + for i := uintptr(0); i <= maxTestSyscall; i++ { + j := i + m[i] = func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) { + return j, nil, nil + } + } + + s := &SyscallTable{ + OS: abi.Linux, + Arch: arch.AMD64, + Table: m, + } + + RegisterSyscallTable(s) + return s +} + +func TestTable(t *testing.T) { + table := createSyscallTable() + defer func() { + // Cleanup registered tables to keep tests separate. + allSyscallTables = []*SyscallTable{} + }() + + // Go through all functions and check that they return the right value. + for i := uintptr(0); i < maxTestSyscall; i++ { + fn := table.Lookup(i) + if fn == nil { + t.Errorf("Syscall %v is set to nil", i) + continue + } + + v, _, _ := fn(nil, arch.SyscallArguments{}) + if v != i { + t.Errorf("Wrong return value for syscall %v: expected %v, got %v", i, i, v) + } + } + + // Check that values outside the range return nil. + for i := uintptr(maxTestSyscall + 1); i < maxTestSyscall+100; i++ { + fn := table.Lookup(i) + if fn != nil { + t.Errorf("Syscall %v is not nil: %v", i, fn) + continue + } + } +} + +func BenchmarkTableLookup(b *testing.B) { + table := createSyscallTable() + + b.ResetTimer() + + j := uintptr(0) + for i := 0; i < b.N; i++ { + table.Lookup(j) + j = (j + 1) % 310 + } + + b.StopTimer() + // Cleanup registered tables to keep tests separate. + allSyscallTables = []*SyscallTable{} +} + +func BenchmarkTableMapLookup(b *testing.B) { + table := createSyscallTable() + + b.ResetTimer() + + j := uintptr(0) + for i := 0; i < b.N; i++ { + table.mapLookup(j) + j = (j + 1) % 310 + } + + b.StopTimer() + // Cleanup registered tables to keep tests separate. + allSyscallTables = []*SyscallTable{} +} diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go new file mode 100644 index 000000000..3d2e035e9 --- /dev/null +++ b/pkg/sentry/kernel/task.go @@ -0,0 +1,606 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + ssync "gvisor.googlesource.com/gvisor/pkg/sync" +) + +// Task represents a thread of execution in the untrusted app. It +// includes registers and any thread-specific state that you would +// normally expect. +// +// Each task is associated with a goroutine, called the task goroutine, that +// executes code (application code, system calls, etc.) on behalf of that task. +// See Task.run (task_run.go). +// +// All fields that are "owned by the task goroutine" can only be mutated by the +// task goroutine while it is running. The task goroutine does not require +// synchronization to read these fields, although it still requires +// synchronization as described for those fields to mutate them. +// +// All fields that are "exclusive to the task goroutine" can only be accessed +// by the task goroutine while it is running. The task goroutine does not +// require synchronization to read or write these fields. +type Task struct { + taskNode + + // runState is what the task goroutine is executing if it is not stopped. + // If runState is nil, the task goroutine should exit or has exited. + // runState is exclusive to the task goroutine. + runState taskRunState + + // haveSyscallReturn is true if tc.Arch().Return() represents a value + // returned by a syscall (or set by ptrace after a syscall). + // + // haveSyscallReturn is exclusive to the task goroutine. + haveSyscallReturn bool + + // interruptChan is notified whenever the task goroutine is interrupted + // (usually by a pending signal). interruptChan is effectively a condition + // variable that can be used in select statements. + // + // interruptChan is not saved; because saving interrupts all tasks, + // interruptChan is always notified after restore (see Task.run). + interruptChan chan struct{} `state:"nosave"` + + // gosched contains the current scheduling state of the task goroutine. + // + // gosched is protected by goschedSeq. gosched is owned by the task + // goroutine. + goschedSeq ssync.SeqCount `state:"nosave"` + gosched TaskGoroutineSchedInfo + + // yieldCount is the number of times the task goroutine has called + // Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or + // Task.Yield(), voluntarily ceasing execution. + // + // yieldCount is accessed using atomic memory operations. yieldCount is + // owned by the task goroutine. + yieldCount uint64 + + // pendingSignals is the set of pending signals that may be handled only by + // this task. + // + // pendingSignals is protected by (taskNode.)tg.signalHandlers.mu + // (hereafter "the signal mutex"); see comment on + // ThreadGroup.signalHandlers. + pendingSignals pendingSignals + + // If haveSavedSignalMask is true, savedSignalMask is the signal mask that + // should be applied after the task has either delivered one signal to a + // user handler or is about to resume execution in the untrusted + // application. + // + // Both haveSavedSignalMask and savedSignalMask are exclusive to the task + // goroutine. + haveSavedSignalMask bool + savedSignalMask linux.SignalSet + + // signalStack is the alternate signal stack used by signal handlers for + // which the SA_ONSTACK flag is set. + // + // signalStack is exclusive to the task goroutine. + signalStack arch.SignalStack + + // If groupStopRequired is true, the task should enter a group stop in the + // interrupt path. groupStopRequired is not redundant with + // tg.groupStopPhase != groupStopNone, because ptrace allows tracers to + // resume individual tasks from a group stop without ending the group stop + // as a whole. + // + // groupStopRequired is analogous to JOBCTL_TRAP_STOP in Linux, except that + // Linux only uses that flag for ptraced tasks. + // + // groupStopRequired is protected by the signal mutex. + groupStopRequired bool + + // If groupStopAcknowledged is true, the task has already acknowledged that + // it is entering the most recent group stop that has been initiated on its + // thread group. groupStopAcknowledged is only meaningful if + // tg.groupStopPhase == groupStopInitiated. + // + // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux. + // + // groupStopAcknowledged is protected by the signal mutex. + groupStopAcknowledged bool + + // If stop is not nil, it is the internally-initiated condition that + // currently prevents the task goroutine from running. + // + // stop is protected by the signal mutex. + stop TaskStop + + // stopCount is the number of active external stops (calls to + // Task.BeginExternalStop that have not been paired with a call to + // Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is + // non-zero if the task goroutine should stop. + // + // Mutating stopCount requires both locking the signal mutex and using + // atomic memory operations. Reading stopCount requires either locking the + // signal mutex or using atomic memory operations. This allows Task.doStop + // to require only a single atomic read in the common case where stopCount + // is 0. + // + // stopCount is not saved, because external stops cannot be retained across + // a save/restore cycle. (Suppose a sentryctl command issues an external + // stop; after a save/restore cycle, the restored sentry has no knowledge + // of the pre-save sentryctl command, and the stopped task would remain + // stopped forever.) + stopCount int32 `state:"nosave"` + + // endStopCond is signaled when stopCount transitions to 0. The combination + // of stopCount and endStopCond effectively form a sync.WaitGroup, but + // WaitGroup provides no way to read its counter value. + // + // Invariant: endStopCond.L is the signal mutex. (This is not racy because + // sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine + // calls sync.Cond.Wait; and only the task goroutine can change the + // identity of the signal mutex, in Task.finishExec.) + endStopCond sync.Cond `state:"nosave"` + + // exitStatus is the task's exit status. + // + // exitStatus is protected by the signal mutex. + exitStatus ExitStatus + + // syscallRestartBlock represents a custom restart function to run in + // restart_syscall(2) to resume an interrupted syscall. + // + // syscallRestartBlock is exclusive to the task goroutine. + syscallRestartBlock SyscallRestartBlock + + // mu protects some of the following fields. + mu sync.Mutex `state:"nosave"` + + // tc and tr form the majority of the task's data. + // + // tc and tr are protected by mu. tc and tr are owned by the task + // goroutine. tr.signalMask is protected by the signal mutex and must be + // written using atomic memory operations (such that reading tr.signalMask + // is safe if the signal mutex is locked or if atomic memory operations are + // used), but is also owned by the task goroutine. + tc TaskContext + tr TaskResources + + // p provides the mechanism by which the task runs code in userspace. The p + // interface object is immutable. + p platform.Context `state:"nosave"` + + // k is the Kernel that this task belongs to. The k pointer is immutable. + k *Kernel + + // If vforkParent is not nil, it is the task that created this task with + // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when + // this TaskContext is released. + // + // vforkParent is protected by the TaskSet mutex. + vforkParent *Task + + // exitState is the task's progress through the exit path. + // + // exitState is protected by the TaskSet mutex. exitState is owned by the + // task goroutine. + exitState TaskExitState + + // exitTracerNotified is true if the exit path has either signaled the + // task's tracer to indicate the exit, or determined that no such signal is + // needed. exitTracerNotified can only be true if exitState is + // TaskExitZombie or TaskExitDead. + // + // exitTracerNotified is protected by the TaskSet mutex. + exitTracerNotified bool + + // exitTracerAcked is true if exitTracerNotified is true and either the + // task's tracer has acknowledged the exit notification, or the exit path + // has determined that no such notification is needed. + // + // exitTracerAcked is protected by the TaskSet mutex. + exitTracerAcked bool + + // exitParentNotified is true if the exit path has either signaled the + // task's parent to indicate the exit, or determined that no such signal is + // needed. exitParentNotified can only be true if exitState is + // TaskExitZombie or TaskExitDead. + // + // exitParentNotified is protected by the TaskSet mutex. + exitParentNotified bool + + // exitParentAcked is true if exitParentNotified is true and either the + // task's parent has acknowledged the exit notification, or the exit path + // has determined that no such acknowledgment is needed. + // + // exitParentAcked is protected by the TaskSet mutex. + exitParentAcked bool + + // goroutineStopped is a WaitGroup whose counter value is 1 when the task + // goroutine is running and 0 when the task goroutine is stopped or has + // exited. + goroutineStopped sync.WaitGroup `state:"nosave"` + + // ptraceTracer is the task that is ptrace-attached to this one. If + // ptraceTracer is nil, this task is not being traced. Note that due to + // atomic.Value limitations (atomic.Value.Store(nil) panics), a nil + // ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)). + // + // ptraceTracer is protected by the TaskSet mutex, and accessed with atomic + // operations. This allows paths that wouldn't otherwise lock the TaskSet + // mutex, notably the syscall path, to check if ptraceTracer is nil without + // additional synchronization. + ptraceTracer atomic.Value `state:".(*Task)"` + + // ptraceTracees is the set of tasks that this task is ptrace-attached to. + // + // ptraceTracees is protected by the TaskSet mutex. + ptraceTracees map[*Task]struct{} + + // ptraceOpts contains ptrace options explicitly set by the tracer. If + // ptraceTracer is nil, ptraceOpts is expected to be the zero value. + // + // ptraceOpts is protected by the TaskSet mutex. + ptraceOpts ptraceOptions + + // ptraceSyscallMode controls ptrace behavior around syscall entry and + // exit. + // + // ptraceSyscallMode is protected by the TaskSet mutex. + ptraceSyscallMode ptraceSyscallMode + + // If ptraceSinglestep is true, the next time the task executes application + // code, single-stepping should be enabled. ptraceSinglestep is stored + // independently of the architecture-specific trap flag because tracer + // detaching (which can happen concurrently with the tracee's execution if + // the tracer exits) must disable single-stepping, and the task's + // architectural state is implicitly exclusive to the task goroutine (no + // synchronization occurs before passing registers to SwitchToApp). + // + // ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP. + // + // ptraceSinglestep is protected by the TaskSet mutex. + ptraceSinglestep bool + + // If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the + // time that t entered the ptrace stop, reset to 0 when the tracer + // acknowledges the stop with a wait*() syscall. Otherwise, it is the + // signal number passed to the ptrace operation that ended the last ptrace + // stop on this task. In the latter case, the effect of ptraceCode depends + // on the nature of the ptrace stop; signal-delivery-stop uses it to + // conditionally override ptraceSiginfo, syscall-entry/exit-stops send the + // signal to the task after leaving the stop, and PTRACE_EVENT stops and + // traced group stops ignore it entirely. + // + // Linux contextually stores the equivalent of ptraceCode in + // task_struct::exit_code. + // + // ptraceCode is protected by the TaskSet mutex. + ptraceCode int32 + + // ptraceSiginfo is the value returned to the tracer by + // ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO). + // (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.) + // ptraceSiginfo is nil if the task is in a ptraced group-stop (this is + // required for PTRACE_GETSIGINFO to return EINVAL during such stops, which + // is in turn required to distinguish group stops from other ptrace stops, + // per subsection "Group-stop" in ptrace(2)). + // + // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo. + // + // ptraceSiginfo is protected by the TaskSet mutex. + ptraceSiginfo *arch.SignalInfo + + // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to + // the tracer by ptrace(PTRACE_GETEVENTMSG). + // + // ptraceEventMsg is protected by the TaskSet mutex. + ptraceEventMsg uint64 + + // The struct that holds the IO-related usage. The ioUsage pointer is + // immutable. + ioUsage *usage.IO + + // logPrefix is a string containing the task's thread ID in the root PID + // namespace, and is prepended to log messages emitted by Task.Infof etc. + logPrefix atomic.Value `state:".(string)"` + + // creds is the task's credentials. + // + // creds is protected by mu. + creds *auth.Credentials + + // utsns is the task's UTS namespace. + // + // utsns is protected by mu. + utsns *UTSNamespace + + // ipcns is the task's IPC namespace. + // + // ipcns is protected by mu. + ipcns *IPCNamespace + + // parentDeathSignal is sent to this task's thread group when its parent exits. + // + // parentDeathSignal is protected by mu. + parentDeathSignal linux.Signal + + // syscallFilters is all seccomp-bpf syscall filters applicable to the + // task, in the order in which they were installed. + // + // syscallFilters is protected by mu. syscallFilters is owned by the task + // goroutine. + syscallFilters []bpf.Program + + // If cleartid is non-zero, treat it as a pointer to a ThreadID in the + // task's virtual address space; when the task exits, set the pointed-to + // ThreadID to 0, and wake any futex waiters. + // + // cleartid is exclusive to the task goroutine. + cleartid usermem.Addr + + // This is mostly a fake cpumask just for sched_set/getaffinity as we + // don't really control the affinity. + // + // Invariant: allowedCPUMask.Size() == + // sched.CPUMaskSize(Kernel.applicationCores). + // + // allowedCPUMask is protected by mu. + allowedCPUMask sched.CPUSet + + // cpu is the fake cpu number returned by getcpu(2). cpu is ignored + // entirely if Kernel.useHostCores is true. + // + // cpu is accessed using atomic memory operations. + cpu int32 + + // This is used to keep track of changes made to a process' priority/niceness. + // It is mostly used to provide some reasonable return value from + // getpriority(2) after a call to setpriority(2) has been made. + // We currently do not actually modify a process' scheduling priority. + // NOTE: This represents the userspace view of priority (nice). + // This means that the value should be in the range [-20, 19]. + // + // niceness is protected by mu. + niceness int + + // This is used to track the numa policy for the current thread. This can be + // modified through a set_mempolicy(2) syscall. Since we always report a + // single numa node, all policies are no-ops. We only track this information + // so that we can return reasonable values if the application calls + // get_mempolicy(2) after setting a non-default policy. Note that in the + // real syscall, nodemask can be longer than 4 bytes, but we always report a + // single node so never need to save more than a single bit. + // + // numaPolicy and numaNodeMask are protected by mu. + numaPolicy int32 + numaNodeMask uint32 + + // If netns is true, the task is in a non-root network namespace. Network + // namespaces aren't currently implemented in full; being in a network + // namespace simply prevents the task from observing any network devices + // (including loopback) or using abstract socket addresses (see unix(7)). + // + // netns is protected by mu. netns is owned by the task goroutine. + netns bool + + // If rseqPreempted is true, before the next call to p.Switch(), interrupt + // RSEQ critical regions as defined by tg.rseq and write the task + // goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number + // written to rseqCPUAddr. + // + // If rseqCPUAddr is 0, rseqCPU is -1. + // + // rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task + // goroutine. + rseqPreempted bool `state:"nosave"` + rseqCPUAddr usermem.Addr + rseqCPU int32 + + // copyScratchBuffer is a buffer available to CopyIn/CopyOut + // implementations that require an intermediate buffer to copy data + // into/out of. It prevents these buffers from being allocated/zeroed in + // each syscall and eventually garbage collected. + // + // copyScratchBuffer is exclusive to the task goroutine. + copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"` + + // blockingTimer is used for blocking timeouts. blockingTimerChan is the + // channel that is sent to when blockingTimer fires. + // + // blockingTimer is exclusive to the task goroutine. + blockingTimer *ktime.Timer `state:"nosave"` + blockingTimerChan <-chan struct{} `state:"nosave"` + + // futexWaiter is used for futex(FUTEX_WAIT) syscalls. + // + // futexWaiter is exclusive to the task goroutine. + futexWaiter *futex.Waiter `state:"nosave"` + + // startTime is the real time at which the task started. It is set when + // a Task is created or invokes execve(2). + // + // startTime is protected by mu. + startTime ktime.Time +} + +func (t *Task) savePtraceTracer() *Task { + return t.ptraceTracer.Load().(*Task) +} + +func (t *Task) loadPtraceTracer(tracer *Task) { + t.ptraceTracer.Store(tracer) +} + +func (t *Task) saveLogPrefix() string { + return t.logPrefix.Load().(string) +} + +func (t *Task) loadLogPrefix(prefix string) { + t.logPrefix.Store(prefix) +} + +// afterLoad is invoked by stateify. +func (t *Task) afterLoad() { + t.interruptChan = make(chan struct{}, 1) + t.gosched.State = TaskGoroutineNonexistent + if t.stop != nil { + t.stopCount = 1 + } + t.endStopCond.L = &t.tg.signalHandlers.mu + t.p = t.k.Platform.NewContext() + t.rseqPreempted = true + t.futexWaiter = futex.NewWaiter() +} + +// copyScratchBufferLen is the length of the copyScratchBuffer field of the Task +// struct. +const copyScratchBufferLen = 52 + +// TaskMaybe is the interface for extracting Tasks out of things which may be +// or contain Task objects. +type TaskMaybe interface { + // ExtractTask returns the Task. + ExtractTask() *Task +} + +// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut +// functions. It must only be used within those functions and can only be used +// by the task goroutine; it exists to improve performance and thus +// intentionally lacks any synchronization. +// +// Callers should pass a constant value as an argument, which will allow the +// compiler to inline and optimize out the if statement below. +func (t *Task) CopyScratchBuffer(size int) []byte { + if size > copyScratchBufferLen { + return make([]byte, size) + } + return t.copyScratchBuffer[:size] +} + +// FutexWaiter returns the Task's futex.Waiter. +func (t *Task) FutexWaiter() *futex.Waiter { + return t.futexWaiter +} + +// ExtractTask implements TaskMaybe.ExtractTask. +func (t *Task) ExtractTask() *Task { + return t +} + +// TaskContext returns t's TaskContext. +// +// Precondition: The caller must be running on the task goroutine, or t.mu must +// be locked. +func (t *Task) TaskContext() *TaskContext { + return &t.tc +} + +// TaskResources returns t's TaskResources. +// +// Precondition: The caller must be running on the task goroutine, or t.mu must +// be locked. +func (t *Task) TaskResources() *TaskResources { + return &t.tr +} + +// WithMuLocked executes f with t.mu locked. +func (t *Task) WithMuLocked(f func(*Task)) { + t.mu.Lock() + defer t.mu.Unlock() + f(t) +} + +// Kernel returns the Kernel containing t. +func (t *Task) Kernel() *Kernel { + return t.k +} + +// Value implements context.Context.Value. +func (t *Task) Value(key interface{}) interface{} { + switch key { + case CtxCanTrace: + return t.CanTrace + case CtxKernel: + return t.k + case CtxPIDNamespace: + return t.tg.pidns + case CtxUTSNamespace: + return t.utsns + case CtxIPCNamespace: + return t.ipcns + case CtxTask: + return t + case auth.CtxCredentials: + return t.creds + case fs.CtxRoot: + return t.FSContext().RootDirectory() + case ktime.CtxRealtimeClock: + return t.k.RealtimeClock() + case limits.CtxLimits: + return t.tg.limits + case platform.CtxPlatform: + return t.k + case uniqueid.CtxGlobalUniqueID: + return t.k.UniqueID() + case uniqueid.CtxInotifyCookie: + return t.k.GenerateInotifyCookie() + default: + return nil + } +} + +// SetClearTID sets t's cleartid. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) SetClearTID(addr usermem.Addr) { + t.cleartid = addr +} + +// SetSyscallRestartBlock sets the restart block for use in +// restart_syscall(2). After registering a restart block, a syscall should +// return ERESTART_RESTARTBLOCK to request a restart using the block. +// +// Precondition: The caller must be running on the task goroutine. +func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) { + t.syscallRestartBlock = r +} + +// SyscallRestartBlock returns the currently registered restart block for use in +// restart_syscall(2). This function is *not* idempotent and may be called once +// per syscall. This function must not be called if a restart block has not been +// registered for the current syscall. +// +// Precondition: The caller must be running on the task goroutine. +func (t *Task) SyscallRestartBlock() SyscallRestartBlock { + r := t.syscallRestartBlock + // Explicitly set the restart block to nil so that a future syscall can't + // accidentally reuse it. + t.syscallRestartBlock = nil + return r +} diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go new file mode 100644 index 000000000..ce12cdb64 --- /dev/null +++ b/pkg/sentry/kernel/task_acct.go @@ -0,0 +1,111 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// Accounting, limits, timers. + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" +) + +// IOUsage returns the io usage of the thread. +func (t *Task) IOUsage() *usage.IO { + return t.ioUsage +} + +// IOUsage returns the total io usage of all dead and live threads in the group. +func (tg *ThreadGroup) IOUsage() *usage.IO { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + + io := *tg.ioUsage + // Account for active tasks. + for t := tg.tasks.Front(); t != nil; t = t.Next() { + io.Accumulate(t.IOUsage()) + } + return &io +} + +// Name returns t's name. +func (t *Task) Name() string { + t.mu.Lock() + defer t.mu.Unlock() + return t.tc.Name +} + +// SetName changes t's name. +func (t *Task) SetName(name string) { + t.mu.Lock() + defer t.mu.Unlock() + t.tc.Name = name + t.Debugf("Set thread name to %q", name) +} + +// SetCPUTimer is used by setrlimit(RLIMIT_CPU) to enforce the hard and soft +// limits on CPU time used by this process. +func (tg *ThreadGroup) SetCPUTimer(l *limits.Limit) { + tg.Timer().applyCPULimits(*l) +} + +// Limits implements context.Context.Limits. +func (t *Task) Limits() *limits.LimitSet { + return t.ThreadGroup().Limits() +} + +// StartTime returns t's start time. +func (t *Task) StartTime() ktime.Time { + t.mu.Lock() + defer t.mu.Unlock() + return t.startTime +} + +// MaxRSS returns the maximum resident set size of the task in bytes. which +// should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or +// RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these +// flags. +func (t *Task) MaxRSS(which int32) uint64 { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + + switch which { + case linux.RUSAGE_SELF, linux.RUSAGE_THREAD: + // If there's an active mm we can use its value. + if mm := t.MemoryManager(); mm != nil { + if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > t.tg.maxRSS { + return mmMaxRSS + } + } + return t.tg.maxRSS + case linux.RUSAGE_CHILDREN: + return t.tg.childMaxRSS + case linux.RUSAGE_BOTH: + maxRSS := t.tg.maxRSS + if maxRSS < t.tg.childMaxRSS { + maxRSS = t.tg.childMaxRSS + } + if mm := t.MemoryManager(); mm != nil { + if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > maxRSS { + return mmMaxRSS + } + } + return maxRSS + default: + // We'll only get here if which is invalid. + return 0 + } +} diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go new file mode 100644 index 000000000..9fd24f134 --- /dev/null +++ b/pkg/sentry/kernel/task_block.go @@ -0,0 +1,207 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "time" + + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// BlockWithTimeout blocks t until an event is received from C, the application +// monotonic clock indicates that timeout has elapsed (only if haveTimeout is true), +// or t is interrupted. It returns: +// +// - The remaining timeout, which is guaranteed to be 0 if the timeout expired, +// and is unspecified if haveTimeout is false. +// +// - An error which is nil if an event is received from C, ETIMEDOUT if the timeout +// expired, and syserror.ErrInterrupted if t is interrupted. +func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) { + if !haveTimeout { + return timeout, t.block(C, nil) + } + + start := t.Kernel().MonotonicClock().Now() + deadline := start.Add(timeout) + err := t.BlockWithDeadline(C, true, deadline) + + // Timeout, explicitly return a remaining duration of 0. + if err == syserror.ETIMEDOUT { + return 0, err + } + + // Compute the remaining timeout. Note that even if block() above didn't + // return due to a timeout, we may have used up any of the remaining time + // since then. We cap the remaining timeout to 0 to make it easier to + // directly use the returned duration. + end := t.Kernel().MonotonicClock().Now() + remainingTimeout := timeout - end.Sub(start) + if remainingTimeout < 0 { + remainingTimeout = 0 + } + + return remainingTimeout, err +} + +// BlockWithDeadline blocks t until an event is received from C, the +// application monotonic clock indicates a time of deadline (only if +// haveDeadline is true), or t is interrupted. It returns nil if an event is +// received from C, ETIMEDOUT if the deadline expired, and +// syserror.ErrInterrupted if t is interrupted. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error { + if !haveDeadline { + return t.block(C, nil) + } + + // Start the timeout timer. + t.blockingTimer.Swap(ktime.Setting{ + Enabled: true, + Next: deadline, + }) + + err := t.block(C, t.blockingTimerChan) + + // Stop the timeout timer and drain the channel. + t.blockingTimer.Swap(ktime.Setting{}) + select { + case <-t.blockingTimerChan: + default: + } + + return err +} + +// BlockWithTimer blocks t until an event is received from C or tchan, or t is +// interrupted. It returns nil if an event is received from C, ETIMEDOUT if an +// event is received from tchan, and syserror.ErrInterrupted if t is +// interrupted. +// +// Most clients should use BlockWithDeadline or BlockWithTimeout instead. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) BlockWithTimer(C chan struct{}, tchan <-chan struct{}) error { + return t.block(C, tchan) +} + +// Block blocks t until an event is received from C or t is interrupted. It +// returns nil if an event is received from C and syserror.ErrInterrupted if t +// is interrupted. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Block(C chan struct{}) error { + return t.block(C, nil) +} + +// block blocks a task on one of many events. +// N.B. defer is too expensive to be used here. +func (t *Task) block(C chan struct{}, timerChan <-chan struct{}) error { + // Fast path if the request is already done. + select { + case <-C: + return nil + default: + } + + // Deactive our address space, we don't need it. + interrupt := t.SleepStart() + + select { + case <-C: + t.SleepFinish(true) + return nil + + case <-interrupt: + t.SleepFinish(false) + // Return the indicated error on interrupt. + return syserror.ErrInterrupted + + case <-timerChan: + // We've timed out. + t.SleepFinish(true) + return syserror.ETIMEDOUT + } +} + +// SleepStart implements amutex.Sleeper.SleepStart. +func (t *Task) SleepStart() <-chan struct{} { + t.Deactivate() + t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible) + return t.interruptChan +} + +// SleepFinish implements amutex.Sleeper.SleepFinish. +func (t *Task) SleepFinish(success bool) { + if !success { + // The interrupted notification is consumed only at the top-level + // (Run). Therefore we attempt to reset the pending notification. + // This will also elide our next entry back into the task, so we + // will process signals, state changes, etc. + t.interruptSelf() + } + t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible) + t.Activate() +} + +// UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart. +func (t *Task) UninterruptibleSleepStart(deactivate bool) { + if deactivate { + t.Deactivate() + } + t.accountTaskGoroutineEnter(TaskGoroutineBlockedUninterruptible) +} + +// UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish. +func (t *Task) UninterruptibleSleepFinish(activate bool) { + t.accountTaskGoroutineLeave(TaskGoroutineBlockedUninterruptible) + if activate { + t.Activate() + } +} + +// interrupted returns true if interrupt or interruptSelf has been called at +// least once since the last call to interrupted. +func (t *Task) interrupted() bool { + select { + case <-t.interruptChan: + return true + default: + return false + } +} + +// interrupt unblocks the task and interrupts it if it's currently running in +// userspace. +func (t *Task) interrupt() { + t.interruptSelf() + t.p.Interrupt() +} + +// interruptSelf is like Interrupt, but can only be called by the task +// goroutine. +func (t *Task) interruptSelf() { + select { + case t.interruptChan <- struct{}{}: + t.Debugf("Interrupt queued") + default: + t.Debugf("Dropping duplicate interrupt") + } + // platform.Context.Interrupt() is unnecessary since a task goroutine + // calling interruptSelf() cannot also be blocked in + // platform.Context.Switch(). +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go new file mode 100644 index 000000000..3a74abdfb --- /dev/null +++ b/pkg/sentry/kernel/task_clone.go @@ -0,0 +1,475 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SharingOptions controls what resources are shared by a new task created by +// Task.Clone, or an existing task affected by Task.Unshare. +type SharingOptions struct { + // If NewAddressSpace is true, the task should have an independent virtual + // address space. + NewAddressSpace bool + + // If NewSignalHandlers is true, the task should use an independent set of + // signal handlers. + NewSignalHandlers bool + + // If NewThreadGroup is true, the task should be the leader of its own + // thread group. TerminationSignal is the signal that the thread group + // will send to its parent when it exits. If NewThreadGroup is false, + // TerminationSignal is ignored. + NewThreadGroup bool + TerminationSignal linux.Signal + + // If NewPIDNamespace is true: + // + // - In the context of Task.Clone, the new task should be the init task + // (TID 1) in a new PID namespace. + // + // - In the context of Task.Unshare, the task should create a new PID + // namespace, and all subsequent clones of the task should be members of + // the new PID namespace. + NewPIDNamespace bool + + // If NewUserNamespace is true, the task should have an independent user + // namespace. + NewUserNamespace bool + + // If NewNetworkNamespace is true, the task should have an independent + // network namespace. (Note that network namespaces are not really + // implemented; see comment on Task.netns for details.) + NewNetworkNamespace bool + + // If NewFiles is true, the task should use an independent file descriptor + // table. + NewFiles bool + + // If NewFSContext is true, the task should have an independent FSContext. + NewFSContext bool + + // If NewUTSNamespace is true, the task should have an independent UTS + // namespace. + NewUTSNamespace bool + + // If NewIPCNamespace is true, the task should have an independent IPC + // namespace. + NewIPCNamespace bool +} + +// CloneOptions controls the behavior of Task.Clone. +type CloneOptions struct { + // SharingOptions defines the set of resources that the new task will share + // with its parent. + SharingOptions + + // Stack is the initial stack pointer of the new task. If Stack is 0, the + // new task will start with the same stack pointer as its parent. + Stack usermem.Addr + + // If SetTLS is true, set the new task's TLS (thread-local storage) + // descriptor to TLS. If SetTLS is false, TLS is ignored. + SetTLS bool + TLS usermem.Addr + + // If ChildClearTID is true, when the child exits, 0 is written to the + // address ChildTID in the child's memory, and if the write is successful a + // futex wake on the same address is performed. + // + // If ChildSetTID is true, the child's thread ID (in the child's PID + // namespace) is written to address ChildTID in the child's memory. (As in + // Linux, failed writes are silently ignored.) + ChildClearTID bool + ChildSetTID bool + ChildTID usermem.Addr + + // If ParentSetTID is true, the child's thread ID (in the parent's PID + // namespace) is written to address ParentTID in the parent's memory. (As + // in Linux, failed writes are silently ignored.) + // + // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID + // causes the child's thread ID to be written to ptid in both the parent + // and child's memory, but this is a documentation error fixed by + // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID"). + ParentSetTID bool + ParentTID usermem.Addr + + // If Vfork is true, place the parent in vforkStop until the cloned task + // releases its TaskContext. + Vfork bool + + // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for + // this clone(), and do not ptrace-attach the caller's tracer to the new + // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate). + Untraced bool + + // If InheritTracer is true, ptrace-attach the caller's tracer to the new + // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported + // for it. If both Untraced and InheritTracer are true, no event will be + // reported, but tracer inheritance will still occur. + InheritTracer bool +} + +// Clone implements the clone(2) syscall and returns the thread ID of the new +// task in t's PID namespace. Clone may return both a non-zero thread ID and a +// non-nil error. +// +// Preconditions: The caller must be running Task.doSyscallInvoke on the task +// goroutine. +func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { + // Since signal actions may refer to application signal handlers by virtual + // address, any set of signal handlers must refer to the same address + // space. + if !opts.NewSignalHandlers && opts.NewAddressSpace { + return 0, nil, syserror.EINVAL + } + // In order for the behavior of thread-group-directed signals to be sane, + // all tasks in a thread group must share signal handlers. + if !opts.NewThreadGroup && opts.NewSignalHandlers { + return 0, nil, syserror.EINVAL + } + // All tasks in a thread group must be in the same PID namespace. + if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) { + return 0, nil, syserror.EINVAL + } + // The two different ways of specifying a new PID namespace are + // incompatible. + if opts.NewPIDNamespace && t.childPIDNamespace != nil { + return 0, nil, syserror.EINVAL + } + // Thread groups and FS contexts cannot span user namespaces. + if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) { + return 0, nil, syserror.EINVAL + } + + // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a + // single clone(2) or unshare(2) call, the user namespace is guaranteed to + // be created first, giving the child (clone(2)) or caller (unshare(2)) + // privileges over the remaining namespaces created by the call." - + // user_namespaces(7) + creds := t.Credentials() + var userns *auth.UserNamespace + if opts.NewUserNamespace { + var err error + // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and + // the caller is in a chroot environment (i.e., the caller's root + // directory does not match the root directory of the mount namespace + // in which it resides)." - clone(2). Neither chroot(2) nor + // user_namespaces(7) document this. + if t.IsChrooted() { + return 0, nil, syserror.EPERM + } + userns, err = creds.NewChildUserNamespace() + if err != nil { + return 0, nil, err + } + } + if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapability(linux.CAP_SYS_ADMIN) { + return 0, nil, syserror.EPERM + } + + utsns := t.UTSNamespace() + if opts.NewUTSNamespace { + // Note that this must happen after NewUserNamespace so we get + // the new userns if there is one. + utsns = t.UTSNamespace().Clone(userns) + } + + ipcns := t.IPCNamespace() + if opts.NewIPCNamespace { + // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC + // namespace" + ipcns = NewIPCNamespace() + } + + tc, err := t.tc.Fork(t, !opts.NewAddressSpace) + if err != nil { + return 0, nil, err + } + // clone() returns 0 in the child. + tc.Arch.SetReturn(0) + if opts.Stack != 0 { + tc.Arch.SetStack(uintptr(opts.Stack)) + } + if opts.SetTLS { + tc.Arch.StateData().Regs.Fs_base = uint64(opts.TLS) + } + + pidns := t.tg.pidns + if t.childPIDNamespace != nil { + pidns = t.childPIDNamespace + } else if opts.NewPIDNamespace { + pidns = pidns.NewChild(userns) + } + tg := t.tg + parent := t.parent + if opts.NewThreadGroup { + sh := t.tg.signalHandlers + if opts.NewSignalHandlers { + sh = sh.Fork() + } + tg = NewThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock) + parent = t + } + cfg := &TaskConfig{ + Kernel: t.k, + Parent: parent, + ThreadGroup: tg, + TaskContext: tc, + TaskResources: t.tr.Fork(!opts.NewFiles, !opts.NewFSContext), + Niceness: t.Niceness(), + Credentials: creds.Fork(), + NetworkNamespaced: t.netns, + AllowedCPUMask: t.CPUMask(), + UTSNamespace: utsns, + IPCNamespace: ipcns, + } + if opts.NewNetworkNamespace { + cfg.NetworkNamespaced = true + } + nt, err := t.tg.pidns.owner.NewTask(cfg) + if err != nil { + if opts.NewThreadGroup { + tg.release() + } + return 0, nil, err + } + + // "A child process created via fork(2) inherits a copy of its parent's + // alternate signal stack settings" - sigaltstack(2). + // + // However kernel/fork.c:copy_process() adds a limitation to this: + // "sigaltstack should be cleared when sharing the same VM". + if opts.NewAddressSpace || opts.Vfork { + nt.SetSignalStack(t.SignalStack()) + } + + if userns != nil { + if err := nt.SetUserNamespace(userns); err != nil { + // This shouldn't be possible: userns was created from nt.creds, so + // nt should have CAP_SYS_ADMIN in userns. + panic("Task.Clone: SetUserNamespace failed: " + err.Error()) + } + } + + // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to + // nt that it must receive before its task goroutine starts running. + tid := nt.k.tasks.Root.IDOfTask(nt) + defer nt.Start(tid) + + // "If fork/clone and execve are allowed by @prog, any child processes will + // be constrained to the same filters and system call ABI as the parent." - + // Documentation/prctl/seccomp_filter.txt + nt.syscallFilters = append([]bpf.Program(nil), t.syscallFilters...) + if opts.Vfork { + nt.vforkParent = t + } + + if opts.ChildClearTID { + nt.SetClearTID(opts.ChildTID) + } + if opts.ChildSetTID { + // Can't use Task.CopyOut, which assumes AddressSpaceActive. + usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{}) + } + ntid := t.tg.pidns.IDOfTask(nt) + if opts.ParentSetTID { + t.CopyOut(opts.ParentTID, ntid) + } + + kind := ptraceCloneKindClone + if opts.Vfork { + kind = ptraceCloneKindVfork + } else if opts.TerminationSignal == linux.SIGCHLD { + kind = ptraceCloneKindFork + } + if t.ptraceClone(kind, nt, opts) { + if opts.Vfork { + return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil + } + return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil + } + if opts.Vfork { + t.maybeBeginVforkStop(nt) + return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil + } + return ntid, nil, nil +} + +// maybeBeginVforkStop checks if a previously-started vfork child is still +// running and has not yet released its MM, such that its parent t should enter +// a vforkStop. +// +// Preconditions: The caller must be running on t's task goroutine. +func (t *Task) maybeBeginVforkStop(child *Task) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.killedLocked() { + child.vforkParent = nil + return + } + if child.vforkParent == t { + t.beginInternalStopLocked((*vforkStop)(nil)) + } +} + +func (t *Task) unstopVforkParent() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if p := t.vforkParent; p != nil { + p.tg.signalHandlers.mu.Lock() + defer p.tg.signalHandlers.mu.Unlock() + if _, ok := p.stop.(*vforkStop); ok { + p.endInternalStopLocked() + } + // Parent no longer needs to be unstopped. + t.vforkParent = nil + } +} + +type runSyscallAfterPtraceEventClone struct { + vforkChild *Task + + // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's + // PID namespace. vforkChildTID must be stored since the child may exit and + // release its TID before the PTRACE_EVENT stop ends. + vforkChildTID ThreadID +} + +func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState { + if r.vforkChild != nil { + t.maybeBeginVforkStop(r.vforkChild) + return &runSyscallAfterVforkStop{r.vforkChildTID} + } + return (*runSyscallExit)(nil) +} + +type runSyscallAfterVforkStop struct { + // childTID has the same meaning as + // runSyscallAfterPtraceEventClone.vforkChildTID. + childTID ThreadID +} + +func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { + t.ptraceVforkDone(r.childTID) + return (*runSyscallExit)(nil) +} + +// Unshare changes the set of resources t shares with other tasks, as specified +// by opts. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Unshare(opts *SharingOptions) error { + // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and + // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if + // t is the only task using its MM, which due to clone(2)'s rules imply + // that it is also the only task using its signal handlers / in its thread + // group, and cause EINVAL to be returned otherwise. + // + // Since we don't count the number of tasks using each address space or set + // of signal handlers, we reject NewSignalHandlers and NewAddressSpace + // altogether, and interpret NewThreadGroup as requiring that t be the only + // member of its thread group. This seems to be logically coherent, in the + // sense that clone(2) allows a task to share signal handlers and address + // spaces with tasks in other thread groups. + if opts.NewAddressSpace || opts.NewSignalHandlers { + return syserror.EINVAL + } + if opts.NewThreadGroup { + t.tg.signalHandlers.mu.Lock() + if t.tg.tasksCount != 1 { + t.tg.signalHandlers.mu.Unlock() + return syserror.EINVAL + } + t.tg.signalHandlers.mu.Unlock() + // This isn't racy because we're the only living task, and therefore + // the only task capable of creating new ones, in our thread group. + } + if opts.NewUserNamespace { + if t.IsChrooted() { + return syserror.EPERM + } + // This temporary is needed because Go. + creds := t.Credentials() + newUserNS, err := creds.NewChildUserNamespace() + if err != nil { + return err + } + err = t.SetUserNamespace(newUserNS) + if err != nil { + return err + } + } + haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) + if opts.NewPIDNamespace { + if !haveCapSysAdmin { + return syserror.EPERM + } + t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) + } + t.mu.Lock() + defer t.mu.Unlock() + if opts.NewNetworkNamespace { + if !haveCapSysAdmin { + return syserror.EPERM + } + t.netns = true + } + if opts.NewUTSNamespace { + if !haveCapSysAdmin { + return syserror.EPERM + } + // Note that this must happen after NewUserNamespace, so the + // new user namespace is used if there is one. + t.utsns = t.utsns.Clone(t.creds.UserNamespace) + } + if opts.NewIPCNamespace { + if !haveCapSysAdmin { + return syserror.EPERM + } + // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC + // namespace" + t.ipcns = NewIPCNamespace() + } + if opts.NewFiles { + oldFDMap := t.tr.FDMap + t.tr.FDMap = oldFDMap.Fork() + oldFDMap.DecRef() + } + if opts.NewFSContext { + oldFS := t.tr.FSContext + t.tr.FSContext = oldFS.Fork() + oldFS.DecRef() + } + return nil +} + +// vforkStop is a TaskStop imposed on a task that creates a child with +// CLONE_VFORK or vfork(2), that ends when the child task ceases to use its +// current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so +// that the child and parent share mappings until the child execve()s into a +// new process image or exits.) +type vforkStop struct{} + +// StopIgnoresKill implements TaskStop.Killable. +func (*vforkStop) Killable() bool { return true } diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go new file mode 100644 index 000000000..5c563ba08 --- /dev/null +++ b/pkg/sentry/kernel/task_context.go @@ -0,0 +1,179 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "errors" + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" + "gvisor.googlesource.com/gvisor/pkg/sentry/loader" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// ErrNoSyscalls is returned if there is no syscall table. +var ErrNoSyscalls = errors.New("no syscall table found") + +// Auxmap contains miscellaneous data for the task. +type Auxmap map[string]interface{} + +// TaskContext is the subset of a task's data that is provided by the loader. +type TaskContext struct { + // Name is the thread name set by the prctl(PR_SET_NAME) system call. + Name string + + // Arch is the architecture-specific context (registers, etc.) + Arch arch.Context + + // MemoryManager is the task's address space. + MemoryManager *mm.MemoryManager + + // fu implements futexes in the address space. + fu *futex.Manager + + // st is the task's syscall table. + st *SyscallTable +} + +// release releases all resources held by the TaskContext. release is called by +// the task when it execs into a new TaskContext or exits. +func (tc *TaskContext) release() { + // Nil out pointers so that if the task is saved after release, it doesn't + // follow the pointers to possibly now-invalid objects. + if tc.MemoryManager != nil { + // TODO + tc.MemoryManager.DecUsers(context.Background()) + tc.MemoryManager = nil + } + tc.fu = nil +} + +// Fork returns a duplicate of tc. The copied TaskContext always has an +// independent arch.Context. If shareAddressSpace is true, the copied +// TaskContext shares an address space with the original; otherwise, the copied +// TaskContext has an independent address space that is initially a duplicate +// of the original's. +func (tc *TaskContext) Fork(ctx context.Context, shareAddressSpace bool) (*TaskContext, error) { + newTC := &TaskContext{ + Arch: tc.Arch.Fork(), + st: tc.st, + } + if shareAddressSpace { + newTC.MemoryManager = tc.MemoryManager + if newTC.MemoryManager != nil { + if !newTC.MemoryManager.IncUsers() { + // Shouldn't be possible since tc.MemoryManager should be a + // counted user. + panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager")) + } + } + newTC.fu = tc.fu + } else { + newMM, err := tc.MemoryManager.Fork(ctx) + if err != nil { + return nil, err + } + newTC.MemoryManager = newMM + // TODO: revisit when shmem is supported. + newTC.fu = futex.NewManager() + } + return newTC, nil +} + +// Arch returns t's arch.Context. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Arch() arch.Context { + return t.tc.Arch +} + +// MemoryManager returns t's MemoryManager. MemoryManager does not take an +// additional reference on the returned MM. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) MemoryManager() *mm.MemoryManager { + return t.tc.MemoryManager +} + +// Futex returns t's futex manager. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Futex() *futex.Manager { + return t.tc.fu +} + +// SyscallTable returns t's syscall table. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) SyscallTable() *SyscallTable { + return t.tc.st +} + +// Stack returns the userspace stack. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Stack() *arch.Stack { + return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())} +} + +// LoadTaskImage loads filename into a new TaskContext. +// +// It takes several arguments: +// * mounts: MountNamespace to lookup filename in +// * root: Root to lookup filename under +// * wd: Working directory to lookup filename under +// * maxTraversals: maximum number of symlinks to follow +// * filename: path to binary to load +// * argv: Binary argv +// * envv: Binary envv +// * fs: Binary FeatureSet +func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, error) { + // Prepare a new user address space to load into. + m := mm.NewMemoryManager(k) + defer m.DecUsers(ctx) + + os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso) + if err != nil { + return nil, err + } + + // Lookup our new syscall table. + st, ok := LookupSyscallTable(os, ac.Arch()) + if !ok { + // No syscall table found. Yikes. + return nil, ErrNoSyscalls + } + + if !m.IncUsers() { + panic("Failed to increment users count on new MM") + } + return &TaskContext{ + Name: name, + Arch: ac, + MemoryManager: m, + fu: futex.NewManager(), + st: st, + }, nil +} diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go new file mode 100644 index 000000000..2285847a2 --- /dev/null +++ b/pkg/sentry/kernel/task_exec.go @@ -0,0 +1,240 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file implements the machinery behind the execve() syscall. In brief, a +// thread executes an execve() by killing all other threads in its thread +// group, assuming the leader's identity, and then switching process images. +// +// This design is effectively mandated by Linux. From ptrace(2): +// +// """ +// execve(2) under ptrace +// When one thread in a multithreaded process calls execve(2), the +// kernel destroys all other threads in the process, and resets the +// thread ID of the execing thread to the thread group ID (process ID). +// (Or, to put things another way, when a multithreaded process does an +// execve(2), at completion of the call, it appears as though the +// execve(2) occurred in the thread group leader, regardless of which +// thread did the execve(2).) This resetting of the thread ID looks +// very confusing to tracers: +// +// * All other threads stop in PTRACE_EVENT_EXIT stop, if the +// PTRACE_O_TRACEEXIT option was turned on. Then all other threads +// except the thread group leader report death as if they exited via +// _exit(2) with exit code 0. +// +// * The execing tracee changes its thread ID while it is in the +// execve(2). (Remember, under ptrace, the "pid" returned from +// waitpid(2), or fed into ptrace calls, is the tracee's thread ID.) +// That is, the tracee's thread ID is reset to be the same as its +// process ID, which is the same as the thread group leader's thread +// ID. +// +// * Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC +// option was turned on. +// +// * If the thread group leader has reported its PTRACE_EVENT_EXIT stop +// by this time, it appears to the tracer that the dead thread leader +// "reappears from nowhere". (Note: the thread group leader does not +// report death via WIFEXITED(status) until there is at least one +// other live thread. This eliminates the possibility that the +// tracer will see it dying and then reappearing.) If the thread +// group leader was still alive, for the tracer this may look as if +// thread group leader returns from a different system call than it +// entered, or even "returned from a system call even though it was +// not in any system call". If the thread group leader was not +// traced (or was traced by a different tracer), then during +// execve(2) it will appear as if it has become a tracee of the +// tracer of the execing tracee. +// +// All of the above effects are the artifacts of the thread ID change in +// the tracee. +// """ + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// execStop is a TaskStop that a task sets on itself when it wants to execve +// and is waiting for the other tasks in its thread group to exit first. +type execStop struct{} + +// Killable implements TaskStop.Killable. +func (*execStop) Killable() bool { return true } + +// Execve implements the execve(2) syscall by killing all other tasks in its +// thread group and switching to newTC. Execve always takes ownership of newTC. +// +// Preconditions: The caller must be running Task.doSyscallInvoke on the task +// goroutine. +func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + + if t.tg.exiting || t.tg.execing != nil { + // We lost to a racing group-exit, kill, or exec from another thread + // and should just exit. + newTC.release() + return nil, syserror.EINTR + } + + // Cancel any racing group stops. + t.tg.endGroupStopLocked(false) + + // If the task has any siblings, they have to exit before the exec can + // continue. + t.tg.execing = t + if t.tg.tasks.Front() != t.tg.tasks.Back() { + // "[All] other threads except the thread group leader report death as + // if they exited via _exit(2) with exit code 0." - ptrace(2) + for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { + if t != sibling { + sibling.killLocked() + } + } + // The last sibling to exit will wake t. + t.beginInternalStopLocked((*execStop)(nil)) + } + + return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil +} + +// The runSyscallAfterExecStop state continues execve(2) after all siblings of +// a thread in the execve syscall have exited. +type runSyscallAfterExecStop struct { + tc *TaskContext +} + +func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { + t.tg.pidns.owner.mu.Lock() + t.tg.execing = nil + if t.killed() { + t.tg.pidns.owner.mu.Unlock() + r.tc.release() + return (*runInterrupt)(nil) + } + // We are the thread group leader now. Save our old thread ID for + // PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this + // point it will get a PID of 0, but this is consistent with Linux. + oldTID := ThreadID(0) + if tracer := t.Tracer(); tracer != nil { + oldTID = tracer.tg.pidns.tids[t] + } + t.promoteLocked() + // "During an execve(2), the dispositions of handled signals are reset to + // the default; the dispositions of ignored signals are left unchanged. ... + // [The] signal mask is preserved across execve(2). ... [The] pending + // signal set is preserved across an execve(2)." - signal(7) + // + // Details: + // + // - If the thread group is sharing its signal handlers with another thread + // group via CLONE_SIGHAND, execve forces the signal handlers to be copied + // (see Linux's fs/exec.c:de_thread). We're not reference-counting signal + // handlers, so we always make a copy. + // + // - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags, + // restorer (if present), and mask are always reset. (See Linux's + // fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.) + t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec() + t.endStopCond.L = &t.tg.signalHandlers.mu + // "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2) + t.signalStack = arch.SignalStack{Flags: arch.SignalStackFlagDisable} + // "The termination signal is reset to SIGCHLD (see clone(2))." + t.tg.terminationSignal = linux.SIGCHLD + // execed indicates that the process can no longer join a process group + // in some scenarios (namely, the parent call setpgid(2) on the child). + // See the JoinProcessGroup function in sessions.go for more context. + t.tg.execed = true + // Maximum RSS is preserved across execve(2). + t.updateRSSLocked() + // Restartable sequence state is discarded. + t.rseqPreempted = false + t.rseqCPUAddr = 0 + t.rseqCPU = -1 + t.tg.rscr.Store(&RSEQCriticalRegion{}) + t.tg.pidns.owner.mu.Unlock() + + // Remove FDs with the CloseOnExec flag set. + t.FDMap().RemoveIf(func(file *fs.File, flags FDFlags) bool { + return flags.CloseOnExec + }) + + // Switch to the new process. + t.MemoryManager().Deactivate() + t.mu.Lock() + // Update credentials to reflect the execve. This should precede switching + // MMs to ensure that dumpability has been reset first, if needed. + t.updateCredsForExecLocked() + t.tc.release() + t.tc = *r.tc + t.mu.Unlock() + t.unstopVforkParent() + // NOTE: All locks must be dropped prior to calling Activate. + t.MemoryManager().Activate() + + t.ptraceExec(oldTID) + return (*runSyscallExit)(nil) +} + +// promoteLocked makes t the leader of its thread group. If t is already the +// thread group leader, promoteLocked is a no-op. +// +// Preconditions: All other tasks in t's thread group, including the existing +// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must +// be locked for writing. +func (t *Task) promoteLocked() { + oldLeader := t.tg.leader + if t == oldLeader { + return + } + // Swap the leader's TIDs with the execing task's. The latter will be + // released when the old leader is reaped below. + for ns := t.tg.pidns; ns != nil; ns = ns.parent { + oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader] + ns.tids[oldLeader] = oldTID + ns.tids[t] = leaderTID + ns.tasks[oldTID] = oldLeader + ns.tasks[leaderTID] = t + } + + // Inherit the old leader's start time. + oldStartTime := oldLeader.StartTime() + t.mu.Lock() + t.startTime = oldStartTime + t.mu.Unlock() + + t.tg.leader = t + t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t]) + t.updateLogPrefixLocked() + // Reap the original leader. If it has a tracer, detach it instead of + // waiting for it to acknowledge the original leader's death. + oldLeader.exitParentNotified = true + oldLeader.exitParentAcked = true + if tracer := oldLeader.Tracer(); tracer != nil { + delete(tracer.ptraceTracees, oldLeader) + oldLeader.forgetTracerLocked() + // Notify the tracer that it will no longer be receiving these events + // from the tracee. + tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue) + } + oldLeader.exitNotifyLocked(false) +} diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go new file mode 100644 index 000000000..3d49ae350 --- /dev/null +++ b/pkg/sentry/kernel/task_exit.go @@ -0,0 +1,1139 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file implements the task exit cycle: +// +// - Tasks are asynchronously requested to exit with Task.Kill. +// +// - When able, the task goroutine enters the exit path starting from state +// runExit. +// +// - Other tasks observe completed exits with Task.Wait (which implements the +// wait*() family of syscalls). + +import ( + "errors" + "fmt" + "strconv" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// An ExitStatus is a value communicated from an exiting task or thread group +// to the party that reaps it. +type ExitStatus struct { + // Code is the numeric value passed to the call to exit or exit_group that + // caused the exit. If the exit was not caused by such a call, Code is 0. + Code int + + // Signo is the signal that caused the exit. If the exit was not caused by + // a signal, Signo is 0. + Signo int +} + +// Signaled returns true if the ExitStatus indicates that the exiting task or +// thread group was killed by a signal. +func (es ExitStatus) Signaled() bool { + return es.Signo != 0 +} + +// Status returns the numeric representation of the ExitStatus returned by e.g. +// the wait4() system call. +func (es ExitStatus) Status() uint32 { + return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff) +} + +// ShellExitCode returns the numeric exit code that Bash would return for an +// exit status of es. +func (es ExitStatus) ShellExitCode() int { + if es.Signaled() { + return 128 + es.Signo + } + return es.Code +} + +// TaskExitState represents a step in the task exit path. +// +// "Exiting" and "exited" are often ambiguous; prefer to name specific states. +type TaskExitState int + +const ( + // TaskExitNone indicates that the task has not begun exiting. + TaskExitNone TaskExitState = iota + + // TaskExitInitiated indicates that the task goroutine has entered the exit + // path, and the task is no longer eligible to participate in group stops + // or group signal handling. TaskExitInitiated is analogous to Linux's + // PF_EXITING. + TaskExitInitiated + + // TaskExitZombie indicates that the task has released its resources, and + // the task no longer prevents a sibling thread from completing execve. + TaskExitZombie + + // TaskExitDead indicates that the task's thread IDs have been released, + // and the task no longer prevents its thread group leader from being + // reaped. ("Reaping" refers to the transitioning of a task from + // TaskExitZombie to TaskExitDead.) + TaskExitDead +) + +// String implements fmt.Stringer. +func (t TaskExitState) String() string { + switch t { + case TaskExitNone: + return "TaskExitNone" + case TaskExitInitiated: + return "TaskExitInitiated" + case TaskExitZombie: + return "TaskExitZombie" + case TaskExitDead: + return "TaskExitDead" + default: + return strconv.Itoa(int(t)) + } +} + +// killLocked marks t as killed by enqueueing a SIGKILL, without causing the +// thread-group-affecting side effects SIGKILL usually has. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) killLocked() { + // Clear killable stops. + if t.stop != nil && t.stop.Killable() { + t.endInternalStopLocked() + } + t.groupStopRequired = false + t.pendingSignals.enqueue(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + // Linux just sets SIGKILL in the pending signal bitmask without + // enqueueing an actual siginfo, such that + // kernel/signal.c:collect_signal() initalizes si_code to SI_USER. + Code: arch.SignalInfoUser, + }) + t.interrupt() +} + +// killed returns true if t has a SIGKILL pending. killed is analogous to +// Linux's fatal_signal_pending(). +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) killed() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.killedLocked() +} + +func (t *Task) killedLocked() bool { + return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0 +} + +// PrepareExit indicates an exit with status es. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) PrepareExit(es ExitStatus) { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.exitStatus = es +} + +// PrepareGroupExit indicates a group exit with status es to t's thread group. +// +// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it +// does not tail-call do_exit(), except that it *does* set Task.exitStatus. +// (Linux does not do so until within do_exit(), since it reuses exit_code for +// ptrace.) +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) PrepareGroupExit(es ExitStatus) { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.tg.exiting || t.tg.execing != nil { + // Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e. + // this "group exit" is being executed by the killed sibling of an + // execing task, then Task.Execve never set t.tg.exitStatus, so it's + // still the zero value. This is consistent with Linux, both in intent + // ("all other threads ... report death as if they exited via _exit(2) + // with exit code 0" - ptrace(2), "execve under ptrace") and in + // implementation (compare fs/exec.c:de_thread() => + // kernel/signal.c:zap_other_threads() and + // kernel/exit.c:do_group_exit() => + // include/linux/sched.h:signal_group_exit()). + t.exitStatus = t.tg.exitStatus + return + } + t.tg.exiting = true + t.tg.exitStatus = es + t.exitStatus = es + for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { + if sibling != t { + sibling.killLocked() + } + } +} + +// Kill requests that all tasks in ts exit as if group exiting with status es. +// Kill does not wait for tasks to exit. +// +// Kill has no analogue in Linux; it's provided for save/restore only. +func (ts *TaskSet) Kill(es ExitStatus) { + ts.mu.Lock() + defer ts.mu.Unlock() + ts.Root.exiting = true + for t := range ts.Root.tids { + t.tg.signalHandlers.mu.Lock() + if !t.tg.exiting { + t.tg.exiting = true + t.tg.exitStatus = es + } + t.killLocked() + t.tg.signalHandlers.mu.Unlock() + } +} + +// advanceExitStateLocked checks that t's current exit state is oldExit, then +// sets it to newExit. If t's current exit state is not oldExit, +// advanceExitStateLocked panics. +// +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) { + if t.exitState != oldExit { + panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState)) + } + t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit) + t.exitState = newExit +} + +// runExit is the entry point into the task exit path. +type runExit struct{} + +func (*runExit) execute(t *Task) taskRunState { + t.ptraceExit() + return (*runExitMain)(nil) +} + +type runExitMain struct{} + +func (*runExitMain) execute(t *Task) taskRunState { + lastExiter := t.exitThreadGroup() + + // If the task has a cleartid, and the thread group wasn't killed by a + // signal, handle that before releasing the MM. + if t.cleartid != 0 { + t.tg.signalHandlers.mu.Lock() + signaled := t.tg.exiting && t.tg.exitStatus.Signaled() + t.tg.signalHandlers.mu.Unlock() + if !signaled { + if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil { + t.Futex().Wake(uintptr(t.cleartid), ^uint32(0), 1) + } + // If the CopyOut fails, there's nothing we can do. + } + } + + // Deactivate the address space before releasing the MM. + t.Deactivate() + + // Update the max resident set size before releasing t.tc.mm. + t.tg.pidns.owner.mu.Lock() + t.updateRSSLocked() + t.tg.pidns.owner.mu.Unlock() + + // Release all of the task's resources. + t.mu.Lock() + t.tc.release() + t.tr.release() + t.mu.Unlock() + t.unstopVforkParent() + + // If this is the last task to exit from the thread group, release the + // thread group's resources. + if lastExiter { + t.tg.release() + } + + // Detach tracees. + t.exitPtrace() + + // Reparent the task's children. + t.exitChildren() + + // Don't tail-call runExitNotify, as exitChildren may have initiated a stop + // to wait for a PID namespace to die. + return (*runExitNotify)(nil) +} + +// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread +// group that it is no longer eligible to participate in group activities. It +// returns true if t is the last task in its thread group to call +// exitThreadGroup. +func (t *Task) exitThreadGroup() bool { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.tg.signalHandlers.mu.Lock() + // Can't defer unlock: see below. + + t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated) + t.tg.activeTasks-- + last := t.tg.activeTasks == 0 + + // Ensure that someone will handle the signals we can't. + t.setSignalMaskLocked(^linux.SignalSet(0)) + + // Check if this task's exit interacts with an initiated group stop. + if t.tg.groupStopPhase != groupStopInitiated { + t.tg.signalHandlers.mu.Unlock() + return last + } + if t.groupStopAcknowledged { + // Un-acknowledge the group stop. + t.tg.groupStopCount-- + t.groupStopAcknowledged = false + // If the group stop wasn't complete before, then there is still at + // least one other task that hasn't acknowledged the group stop, so + // it is still not complete now. + t.tg.signalHandlers.mu.Unlock() + return last + } + if t.tg.groupStopCount != t.tg.activeTasks { + t.tg.signalHandlers.mu.Unlock() + return last + } + t.Debugf("Completing group stop") + t.tg.groupStopPhase = groupStopComplete + t.tg.groupStopWaitable = true + sig := t.tg.groupStopSignal + t.tg.groupContNotify = false + t.tg.groupContWaitable = false + // signalStop must be called with t's signal mutex unlocked. + t.tg.signalHandlers.mu.Unlock() + if t.tg.leader.parent != nil { + t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) + } + return last +} + +func (t *Task) exitChildren() { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + newParent := t.findReparentTargetLocked() + if newParent == nil { + // "If the init process of a PID namespace terminates, the kernel + // terminates all of the processes in the namespace via a SIGKILL + // signal." - pid_namespaces(7) + t.Debugf("Init process terminating, killing namespace") + t.tg.pidns.exiting = true + for other := range t.tg.pidns.tids { + if other.tg != t.tg { + other.tg.signalHandlers.mu.Lock() + other.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + }, false /* group */) + other.tg.signalHandlers.mu.Unlock() + } + } + // TODO: The init process waits for all processes in the + // namespace to exit before completing its own exit + // (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all + // other tasks in the namespace are dead, except possibly for this + // thread group's leader (which can't be reaped until this task exits). + } + // This is correct even if newParent is nil (it ensures that children don't + // wait for a parent to reap them.) + for c := range t.children { + if sig := c.ParentDeathSignal(); sig != 0 { + siginfo := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + siginfo.SetPid(int32(c.tg.pidns.tids[t])) + siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) + c.tg.signalHandlers.mu.Lock() + c.sendSignalLocked(siginfo, true /* group */) + c.tg.signalHandlers.mu.Unlock() + } + c.reparentLocked(newParent) + if newParent != nil { + newParent.children[c] = struct{}{} + } + } +} + +// findReparentTargetLocked returns the task to which t's children should be +// reparented. If no such task exists, findNewParentLocked returns nil. +// +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) findReparentTargetLocked() *Task { + // Reparent to any sibling in the same thread group that hasn't begun + // exiting. + if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil { + return t2 + } + // "A child process that is orphaned within the namespace will be + // reparented to [the init process for the namespace] ..." - + // pid_namespaces(7) + if init := t.tg.pidns.tasks[InitTID]; init != nil { + return init.tg.anyNonExitingTaskLocked() + } + return nil +} + +func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task { + for t := tg.tasks.Front(); t != nil; t = t.Next() { + if t.exitState == TaskExitNone { + return t + } + } + return nil +} + +// reparentLocked changes t's parent. The new parent may be nil. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) reparentLocked(parent *Task) { + oldParent := t.parent + t.parent = parent + // If a thread group leader's parent changes, reset the thread group's + // termination signal to SIGCHLD and re-check exit notification. (Compare + // kernel/exit.c:reparent_leader().) + if t != t.tg.leader { + return + } + if oldParent == nil && parent == nil { + return + } + if oldParent != nil && parent != nil && oldParent.tg == parent.tg { + return + } + t.tg.terminationSignal = linux.SIGCHLD + if t.exitParentNotified && !t.exitParentAcked { + t.exitParentNotified = false + t.exitNotifyLocked(false) + } +} + +// When a task exits, other tasks in the system, notably the task's parent and +// ptracer, may want to be notified. The exit notification system ensures that +// interested tasks receive signals and/or are woken from blocking calls to +// wait*() syscalls; these notifications must be resolved before exiting tasks +// can be reaped and disappear from the system. +// +// Each task may have a parent task and/or a tracer task. If both a parent and +// a tracer exist, they may be the same task, different tasks in the same +// thread group, or tasks in different thread groups. (In the last case, Linux +// refers to the task as being ptrace-reparented due to an implementation +// detail; we avoid this terminology to avoid confusion.) +// +// A thread group is *empty* if all non-leader tasks in the thread group are +// dead, and the leader is either a zombie or dead. The exit of a thread group +// leader is never waitable - by either the parent or tracer - until the thread +// group is empty. +// +// There are a few ways for an exit notification to be resolved: +// +// - The exit notification may be acknowledged by a call to Task.Wait with +// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall). +// +// - If the notified party is the parent, and the parent thread group is not +// also the tracer thread group, and the notification signal is SIGCHLD, the +// parent may explicitly ignore the notification (see quote in exitNotify). +// Note that it's possible for the notified party to ignore the signal in other +// cases, but the notification is only resolved under the above conditions. +// (Actually, there is one exception; see the last paragraph of the "leader, +// has tracer, tracer thread group is parent thread group" case below.) +// +// - If the notified party is the parent, and the parent does not exist, the +// notification is resolved as if ignored. (This is only possible in the +// sentry. In Linux, the only task / thread group without a parent is global +// init, and killing global init causes a kernel panic.) +// +// - If the notified party is a tracer, the tracer may detach the traced task. +// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.) +// +// In addition, if the notified party is the parent, the parent may exit and +// cause the notifying task to be reparented to another thread group. This does +// not resolve the notification; instead, the notification must be resent to +// the new parent. +// +// The series of notifications generated for a given task's exit depend on +// whether it is a thread group leader; whether the task is ptraced; and, if +// so, whether the tracer thread group is the same as the parent thread group. +// +// - Non-leader, no tracer: No notification is generated; the task is reaped +// immediately. +// +// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer +// notification is resolved (by waiting or detaching), the task is reaped. (For +// non-leaders, whether the tracer and parent thread groups are the same is +// irrelevant.) +// +// - Leader, no tracer: The task remains a zombie, with no notification sent, +// until all other tasks in the thread group are dead. (In Linux terms, this +// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks +// are removed from their thread_group list in kernel/exit.c:release_task() => +// __exit_signal() => __unhash_process().) Then the thread group's termination +// signal is sent to the parent. When the parent notification is resolved (by +// waiting or ignoring), the task is reaped. +// +// - Leader, has tracer, tracer thread group is not parent thread group: +// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by +// waiting or detaching), and all other tasks in the thread group are dead, the +// thread group's termination signal is sent to the parent. (Note that the +// tracer cannot resolve the exit notification by waiting until the thread +// group is empty.) When the parent notification is resolved, the task is +// reaped. +// +// - Leader, has tracer, tracer thread group is parent thread group: +// +// If all other tasks in the thread group are dead, the thread group's +// termination signal is sent to the parent. At this point, the notification +// can only be resolved by waiting. If the parent detaches from the task as a +// tracer, the notification is not resolved, but the notification can now be +// resolved by waiting or ignoring. When the parent notification is resolved, +// the task is reaped. +// +// If at least one task in the thread group is not dead, SIGCHLD is sent to the +// parent. At this point, the notification cannot be resolved at all; once the +// thread group becomes empty, it can be resolved only by waiting. If the +// parent detaches from the task as a tracer before all remaining tasks die, +// then exit notification proceeds as in the case where the leader never had a +// tracer. If the parent detaches from the task as a tracer after all remaining +// tasks die, the notification is not resolved, but the notification can now be +// resolved by waiting or ignoring. When the parent notification is resolved, +// the task is reaped. +// +// In both of the above cases, when the parent detaches from the task as a +// tracer while the thread group is empty, whether or not the parent resolves +// the notification by ignoring it is based on the parent's SIGCHLD signal +// action, whether or not the thread group's termination signal is SIGCHLD +// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()). +// +// There is one final wrinkle: A leader can become a non-leader due to a +// sibling execve. In this case, the execing thread detaches the leader's +// tracer (if one exists) and reaps the leader immediately. In Linux, this is +// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked(). + +type runExitNotify struct{} + +func (*runExitNotify) execute(t *Task) taskRunState { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie) + t.tg.liveTasks-- + // Check if this completes a sibling's execve. + if t.tg.execing != nil && t.tg.liveTasks == 1 { + // execing blocks the addition of new tasks to the thread group, so + // the sole living task must be the execing one. + e := t.tg.execing + e.tg.signalHandlers.mu.Lock() + if _, ok := e.stop.(*execStop); ok { + e.endInternalStopLocked() + } + e.tg.signalHandlers.mu.Unlock() + } + t.exitNotifyLocked(false) + // The task goroutine will now exit. + return nil +} + +// exitNotifyLocked is called after changes to t's state that affect exit +// notification. +// +// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace; +// thanks to Linux's haphazard implementation of this functionality, such cases +// determine whether parent notifications are ignored based on the parent's +// handling of SIGCHLD, regardless of what the exited task's thread group's +// termination signal is. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { + if t.exitState != TaskExitZombie { + return + } + if !t.exitTracerNotified { + t.exitTracerNotified = true + tracer := t.Tracer() + if tracer == nil { + t.exitTracerAcked = true + } else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg { + // Don't set exitParentNotified if t is non-leader, even if the + // tracer is in the parent thread group, so that if the parent + // detaches the following call to exitNotifyLocked passes through + // the !exitParentNotified case below and causes t to be reaped + // immediately. + // + // Tracer notification doesn't care about about + // SIG_IGN/SA_NOCLDWAIT. + tracer.tg.signalHandlers.mu.Lock() + tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */) + tracer.tg.signalHandlers.mu.Unlock() + // Wake EventTraceeStop waiters as well since this task will never + // ptrace-stop again. + tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop) + } else { + // t is a leader and the tracer is in the parent thread group. + t.exitParentNotified = true + sig := linux.SIGCHLD + if t.tg.tasksCount == 1 { + sig = t.tg.terminationSignal + } + // This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either + // (in Linux, the check in do_notify_parent() is gated by + // !tsk->ptrace.) + t.parent.tg.signalHandlers.mu.Lock() + t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */) + t.parent.tg.signalHandlers.mu.Unlock() + // See below for rationale for this event mask. + t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) + } + } + if t.exitTracerAcked && !t.exitParentNotified { + if t != t.tg.leader { + t.exitParentNotified = true + t.exitParentAcked = true + } else if t.tg.tasksCount == 1 { + t.exitParentNotified = true + if t.parent == nil { + t.exitParentAcked = true + } else { + // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is + // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see + // sigaction(2)), then children that terminate do not become + // zombies and a call to wait() or waitpid() will block until all + // children have terminated, and then fail with errno set to + // ECHILD. (The original POSIX standard left the behavior of + // setting SIGCHLD to SIG_IGN unspecified. Note that even though + // the default disposition of SIGCHLD is "ignore", explicitly + // setting the disposition to SIG_IGN results in different + // treatment of zombie process children.) Linux 2.6 conforms to + // this specification." - wait(2) + // + // Some undocumented Linux-specific details: + // + // - All of the above is ignored if the termination signal isn't + // SIGCHLD. + // + // - SA_NOCLDWAIT causes the leader to be immediately reaped, but + // does not suppress the SIGCHLD. + signalParent := t.tg.terminationSignal.IsValid() + t.parent.tg.signalHandlers.mu.Lock() + if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach { + if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok { + if act.Handler == arch.SignalActIgnore { + t.exitParentAcked = true + signalParent = false + } else if act.Flags&arch.SignalFlagNoCldWait != 0 { + t.exitParentAcked = true + } + } + } + if signalParent { + t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */) + } + t.parent.tg.signalHandlers.mu.Unlock() + // If a task in the parent was waiting for a child group stop + // or continue, it needs to be notified of the exit, because + // there may be no remaining eligible tasks (so that wait + // should return ECHILD). + t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) + } + } + } + if t.exitTracerAcked && t.exitParentAcked { + t.advanceExitStateLocked(TaskExitZombie, TaskExitDead) + for ns := t.tg.pidns; ns != nil; ns = ns.parent { + tid := ns.tids[t] + delete(ns.tasks, tid) + delete(ns.tids, t) + } + t.tg.exitedCPUStats.Accumulate(t.CPUStats()) + t.tg.ioUsage.Accumulate(t.ioUsage) + t.tg.signalHandlers.mu.Lock() + t.tg.tasks.Remove(t) + if t.tg.lastTimerSignalTask == t { + t.tg.lastTimerSignalTask = nil + } + t.tg.tasksCount-- + tc := t.tg.tasksCount + t.tg.signalHandlers.mu.Unlock() + if tc == 1 && t != t.tg.leader { + // Our fromPtraceDetach doesn't matter here (in Linux terms, this + // is via a call to release_task()). + t.tg.leader.exitNotifyLocked(false) + } else if tc == 0 { + t.tg.processGroup.decRefWithParent(t.tg.parentPG()) + } + if t.parent != nil { + delete(t.parent.children, t) + t.parent = nil + } + } +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo { + info := &arch.SignalInfo{ + Signo: int32(sig), + } + info.SetPid(int32(receiver.tg.pidns.tids[t])) + info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + if t.exitStatus.Signaled() { + info.Code = arch.CLD_KILLED + info.SetStatus(int32(t.exitStatus.Signo)) + } else { + info.Code = arch.CLD_EXITED + info.SetStatus(int32(t.exitStatus.Code)) + } + // TODO: Set utime, stime. + return info +} + +// ExitStatus returns t's exit status, which is only guaranteed to be +// meaningful if t.ExitState() != TaskExitNone. +func (t *Task) ExitStatus() ExitStatus { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.exitStatus +} + +// ExitStatus returns the exit status that would be returned by a consuming +// wait*() on tg. +func (tg *ThreadGroup) ExitStatus() ExitStatus { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + if tg.exiting { + return tg.exitStatus + } + return tg.leader.exitStatus +} + +// TerminationSignal returns the thread group's termination signal. +func (tg *ThreadGroup) TerminationSignal() linux.Signal { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.terminationSignal +} + +// Task events that can be waited for. +const ( + // EventExit represents an exit notification generated for a child thread + // group leader or a tracee under the conditions specified in the comment + // above runExitNotify. + EventExit waiter.EventMask = 1 << iota + + // EventChildGroupStop occurs when a child thread group completes a group + // stop (i.e. all tasks in the child thread group have entered a stopped + // state as a result of a group stop). + EventChildGroupStop + + // EventTraceeStop occurs when a task that is ptraced by a task in the + // notified thread group enters a ptrace stop (see ptrace(2)). + EventTraceeStop + + // EventGroupContinue occurs when a child thread group, or a thread group + // whose leader is ptraced by a task in the notified thread group, that had + // initiated or completed a group stop leaves the group stop, due to the + // child thread group or any task in the child thread group being sent + // SIGCONT. + EventGroupContinue +) + +// WaitOptions controls the behavior of Task.Wait. +type WaitOptions struct { + // If SpecificTID is non-zero, only events from the task with thread ID + // SpecificTID are eligible to be waited for. SpecificTID is resolved in + // the PID namespace of the waiter (the method receiver of Task.Wait). If + // no such task exists, or that task would not otherwise be eligible to be + // waited for by the waiting task, then there are no waitable tasks and + // Wait will return ECHILD. + SpecificTID ThreadID + + // If SpecificPGID is non-zero, only events from ThreadGroups with a + // matching ProcessGroupID are eligible to be waited for. (Same + // constraints as SpecificTID apply.) + SpecificPGID ProcessGroupID + + // Terminology note: Per waitpid(2), "a clone child is one which delivers + // no signal, or a signal other than SIGCHLD to its parent upon + // termination." In Linux, termination signal is technically a per-task + // property rather than a per-thread-group property. However, clone() + // forces no termination signal for tasks created with CLONE_THREAD, and + // execve() resets the termination signal to SIGCHLD, so all + // non-group-leader threads have no termination signal and are therefore + // "clone tasks". + + // If NonCloneTasks is true, events from non-clone tasks are eligible to be + // waited for. + NonCloneTasks bool + + // If CloneTasks is true, events from clone tasks are eligible to be waited + // for. + CloneTasks bool + + // Events is a bitwise combination of the events defined above that specify + // what events are of interest to the call to Wait. + Events waiter.EventMask + + // If ConsumeEvent is true, the Wait should consume the event such that it + // cannot be returned by a future Wait. Note that if a task exit is + // consumed in this way, in most cases the task will be reaped. + ConsumeEvent bool + + // If BlockInterruptErr is not nil, Wait will block until either an event + // is available or there are no tasks that could produce a waitable event; + // if that blocking is interrupted, Wait returns BlockInterruptErr. If + // BlockInterruptErr is nil, Wait will not block. + BlockInterruptErr error +} + +// Preconditions: The TaskSet mutex must be locked (for reading or writing). +func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace) bool { + if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] { + return false + } + if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] { + return false + } + if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD { + return o.NonCloneTasks + } + return o.CloneTasks +} + +// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g. +// waitpid(WNOHANG)) that find no waitable events, but determine that waitable +// events may exist in the future. (In contrast, if a non-blocking or blocking +// Wait determines that there are no tasks that can produce a waitable event, +// Task.Wait returns ECHILD.) +var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events") + +// WaitResult contains information about a waited-for event. +type WaitResult struct { + // Task is the task that reported the event. + Task *Task + + // TID is the thread ID of Task in the PID namespace of the task that + // called Wait (that is, the method receiver of the call to Task.Wait). TID + // is provided because consuming exit waits cause the thread ID to be + // deallocated. + TID ThreadID + + // UID is the real UID of Task in the user namespace of the task that + // called Wait. + UID auth.UID + + // Event is exactly one of the events defined above. + Event waiter.EventMask + + // Status is the numeric status associated with the event. + Status uint32 +} + +// Wait waits for an event from a thread group that is a child of t's thread +// group, or a task in such a thread group, or a task that is ptraced by t, +// subject to the options specified in opts. +func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) { + if opts.BlockInterruptErr == nil { + return t.waitOnce(opts) + } + w, ch := waiter.NewChannelEntry(nil) + t.tg.eventQueue.EventRegister(&w, opts.Events) + defer t.tg.eventQueue.EventUnregister(&w) + for { + wr, err := t.waitOnce(opts) + if err != ErrNoWaitableEvent { + // This includes err == nil. + return wr, err + } + if err := t.Block(ch); err != nil { + return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr) + } + } +} + +func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) { + anyWaitableTasks := false + + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + + // Without the (unimplemented) __WNOTHREAD flag, a task can wait on the + // children and tracees of any task in the same thread group. + for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() { + for child := range parent.children { + if !opts.matchesTask(child, parent.tg.pidns) { + continue + } + // Non-leaders don't notify parents on exit and aren't eligible to + // be waited on. + if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked { + anyWaitableTasks = true + if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil { + return wr, nil + } + } + // Check for group stops and continues. Tasks that have passed + // TaskExitInitiated can no longer participate in group stops. + if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 { + continue + } + if child.exitState >= TaskExitInitiated { + continue + } + // If the waiter is in the same thread group as the task's + // tracer, do not report its group stops; they will be reported + // as ptrace stops instead. This also skips checking for group + // continues, but they'll be checked for when scanning tracees + // below. (Per kernel/exit.c:wait_consider_task(): "If a + // ptracer wants to distinguish the two events for its own + // children, it should create a separate process which takes + // the role of real parent.") + if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg { + continue + } + anyWaitableTasks = true + if opts.Events&EventChildGroupStop != 0 { + if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil { + return wr, nil + } + } + if opts.Events&EventGroupContinue != 0 { + if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil { + return wr, nil + } + } + } + for tracee := range parent.ptraceTracees { + if !opts.matchesTask(tracee, parent.tg.pidns) { + continue + } + // Non-leaders do notify tracers on exit. + if opts.Events&EventExit != 0 && !tracee.exitTracerAcked { + anyWaitableTasks = true + if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil { + return wr, nil + } + } + if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 { + continue + } + if tracee.exitState >= TaskExitInitiated { + continue + } + anyWaitableTasks = true + if opts.Events&EventTraceeStop != 0 { + if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil { + return wr, nil + } + } + if opts.Events&EventGroupContinue != 0 { + if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil { + return wr, nil + } + } + } + } + + if anyWaitableTasks { + return nil, ErrNoWaitableEvent + } + return nil, syserror.ECHILD +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult { + if asPtracer && !target.exitTracerNotified { + return nil + } + if !asPtracer && !target.exitParentNotified { + return nil + } + // Zombied thread group leaders are never waitable until their thread group + // is otherwise empty. Usually this is caught by the + // target.exitParentNotified check above, but if t is both (in the thread + // group of) target's tracer and parent, asPtracer may be true. + if target == target.tg.leader && target.tg.tasksCount != 1 { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + status := target.exitStatus.Status() + if !opts.ConsumeEvent { + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventExit, + Status: status, + } + } + // Surprisingly, the exit status reported by a non-consuming wait can + // differ from that reported by a consuming wait; the latter will return + // the group exit code if one is available. + if target.tg.exiting { + status = target.tg.exitStatus.Status() + } + // t may be (in the thread group of) target's parent, tracer, or both. We + // don't need to check for !exitTracerAcked because tracees are detached + // here, and we don't need to check for !exitParentAcked because zombies + // will be reaped here. + if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified { + target.exitTracerAcked = true + target.ptraceTracer.Store((*Task)(nil)) + delete(t.ptraceTracees, target) + } + if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified { + target.exitParentAcked = true + if target == target.tg.leader { + // target.tg.exitedCPUStats doesn't include target.CPUStats() yet, + // and won't until after target.exitNotifyLocked() (maybe). Include + // target.CPUStats() explicitly. This is consistent with Linux, + // which accounts an exited task's cputime to its thread group in + // kernel/exit.c:release_task() => __exit_signal(), and uses + // thread_group_cputime_adjusted() in wait_task_zombie(). + t.tg.childCPUStats.Accumulate(target.CPUStats()) + t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats) + t.tg.childCPUStats.Accumulate(target.tg.childCPUStats) + // Update t's child max resident set size. The size will be the maximum + // of this thread's size and all its childrens' sizes. + if t.tg.childMaxRSS < target.tg.maxRSS { + t.tg.childMaxRSS = target.tg.maxRSS + } + if t.tg.childMaxRSS < target.tg.childMaxRSS { + t.tg.childMaxRSS = target.tg.childMaxRSS + } + } + } + target.exitNotifyLocked(false) + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventExit, + Status: status, + } +} + +// updateRSSLocked updates t.tg.maxRSS. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) updateRSSLocked() { + if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS { + t.tg.maxRSS = mmMaxRSS + } +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult { + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if !target.tg.groupStopWaitable { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + sig := target.tg.groupStopSignal + if opts.ConsumeEvent { + target.tg.groupStopWaitable = false + } + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventChildGroupStop, + // There is no name for these status constants. + Status: (uint32(sig)&0xff)<<8 | 0x7f, + } +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult { + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if !target.tg.groupContWaitable { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + if opts.ConsumeEvent { + target.tg.groupContWaitable = false + } + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventGroupContinue, + Status: 0xffff, + } +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult { + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.stop == nil { + return nil + } + if _, ok := target.stop.(*ptraceStop); !ok { + return nil + } + if target.ptraceCode == 0 { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + code := target.ptraceCode + if opts.ConsumeEvent { + target.ptraceCode = 0 + } + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventTraceeStop, + Status: uint32(code)<<8 | 0x7f, + } +} + +// ExitState returns t's current progress through the exit path. +func (t *Task) ExitState() TaskExitState { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + return t.exitState +} + +// ParentDeathSignal returns t's parent death signal. +func (t *Task) ParentDeathSignal() linux.Signal { + t.mu.Lock() + defer t.mu.Unlock() + return t.parentDeathSignal +} + +// SetParentDeathSignal sets t's parent death signal. +func (t *Task) SetParentDeathSignal(sig linux.Signal) { + t.mu.Lock() + defer t.mu.Unlock() + t.parentDeathSignal = sig +} diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go new file mode 100644 index 000000000..a51fa9d7e --- /dev/null +++ b/pkg/sentry/kernel/task_identity.go @@ -0,0 +1,557 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Credentials returns t's credentials by value. +func (t *Task) Credentials() auth.Credentials { + t.mu.Lock() + defer t.mu.Unlock() + return *t.creds // Copy out with lock held. +} + +// UserNamespace returns the user namespace associated with the task. +func (t *Task) UserNamespace() *auth.UserNamespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.creds.UserNamespace +} + +// HasCapabilityIn checks if the task has capability cp in user namespace ns. +func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool { + t.mu.Lock() + defer t.mu.Unlock() + return t.creds.HasCapabilityIn(cp, ns) +} + +// HasCapability checks if the task has capability cp in its user namespace. +func (t *Task) HasCapability(cp linux.Capability) bool { + t.mu.Lock() + defer t.mu.Unlock() + return t.creds.HasCapability(cp) +} + +// SetUID implements the semantics of setuid(2). +func (t *Task) SetUID(uid auth.UID) error { + // setuid considers -1 to be invalid. + if !uid.Ok() { + return syserror.EINVAL + } + t.mu.Lock() + defer t.mu.Unlock() + kuid := t.creds.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return syserror.EINVAL + } + // "setuid() sets the effective user ID of the calling process. If the + // effective UID of the caller is root (more precisely: if the caller has + // the CAP_SETUID capability), the real UID and saved set-user-ID are also + // set." - setuid(2) + if t.creds.HasCapability(linux.CAP_SETUID) { + t.setKUIDsUncheckedLocked(kuid, kuid, kuid) + return nil + } + // "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID + // capability) and uid does not match the real UID or saved set-user-ID of + // the calling process." + if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID { + return syserror.EPERM + } + t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID) + return nil +} + +// SetREUID implements the semantics of setreuid(2). +func (t *Task) SetREUID(r, e auth.UID) error { + t.mu.Lock() + defer t.mu.Unlock() + // "Supplying a value of -1 for either the real or effective user ID forces + // the system to leave that ID unchanged." - setreuid(2) + newR := t.creds.RealKUID + if r.Ok() { + newR = t.creds.UserNamespace.MapToKUID(r) + if !newR.Ok() { + return syserror.EINVAL + } + } + newE := t.creds.EffectiveKUID + if e.Ok() { + newE = t.creds.UserNamespace.MapToKUID(e) + if !newE.Ok() { + return syserror.EINVAL + } + } + if !t.creds.HasCapability(linux.CAP_SETUID) { + // "Unprivileged processes may only set the effective user ID to the + // real user ID, the effective user ID, or the saved set-user-ID." + if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID { + return syserror.EPERM + } + // "Unprivileged users may only set the real user ID to the real user + // ID or the effective user ID." + if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID { + return syserror.EPERM + } + } + // "If the real user ID is set (i.e., ruid is not -1) or the effective user + // ID is set to a value not equal to the previous real user ID, the saved + // set-user-ID will be set to the new effective user ID." + newS := t.creds.SavedKUID + if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) { + newS = newE + } + t.setKUIDsUncheckedLocked(newR, newE, newS) + return nil +} + +// SetRESUID implements the semantics of the setresuid(2) syscall. +func (t *Task) SetRESUID(r, e, s auth.UID) error { + t.mu.Lock() + defer t.mu.Unlock() + // "Unprivileged user processes may change the real UID, effective UID, and + // saved set-user-ID, each to one of: the current real UID, the current + // effective UID or the current saved set-user-ID. Privileged processes (on + // Linux, those having the CAP_SETUID capability) may set the real UID, + // effective UID, and saved set-user-ID to arbitrary values. If one of the + // arguments equals -1, the corresponding value is not changed." - + // setresuid(2) + var err error + newR := t.creds.RealKUID + if r.Ok() { + newR, err = t.creds.UseUID(r) + if err != nil { + return err + } + } + newE := t.creds.EffectiveKUID + if e.Ok() { + newE, err = t.creds.UseUID(e) + if err != nil { + return err + } + } + newS := t.creds.SavedKUID + if s.Ok() { + newS, err = t.creds.UseUID(s) + if err != nil { + return err + } + } + t.setKUIDsUncheckedLocked(newR, newE, newS) + return nil +} + +// Preconditions: t.mu must be locked. +func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { + root := t.creds.UserNamespace.MapToKUID(auth.RootUID) + oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID + t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS + + // "1. If one or more of the real, effective or saved set user IDs was + // previously 0, and as a result of the UID changes all of these IDs have a + // nonzero value, then all capabilities are cleared from the permitted and + // effective capability sets." - capabilities(7) + if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) { + // prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's + // "keep capabilities" flag, which determines whether the thread's permitted + // capability set is cleared when a change is made to the + // thread's user IDs such that the thread's real UID, effective + // UID, and saved set-user-ID all become nonzero when at least + // one of them previously had the value 0. By default, the + // permitted capability set is cleared when such a change is + // made; setting the "keep capabilities" flag prevents it from + // being cleared." (A thread's effective capability set is always + // cleared when such a credential change is made, + // regardless of the setting of the "keep capabilities" flag.) + if !t.creds.KeepCaps { + t.creds.PermittedCaps = 0 + t.creds.EffectiveCaps = 0 + } + } + // """ + // 2. If the effective user ID is changed from 0 to nonzero, then all + // capabilities are cleared from the effective set. + // + // 3. If the effective user ID is changed from nonzero to 0, then the + // permitted set is copied to the effective set. + // """ + if oldE == root && newE != root { + t.creds.EffectiveCaps = 0 + } else if oldE != root && newE == root { + t.creds.EffectiveCaps = t.creds.PermittedCaps + } + // "4. If the filesystem user ID is changed from 0 to nonzero (see + // setfsuid(2)), then the following capabilities are cleared from the + // effective set: ..." + // (filesystem UIDs aren't implemented, nor are any of the capabilities in + // question) + + // Not documented, but compare Linux's kernel/cred.c:commit_creds(). + if oldE != newE { + t.parentDeathSignal = 0 + } +} + +// SetGID implements the semantics of setgid(2). +func (t *Task) SetGID(gid auth.GID) error { + if !gid.Ok() { + return syserror.EINVAL + } + t.mu.Lock() + defer t.mu.Unlock() + kgid := t.creds.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + if t.creds.HasCapability(linux.CAP_SETGID) { + t.setKGIDsUncheckedLocked(kgid, kgid, kgid) + return nil + } + if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID { + return syserror.EPERM + } + t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID) + return nil +} + +// SetREGID implements the semantics of setregid(2). +func (t *Task) SetREGID(r, e auth.GID) error { + t.mu.Lock() + defer t.mu.Unlock() + newR := t.creds.RealKGID + if r.Ok() { + newR = t.creds.UserNamespace.MapToKGID(r) + if !newR.Ok() { + return syserror.EINVAL + } + } + newE := t.creds.EffectiveKGID + if e.Ok() { + newE = t.creds.UserNamespace.MapToKGID(e) + if !newE.Ok() { + return syserror.EINVAL + } + } + if !t.creds.HasCapability(linux.CAP_SETGID) { + if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID { + return syserror.EPERM + } + if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID { + return syserror.EPERM + } + } + newS := t.creds.SavedKGID + if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) { + newS = newE + } + t.setKGIDsUncheckedLocked(newR, newE, newS) + return nil +} + +// SetRESGID implements the semantics of the setresgid(2) syscall. +func (t *Task) SetRESGID(r, e, s auth.GID) error { + t.mu.Lock() + defer t.mu.Unlock() + var err error + newR := t.creds.RealKGID + if r.Ok() { + newR, err = t.creds.UseGID(r) + if err != nil { + return err + } + } + newE := t.creds.EffectiveKGID + if e.Ok() { + newE, err = t.creds.UseGID(e) + if err != nil { + return err + } + } + newS := t.creds.SavedKGID + if s.Ok() { + newS, err = t.creds.UseGID(s) + if err != nil { + return err + } + } + t.setKGIDsUncheckedLocked(newR, newE, newS) + return nil +} + +func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { + oldE := t.creds.EffectiveKGID + t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS + + // Not documented, but compare Linux's kernel/cred.c:commit_creds(). + if oldE != newE { + t.parentDeathSignal = 0 + } +} + +// SetExtraGIDs attempts to change t's supplemental groups. All IDs are +// interpreted as being in t's user namespace. +func (t *Task) SetExtraGIDs(gids []auth.GID) error { + t.mu.Lock() + defer t.mu.Unlock() + if !t.creds.HasCapability(linux.CAP_SETGID) { + return syserror.EPERM + } + kgids := make([]auth.KGID, len(gids)) + for i, gid := range gids { + kgid := t.creds.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + kgids[i] = kgid + } + t.creds.ExtraKGIDs = kgids + return nil +} + +// SetCapabilitySets attempts to change t's permitted, inheritable, and +// effective capability sets. +func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error { + t.mu.Lock() + defer t.mu.Unlock() + // "Permitted: This is a limiting superset for the effective capabilities + // that the thread may assume." - capabilities(7) + if effective & ^permitted != 0 { + return syserror.EPERM + } + // "It is also a limiting superset for the capabilities that may be added + // to the inheritable set by a thread that does not have the CAP_SETPCAP + // capability in its effective set." + if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) { + return syserror.EPERM + } + // "If a thread drops a capability from its permitted set, it can never + // reacquire that capability (unless it execve(2)s ..." + if permitted & ^t.creds.PermittedCaps != 0 { + return syserror.EPERM + } + // "... if a capability is not in the bounding set, then a thread can't add + // this capability to its inheritable set, even if it was in its permitted + // capabilities ..." + if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 { + return syserror.EPERM + } + t.creds.PermittedCaps = permitted + t.creds.InheritableCaps = inheritable + t.creds.EffectiveCaps = effective + return nil +} + +// DropBoundingCapability attempts to drop capability cp from t's capability +// bounding set. +func (t *Task) DropBoundingCapability(cp linux.Capability) error { + t.mu.Lock() + defer t.mu.Unlock() + if !t.creds.HasCapability(linux.CAP_SETPCAP) { + return syserror.EPERM + } + t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp) + return nil +} + +// SetUserNamespace attempts to move c into ns. +func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { + t.mu.Lock() + defer t.mu.Unlock() + + // "A process reassociating itself with a user namespace must have the + // CAP_SYS_ADMIN capability in the target user namespace." - setns(2) + // + // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN + // in ns (by rule 3 in auth.Credentials.HasCapability). + if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { + return syserror.EPERM + } + + t.creds.UserNamespace = ns + // "The child process created by clone(2) with the CLONE_NEWUSER flag + // starts out with a complete set of capabilities in the new user + // namespace. Likewise, a process that creates a new user namespace using + // unshare(2) or joins an existing user namespace using setns(2) gains a + // full set of capabilities in that namespace." + t.creds.PermittedCaps = auth.AllCapabilities + t.creds.InheritableCaps = 0 + t.creds.EffectiveCaps = auth.AllCapabilities + t.creds.BoundingCaps = auth.AllCapabilities + // "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER + // flag sets the "securebits" flags (see capabilities(7)) to their default + // values (all flags disabled) in the child (for clone(2)) or caller (for + // unshare(2), or setns(2)." - user_namespaces(7) + t.creds.KeepCaps = false + + return nil +} + +// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS. +func (t *Task) SetKeepCaps(k bool) { + t.mu.Lock() + defer t.mu.Unlock() + t.creds.KeepCaps = k +} + +// updateCredsForExec updates t.creds to reflect an execve(). +// +// NOTE: We currently do not implement privileged executables +// (set-user/group-ID bits and file capabilities). This allows us to make a lot +// of simplifying assumptions: +// +// - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which +// disables the features we don't support anyway, is always set. This +// drastically simplifies this function. +// +// - We don't implement AT_SECURE, because no_new_privs always being set means +// that the conditions that require AT_SECURE never arise. (Compare Linux's +// security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().) +// +// - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since +// seccomp-bpf is also allowed if the task has no_new_privs set. +// +// - Task.ptraceAttach does not serialize with execve as it does in Linux, +// since no_new_privs being set has the same effect as the presence of an +// unprivileged tracer. +// +// Preconditions: t.mu must be locked. +func (t *Task) updateCredsForExecLocked() { + // """ + // During an execve(2), the kernel calculates the new capabilities of + // the process using the following algorithm: + // + // P'(permitted) = (P(inheritable) & F(inheritable)) | + // (F(permitted) & cap_bset) + // + // P'(effective) = F(effective) ? P'(permitted) : 0 + // + // P'(inheritable) = P(inheritable) [i.e., unchanged] + // + // where: + // + // P denotes the value of a thread capability set before the + // execve(2) + // + // P' denotes the value of a thread capability set after the + // execve(2) + // + // F denotes a file capability set + // + // cap_bset is the value of the capability bounding set + // + // ... + // + // In order to provide an all-powerful root using capability sets, during + // an execve(2): + // + // 1. If a set-user-ID-root program is being executed, or the real user ID + // of the process is 0 (root) then the file inheritable and permitted sets + // are defined to be all ones (i.e. all capabilities enabled). + // + // 2. If a set-user-ID-root program is being executed, then the file + // effective bit is defined to be one (enabled). + // + // The upshot of the above rules, combined with the capabilities + // transformations described above, is that when a process execve(2)s a + // set-user-ID-root program, or when a process with an effective UID of 0 + // execve(2)s a program, it gains all capabilities in its permitted and + // effective capability sets, except those masked out by the capability + // bounding set. + // """ - capabilities(7) + // (ambient capability sets omitted) + // + // As the last paragraph implies, the case of "a set-user-ID root program + // is being executed" also includes the case where (namespace) root is + // executing a non-set-user-ID program; the actual check is just based on + // the effective user ID. + var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0 + fileEffective := false + root := t.creds.UserNamespace.MapToKUID(auth.RootUID) + if t.creds.EffectiveKUID == root || t.creds.RealKUID == root { + newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps + if t.creds.EffectiveKUID == root { + fileEffective = true + } + } + + // Now we enter poorly-documented, somewhat confusing territory. (The + // accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds + // is not very helpful.) My reading of it is: + // + // If at least one of the following is true: + // + // A1. The execing task is ptraced, and the tracer did not have + // CAP_SYS_PTRACE in the execing task's user namespace at the time of + // PTRACE_ATTACH. + // + // A2. The execing task shares its FS context with at least one task in + // another thread group. + // + // A3. The execing task has no_new_privs set. + // + // AND at least one of the following is true: + // + // B1. The new effective user ID (which may come from set-user-ID, or be the + // execing task's existing effective user ID) is not equal to the task's + // real UID. + // + // B2. The new effective group ID (which may come from set-group-ID, or be + // the execing task's existing effective group ID) is not equal to the + // task's real GID. + // + // B3. The new permitted capability set contains capabilities not in the + // task's permitted capability set. + // + // Then: + // + // C1. Limit the new permitted capability set to the task's permitted + // capability set. + // + // C2. If either the task does not have CAP_SETUID in its user namespace, or + // the task has no_new_privs set, force the new effective UID and GID to + // the task's real UID and GID. + // + // But since no_new_privs is always set (A3 is always true), this becomes + // much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1 + // is a no-op. So we can just do C1 and C2 unconditionally. + if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID { + t.creds.EffectiveKUID = t.creds.RealKUID + t.creds.EffectiveKGID = t.creds.RealKGID + t.parentDeathSignal = 0 + } + // (Saved set-user-ID is always set to the new effective user ID, and saved + // set-group-ID is always set to the new effective group ID, regardless of + // the above.) + t.creds.SavedKUID = t.creds.RealKUID + t.creds.SavedKGID = t.creds.RealKGID + t.creds.PermittedCaps &= newPermitted + if fileEffective { + t.creds.EffectiveCaps = t.creds.PermittedCaps + } else { + t.creds.EffectiveCaps = 0 + } + + // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent + // calls to execve(2). + t.creds.KeepCaps = false + + // "The bounding set is inherited at fork(2) from the thread's parent, and + // is preserved across an execve(2)". So we're done. +} diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go new file mode 100644 index 000000000..18efacb19 --- /dev/null +++ b/pkg/sentry/kernel/task_log.go @@ -0,0 +1,137 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sort" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // maxStackDebugBytes is the maximum number of user stack bytes that may be + // printed by debugDumpStack. + maxStackDebugBytes = 1024 +) + +// Infof logs an formatted info message by calling log.Infof. +func (t *Task) Infof(fmt string, v ...interface{}) { + if log.IsLogging(log.Info) { + log.Infof(t.logPrefix.Load().(string)+fmt, v...) + } +} + +// Warningf logs a warning string by calling log.Warningf. +func (t *Task) Warningf(fmt string, v ...interface{}) { + if log.IsLogging(log.Warning) { + log.Warningf(t.logPrefix.Load().(string)+fmt, v...) + } +} + +// Debugf creates a debug string that includes the task ID. +func (t *Task) Debugf(fmt string, v ...interface{}) { + if log.IsLogging(log.Debug) { + log.Debugf(t.logPrefix.Load().(string)+fmt, v...) + } +} + +// IsLogging returns true iff this level is being logged. +func (t *Task) IsLogging(level log.Level) bool { + return log.IsLogging(level) +} + +// DebugDumpState logs task state at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) DebugDumpState() { + t.debugDumpRegisters() + t.debugDumpStack() + if mm := t.MemoryManager(); mm != nil { + t.Debugf("Mappings:\n%s", mm) + } + t.Debugf("FDMap:\n%s", t.FDMap()) +} + +// debugDumpRegisters logs register state at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) debugDumpRegisters() { + if !t.IsLogging(log.Debug) { + return + } + regmap, err := t.Arch().RegisterMap() + if err != nil { + t.Debugf("Registers: %v", err) + } else { + t.Debugf("Registers:") + var regs []string + for reg := range regmap { + regs = append(regs, reg) + } + sort.Strings(regs) + for _, reg := range regs { + t.Debugf("%-8s = %016x", reg, regmap[reg]) + } + } +} + +// debugDumpStack logs user stack contents at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) debugDumpStack() { + if !t.IsLogging(log.Debug) { + return + } + m := t.MemoryManager() + if m == nil { + t.Debugf("Memory manager for task is gone, skipping application stack dump.") + return + } + t.Debugf("Stack:") + start := usermem.Addr(t.Arch().Stack()) + // Round addr down to a 16-byte boundary. + start &= ^usermem.Addr(15) + // Print 16 bytes per line, one byte at a time. + for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 { + addr, ok := start.AddLength(offset) + if !ok { + break + } + var data [16]byte + n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{ + IgnorePermissions: true, + }) + // Print as much of the line as we can, even if an error was + // encountered. + if n > 0 { + t.Debugf("%x: % x", addr, data[:n]) + } + if err != nil { + t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err) + break + } + } +} + +// updateLogPrefix updates the task's cached log prefix to reflect its +// current thread ID. +// +// Preconditions: The task's owning TaskSet.mu must be locked. +func (t *Task) updateLogPrefixLocked() { + // Use the task's TID in the root PID namespace for logging. + t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t])) +} diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go new file mode 100644 index 000000000..4df2e53d3 --- /dev/null +++ b/pkg/sentry/kernel/task_net.go @@ -0,0 +1,35 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" +) + +// IsNetworkNamespaced returns true if t is in a non-root network namespace. +func (t *Task) IsNetworkNamespaced() bool { + t.mu.Lock() + defer t.mu.Unlock() + return t.netns +} + +// NetworkContext returns the network stack used by the task. NetworkContext +// may return nil if no network stack is available. +func (t *Task) NetworkContext() inet.Stack { + if t.IsNetworkNamespaced() { + return nil + } + return t.k.networkStack +} diff --git a/pkg/sentry/kernel/task_resources.go b/pkg/sentry/kernel/task_resources.go new file mode 100644 index 000000000..e529f0c2d --- /dev/null +++ b/pkg/sentry/kernel/task_resources.go @@ -0,0 +1,126 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// TaskResources is the subset of a task's data provided by its creator that is +// not provided by the loader. +type TaskResources struct { + // SignalMask is the set of signals whose delivery is currently blocked. + // + // FIXME: Determine if we also need RealSignalMask + SignalMask linux.SignalSet + + // FSContext is the filesystem context. + *FSContext + + // FDMap provides access to files to the task. + *FDMap + + // Tracks abstract sockets that are in use. + AbstractSockets *AbstractSocketNamespace +} + +// newTaskResources returns a new TaskResources, taking an additional reference +// on fdm. +func newTaskResources(fdm *FDMap, fc *FSContext) *TaskResources { + fdm.IncRef() + return &TaskResources{ + FDMap: fdm, + FSContext: fc, + AbstractSockets: NewAbstractSocketNamespace(), + } +} + +// release releases all resources held by the TaskResources. release is called +// by the task when it exits. +func (tr *TaskResources) release() { + tr.FDMap.DecRef() + tr.FDMap = nil + tr.FSContext.DecRef() + tr.FSContext = nil + tr.AbstractSockets = nil +} + +// Fork returns a duplicate of tr. +// +// FIXME: Preconditions: When tr is owned by a Task, that task's +// signal mutex must be locked, or Fork must be called by the task's goroutine. +func (tr *TaskResources) Fork(shareFiles bool, shareFSContext bool) *TaskResources { + var fdmap *FDMap + if shareFiles { + fdmap = tr.FDMap + fdmap.IncRef() + } else { + fdmap = tr.FDMap.Fork() + } + + var fsc *FSContext + if shareFSContext { + fsc = tr.FSContext + fsc.IncRef() + } else { + fsc = tr.FSContext.Fork() + } + + return &TaskResources{ + SignalMask: tr.SignalMask, + FDMap: fdmap, + FSContext: fsc, + AbstractSockets: tr.AbstractSockets, + } +} + +// FDMap returns t's FDMap. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) FDMap() *FDMap { + return t.tr.FDMap +} + +// FSContext returns t's FSContext. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) FSContext() *FSContext { + return t.tr.FSContext +} + +// MountNamespace returns t's MountNamespace. MountNamespace does not take an additional +// reference on the returned MountNamespace. +func (t *Task) MountNamespace() *fs.MountNamespace { + return t.k.mounts +} + +// AbstractSockets returns t's AbstractSocketNamespace. +func (t *Task) AbstractSockets() *AbstractSocketNamespace { + return t.tr.AbstractSockets +} + +// IsChrooted returns true if the root directory of t's FSContext is not the +// root directory of t's MountNamespace. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) IsChrooted() bool { + realRoot := t.k.mounts.Root() + defer realRoot.DecRef() + return t.tr.FSContext.root != realRoot +} diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go new file mode 100644 index 000000000..94ce5582b --- /dev/null +++ b/pkg/sentry/kernel/task_run.go @@ -0,0 +1,346 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "runtime" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// A taskRunState is a reified state in the task state machine. See README.md +// for details. The canonical list of all run states, as well as transitions +// between them, is given in run_states.dot. +// +// The set of possible states is enumerable and completely defined by the +// kernel package, so taskRunState would ideally be represented by a +// discriminated union. However, Go does not support sum types. +// +// Hence, as with TaskStop, data-free taskRunStates should be represented as +// typecast nils to avoid unnecessary allocation. +type taskRunState interface { + // execute executes the code associated with this state over the given task + // and returns the following state. If execute returns nil, the task + // goroutine should exit. + // + // It is valid to tail-call a following state's execute to avoid the + // overhead of converting the following state to an interface object and + // checking for stops, provided that the tail-call cannot recurse. + execute(*Task) taskRunState +} + +// run runs the task goroutine. +// +// threadID a dummy value set to the task's TID in the root PID namespace to +// make it visible in stack dumps. A goroutine for a given task can be identified +// searching for Task.run()'s argument value. +func (t *Task) run(threadID uintptr) { + // Construct t.blockingTimer here. We do this here because we can't + // reconstruct t.blockingTimer during restore in Task.afterLoad(), because + // kernel.timekeeper.SetClocks() hasn't been called yet. + blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier() + t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier) + defer t.blockingTimer.Destroy() + t.blockingTimerChan = blockingTimerChan + + // Activate our address space. + t.Activate() + // The corresponding t.Deactivate occurs in the exit path + // (runExitMain.execute) so that when + // Platform.CooperativelySharesAddressSpace() == true, we give up the + // AddressSpace before the task goroutine finishes executing. + + // Ensure that thread group timers for execution time reflect that this + // task now exists. + t.tg.tm.kick() + + // If this is a newly-started task, it should check for participation in + // group stops. If this is a task resuming after restore, it was + // interrupted by saving. In either case, the task is initially + // interrupted. + t.interruptSelf() + + for { + // Explanation for this ordering: + // + // - A freshly-started task that is stopped should not do anything + // before it enters the stop. + // + // - If taskRunState.execute returns nil, the task goroutine should + // exit without checking for a stop. + // + // - Task.Start won't start Task.run if t.runState is nil, so this + // ordering is safe. + t.doStop() + t.runState = t.runState.execute(t) + if t.runState == nil { + t.accountTaskGoroutineEnter(TaskGoroutineNonexistent) + t.goroutineStopped.Done() + t.tg.liveGoroutines.Done() + t.tg.pidns.owner.liveGoroutines.Done() + t.tg.pidns.owner.runningGoroutines.Done() + + // Keep argument alive because stack trace for dead variables may not be correct. + runtime.KeepAlive(threadID) + return + } + } +} + +// doStop is called by Task.run to block until the task is not stopped. +func (t *Task) doStop() { + if atomic.LoadInt32(&t.stopCount) == 0 { + return + } + t.Deactivate() + // NOTE: t.Activate() must be called without any locks held, so + // this defer must precede the defer for unlocking the signal mutex. + defer t.Activate() + t.accountTaskGoroutineEnter(TaskGoroutineStopped) + defer t.accountTaskGoroutineLeave(TaskGoroutineStopped) + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.tg.pidns.owner.runningGoroutines.Add(-1) + defer t.tg.pidns.owner.runningGoroutines.Add(1) + t.goroutineStopped.Add(-1) + defer t.goroutineStopped.Add(1) + for t.stopCount > 0 { + t.endStopCond.Wait() + } +} + +// The runApp state checks for interrupts before executing untrusted +// application code. +type runApp struct{} + +func (*runApp) execute(t *Task) taskRunState { + if t.interrupted() { + // Checkpointing instructs tasks to stop by sending an interrupt, so we + // must check for stops before entering runInterrupt (instead of + // tail-calling it). + return (*runInterrupt)(nil) + } + + // We're about to switch to the application again. If there's still a + // unhandled SyscallRestartErrno that wasn't translated to an EINTR, + // restart the syscall that was interrupted. If there's a saved signal + // mask, restore it. (Note that restoring the saved signal mask may unblock + // a pending signal, causing another interruption, but that signal should + // not interact with the interrupted syscall.) + if t.haveSyscallReturn { + if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + if sre == ERESTART_RESTARTBLOCK { + t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) + t.Arch().RestartSyscallWithRestartBlock() + } else { + t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) + t.Arch().RestartSyscall() + } + } + t.haveSyscallReturn = false + } + if t.haveSavedSignalMask { + t.SetSignalMask(t.savedSignalMask) + t.haveSavedSignalMask = false + if t.interrupted() { + return (*runInterrupt)(nil) + } + } + + // Apply restartable sequences. + if t.rseqPreempted { + t.rseqPreempted = false + if t.rseqCPUAddr != 0 { + if err := t.rseqCopyOutCPU(); err != nil { + t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err) + t.forceSignal(linux.SIGSEGV, false) + t.SendSignal(sigPriv(linux.SIGSEGV)) + // Re-enter the task run loop for signal delivery. + return (*runApp)(nil) + } + } + t.rseqInterrupt() + } + + // Check if we need to enable single-stepping. Tracers expect that the + // kernel preserves the value of the single-step flag set by PTRACE_SETREGS + // whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this + // includes our ptrace platform, by the way), so we should only clear the + // single-step flag if we're responsible for setting it. (clearSinglestep + // is therefore analogous to Linux's TIF_FORCED_TF.) + // + // Strictly speaking, we should also not clear the single-step flag if we + // single-step through an instruction that sets the single-step flag + // (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their + // own TF. (Famous last words, I know.) + clearSinglestep := false + if t.hasTracer() { + t.tg.pidns.owner.mu.RLock() + if t.ptraceSinglestep { + clearSinglestep = !t.Arch().SingleStep() + t.Arch().SetSingleStep() + } + t.tg.pidns.owner.mu.RUnlock() + } + + t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) + info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) + t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) + + if clearSinglestep { + t.Arch().ClearSingleStep() + } + + switch err { + case nil: + // Handle application system call. + return t.doSyscall() + + case platform.ErrContextInterrupt: + // Interrupted by platform.Context.Interrupt(). Re-enter the run + // loop to figure out why. + return (*runApp)(nil) + + case platform.ErrContextSignal: + // Looks like a signal has been delivered to us. If it's a synchronous + // signal (SEGV, SIGBUS, etc.), it should be sent to the application + // thread that received it. + sig := linux.Signal(info.Signo) + + // Was it a fault that we should handle internally? If so, this wasn't + // an application-generated signal and we should continue execution + // normally. + if at.Any() { + addr := usermem.Addr(info.Addr()) + err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack())) + if err == nil { + // The fault was handled appropriately. + // We can resume running the application. + return (*runApp)(nil) + } + + // Is this a vsyscall that we need emulate? + if at.Execute { + if sysno, ok := t.tc.st.LookupEmulate(addr); ok { + return t.doVsyscall(addr, sysno) + } + } + + // The JVM will trigger these errors constantly, so don't + // spam logs with this error. + if err == syserror.EFAULT || err == syserror.EPERM { + t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err) + } else { + t.Warningf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err) + } + t.DebugDumpState() + + // Continue to signal handling. + // + // Convert a BusError error to a SIGBUS from a SIGSEGV. All + // other info bits stay the same (address, etc.). + if _, ok := err.(*memmap.BusError); ok { + sig = linux.SIGBUS + info.Signo = int32(linux.SIGBUS) + } + } + + switch sig { + case linux.SIGILL: + // N.B. The debug stuff here is arguably + // expensive. Don't fret. This gets called + // about 5 times for a typical application, if + // that. + t.Debugf("SIGILL @ %x", t.Arch().IP()) + + // Is this a CPUID instruction? + expected := arch.CPUIDInstruction[:] + found := make([]byte, len(expected)) + _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) + if err == nil && bytes.Equal(expected, found) { + // Skip the cpuid instruction. + t.Arch().CPUIDEmulate(t) + t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) + break + } + + // Treat it like any other synchronous signal. + fallthrough + + case linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP: + // Synchronous signal. Send it to ourselves. Assume the signal is + // legitimate and force it (work around the signal being ignored or + // blocked) like Linux does. Conveniently, this is even the correct + // behavior for SIGTRAP from single-stepping. + t.forceSignal(linux.Signal(sig), false /* unconditional */) + t.SendSignal(info) + + case platform.SignalInterrupt: + // Assume that a call to platform.Context.Interrupt() misfired. + + case linux.SIGPROF: + // It's a profiling interrupt: there's not much + // we can do. We've already paid a decent cost + // by intercepting the signal, at this point we + // simply ignore it. + + default: + // Asynchronous signal. Let the system deal with it. + t.k.sendExternalSignal(info, "application") + } + + return (*runApp)(nil) + + case platform.ErrContextCPUPreempted: + // Ensure that RSEQ critical sections are interrupted and per-thread + // CPU values are updated before the next platform.Context.Switch(). + t.rseqPreempted = true + return (*runApp)(nil) + + default: + // What happened? Can't continue. + t.Warningf("Unexpected SwitchToApp error: %v", err) + t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)}) + return (*runExit)(nil) + } +} + +// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. +func (t *Task) waitGoroutineStoppedOrExited() { + t.goroutineStopped.Wait() +} + +// WaitExited blocks until all task goroutines in tg have exited. +// +// WaitExited does not correspond to anything in Linux; it's provided so that +// external callers of Kernel.CreateProcess can wait for the created thread +// group to terminate. +func (tg *ThreadGroup) WaitExited() { + tg.liveGoroutines.Wait() +} + +// Yield yields the processor for the calling task. +func (t *Task) Yield() { + atomic.AddUint64(&t.yieldCount, 1) + runtime.Gosched() +} diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go new file mode 100644 index 000000000..b50139077 --- /dev/null +++ b/pkg/sentry/kernel/task_sched.go @@ -0,0 +1,329 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// CPU scheduling, real and fake. + +import ( + "fmt" + "sync/atomic" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// TaskGoroutineState is a coarse representation of the current execution +// status of a kernel.Task goroutine. +type TaskGoroutineState int + +const ( + // TaskGoroutineNonexistent indicates that the task goroutine has either + // not yet been created by Task.Start() or has returned from Task.run(). + // This must be the zero value for TaskGoroutineState. + TaskGoroutineNonexistent TaskGoroutineState = iota + + // TaskGoroutineRunningSys indicates that the task goroutine is executing + // sentry code. + TaskGoroutineRunningSys + + // TaskGoroutineRunningApp indicates that the task goroutine is executing + // application code. + TaskGoroutineRunningApp + + // TaskGoroutineBlockedInterruptible indicates that the task goroutine is + // blocked in Task.block(), and hence may be woken by Task.interrupt() + // (e.g. due to signal delivery). + TaskGoroutineBlockedInterruptible + + // TaskGoroutineBlockedUninterruptible indicates that the task goroutine is + // stopped outside of Task.block() and Task.doStop(), and hence cannot be + // woken by Task.interrupt(). + TaskGoroutineBlockedUninterruptible + + // TaskGoroutineStopped indicates that the task goroutine is blocked in + // Task.doStop(). TaskGoroutineStopped is similar to + // TaskGoroutineBlockedUninterruptible, but is a separate state to make it + // possible to determine when Task.stop is meaningful. + TaskGoroutineStopped +) + +// TaskGoroutineSchedInfo contains task goroutine scheduling state which must +// be read and updated atomically. +type TaskGoroutineSchedInfo struct { + // Timestamp was the value of Kernel.cpuClock when this + // TaskGoroutineSchedInfo was last updated. + Timestamp uint64 + + // State is the current state of the task goroutine. + State TaskGoroutineState + + // UserTicks is the amount of time the task goroutine has spent executing + // its associated Task's application code, in units of linux.ClockTick. + UserTicks uint64 + + // SysTicks is the amount of time the task goroutine has spent executing in + // the sentry, in units of linux.ClockTick. + SysTicks uint64 +} + +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) { + now := t.k.CPUClockNow() + if t.gosched.State != TaskGoroutineRunningSys { + panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state)) + } + t.goschedSeq.BeginWrite() + // This function is very hot; avoid defer. + t.gosched.SysTicks += now - t.gosched.Timestamp + t.gosched.Timestamp = now + t.gosched.State = state + t.goschedSeq.EndWrite() +} + +// Preconditions: The caller must be running on the task goroutine, and leaving +// a state indicated by a previous call to +// t.accountTaskGoroutineEnter(state). +func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { + now := t.k.CPUClockNow() + if t.gosched.State != state { + panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys)) + } + t.goschedSeq.BeginWrite() + // This function is very hot; avoid defer. + if state == TaskGoroutineRunningApp { + t.gosched.UserTicks += now - t.gosched.Timestamp + } + t.gosched.Timestamp = now + t.gosched.State = TaskGoroutineRunningSys + t.goschedSeq.EndWrite() +} + +// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info. +// Most clients should use t.CPUStats() instead. +func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo { + return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched) +} + +// CPUStats returns the CPU usage statistics of t. +func (t *Task) CPUStats() usage.CPUStats { + return t.cpuStatsAt(t.k.CPUClockNow()) +} + +// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is +// monotonic, this is satisfied if now is the result of a previous call to +// Kernel.CPUClockNow().) This requirement exists because otherwise a racing +// change to t.gosched can cause cpuStatsAt to adjust stats by too much, making +// the returned stats non-monotonic. +func (t *Task) cpuStatsAt(now uint64) usage.CPUStats { + tsched := t.TaskGoroutineSchedInfo() + if tsched.Timestamp < now { + // Update stats to reflect execution since the last update to + // t.gosched. + switch tsched.State { + case TaskGoroutineRunningSys: + tsched.SysTicks += now - tsched.Timestamp + case TaskGoroutineRunningApp: + tsched.UserTicks += now - tsched.Timestamp + } + } + return usage.CPUStats{ + UserTime: time.Duration(tsched.UserTicks * uint64(linux.ClockTick)), + SysTime: time.Duration(tsched.SysTicks * uint64(linux.ClockTick)), + VoluntarySwitches: atomic.LoadUint64(&t.yieldCount), + } +} + +// CPUStats returns the combined CPU usage statistics of all past and present +// threads in tg. +func (tg *ThreadGroup) CPUStats() usage.CPUStats { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + // Hack to get a pointer to the Kernel. + if tg.leader == nil { + // Per comment on tg.leader, this is only possible if nothing in the + // ThreadGroup has ever executed anyway. + return usage.CPUStats{} + } + now := tg.leader.k.CPUClockNow() + stats := tg.exitedCPUStats + // Account for active tasks. + for t := tg.tasks.Front(); t != nil; t = t.Next() { + stats.Accumulate(t.cpuStatsAt(now)) + } + return stats +} + +// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return +// resource usage statistics for all children of [tg] that have terminated and +// been waited for. These statistics will include the resources used by +// grandchildren, and further removed descendants, if all of the intervening +// descendants waited on their terminated children." +func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.childCPUStats +} + +// StateStatus returns a string representation of the task's current state, +// appropriate for /proc/[pid]/status. +func (t *Task) StateStatus() string { + switch s := t.TaskGoroutineSchedInfo().State; s { + case TaskGoroutineNonexistent: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + switch t.exitState { + case TaskExitZombie: + return "Z (zombie)" + case TaskExitDead: + return "X (dead)" + default: + // The task goroutine can't exit before passing through + // runExitNotify, so this indicates that the task has been created, + // but the task goroutine hasn't yet started. The Linux equivalent + // is struct task_struct::state == TASK_NEW + // (kernel/fork.c:copy_process() => + // kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is + // masked out by TASK_REPORT for /proc/[pid]/status, leaving only + // TASK_RUNNING. + return "R (running)" + } + case TaskGoroutineRunningSys, TaskGoroutineRunningApp: + return "R (running)" + case TaskGoroutineBlockedInterruptible: + return "S (sleeping)" + case TaskGoroutineStopped: + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + switch t.stop.(type) { + case *groupStop: + return "T (stopped)" + case *ptraceStop: + return "t (tracing stop)" + } + fallthrough + case TaskGoroutineBlockedUninterruptible: + // This is the name Linux uses for TASK_UNINTERRUPTIBLE and + // TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL): + // fs/proc/array.c:task_state_array. + return "D (disk sleep)" + default: + panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s)) + } +} + +// CPUMask returns a copy of t's allowed CPU mask. +func (t *Task) CPUMask() sched.CPUSet { + t.mu.Lock() + defer t.mu.Unlock() + return t.allowedCPUMask.Copy() +} + +// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of +// mask. +// +// Preconditions: mask.Size() == +// sched.CPUSetSize(t.Kernel().ApplicationCores()). +func (t *Task) SetCPUMask(mask sched.CPUSet) error { + if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want { + panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want)) + } + + // Remove CPUs in mask above Kernel.applicationCores. + mask.ClearAbove(t.k.applicationCores) + + // Ensure that at least 1 CPU is still allowed. + if mask.NumCPUs() == 0 { + return syserror.EINVAL + } + + if t.k.useHostCores { + // No-op; pretend the mask was immediately changed back. + return nil + } + + t.tg.pidns.owner.mu.RLock() + rootTID := t.tg.pidns.owner.Root.tids[t] + t.tg.pidns.owner.mu.RUnlock() + + t.mu.Lock() + defer t.mu.Unlock() + t.allowedCPUMask = mask + atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID)) + return nil +} + +// CPU returns the cpu id for a given task. +func (t *Task) CPU() int32 { + if t.k.useHostCores { + return int32(hostcpu.GetCPU()) + } + + return atomic.LoadInt32(&t.cpu) +} + +// assignCPU returns the virtualized CPU number for the task with global TID +// tid and allowedCPUMask allowed. +func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) { + // To pretend that threads are evenly distributed to allowed CPUs, choose n + // to be less than the number of CPUs in allowed ... + n := int(tid) % int(allowed.NumCPUs()) + // ... then pick the nth CPU in allowed. + allowed.ForEachCPU(func(c uint) { + if n--; n == 0 { + cpu = int32(c) + } + }) + return cpu +} + +// Niceness returns t's niceness. +func (t *Task) Niceness() int { + t.mu.Lock() + defer t.mu.Unlock() + return t.niceness +} + +// Priority returns t's priority. +func (t *Task) Priority() int { + t.mu.Lock() + defer t.mu.Unlock() + return t.niceness + 20 +} + +// SetNiceness sets t's niceness to n. +func (t *Task) SetNiceness(n int) { + t.mu.Lock() + defer t.mu.Unlock() + t.niceness = n +} + +// NumaPolicy returns t's current numa policy. +func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) { + t.mu.Lock() + defer t.mu.Unlock() + return t.numaPolicy, t.numaNodeMask +} + +// SetNumaPolicy sets t's numa policy. +func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) { + t.mu.Lock() + defer t.mu.Unlock() + t.numaPolicy = policy + t.numaNodeMask = nodeMask +} diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go new file mode 100644 index 000000000..2340256b0 --- /dev/null +++ b/pkg/sentry/kernel/task_signals.go @@ -0,0 +1,1056 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file defines the behavior of task signal handling. + +import ( + "fmt" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SignalAction is an internal signal action. +type SignalAction int + +// Available signal actions. +// Note that although we refer the complete set internally, +// the application is only capable of using the Default and +// Ignore actions from the system call interface. +const ( + SignalActionTerm SignalAction = iota + SignalActionCore + SignalActionStop + SignalActionIgnore + SignalActionHandler +) + +// Default signal handler actions. Note that for most signals, +// (except SIGKILL and SIGSTOP) these can be overridden by the app. +var defaultActions = map[linux.Signal]SignalAction{ + // POSIX.1-1990 standard. + linux.SIGHUP: SignalActionTerm, + linux.SIGINT: SignalActionTerm, + linux.SIGQUIT: SignalActionCore, + linux.SIGILL: SignalActionCore, + linux.SIGABRT: SignalActionCore, + linux.SIGFPE: SignalActionCore, + linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects + linux.SIGSEGV: SignalActionCore, + linux.SIGPIPE: SignalActionTerm, + linux.SIGALRM: SignalActionTerm, + linux.SIGTERM: SignalActionTerm, + linux.SIGUSR1: SignalActionTerm, + linux.SIGUSR2: SignalActionTerm, + linux.SIGCHLD: SignalActionIgnore, + linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects + linux.SIGSTOP: SignalActionStop, + linux.SIGTSTP: SignalActionStop, + linux.SIGTTIN: SignalActionStop, + linux.SIGTTOU: SignalActionStop, + // POSIX.1-2001 standard. + linux.SIGBUS: SignalActionCore, + linux.SIGPROF: SignalActionTerm, + linux.SIGSYS: SignalActionCore, + linux.SIGTRAP: SignalActionCore, + linux.SIGURG: SignalActionIgnore, + linux.SIGVTALRM: SignalActionTerm, + linux.SIGXCPU: SignalActionCore, + linux.SIGXFSZ: SignalActionCore, + // The rest on linux. + linux.SIGSTKFLT: SignalActionTerm, + linux.SIGIO: SignalActionTerm, + linux.SIGPWR: SignalActionTerm, + linux.SIGWINCH: SignalActionIgnore, +} + +// computeAction figures out what to do given a signal number +// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop, +// and SIGKILL always results in a SignalActionTerm. +// Signal 0 is always ignored as many programs use it for various internal functions +// and don't expect it to do anything. +// +// In the event the signal is not one of these, act.Handler determines what +// happens next. +// If act.Handler is: +// 0, the default action is taken; +// 1, the signal is ignored; +// anything else, the function returns SignalActionHandler. +func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction { + switch sig { + case linux.SIGSTOP: + return SignalActionStop + case linux.SIGKILL: + return SignalActionTerm + case linux.Signal(0): + return SignalActionIgnore + } + + switch act.Handler { + case arch.SignalActDefault: + return defaultActions[sig] + case arch.SignalActIgnore: + return SignalActionIgnore + default: + return SignalActionHandler + } +} + +// UnblockableSignals contains the set of signals which cannot be blocked. +var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP) + +// StopSignals is the set of signals whose default action is SignalActionStop. +var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU) + +// dequeueSignalLocked returns a pending unmasked signal. If there are no +// pending unmasked signals, dequeueSignalLocked returns nil. +// +// Preconditions: t.tg.signalHandlers.mu must be locked. +func (t *Task) dequeueSignalLocked() *arch.SignalInfo { + if info := t.pendingSignals.dequeue(t.tr.SignalMask); info != nil { + return info + } + if info := t.tg.pendingSignals.dequeue(t.tr.SignalMask); info != nil { + return info + } + return nil +} + +// TakeSignal returns a pending signal not blocked by mask. Signal handlers are +// not affected. If there are no pending signals not blocked by mask, +// TakeSignal returns a nil SignalInfo. +func (t *Task) TakeSignal(mask linux.SignalSet) *arch.SignalInfo { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if info := t.pendingSignals.dequeue(mask); info != nil { + return info + } + if info := t.tg.pendingSignals.dequeue(mask); info != nil { + return info + } + return nil +} + +// discardSpecificLocked removes all instances of the given signal from all +// signal queues in tg. +// +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) { + tg.pendingSignals.discardSpecific(sig) + for t := tg.tasks.Front(); t != nil; t = t.Next() { + t.pendingSignals.discardSpecific(sig) + } +} + +// PendingSignals returns the set of pending signals. +func (t *Task) PendingSignals() linux.SignalSet { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet +} + +// deliverSignal delivers the given signal and returns the following run state. +func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState { + sigact := computeAction(linux.Signal(info.Signo), act) + + if t.haveSyscallReturn { + if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + // Signals that are ignored, cause a thread group stop, or + // terminate the thread group do not interact with interrupted + // syscalls; in Linux terms, they are never returned to the signal + // handling path from get_signal => get_signal_to_deliver. The + // behavior of an interrupted syscall is determined by the first + // signal that is actually handled (by userspace). + if sigact == SignalActionHandler { + switch { + case sre == ERESTARTNOHAND: + fallthrough + case sre == ERESTART_RESTARTBLOCK: + fallthrough + case (sre == ERESTARTSYS && !act.IsRestart()): + t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) + t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1))) + default: + t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) + t.Arch().RestartSyscall() + } + } + } + } + + switch sigact { + case SignalActionTerm, SignalActionCore: + // "Default action is to terminate the process." - signal(7) + t.Debugf("Signal %d: terminating thread group", info.Signo) + t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)}) + return (*runExit)(nil) + + case SignalActionStop: + // "Default action is to stop the process." + t.initiateGroupStop(info) + + case SignalActionIgnore: + // "Default action is to ignore the signal." + t.Debugf("Signal %d: ignored", info.Signo) + + case SignalActionHandler: + // Try to deliver the signal to the user-configured handler. + t.Debugf("Signal %d: delivering to handler", info.Signo) + if err := t.deliverSignalToHandler(info, act); err != nil { + t.Warningf("Failed to deliver signal %+v to user handler: %v", info, err) + // Send a forced SIGSEGV. If the signal that couldn't be delivered + // was a SIGSEGV, force the handler to SIG_DFL. + t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */) + t.SendSignal(sigPriv(linux.SIGSEGV)) + } + + default: + panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act))) + } + return (*runInterrupt)(nil) +} + +// deliverSignalToHandler changes the task's userspace state to enter the given +// user-configured handler for the given signal. +func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error { + // Signal delivery to an application handler interrupts restartable + // sequences. + t.rseqInterrupt() + + // Are executing on the main stack, + // or the provided alternate stack? + sp := usermem.Addr(t.Arch().Stack()) + + // N.B. This is a *copy* of the alternate stack that the user's signal + // handler expects to see in its ucontext (even if it's not in use). + alt := t.signalStack + if act.IsOnStack() && alt.IsEnabled() { + alt.SetOnStack() + if !t.OnSignalStack(alt) { + sp = usermem.Addr(alt.Top()) + } + } + + // Set up the signal handler. If we have a saved signal mask, the signal + // handler should run with the current mask, but sigreturn should restore + // the saved one. + st := &arch.Stack{t.Arch(), t.MemoryManager(), sp} + mask := t.tr.SignalMask + if t.haveSavedSignalMask { + mask = t.savedSignalMask + } + if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil { + return err + } + t.haveSavedSignalMask = false + + // Add our signal mask. + newMask := t.tr.SignalMask | act.Mask + if !act.IsNoDefer() { + newMask |= linux.SignalSetOf(linux.Signal(info.Signo)) + } + t.SetSignalMask(newMask) + + return nil +} + +var ctrlResume = &SyscallControl{ignoreReturn: true} + +// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if +// rt is true). +func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) { + st := t.Stack() + sigset, err := t.Arch().SignalRestore(st, rt) + if err != nil { + return nil, err + } + + // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked. + t.SetSignalMask(sigset &^ UnblockableSignals) + + // TODO: sys_rt_sigreturn also calls restore_altstack from + // uc.stack, allowing the signal handler to implicitly mutate the signal + // stack. + + return ctrlResume, nil +} + +// SendSignal sends the given signal to t. +// +// The following errors may be returned: +// +// syserror.ESRCH - The task has exited. +// syserror.EINVAL - The signal is not valid. +// syserror.EAGAIN - THe signal is realtime, and cannot be queued. +// +func (t *Task) SendSignal(info *arch.SignalInfo) error { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.sendSignalLocked(info, false /* group */) +} + +// SendGroupSignal sends the given signal to t's thread group. +func (t *Task) SendGroupSignal(info *arch.SignalInfo) error { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.sendSignalLocked(info, true /* group */) +} + +// SendSignal sends the given signal to tg, using tg's leader to determine if +// the signal is blocked. +func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + return tg.leader.sendSignalLocked(info, true /* group */) +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) onCPULocked(includeSys bool) bool { + // Task is exiting. + if t.exitState != TaskExitNone { + return false + } + + switch t.TaskGoroutineSchedInfo().State { + case TaskGoroutineRunningSys: + return includeSys + case TaskGoroutineRunningApp: + return true + default: + return false + } +} + +// SendTimerSignal mimics the process timer signal delivery behavior in linux: +// signals are delivered to the thread that triggers the timer expiration (see +// kernel/time/posix-cpu-timers.c:check_process_timers(). This +// means +// 1) the thread is running on cpu at the time. +// 2) a thread runs more frequently will get more of those signals. +// +// We approximate this behavior by selecting a running task in a round-robin +// fashion. Statistically, a thread running more often should have a higher +// probability to be selected. +func (tg *ThreadGroup) SendTimerSignal(info *arch.SignalInfo, includeSys bool) error { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + + // Find the next running threads. + var t *Task + if tg.lastTimerSignalTask == nil { + t = tg.tasks.Front() + } else { + t = tg.lastTimerSignalTask.Next() + } + + // Iterate from lastTimerSignalTask.Next() to the last task in the task list. + for t != nil { + if t.onCPULocked(includeSys) { + tg.lastTimerSignalTask = t + return t.sendSignalLocked(info, true /* group */) + } + t = t.Next() + } + + // t is nil when we reach here. If lastTimerSignalTask is not nil, iterate + // from Front to lastTimerSignalTask. + if tg.lastTimerSignalTask != nil { + for t := tg.tasks.Front(); t != tg.lastTimerSignalTask.Next(); t = t.Next() { + if t.onCPULocked(includeSys) { + tg.lastTimerSignalTask = t + return t.sendSignalLocked(info, true /* group */) + } + } + } + + // No running threads? Just try the leader. + tg.lastTimerSignalTask = tg.leader + return tg.leader.sendSignalLocked(info, true /* group */) +} + +func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error { + if t.exitState == TaskExitDead { + return syserror.ESRCH + } + sig := linux.Signal(info.Signo) + if sig == 0 { + return nil + } + if !sig.IsValid() { + return syserror.EINVAL + } + + // Signal side effects apply even if the signal is ultimately discarded. + t.tg.applySignalSideEffectsLocked(sig) + + // TODO: "Only signals for which the "init" process has established a + // signal handler can be sent to the "init" process by other members of the + // PID namespace. This restriction applies even to privileged processes, + // and prevents other members of the PID namespace from accidentally + // killing the "init" process." - pid_namespaces(7). We don't currently do + // this for child namespaces, though we should; we also don't do this for + // the root namespace (the same restriction applies to global init on + // Linux), where whether or not we should is much murkier. In practice, + // most sandboxed applications are not prepared to function as an init + // process. + + // Unmasked, ignored signals are discarded without being queued, unless + // they will be visible to a tracer. Even for group signals, it's the + // originally-targeted task's signal mask and tracer that matter; compare + // Linux's kernel/signal.c:__send_signal() => prepare_signal() => + // sig_ignored(). + ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore + if linux.SignalSetOf(sig)&t.tr.SignalMask == 0 && ignored && !t.hasTracer() { + t.Debugf("Discarding ignored signal %d", sig) + return nil + } + + q := &t.pendingSignals + if group { + q = &t.tg.pendingSignals + } + if !q.enqueue(info) { + if sig.IsRealtime() { + return syserror.EAGAIN + } + t.Debugf("Discarding duplicate signal %d", sig) + return nil + } + + // Find a receiver to notify. Note that the task we choose to notify, if + // any, may not be the task that actually dequeues and handles the signal; + // e.g. a racing signal mask change may cause the notified task to become + // ineligible, or a racing sibling task may dequeue the signal first. + if t.canReceiveSignalLocked(sig) { + t.Debugf("Notified of signal %d", sig) + t.interrupt() + return nil + } + if group { + if nt := t.tg.findSignalReceiverLocked(sig); nt != nil { + nt.Debugf("Notified of group signal %d", sig) + nt.interrupt() + return nil + } + } + t.Debugf("No task notified of signal %d", sig) + return nil +} + +func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) { + switch { + case linux.SignalSetOf(sig)&StopSignals != 0: + // Stop signals cause all prior SIGCONT to be discarded. (This is + // despite the fact this has little effect since SIGCONT's most + // important effect is applied when the signal is sent in the branch + // below, not when the signal is delivered.) + tg.discardSpecificLocked(linux.SIGCONT) + case sig == linux.SIGCONT: + // "The SIGCONT signal has a side effect of waking up (all threads of) + // a group-stopped process. This side effect happens before + // signal-delivery-stop. The tracer can't suppress this side effect (it + // can only suppress signal injection, which only causes the SIGCONT + // handler to not be executed in the tracee, if such a handler is + // installed." - ptrace(2) + tg.endGroupStopLocked(true) + case sig == linux.SIGKILL: + // "SIGKILL does not generate signal-delivery-stop and therefore the + // tracer can't suppress it. SIGKILL kills even within system calls + // (syscall-exit-stop is not generated prior to death by SIGKILL)." - + // ptrace(2) + // + // Note that this differs from ThreadGroup.requestExit in that it + // ignores tg.execing. + if !tg.exiting { + tg.exiting = true + tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)} + } + for t := tg.tasks.Front(); t != nil; t = t.Next() { + t.killLocked() + } + } +} + +// canReceiveSignalLocked returns true if t should be interrupted to receive +// the given signal. canReceiveSignalLocked is analogous to Linux's +// kernel/signal.c:wants_signal(), but see below for divergences. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool { + // - Do not choose tasks that are blocking the signal. + if linux.SignalSetOf(sig)&t.tr.SignalMask != 0 { + return false + } + // - No need to check Task.exitState, as the exit path sets every bit in the + // signal mask when it transitions from TaskExitNone to TaskExitInitiated. + // - No special case for SIGKILL: SIGKILL already interrupted all tasks in the + // task group via applySignalSideEffects => killLocked. + // - Do not choose stopped tasks, which cannot handle signals. + if t.stop != nil { + return false + } + // - TODO: No special case for when t is also the sending task, + // because the identity of the sender is unknown. + // - Do not choose tasks that have already been interrupted, as they may be + // busy handling another signal. + if len(t.interruptChan) != 0 { + return false + } + return true +} + +// findSignalReceiverLocked returns a task in tg that should be interrupted to +// receive the given signal. If no such task exists, findSignalReceiverLocked +// returns nil. +// +// Linux actually records curr_target to balance the group signal targets. +// +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task { + for t := tg.tasks.Front(); t != nil; t = t.Next() { + if t.canReceiveSignalLocked(sig) { + return t + } + } + return nil +} + +// forceSignal ensures that the task is not ignoring or blocking the given +// signal. If unconditional is true, forceSignal takes action even if the +// signal isn't being ignored or blocked. +func (t *Task) forceSignal(sig linux.Signal, unconditional bool) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.forceSignalLocked(sig, unconditional) +} + +func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) { + blocked := linux.SignalSetOf(sig)&t.tr.SignalMask != 0 + act := t.tg.signalHandlers.actions[sig] + ignored := act.Handler == arch.SignalActIgnore + if blocked || ignored || unconditional { + act.Handler = arch.SignalActDefault + t.tg.signalHandlers.actions[sig] = act + if blocked { + t.setSignalMaskLocked(t.tr.SignalMask &^ linux.SignalSetOf(sig)) + } + } +} + +// SignalMask returns a copy of t's signal mask. +func (t *Task) SignalMask() linux.SignalSet { + return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.tr.SignalMask))) +} + +// SetSignalMask sets t's signal mask. +// +// Preconditions: SetSignalMask can only be called by the task goroutine. +// t.exitState < TaskExitZombie. +func (t *Task) SetSignalMask(mask linux.SignalSet) { + // By precondition, t prevents t.tg from completing an execve and mutating + // t.tg.signalHandlers, so we can skip the TaskSet mutex. + t.tg.signalHandlers.mu.Lock() + t.setSignalMaskLocked(mask) + t.tg.signalHandlers.mu.Unlock() +} + +// Preconditions: The signal mutex must be locked. +func (t *Task) setSignalMaskLocked(mask linux.SignalSet) { + oldMask := t.tr.SignalMask + atomic.StoreUint64((*uint64)(&t.tr.SignalMask), uint64(mask)) + + // If the new mask blocks any signals that were not blocked by the old + // mask, and at least one such signal is pending in tg.pendingSignals, and + // t has been woken, it could be the case that t was woken to handle that + // signal, but will no longer do so as a result of its new signal mask, so + // we have to pick a replacement. + blocked := mask &^ oldMask + blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet + if blockedGroupPending != 0 && t.interrupted() { + linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) { + if nt := t.tg.findSignalReceiverLocked(sig); nt != nil { + nt.interrupt() + return + } + }) + // We have to re-issue the interrupt consumed by t.interrupted() since + // it might have been for a different reason. + t.interruptSelf() + } + + // Conversely, if the new mask unblocks any signals that were blocked by + // the old mask, and at least one such signal is pending, we may now need + // to handle that signal. + unblocked := oldMask &^ mask + unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet) + if unblockedPending != 0 { + t.interruptSelf() + } +} + +// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's +// comment). +// +// Preconditions: SetSavedSignalMask can only be called by the task goroutine. +func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { + t.savedSignalMask = mask + t.haveSavedSignalMask = true +} + +// SignalStack returns the task-private signal stack. +func (t *Task) SignalStack() arch.SignalStack { + return t.signalStack +} + +// OnSignalStack returns true if, when the task resumes running, it will run on +// the task-private signal stack. +func (t *Task) OnSignalStack(s arch.SignalStack) bool { + sp := usermem.Addr(t.Arch().Stack()) + return usermem.Addr(s.Addr) <= sp && sp < usermem.Addr(s.Addr+s.Size) +} + +// SetSignalStack sets the task-private signal stack and clears the +// SignalStackFlagDisable, since we have a signal stack. +func (t *Task) SetSignalStack(alt arch.SignalStack) error { + // Mask out irrelevant parts: only disable matters. + alt.Flags &= arch.SignalStackFlagDisable + t.signalStack = alt + return nil +} + +// SetSignalAct atomically sets the thread group's signal action for signal sig +// to *actptr (if actptr is not nil) and returns the old signal action. +func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) { + if !sig.IsValid() { + return arch.SignalAct{}, syserror.EINVAL + } + + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + sh := tg.signalHandlers + sh.mu.Lock() + defer sh.mu.Unlock() + oldact := sh.actions[sig] + if actptr != nil { + if sig == linux.SIGKILL || sig == linux.SIGSTOP { + return oldact, syserror.EINVAL + } + + act := *actptr + act.Mask &^= UnblockableSignals + sh.actions[sig] = act + // From POSIX, by way of Linux: + // + // "Setting a signal action to SIG_IGN for a signal that is pending + // shall cause the pending signal to be discarded, whether or not it is + // blocked." + // + // "Setting a signal action to SIG_DFL for a signal that is pending and + // whose default action is to ignore the signal (for example, SIGCHLD), + // shall cause the pending signal to be discarded, whether or not it is + // blocked." + if computeAction(sig, act) == SignalActionIgnore { + tg.discardSpecificLocked(sig) + } + } + return oldact, nil +} + +// CopyOutSignalAct converts the given SignalAct into an architecture-specific +// type and then copies it out to task memory. +func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error { + n := t.Arch().NewSignalAct() + n.SerializeFrom(s) + _, err := t.CopyOut(addr, n) + return err +} + +// CopyInSignalAct copies an architecture-specific sigaction type from task +// memory and then converts it into a SignalAct. +func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) { + n := t.Arch().NewSignalAct() + var s arch.SignalAct + if _, err := t.CopyIn(addr, n); err != nil { + return s, err + } + n.DeserializeTo(&s) + return s, nil +} + +// CopyOutSignalStack converts the given SignalStack into an +// architecture-specific type and then copies it out to task memory. +func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error { + n := t.Arch().NewSignalStack() + n.SerializeFrom(s) + _, err := t.CopyOut(addr, n) + return err +} + +// CopyInSignalStack copies an architecture-specific stack_t from task memory +// and then converts it into a SignalStack. +func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) { + n := t.Arch().NewSignalStack() + var s arch.SignalStack + if _, err := t.CopyIn(addr, n); err != nil { + return s, err + } + n.DeserializeTo(&s) + return s, nil +} + +// groupStop is a TaskStop placed on tasks that have received a stop signal +// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from +// the ptrace man page.) +type groupStop struct{} + +// Killable implements TaskStop.Killable. +func (*groupStop) Killable() bool { return true } + +type groupStopPhase int + +const ( + // groupStopNone indicates that a thread group is not in, or attempting to + // enter or leave, a group stop. + groupStopNone groupStopPhase = iota + + // groupStopDequeued indicates that at least one task in a thread group has + // dequeued a stop signal (or dequeued any signal and entered a + // signal-delivery-stop as a result, which allows ptrace to change the + // signal into a stop signal), but temporarily dropped the signal mutex + // without initiating the group stop. + // + // groupStopDequeued is analogous to JOBCTL_STOP_DEQUEUED in Linux. + groupStopDequeued + + // groupStopInitiated indicates that a task in a thread group has initiated + // a group stop, but not all tasks in the thread group have acknowledged + // entering the group stop. + // + // groupStopInitiated is represented by JOBCTL_STOP_PENDING && + // !SIGNAL_STOP_STOPPED in Linux. + groupStopInitiated + + // groupStopComplete indicates that all tasks in a thread group have + // acknowledged entering the group stop, and the last one to do so has + // notified the thread group's parent. + // + // groupStopComplete is represented by JOBCTL_STOP_PENDING && + // SIGNAL_STOP_STOPPED in Linux. + groupStopComplete +) + +// initiateGroupStop attempts to initiate a group stop based on a +// previously-dequeued stop signal. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) initiateGroupStop(info *arch.SignalInfo) { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.tg.groupStopPhase != groupStopDequeued { + t.Debugf("Signal %d: not stopping thread group: lost to racing signal", info.Signo) + return + } + if t.tg.exiting { + t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo) + return + } + if t.tg.execing != nil { + t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo) + return + } + t.Debugf("Signal %d: stopping thread group", info.Signo) + t.tg.groupStopPhase = groupStopInitiated + t.tg.groupStopSignal = linux.Signal(info.Signo) + t.tg.groupStopCount = 0 + for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() { + t2.groupStopRequired = true + t2.groupStopAcknowledged = false + t2.interrupt() + } +} + +// endGroupStopLocked ensures that all prior stop signals received by tg are +// not stopping tg and will not stop tg in the future. If broadcast is true, +// parent and tracer notification will be scheduled if appropriate. +// +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) { + // Discard all previously-queued stop signals. + linux.ForEachSignal(StopSignals, tg.discardSpecificLocked) + + if tg.groupStopPhase != groupStopNone { + tg.leader.Debugf("Ending group stop currently in phase %d", tg.groupStopPhase) + if tg.groupStopPhase == groupStopInitiated || tg.groupStopPhase == groupStopComplete { + tg.groupStopSignal = 0 + for t := tg.tasks.Front(); t != nil; t = t.Next() { + if _, ok := t.stop.(*groupStop); ok { + t.endInternalStopLocked() + } + } + if broadcast { + // Instead of notifying the parent here, set groupContNotify so + // that one of the continuing tasks does so. (Linux does + // something similar.) The reason we do this is to keep locking + // sane. In order to send a signal to the parent, we need to + // lock its signal mutex, but we're already holding tg's signal + // mutex, and the TaskSet mutex must be locked for writing for + // us to hold two signal mutexes. Since we don't want to + // require this for endGroupStopLocked (which is called from + // signal-sending paths), nor do we want to lose atomicity by + // releasing the mutexes we're already holding, just let the + // continuing thread group deal with it. + tg.groupContNotify = true + tg.groupContInterrupted = tg.groupStopPhase == groupStopInitiated + tg.groupContWaitable = true + } + } + // If groupStopPhase was groupStopDequeued, setting it to groupStopNone + // will cause following calls to initiateGroupStop to recognize that + // the group stop has been cancelled. + tg.groupStopPhase = groupStopNone + } +} + +// signalStop sends a signal to t's thread group of a new group stop, group +// continue, or ptrace stop, if appropriate. code and status are set in the +// signal sent to tg, if any. +// +// Preconditions: The TaskSet mutex must be locked (for reading or writing). +func (t *Task) signalStop(target *Task, code int32, status int32) { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD] + if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) { + sigchld := &arch.SignalInfo{ + Signo: int32(linux.SIGCHLD), + Code: code, + } + sigchld.SetPid(int32(t.tg.pidns.tids[target])) + sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + sigchld.SetStatus(status) + // TODO: Set utime, stime. + t.sendSignalLocked(sigchld, true /* group */) + } +} + +// The runInterrupt state handles conditions indicated by interrupts. +type runInterrupt struct{} + +func (*runInterrupt) execute(t *Task) taskRunState { + // Interrupts are de-duplicated (if t is interrupted twice before + // t.interrupted() is called, t.interrupted() will only return true once), + // so early exits from this function must re-enter the runInterrupt state + // to check for more interrupt-signaled conditions. + + t.tg.signalHandlers.mu.Lock() + + // Did we just leave a group stop? + if t.tg.groupContNotify { + t.tg.groupContNotify = false + sig := t.tg.groupStopSignal + intr := t.tg.groupContInterrupted + t.tg.signalHandlers.mu.Unlock() + t.tg.pidns.owner.mu.RLock() + // For consistency with Linux, if the parent and (thread group + // leader's) tracer are in the same thread group, deduplicate + // notifications. + notifyParent := t.tg.leader.parent != nil + if tracer := t.tg.leader.ptraceTracer.Load().(*Task); tracer != nil { + if notifyParent && tracer.tg == t.tg.leader.parent.tg { + notifyParent = false + } + // Sending CLD_STOPPED to the tracer doesn't really make any sense; + // the thread group leader may have already entered the stop and + // notified its tracer accordingly. But it's consistent with + // Linux... + if intr { + tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + if !notifyParent { + tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop) + } else { + tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop) + } + } else { + tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig)) + tracer.tg.eventQueue.Notify(EventGroupContinue) + } + } + if notifyParent { + // If groupContInterrupted, do as Linux does and pretend the group + // stop completed just before it ended. The theoretical behavior in + // this case would be to send a SIGCHLD indicating the completed + // stop, followed by a SIGCHLD indicating the continue. However, + // SIGCHLD is a standard signal, so the latter would always be + // dropped. Hence sending only the former is equivalent. + if intr { + t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop) + } else { + t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue) + } + } + t.tg.pidns.owner.mu.RUnlock() + return (*runInterrupt)(nil) + } + + // Do we need to enter a group stop? + if t.groupStopRequired { + t.groupStopRequired = false + sig := t.tg.groupStopSignal + notifyParent := false + if !t.groupStopAcknowledged { + t.groupStopAcknowledged = true + t.tg.groupStopCount++ + if t.tg.groupStopCount == t.tg.activeTasks { + t.Debugf("Completing group stop") + notifyParent = true + t.tg.groupStopPhase = groupStopComplete + t.tg.groupStopWaitable = true + t.tg.groupContNotify = false + t.tg.groupContWaitable = false + } + } + // Drop the signal mutex so we can take the TaskSet mutex. + t.tg.signalHandlers.mu.Unlock() + + t.tg.pidns.owner.mu.RLock() + if t.tg.leader.parent == nil { + notifyParent = false + } + if tracer := t.Tracer(); tracer != nil { + t.ptraceCode = int32(sig) + t.ptraceSiginfo = nil + if t.beginPtraceStopLocked() { + tracer.signalStop(t, arch.CLD_STOPPED, int32(sig)) + // For consistency with Linux, if the parent and tracer are in the + // same thread group, deduplicate notification signals. + if notifyParent && tracer.tg == t.tg.leader.parent.tg { + notifyParent = false + tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop) + } else { + tracer.tg.eventQueue.Notify(EventTraceeStop) + } + } + } else { + t.tg.signalHandlers.mu.Lock() + if !t.killedLocked() { + t.beginInternalStopLocked((*groupStop)(nil)) + } + t.tg.signalHandlers.mu.Unlock() + } + if notifyParent { + t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) + } + t.tg.pidns.owner.mu.RUnlock() + + return (*runInterrupt)(nil) + } + + // Are there signals pending? + if info := t.dequeueSignalLocked(); info != nil { + if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 && t.tg.groupStopPhase == groupStopNone { + // Indicate that we've dequeued a stop signal before + // unlocking the signal mutex; initiateGroupStop will check + // that the phase hasn't changed (or is at least another + // "stop signal dequeued" phase) after relocking it. + t.tg.groupStopPhase = groupStopDequeued + } + if t.ptraceSignalLocked(info) { + // Dequeueing the signal action must wait until after the + // signal-delivery-stop ends since the tracer can change or + // suppress the signal. + t.tg.signalHandlers.mu.Unlock() + return (*runInterruptAfterSignalDeliveryStop)(nil) + } + act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo)) + t.tg.signalHandlers.mu.Unlock() + return t.deliverSignal(info, act) + } + + t.tg.signalHandlers.mu.Unlock() + return (*runApp)(nil) +} + +type runInterruptAfterSignalDeliveryStop struct{} + +func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState { + t.tg.pidns.owner.mu.Lock() + // Can't defer unlock: deliverSignal must be called without holding TaskSet + // mutex. + sig := linux.Signal(t.ptraceCode) + defer func() { + t.ptraceSiginfo = nil + }() + if !sig.IsValid() { + t.tg.pidns.owner.mu.Unlock() + return (*runInterrupt)(nil) + } + info := t.ptraceSiginfo + if sig != linux.Signal(info.Signo) { + info.Signo = int32(sig) + info.Errno = 0 + info.Code = arch.SignalInfoUser + // pid isn't a valid field for all signal numbers, but Linux + // doesn't care (kernel/signal.c:ptrace_signal()). + // + // Linux uses t->parent for the tid and uid here, which is the tracer + // if it hasn't detached or the real parent otherwise. + parent := t.parent + if tracer := t.Tracer(); tracer != nil { + parent = tracer + } + if parent == nil { + // Tracer has detached and t was created by Kernel.CreateProcess(). + // Pretend the parent is in an ancestor PID + user namespace. + info.SetPid(0) + info.SetUid(int32(auth.OverflowUID)) + } else { + info.SetPid(int32(t.tg.pidns.tids[parent])) + info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + } + } + t.tg.signalHandlers.mu.Lock() + t.tg.pidns.owner.mu.Unlock() + // If the signal is masked, re-queue it. + if linux.SignalSetOf(sig)&t.tr.SignalMask != 0 { + t.sendSignalLocked(info, false /* group */) + t.tg.signalHandlers.mu.Unlock() + return (*runInterrupt)(nil) + } + act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo)) + t.tg.signalHandlers.mu.Unlock() + return t.deliverSignal(info, act) +} diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go new file mode 100644 index 000000000..801cb3395 --- /dev/null +++ b/pkg/sentry/kernel/task_start.go @@ -0,0 +1,252 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// TaskConfig defines the configuration of a new Task (see below). +type TaskConfig struct { + // Kernel is the owning Kernel. + *Kernel + + // Parent is the new task's parent. Parent may be nil. + Parent *Task + + // ThreadGroup is the ThreadGroup the new task belongs to. + *ThreadGroup + + // TaskContext is the TaskContext of the new task. + *TaskContext + + // TaskResources is the TaskResources of the new task. + *TaskResources + + // Credentials is the Credentials of the new task. + Credentials *auth.Credentials + + // Niceness is the niceness of the new task. + Niceness int + + // If NetworkNamespaced is true, the new task should observe a non-root + // network namespace. + NetworkNamespaced bool + + // AllowedCPUMask contains the cpus that this task can run on. + AllowedCPUMask sched.CPUSet + + // UTSNamespace is the UTSNamespace of the new task. + UTSNamespace *UTSNamespace + + // IPCNamespace is the IPCNamespace of the new task. + IPCNamespace *IPCNamespace +} + +// NewTask creates a new task defined by TaskConfig. +// Whether or not NewTask is successful, it takes ownership of both TaskContext +// and TaskResources of the TaskConfig. +// +// NewTask does not start the returned task; the caller must call Task.Start. +func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { + t, err := ts.newTask(cfg) + if err != nil { + cfg.TaskContext.release() + cfg.TaskResources.release() + return nil, err + } + return t, nil +} + +// newTask is a helper for TaskSet.NewTask that only takes ownership of TaskContext +// and TaskResources of the TaskConfig if it succeeds. +func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { + tg := cfg.ThreadGroup + tc := cfg.TaskContext + t := &Task{ + taskNode: taskNode{ + tg: tg, + parent: cfg.Parent, + children: make(map[*Task]struct{}), + }, + runState: (*runApp)(nil), + interruptChan: make(chan struct{}, 1), + signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, + tc: *tc, + tr: *cfg.TaskResources, + p: cfg.Kernel.Platform.NewContext(), + k: cfg.Kernel, + ptraceTracees: make(map[*Task]struct{}), + allowedCPUMask: cfg.AllowedCPUMask.Copy(), + ioUsage: &usage.IO{}, + creds: cfg.Credentials, + niceness: cfg.Niceness, + netns: cfg.NetworkNamespaced, + utsns: cfg.UTSNamespace, + ipcns: cfg.IPCNamespace, + rseqCPU: -1, + futexWaiter: futex.NewWaiter(), + } + t.endStopCond.L = &t.tg.signalHandlers.mu + t.ptraceTracer.Store((*Task)(nil)) + // We don't construct t.blockingTimer until Task.run(); see that function + // for justification. + + // Make the new task (and possibly thread group) visible to the rest of + // the system atomically. + ts.mu.Lock() + defer ts.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + if tg.exiting || tg.execing != nil { + // If the caller is in the same thread group, then what we return + // doesn't matter too much since the caller will exit before it returns + // to userspace. If the caller isn't in the same thread group, then + // we're in uncharted territory and can return whatever we want. + return nil, syserror.EINTR + } + if err := ts.assignTIDsLocked(t); err != nil { + return nil, err + } + // Below this point, newTask is expected not to fail (there is no rollback + // of assignTIDsLocked or any of the following). + + // Logging on t's behalf will panic if t.logPrefix hasn't been initialized. + // This is the earliest point at which we can do so (since t now has thread + // IDs). + t.updateLogPrefixLocked() + + if t.parent != nil { + t.parent.children[t] = struct{}{} + } + + if tg.leader == nil { + // New thread group. + tg.leader = t + if parentPG := tg.parentPG(); parentPG == nil { + tg.createSession() + } else { + // Inherit the process group. + parentPG.incRefWithParent(parentPG) + tg.processGroup = parentPG + } + } + tg.tasks.PushBack(t) + tg.tasksCount++ + tg.liveTasks++ + tg.activeTasks++ + + // Propagate external TaskSet stops to the new task. + t.stopCount = ts.stopCount + + t.mu.Lock() + defer t.mu.Unlock() + + t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t]) + + t.startTime = t.k.RealtimeClock().Now() + + return t, nil +} + +// assignTIDsLocked ensures that new task t is visible in all PID namespaces in +// which it should be visible. +// +// Preconditions: ts.mu must be locked for writing. +func (ts *TaskSet) assignTIDsLocked(t *Task) error { + type allocatedTID struct { + ns *PIDNamespace + tid ThreadID + } + var allocatedTIDs []allocatedTID + for ns := t.tg.pidns; ns != nil; ns = ns.parent { + tid, err := ns.allocateTID() + if err != nil { + // Failure. Remove the tids we already allocated in descendant + // namespaces. + for _, a := range allocatedTIDs { + delete(a.ns.tasks, a.tid) + delete(a.ns.tids, t) + } + return err + } + ns.tasks[tid] = t + ns.tids[t] = tid + allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) + } + return nil +} + +// allocateTID returns an unused ThreadID from ns. +// +// Preconditions: ns.owner.mu must be locked for writing. +func (ns *PIDNamespace) allocateTID() (ThreadID, error) { + if ns.exiting { + // "In this case, a subsequent fork(2) into this PID namespace will + // fail with the error ENOMEM; it is not possible to create a new + // processes [sic] in a PID namespace whose init process has + // terminated." - pid_namespaces(7) + return 0, syserror.ENOMEM + } + tid := ns.last + for { + // Next. + tid++ + if tid > TasksLimit { + tid = InitTID + 1 + } + + // Is it available? + _, ok := ns.tasks[tid] + if !ok { + ns.last = tid + return tid, nil + } + + // Did we do a full cycle? + if tid == ns.last { + // No tid available. + return 0, syserror.EAGAIN + } + } +} + +// Start starts the task goroutine. Start must be called exactly once for each +// task returned by NewTask. +// +// 'tid' must be the task's TID in the root PID namespace and it's used for +// debugging purposes only (set as parameter to Task.run to make it visible +// in stack dumps). +func (t *Task) Start(tid ThreadID) { + // If the task was restored, it may be "starting" after having already exited. + if t.runState == nil { + return + } + t.goroutineStopped.Add(1) + t.tg.liveGoroutines.Add(1) + t.tg.pidns.owner.liveGoroutines.Add(1) + t.tg.pidns.owner.runningGoroutines.Add(1) + + // Task is now running in system mode. + t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) + + // Use the task's TID in the root PID namespace to make it visible in stack dumps. + go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops +} diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go new file mode 100644 index 000000000..feaf6cae4 --- /dev/null +++ b/pkg/sentry/kernel/task_stop.go @@ -0,0 +1,226 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file implements task stops, which represent the equivalent of Linux's +// uninterruptible sleep states in a way that is compatible with save/restore. +// Task stops comprise both internal stops (which form part of the task's +// "normal" control flow) and external stops (which do not); see README.md for +// details. +// +// There are multiple interfaces for interacting with stops because there are +// multiple cases to consider: +// +// - A task goroutine can begin a stop on its associated task (e.g. a +// vfork() syscall stopping the calling task until the child task releases its +// MM). In this case, calling Task.interrupt is both unnecessary (the task +// goroutine obviously cannot be blocked in Task.block or executing application +// code) and undesirable (as it may spuriously interrupt a in-progress +// syscall). +// +// Beginning internal stops in this case is implemented by +// Task.beginInternalStop / Task.beginInternalStopLocked. As of this writing, +// there are no instances of this case that begin external stops, except for +// autosave; however, autosave terminates the sentry without ending the +// external stop, so the spurious interrupt is moot. +// +// - An arbitrary goroutine can begin a stop on an unrelated task (e.g. all +// tasks being stopped in preparation for state checkpointing). If the task +// goroutine may be in Task.block or executing application code, it must be +// interrupted by Task.interrupt for it to actually enter the stop; since, +// strictly speaking, we have no way of determining this, we call +// Task.interrupt unconditionally. +// +// Beginning external stops in this case is implemented by +// Task.BeginExternalStop. As of this writing, there are no instances of this +// case that begin internal stops. +// +// - An arbitrary goroutine can end a stop on an unrelated task (e.g. an +// exiting task resuming a sibling task that has been blocked in an execve() +// syscall waiting for other tasks to exit). In this case, Task.endStopCond +// must be notified to kick the task goroutine out of Task.doStop. +// +// Ending internal stops in this case is implemented by +// Task.endInternalStopLocked. Ending external stops in this case is +// implemented by Task.EndExternalStop. +// +// - Hypothetically, a task goroutine can end an internal stop on its +// associated task. As of this writing, there are no instances of this case. +// However, any instances of this case could still use the above functions, +// since notifying Task.endStopCond would be unnecessary but harmless. + +import ( + "fmt" + "sync/atomic" +) + +// A TaskStop is a condition visible to the task control flow graph that +// prevents a task goroutine from running or exiting, i.e. an internal stop. +// +// NOTE: Most TaskStops don't contain any data; they're +// distinguished by their type. The obvious way to implement such a TaskStop +// is: +// +// type groupStop struct{} +// func (groupStop) Killable() bool { return true } +// ... +// t.beginInternalStop(groupStop{}) +// +// However, this doesn't work because the state package can't serialize values, +// only pointers. Furthermore, the correctness of save/restore depends on the +// ability to pass a TaskStop to endInternalStop that will compare equal to the +// TaskStop that was passed to beginInternalStop, even if a save/restore cycle +// occurred between the two. As a result, the current idiom is to always use a +// typecast nil for data-free TaskStops: +// +// type groupStop struct{} +// func (*groupStop) Killable() bool { return true } +// ... +// t.beginInternalStop((*groupStop)(nil)) +// +// This is pretty gross, but the alternatives seem grosser. +type TaskStop interface { + // Killable returns true if Task.Kill should end the stop prematurely. + // Killable is analogous to Linux's TASK_WAKEKILL. + Killable() bool +} + +// beginInternalStop indicates the start of an internal stop that applies to t. +// +// Preconditions: The task must not already be in an internal stop (i.e. t.stop +// == nil). The caller must be running on the task goroutine. +func (t *Task) beginInternalStop(s TaskStop) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.beginInternalStopLocked(s) +} + +// Preconditions: The signal mutex must be locked. All preconditions for +// Task.beginInternalStop also apply. +func (t *Task) beginInternalStopLocked(s TaskStop) { + if t.stop != nil { + panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop)) + } + t.Debugf("Entering internal stop %#v", s) + t.stop = s + t.beginStopLocked() +} + +// endInternalStopLocked indicates the end of an internal stop that applies to +// t. endInternalStopLocked does not wait for the task to resume. +// +// The caller is responsible for ensuring that the internal stop they expect +// actually applies to t; this requires holding the signal mutex which protects +// t.stop, which is why there is no endInternalStop that locks the signal mutex +// for you. +// +// Preconditions: The signal mutex must be locked. The task must be in an +// internal stop (i.e. t.stop != nil). +func (t *Task) endInternalStopLocked() { + if t.stop == nil { + panic("Attempting to leave non-existent internal stop") + } + t.Debugf("Leaving internal stop %#v", t.stop) + t.stop = nil + t.endStopLocked() +} + +// BeginExternalStop indicates the start of an external stop that applies to t. +// BeginExternalStop does not wait for t's task goroutine to stop. +func (t *Task) BeginExternalStop() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.beginStopLocked() + t.interrupt() +} + +// EndExternalStop indicates the end of an external stop started by a previous +// call to Task.BeginExternalStop. EndExternalStop does not wait for t's task +// goroutine to resume. +func (t *Task) EndExternalStop() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.endStopLocked() +} + +// beginStopLocked increments t.stopCount to indicate that a new internal or +// external stop applies to t. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) beginStopLocked() { + if newval := atomic.AddInt32(&t.stopCount, 1); newval <= 0 { + // Most likely overflow. + panic(fmt.Sprintf("Invalid stopCount: %d", newval)) + } +} + +// endStopLocked decerements t.stopCount to indicate that an existing internal +// or external stop no longer applies to t. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) endStopLocked() { + if newval := atomic.AddInt32(&t.stopCount, -1); newval < 0 { + panic(fmt.Sprintf("Invalid stopCount: %d", newval)) + } else if newval == 0 { + t.endStopCond.Signal() + } +} + +// BeginExternalStop indicates the start of an external stop that applies to +// all current and future tasks in ts. BeginExternalStop does not wait for +// task goroutines to stop. +func (ts *TaskSet) BeginExternalStop() { + ts.mu.Lock() + defer ts.mu.Unlock() + ts.stopCount++ + if ts.stopCount <= 0 { + panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount)) + } + if ts.Root == nil { + return + } + for t := range ts.Root.tids { + t.tg.signalHandlers.mu.Lock() + t.beginStopLocked() + t.tg.signalHandlers.mu.Unlock() + t.interrupt() + } +} + +// EndExternalStop indicates the end of an external stop started by a previous +// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task +// goroutines to resume. +func (ts *TaskSet) EndExternalStop() { + ts.mu.Lock() + defer ts.mu.Unlock() + ts.stopCount-- + if ts.stopCount < 0 { + panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount)) + } + if ts.Root == nil { + return + } + for t := range ts.Root.tids { + t.tg.signalHandlers.mu.Lock() + t.endStopLocked() + t.tg.signalHandlers.mu.Unlock() + } +} diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go new file mode 100644 index 000000000..79f4ff60c --- /dev/null +++ b/pkg/sentry/kernel/task_syscall.go @@ -0,0 +1,434 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "os" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bits" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel +// include/linux/errno.h. These errnos are never returned to userspace +// directly, but are used to communicate the expected behavior of an +// interrupted syscall from the syscall to signal handling. +type SyscallRestartErrno int + +// These numeric values are significant because ptrace syscall exit tracing can +// observe them. +// +// For all of the following errnos, if the syscall is not interrupted by a +// signal delivered to a user handler, the syscall is restarted. +const ( + // ERESTARTSYS is returned by an interrupted syscall to indicate that it + // should be converted to EINTR if interrupted by a signal delivered to a + // user handler without SA_RESTART set, and restarted otherwise. + ERESTARTSYS = SyscallRestartErrno(512) + + // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it + // should always be restarted. + ERESTARTNOINTR = SyscallRestartErrno(513) + + // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it + // should be converted to EINTR if interrupted by a signal delivered to a + // user handler, and restarted otherwise. + ERESTARTNOHAND = SyscallRestartErrno(514) + + // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate + // that it should be restarted using a custom function. The interrupted + // syscall must register a custom restart function by calling + // Task.SetRestartSyscallFn. + ERESTART_RESTARTBLOCK = SyscallRestartErrno(516) +) + +// Error implements error.Error. +func (e SyscallRestartErrno) Error() string { + // Descriptions are borrowed from strace. + switch e { + case ERESTARTSYS: + return "to be restarted if SA_RESTART is set" + case ERESTARTNOINTR: + return "to be restarted" + case ERESTARTNOHAND: + return "to be restarted if no handler" + case ERESTART_RESTARTBLOCK: + return "interrupted by signal" + default: + return "(unknown interrupt error)" + } +} + +// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by +// rv, the value in a syscall return register. +func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) { + switch int(rv) { + case -int(ERESTARTSYS): + return ERESTARTSYS, true + case -int(ERESTARTNOINTR): + return ERESTARTNOINTR, true + case -int(ERESTARTNOHAND): + return ERESTARTNOHAND, true + case -int(ERESTART_RESTARTBLOCK): + return ERESTART_RESTARTBLOCK, true + default: + return 0, false + } +} + +// SyscallRestartBlock represents the restart block for a syscall restartable +// with a custom function. It encapsulates the state required to restart a +// syscall across a S/R. +type SyscallRestartBlock interface { + Restart(t *Task) (uintptr, error) +} + +// SyscallControl is returned by syscalls to control the behavior of +// Task.doSyscallInvoke. +type SyscallControl struct { + // next is the state that the task goroutine should switch to. If next is + // nil, the task goroutine should continue to syscall exit as usual. + next taskRunState + + // If ignoreReturn is true, Task.doSyscallInvoke should not store any value + // in the task's syscall return value register. + ignoreReturn bool +} + +var ( + // CtrlDoExit is returned by the implementations of the exit and exit_group + // syscalls to enter the task exit path directly, skipping syscall exit + // tracing. + CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true} + + // ctrlStopAndReinvokeSyscall is returned by syscalls using the external + // feature before syscall execution. This causes Task.doSyscallInvoke + // to return runSyscallReinvoke, allowing Task.run to check for stops + // before immediately re-invoking the syscall (skipping the re-checking + // of seccomp filters and ptrace which would confuse userspace + // tracing). + ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true} + + // ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at + // their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather + // than tail-calling it, allowing stops to be checked before syscall exit. + ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)} +) + +func (t *Task) invokeExternal() { + t.BeginExternalStop() + go func() { // S/R-SAFE: External control flow. + defer t.EndExternalStop() + t.SyscallTable().External(t.Kernel()) + }() +} + +func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) { + s := t.SyscallTable() + + fe := s.FeatureEnable.Word(sysno) + + var straceContext interface{} + if bits.IsAnyOn32(fe, StraceEnableBits) { + straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe) + } + + if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) { + t.invokeExternal() + // Ensure we check for stops, then invoke the syscall again. + ctrl = ctrlStopAndReinvokeSyscall + } else { + fn := s.Lookup(sysno) + if fn != nil { + // Call our syscall implementation. + rval, ctrl, err = fn(t, args) + } else { + // Use the missing function if not found. + rval, err = t.SyscallTable().Missing(t, sysno, args) + } + } + + if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { + t.invokeExternal() + // Don't reinvoke the syscall. + } + + if bits.IsAnyOn32(fe, StraceEnableBits) { + s.Stracer.SyscallExit(straceContext, t, sysno, rval, err) + } + + return +} + +// doSyscall is the entry point for an invocation of a system call specified by +// the current state of t's registers. +// +// The syscall path is very hot; avoid defer. +func (t *Task) doSyscall() taskRunState { + sysno := t.Arch().SyscallNo() + args := t.Arch().SyscallArgs() + + // Tracers expect to see this between when the task traps into the kernel + // to perform a syscall and when the syscall is actually invoked. + // This useless-looking temporary is needed because Go. + tmp := uintptr(syscall.ENOSYS) + t.Arch().SetReturn(-tmp) + + // Check seccomp filters. The nil check is for performance (as seccomp use + // is rare), not needed for correctness. + if t.syscallFilters != nil { + switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r { + case seccompResultDeny: + t.Debugf("Syscall %d: denied by seccomp", sysno) + return (*runSyscallExit)(nil) + case seccompResultAllow: + // ok + case seccompResultKill: + t.Debugf("Syscall %d: killed by seccomp", sysno) + t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + return (*runExit)(nil) + case seccompResultTrace: + t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno) + return (*runSyscallAfterPtraceEventSeccomp)(nil) + default: + panic(fmt.Sprintf("Unknown seccomp result %d", r)) + } + } + + return t.doSyscallEnter(sysno, args) +} + +type runSyscallAfterPtraceEventSeccomp struct{} + +func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { + if t.killed() { + // "[S]yscall-exit-stop is not generated prior to death by SIGKILL." - + // ptrace(2) + return (*runInterrupt)(nil) + } + sysno := t.Arch().SyscallNo() + // "The tracer can skip the system call by changing the syscall number to + // -1." - Documentation/prctl/seccomp_filter.txt + if sysno == ^uintptr(0) { + return (*runSyscallExit)(nil).execute(t) + } + args := t.Arch().SyscallArgs() + return t.doSyscallEnter(sysno, args) +} + +func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState { + if next, ok := t.ptraceSyscallEnter(); ok { + return next + } + return t.doSyscallInvoke(sysno, args) +} + +type runSyscallAfterSyscallEnterStop struct{} + +func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState { + if sig := linux.Signal(t.ptraceCode); sig.IsValid() { + t.tg.signalHandlers.mu.Lock() + t.sendSignalLocked(sigPriv(sig), false /* group */) + t.tg.signalHandlers.mu.Unlock() + } + if t.killed() { + return (*runInterrupt)(nil) + } + sysno := t.Arch().SyscallNo() + if sysno == ^uintptr(0) { + return (*runSyscallExit)(nil) + } + args := t.Arch().SyscallArgs() + return t.doSyscallInvoke(sysno, args) +} + +type runSyscallAfterSysemuStop struct{} + +func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState { + if sig := linux.Signal(t.ptraceCode); sig.IsValid() { + t.tg.signalHandlers.mu.Lock() + t.sendSignalLocked(sigPriv(sig), false /* group */) + t.tg.signalHandlers.mu.Unlock() + } + if t.killed() { + return (*runInterrupt)(nil) + } + return (*runSyscallExit)(nil).execute(t) +} + +func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState { + rval, ctrl, err := t.executeSyscall(sysno, args) + + if ctrl != nil { + if !ctrl.ignoreReturn { + t.Arch().SetReturn(rval) + } + if ctrl.next != nil { + return ctrl.next + } + } else if err != nil { + t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno)))) + t.haveSyscallReturn = true + } else { + t.Arch().SetReturn(rval) + } + + return (*runSyscallExit)(nil).execute(t) +} + +type runSyscallReinvoke struct{} + +func (*runSyscallReinvoke) execute(t *Task) taskRunState { + if t.killed() { + // It's possible that since the last execution, the task has + // been forcible killed. Invoking the system call here could + // result in an infinite loop if it is again preempted by an + // external stop and reinvoked. + return (*runInterrupt)(nil) + } + + sysno := t.Arch().SyscallNo() + args := t.Arch().SyscallArgs() + return t.doSyscallInvoke(sysno, args) +} + +type runSyscallExit struct{} + +func (*runSyscallExit) execute(t *Task) taskRunState { + t.ptraceSyscallExit() + return (*runApp)(nil) +} + +// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as +// indicated by an execution fault at address addr. doVsyscall returns the +// task's next run state. +func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { + // Grab the caller up front, to make sure there's a sensible stack. + caller := t.Arch().Native(uintptr(0)) + if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil { + t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(sigPriv(linux.SIGSEGV)) + return (*runApp)(nil) + } + + // For _vsyscalls_, there is no need to translate System V calling convention + // to syscall ABI because they both use RDI, RSI, and RDX for the first three + // arguments and none of the vsyscalls uses more than two arguments. + args := t.Arch().SyscallArgs() + if t.syscallFilters != nil { + switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r { + case seccompResultDeny: + t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller)) + return (*runApp)(nil) + case seccompResultAllow: + // ok + case seccompResultTrace: + t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller)) + return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller} + default: + panic(fmt.Sprintf("Unknown seccomp result %d", r)) + } + } + + return t.doVsyscallInvoke(sysno, args, caller) +} + +type runVsyscallAfterPtraceEventSeccomp struct { + addr usermem.Addr + sysno uintptr + caller interface{} +} + +func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { + if t.killed() { + return (*runInterrupt)(nil) + } + sysno := t.Arch().SyscallNo() + // "... the syscall may not be changed to another system call using the + // orig_rax register. It may only be changed to -1 order [sic] to skip the + // currently emulated call. ... The tracer MUST NOT modify rip or rsp." - + // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip + // causes do_exit(SIGSYS), and changing sp is ignored. + if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr { + t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + return (*runExit)(nil) + } + if sysno == ^uintptr(0) { + return (*runApp)(nil) + } + return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller) +} + +func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState { + rval, ctrl, err := t.executeSyscall(sysno, args) + if ctrl != nil { + t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl) + // Set the return value. The stack has already been adjusted. + t.Arch().SetReturn(0) + } else if err == nil { + t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller)) + // Set the return value. The stack has already been adjusted. + t.Arch().SetReturn(uintptr(rval)) + } else { + t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err) + if err == syserror.EFAULT { + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(sigPriv(linux.SIGSEGV)) + // A return is not emulated in this case. + return (*runApp)(nil) + } + t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno)))) + } + t.Arch().SetIP(t.Arch().Value(caller)) + t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width())) + return (*runApp)(nil) +} + +// ExtractErrno extracts an integer error number from the error. +// The syscall number is purely for context in the error case. Use -1 if +// syscall number is unknown. +func (t *Task) ExtractErrno(err error, sysno int) int { + switch err := err.(type) { + case nil: + return 0 + case syscall.Errno: + return int(err) + case SyscallRestartErrno: + return int(err) + case *memmap.BusError: + // Bus errors may generate SIGBUS, but for syscalls they still + // return EFAULT. See case in task_run.go where the fault is + // handled (and the SIGBUS is delivered). + return int(syscall.EFAULT) + case *os.PathError: + return t.ExtractErrno(err.Err, sysno) + case *os.LinkError: + return t.ExtractErrno(err.Err, sysno) + case *os.SyscallError: + return t.ExtractErrno(err.Err, sysno) + default: + if errno, ok := syserror.TranslateError(err); ok { + return int(errno) + } + } + panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err)) +} diff --git a/pkg/sentry/kernel/task_test.go b/pkg/sentry/kernel/task_test.go new file mode 100644 index 000000000..82ef858a1 --- /dev/null +++ b/pkg/sentry/kernel/task_test.go @@ -0,0 +1,69 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" +) + +func TestTaskCPU(t *testing.T) { + for _, test := range []struct { + mask sched.CPUSet + tid ThreadID + cpu int32 + }{ + { + mask: []byte{0xff}, + tid: 1, + cpu: 0, + }, + { + mask: []byte{0xff}, + tid: 10, + cpu: 1, + }, + { + // more than 8 cpus. + mask: []byte{0xff, 0xff}, + tid: 10, + cpu: 9, + }, + { + // missing the first cpu. + mask: []byte{0xfe}, + tid: 1, + cpu: 1, + }, + { + mask: []byte{0xfe}, + tid: 10, + cpu: 3, + }, + { + // missing the fifth cpu. + mask: []byte{0xef}, + tid: 10, + cpu: 2, + }, + } { + assigned := assignCPU(test.mask, test.tid) + if test.cpu != assigned { + t.Errorf("assignCPU(%v, %v) got %v, want %v", test.mask, test.tid, assigned, test.cpu) + } + } + +} diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go new file mode 100644 index 000000000..7a62ab674 --- /dev/null +++ b/pkg/sentry/kernel/task_usermem.go @@ -0,0 +1,298 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// _MAX_RW_COUNT is the maximum size in bytes of a single read or write. +// Reads and writes that exceed this size may be silently truncated. +// (Linux: include/linux/fs.h:MAX_RW_COUNT) +var _MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown()) + +// Activate ensures that the task has an active address space. +func (t *Task) Activate() { + if mm := t.MemoryManager(); mm != nil { + if err := mm.Activate(); err != nil { + panic("unable to activate mm: " + err.Error()) + } + } +} + +// Deactivate relinquishes the task's active address space. +func (t *Task) Deactivate() { + if mm := t.MemoryManager(); mm != nil { + if err := mm.Deactivate(); err != nil { + panic("unable to deactivate mm: " + err.Error()) + } + } +} + +// CopyIn copies a fixed-size value or slice of fixed-size values in from the +// task's memory. The copy will fail with syscall.EFAULT if it traverses user +// memory that is unmapped or not readable by the user. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) { + return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyInBytes is a fast version of CopyIn if the caller can serialize the +// data without reflection and pass in a byte slice. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { + return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyOut copies a fixed-size value or slice of fixed-size values out to the +// task's memory. The copy will fail with syscall.EFAULT if it traverses user +// memory that is unmapped or not writeable by the user. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) { + return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyOutBytes is a fast version of CopyOut if the caller can serialize the +// data without reflection and pass in a byte slice. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { + return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyInString copies a NUL-terminated string of length at most maxlen in from +// the task's memory. The copy will fail with syscall.EFAULT if it traverses +// user memory that is unmapped or not readable by the user. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) { + return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyInVector copies a NULL-terminated vector of strings from the task's +// memory. The copy will fail with syscall.EFAULT if it traverses +// user memory that is unmapped or not readable by the user. +// +// maxElemSize is the maximum size of each individual element. +// +// maxTotalSize is the maximum total length of all elements plus the total +// number of elements. For example, the following strings correspond to +// the following set of sizes: +// +// { "a", "b", "c" } => 6 (3 for lengths, 3 for elements) +// { "abc" } => 4 (3 for length, 1 for elements) +// +// This Task's AddressSpace must be active. +func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) { + var v []string + for { + argAddr := t.Arch().Native(0) + if _, err := t.CopyIn(addr, argAddr); err != nil { + return v, err + } + if t.Arch().Value(argAddr) == 0 { + break + } + // Each string has a zero terminating byte counted, so copying out a string + // requires at least one byte of space. Also, see the calculation below. + if maxTotalSize <= 0 { + return nil, syserror.ENOMEM + } + thisMax := maxElemSize + if maxTotalSize < thisMax { + thisMax = maxTotalSize + } + arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax) + if err != nil { + return v, err + } + v = append(v, arg) + addr += usermem.Addr(t.Arch().Width()) + maxTotalSize -= len(arg) + 1 + } + return v, nil +} + +// CopyOutIovecs converts src to an array of struct iovecs and copies it to the +// memory mapped at addr. +// +// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the +// task goroutine. t's AddressSpace must be active. +func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error { + switch t.Arch().Width() { + case 8: + const itemLen = 16 + if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok { + return syserror.EFAULT + } + + b := t.CopyScratchBuffer(itemLen) + for ; !src.IsEmpty(); src = src.Tail() { + ar := src.Head() + usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start)) + usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length())) + if _, err := t.CopyOutBytes(addr, b); err != nil { + return err + } + addr += itemLen + } + + default: + return syserror.ENOSYS + } + + return nil +} + +// CopyInIovecs copies an array of numIovecs struct iovecs from the memory +// mapped at addr, converts them to usermem.AddrRanges, and returns them as a +// usermem.AddrRangeSeq. +// +// CopyInIovecs shares the following properties with Linux's +// lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector(): +// +// - If the length of any AddrRange would exceed the range of an ssize_t, +// CopyInIovecs returns EINVAL. +// +// - If the length of any AddrRange would cause its end to overflow, +// CopyInIovecs returns EFAULT. +// +// - The combined length of all AddrRanges is limited to _MAX_RW_COUNT. If the +// combined length of all AddrRanges would otherwise exceed this amount, ranges +// beyond _MAX_RW_COUNT are silently truncated. +// +// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the +// task goroutine. t's AddressSpace must be active. +func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) { + if numIovecs == 0 { + return usermem.AddrRangeSeq{}, nil + } + + var dst []usermem.AddrRange + if numIovecs > 1 { + dst = make([]usermem.AddrRange, 0, numIovecs) + } + + switch t.Arch().Width() { + case 8: + const itemLen = 16 + if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok { + return usermem.AddrRangeSeq{}, syserror.EFAULT + } + + b := t.CopyScratchBuffer(itemLen) + for i := 0; i < numIovecs; i++ { + if _, err := t.CopyInBytes(addr, b); err != nil { + return usermem.AddrRangeSeq{}, err + } + + base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8])) + length := usermem.ByteOrder.Uint64(b[8:16]) + if length > math.MaxInt64 { + return usermem.AddrRangeSeq{}, syserror.EINVAL + } + ar, ok := base.ToRange(length) + if !ok { + return usermem.AddrRangeSeq{}, syserror.EFAULT + } + + if numIovecs == 1 { + // Special case to avoid allocating dst. + return usermem.AddrRangeSeqOf(ar).TakeFirst(_MAX_RW_COUNT), nil + } + dst = append(dst, ar) + + addr += itemLen + } + + default: + return usermem.AddrRangeSeq{}, syserror.ENOSYS + } + + // Truncate to _MAX_RW_COUNT. + var total uint64 + for i := range dst { + dstlen := uint64(dst[i].Length()) + if rem := uint64(_MAX_RW_COUNT) - total; rem < dstlen { + dst[i].End -= usermem.Addr(dstlen - rem) + dstlen = rem + } + total += dstlen + } + + return usermem.AddrRangeSeqFromSlice(dst), nil +} + +// SingleIOSequence returns a usermem.IOSequence representing [addr, +// addr+length) in t's address space. If length exceeds _MAX_RW_COUNT, it is +// silently truncated. +// +// SingleIOSequence is analogous to Linux's +// lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and +// write syscalls in Linux do not use import_single_range(), but are still +// truncated to _MAX_RW_COUNT by fs/read_write.c:rw_verify_area().) +func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) { + if length > _MAX_RW_COUNT { + length = _MAX_RW_COUNT + } + ar, ok := addr.ToRange(uint64(length)) + if !ok { + return usermem.IOSequence{}, syserror.EFAULT + } + return usermem.IOSequence{ + IO: t.MemoryManager(), + Addrs: usermem.AddrRangeSeqOf(ar), + Opts: opts, + }, nil +} + +// IovecsIOSequence returns a usermem.IOSequence representing the array of +// iovcnt struct iovecs at addr in t's address space. opts applies to the +// returned IOSequence, not the reading of the struct iovec array. +// +// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec(). +// +// Preconditions: As for Task.CopyInIovecs. +func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) { + if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { + return usermem.IOSequence{}, syserror.EINVAL + } + ars, err := t.CopyInIovecs(addr, iovcnt) + if err != nil { + return usermem.IOSequence{}, err + } + return usermem.IOSequence{ + IO: t.MemoryManager(), + Addrs: ars, + Opts: opts, + }, nil +} diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go new file mode 100644 index 000000000..8fffd3446 --- /dev/null +++ b/pkg/sentry/kernel/thread_group.go @@ -0,0 +1,269 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" +) + +// A ThreadGroup is a logical grouping of tasks that has widespread +// significance to other kernel features (e.g. signal handling). ("Thread +// groups" are usually called "processes" in userspace documentation.) +// +// ThreadGroup is a superset of Linux's struct signal_struct. +type ThreadGroup struct { + threadGroupNode + + // signalHandlers is the set of signal handlers used by every task in this + // thread group. (signalHandlers may also be shared with other thread + // groups.) + // + // signalHandlers.mu (hereafter "the signal mutex") protects state related + // to signal handling, as well as state that usually needs to be atomic + // with signal handling, for all ThreadGroups and Tasks using + // signalHandlers. (This is analogous to Linux's use of struct + // sighand_struct::siglock.) + // + // The signalHandlers pointer can only be mutated during an execve + // (Task.finishExec). Consequently, when it's possible for a task in the + // thread group to be completing an execve, signalHandlers is protected by + // the owning TaskSet.mu. Otherwise, it is possible to read the + // signalHandlers pointer without synchronization. In particular, + // completing an execve requires that all other tasks in the thread group + // have exited, so task goroutines do not need the owning TaskSet.mu to + // read the signalHandlers pointer of their thread groups. + signalHandlers *SignalHandlers + + // pendingSignals is the set of pending signals that may be handled by any + // task in this thread group. + // + // pendingSignals is protected by the signal mutex. + pendingSignals pendingSignals + + // lastTimerSignalTask records the last task we deliver a process timer signal to. + // Please see SendTimerSignal for more details. + // + // lastTimerSignalTask is protected by the signal mutex. + lastTimerSignalTask *Task + + // groupStopPhase indicates the state of a group stop in progress on the + // thread group, if any. + // + // groupStopPhase is protected by the signal mutex. + groupStopPhase groupStopPhase + + // groupStopSignal is the signal that caused a group stop to be initiated. + // groupStopSignal is only meaningful if groupStopPhase is + // groupStopInitiated or groupStopComplete. + // + // groupStopSignal is protected by the signal mutex. + groupStopSignal linux.Signal + + // groupStopCount is the number of non-exited tasks in the thread group + // that have acknowledged an initiated group stop. groupStopCount is only + // meaningful if groupStopPhase is groupStopInitiated. + // + // groupStopCount is protected by the signal mutex. + groupStopCount int + + // If groupStopWaitable is true, the thread group is indicating a waitable + // group stop event (as defined by EventChildGroupStop). + // + // Linux represents the analogous state as SIGNAL_STOP_STOPPED being set + // and group_exit_code being non-zero. + // + // groupStopWaitable is protected by the signal mutex. + groupStopWaitable bool + + // If groupContNotify is true, then a SIGCONT has recently ended a group + // stop on this thread group, and the first task to observe it should + // notify its parent. + // + // groupContNotify is protected by the signal mutex. + groupContNotify bool + + // If groupContNotify is true, groupContInterrupted is true iff SIGCONT + // ended a group stop in phase groupStopInitiated. If groupContNotify is + // false, groupContInterrupted is meaningless. + // + // Analogues in Linux: + // + // - groupContNotify && groupContInterrupted is represented by + // SIGNAL_CLD_STOPPED. + // + // - groupContNotify && !groupContInterrupted is represented by + // SIGNAL_CLD_CONTINUED. + // + // - !groupContNotify is represented by neither flag being set. + // + // groupContInterrupted is protected by the signal mutex. + groupContInterrupted bool + + // If groupContWaitable is true, the thread group is indicating a waitable + // continue event (as defined by EventGroupContinue). + // + // groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED. + // + // groupContWaitable is protected by the signal mutex. + groupContWaitable bool + + // exiting is true if all tasks in the ThreadGroup should exit. exiting is + // analogous to Linux's SIGNAL_GROUP_EXIT. + // + // exiting is protected by the signal mutex. exiting can only transition + // from false to true. + exiting bool + + // exitStatus is the thread group's exit status. + // + // While exiting is false, exitStatus is protected by the signal mutex. + // When exiting becomes true, exitStatus becomes immutable. + exitStatus ExitStatus + + // terminationSignal is the signal that this thread group's leader will + // send to its parent when it exits. + // + // terminationSignal is protected by the TaskSet mutex. + terminationSignal linux.Signal + + // liveGoroutines is the number of non-exited task goroutines in the thread + // group. + // + // liveGoroutines is not saved; it is reset as task goroutines are + // restarted by Task.Start. + liveGoroutines sync.WaitGroup `state:"nosave"` + + // tm contains process timers. TimerManager fields are immutable. + tm TimerManager + + // exitedCPUStats is the CPU usage for all exited tasks in the thread + // group. exitedCPUStats is protected by the TaskSet mutex. + exitedCPUStats usage.CPUStats + + // childCPUStats is the CPU usage of all joined descendants of this thread + // group. childCPUStats is protected by the TaskSet mutex. + childCPUStats usage.CPUStats + + // ioUsage is the I/O usage for all exited tasks in the thread group. + // The ioUsage pointer is immutable. + ioUsage *usage.IO + + // maxRSS is the historical maximum resident set size of the thread group, updated when: + // + // - A task in the thread group exits, since after all tasks have + // exited the MemoryManager is no longer reachable. + // + // - The thread group completes an execve, since this changes + // MemoryManagers. + // + // maxRSS is protected by the TaskSet mutex. + maxRSS uint64 + + // childMaxRSS is the maximum resident set size in bytes of all joined + // descendants of this thread group. + // + // childMaxRSS is protected by the TaskSet mutex. + childMaxRSS uint64 + + // Resource limits for this ThreadGroup. The limits pointer is immutable. + limits *limits.LimitSet + + // processGroup is the processGroup for this thread group. + // + // processGroup is protected by the TaskSet mutex. + processGroup *ProcessGroup + + // execed indicates an exec has occurred since creation. This will be + // set by finishExec, and new TheadGroups will have this field cleared. + // When execed is set, the processGroup may no longer be changed. + // + // execed is protected by the TaskSet mutex. + execed bool + + // rscr is the thread group's RSEQ critical region. + rscr atomic.Value `state:".(*RSEQCriticalRegion)"` +} + +// NewThreadGroup returns a new, empty thread group in PID namespace ns. The +// thread group leader will send its parent terminationSignal when it exits. +// The new thread group isn't visible to the system until a task has been +// created inside of it by a successful call to TaskSet.NewTask. +func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup { + tg := &ThreadGroup{ + threadGroupNode: threadGroupNode{ + pidns: ns, + }, + signalHandlers: sh, + terminationSignal: terminationSignal, + ioUsage: &usage.IO{}, + limits: limits, + } + tg.tm = newTimerManager(tg, monotonicClock) + tg.rscr.Store(&RSEQCriticalRegion{}) + return tg +} + +// saveRscr is invopked by stateify. +func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion { + return tg.rscr.Load().(*RSEQCriticalRegion) +} + +// loadRscr is invoked by stateify. +func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) { + tg.rscr.Store(rscr) +} + +// SignalHandlers returns the signal handlers used by tg. +// +// Preconditions: The caller must provide the synchronization required to read +// tg.signalHandlers, as described in the field's comment. +func (tg *ThreadGroup) SignalHandlers() *SignalHandlers { + return tg.signalHandlers +} + +// Timer returns tg's timers. +func (tg *ThreadGroup) Timer() *TimerManager { + return &tg.tm +} + +// Limits returns tg's limits. +func (tg *ThreadGroup) Limits() *limits.LimitSet { + return tg.limits +} + +// release releases the thread group's resources. +func (tg *ThreadGroup) release() { + // This must be done without holding the TaskSet mutex since thread group + // timers call SendSignal with Timer.mu locked. + tg.tm.destroy() +} + +// forEachChildThreadGroupLocked indicates over all child ThreadGroups. +// +// Precondition: TaskSet.mu must be held. +func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) { + for t := tg.tasks.Front(); t != nil; t = t.Next() { + for child := range t.children { + if child == child.tg.leader { + fn(child.tg) + } + } + } +} diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go new file mode 100644 index 000000000..440da9dad --- /dev/null +++ b/pkg/sentry/kernel/threads.go @@ -0,0 +1,443 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// TasksLimit is the maximum number of threads for untrusted application. +// Linux doesn't really limit this directly, rather it is limited by total +// memory size, stacks allocated and a global maximum. There's no real reason +// for us to limit it either, (esp. since threads are backed by go routines), +// and we would expect to hit resource limits long before hitting this number. +// However, for correctness, we still check that the user doesn't exceed this +// number. +// +// Note that because of the way futexes are implemented, there *are* in fact +// serious restrictions on valid thread IDs. They are limited to 2^30 - 1 +// (kernel/fork.c:MAX_THREADS). +const TasksLimit = (1 << 16) + +// ThreadID is a generic thread identifier. +type ThreadID int32 + +// String returns a decimal representation of the ThreadID. +func (tid ThreadID) String() string { + return fmt.Sprintf("%d", tid) +} + +// InitTID is the TID given to the first task added to each PID namespace. The +// thread group led by InitTID is called the namespace's init process. The +// death of a PID namespace's init process causes all tasks visible in that +// namespace to be killed. +const InitTID ThreadID = 1 + +// A TaskSet comprises all tasks in a system. +type TaskSet struct { + // mu protects all relationships betweens tasks and thread groups in the + // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.) + mu sync.RWMutex `state:"nosave"` + + // Root is the root PID namespace, in which all tasks in the TaskSet are + // visible. The Root pointer is immutable. + Root *PIDNamespace + + // sessions is the set of all sessions. + sessions sessionList + + // stopCount is the number of active external stops applicable to all tasks + // in the TaskSet (calls to TaskSet.BeginExternalStop that have not been + // paired with a call to TaskSet.EndExternalStop). stopCount is protected + // by mu. + // + // stopCount is not saved for the same reason as Task.stopCount; it is + // always reset to zero after restore. + stopCount int32 `state:"nosave"` + + // liveGoroutines is the number of non-exited task goroutines in the + // TaskSet. + // + // liveGoroutines is not saved; it is reset as task goroutines are + // restarted by Task.Start. + liveGoroutines sync.WaitGroup `state:"nosave"` + + // runningGoroutines is the number of running task goroutines in the + // TaskSet. + // + // runningGoroutines is not saved; its counter value is required to be zero + // at time of save (but note that this is not necessarily the same thing as + // sync.WaitGroup's zero value). + runningGoroutines sync.WaitGroup `state:"nosave"` +} + +// newTaskSet returns a new, empty TaskSet. +func newTaskSet() *TaskSet { + ts := &TaskSet{} + ts.Root = newPIDNamespace(ts, nil /* parent */, auth.NewRootUserNamespace()) + return ts +} + +// forEachThreadGroupLocked applies f to each thread group in ts. +// +// Preconditions: ts.mu must be locked (for reading or writing). +func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) { + for t := range ts.Root.tids { + if t == t.tg.leader { + f(t.tg) + } + } +} + +// A PIDNamespace represents a PID namespace, a bimap between thread IDs and +// tasks. See the pid_namespaces(7) man page for further details. +// +// N.B. A task is said to be visible in a PID namespace if the PID namespace +// contains a thread ID that maps to that task. +type PIDNamespace struct { + // owner is the TaskSet that this PID namespace belongs to. The owner + // pointer is immutable. + owner *TaskSet + + // parent is the PID namespace of the process that created this one. If + // this is the root PID namespace, parent is nil. The parent pointer is + // immutable. + // + // Invariant: All tasks that are visible in this namespace are also visible + // in all ancestor namespaces. + parent *PIDNamespace + + // userns is the user namespace with which this PID namespace is + // associated. Privileged operations on this PID namespace must have + // appropriate capabilities in userns. The userns pointer is immutable. + userns *auth.UserNamespace + + // The following fields are protected by owner.mu. + + // last is the last ThreadID to be allocated in this namespace. + last ThreadID + + // tasks is a mapping from ThreadIDs in this namespace to tasks visible in + // the namespace. + tasks map[ThreadID]*Task + + // tids is a mapping from tasks visible in this namespace to their + // identifiers in this namespace. + tids map[*Task]ThreadID + + // sessions is a mapping from SessionIDs in this namespace to sessions + // visible in the namespace. + sessions map[SessionID]*Session + + // sids is a mapping from sessions visible in this namespace to their + // identifiers in this namespace. + sids map[*Session]SessionID + + // processGroups is a mapping from ProcessGroupIDs in this namespace to + // process groups visible in the namespace. + processGroups map[ProcessGroupID]*ProcessGroup + + // pgids is a mapping from process groups visible in this namespace to + // their identifiers in this namespace. + pgids map[*ProcessGroup]ProcessGroupID + + // exiting indicates that the namespace's init process is exiting or has + // exited. + exiting bool +} + +func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace { + return &PIDNamespace{ + owner: ts, + parent: parent, + userns: userns, + tasks: make(map[ThreadID]*Task), + tids: make(map[*Task]ThreadID), + sessions: make(map[SessionID]*Session), + sids: make(map[*Session]SessionID), + processGroups: make(map[ProcessGroupID]*ProcessGroup), + pgids: make(map[*ProcessGroup]ProcessGroupID), + } +} + +// NewChild returns a new, empty PID namespace that is a child of ns. Authority +// over the new PID namespace is controlled by userns. +func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace { + return newPIDNamespace(ns.owner, ns, userns) +} + +// TaskWithID returns the task with thread ID tid in PID namespace ns. If no +// task has that TID, TaskWithID returns nil. +func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + return ns.tasks[tid] +} + +// ThreadGroupWithID returns the thread group lead by the task with thread ID +// tid in PID namespace ns. If no task has that TID, or if the task with that +// TID is not a thread group leader, ThreadGroupWithID returns nil. +func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + t := ns.tasks[tid] + if t == nil { + return nil + } + if t != t.tg.leader { + return nil + } + return t.tg +} + +// IDOfTask returns the TID assigned to the given task in PID namespace ns. If +// the task is not visible in that namespace, IDOfTask returns 0. (This return +// value is significant in some cases, e.g. getppid() is documented as +// returning 0 if the caller's parent is in an ancestor namespace and +// consequently not visible to the caller.) If the task is nil, IDOfTask returns +// 0. +func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + return ns.tids[t] +} + +// IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns. +// If the task is not visible in that namespace, IDOfThreadGroup returns 0. +func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + return ns.tids[tg.leader] +} + +// Tasks returns a snapshot of the tasks in ns. +func (ns *PIDNamespace) Tasks() []*Task { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + tasks := make([]*Task, 0, len(ns.tasks)) + for t := range ns.tids { + tasks = append(tasks, t) + } + return tasks +} + +// ThreadGroups returns a snapshot of the thread groups in ns. +func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + var tgs []*ThreadGroup + for t := range ns.tids { + if t == t.tg.leader { + tgs = append(tgs, t.tg) + } + } + return tgs +} + +// UserNamespace returns the user namespace associated with PID namespace ns. +func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace { + return ns.userns +} + +// A threadGroupNode defines the relationship between a thread group and the +// rest of the system. Conceptually, threadGroupNode is data belonging to the +// owning TaskSet, as if TaskSet contained a field `nodes +// map[*ThreadGroup]*threadGroupNode`. However, for practical reasons, +// threadGroupNode is embedded in the ThreadGroup it represents. +// (threadGroupNode is an anonymous field in ThreadGroup; this is to expose +// threadGroupEntry's methods on ThreadGroup to make it implement +// threadGroupLinker.) +type threadGroupNode struct { + // pidns is the PID namespace containing the thread group and all of its + // member tasks. The pidns pointer is immutable. + pidns *PIDNamespace + + // eventQueue is notified whenever a event of interest to Task.Wait occurs + // in a child of this thread group, or a ptrace tracee of a task in this + // thread group. Events are defined in task_exit.go. + // + // Note that we cannot check and save this wait queue similarly to other + // wait queues, as the queue will not be empty by the time of saving, due + // to the wait sourced from Exec(). + eventQueue waiter.Queue `state:"nosave"` + + // leader is the thread group's leader, which is the oldest task in the + // thread group; usually the last task in the thread group to call + // execve(), or if no such task exists then the first task in the thread + // group, which was created by a call to fork() or clone() without + // CLONE_THREAD. Once a thread group has been made visible to the rest of + // the system by TaskSet.newTask, leader is never nil. + // + // Note that it's possible for the leader to exit without causing the rest + // of the thread group to exit; in such a case, leader will still be valid + // and non-nil, but leader will not be in tasks. + // + // leader is protected by the TaskSet mutex. + leader *Task + + // If execing is not nil, it is a task in the thread group that has killed + // all other tasks so that it can become the thread group leader and + // perform an execve. (execing may already be the thread group leader.) + // + // execing is analogous to Linux's signal_struct::group_exit_task. + // + // execing is protected by the TaskSet mutex. + execing *Task + + // tasks is all tasks in the thread group that have not yet been reaped. + // + // tasks is protected by both the TaskSet mutex and the signal mutex: + // Mutating tasks requires locking the TaskSet mutex for writing *and* + // locking the signal mutex. Reading tasks requires locking the TaskSet + // mutex *or* locking the signal mutex. + tasks taskList + + // tasksCount is the number of tasks in the thread group that have not yet + // been reaped; equivalently, tasksCount is the number of tasks in tasks. + // + // tasksCount is protected by both the TaskSet mutex and the signal mutex, + // as with tasks. + tasksCount int + + // liveTasks is the number of tasks in the thread group that have not yet + // reached TaskExitZombie. + // + // liveTasks is protected by the TaskSet mutex (NOT the signal mutex). + liveTasks int + + // activeTasks is the number of tasks in the thread group that have not yet + // reached TaskExitInitiated. + // + // activeTasks is protected by both the TaskSet mutex and the signal mutex, + // as with tasks. + activeTasks int +} + +// PIDNamespace returns the PID namespace containing tg. +func (tg *ThreadGroup) PIDNamespace() *PIDNamespace { + return tg.pidns +} + +// TaskSet returns the TaskSet containing tg. +func (tg *ThreadGroup) TaskSet() *TaskSet { + return tg.pidns.owner +} + +// Leader returns tg's leader. +func (tg *ThreadGroup) Leader() *Task { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.leader +} + +// Count returns the number of non-exited threads in the group. +func (tg *ThreadGroup) Count() int { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + var count int + for t := tg.tasks.Front(); t != nil; t = t.Next() { + count++ + } + return count +} + +// MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for +// all tasks in tg. +func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + + var tasks []ThreadID + for t := tg.tasks.Front(); t != nil; t = t.Next() { + if id, ok := pidns.tids[t]; ok { + tasks = append(tasks, id) + } + } + return tasks +} + +// ID returns tg's leader's thread ID in its own PID namespace. If tg's leader +// is dead, ID returns 0. +func (tg *ThreadGroup) ID() ThreadID { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.pidns.tids[tg.leader] +} + +// A taskNode defines the relationship between a task and the rest of the +// system. The comments on threadGroupNode also apply to taskNode. +type taskNode struct { + // tg is the thread group that this task belongs to. The tg pointer is + // immutable. + tg *ThreadGroup `state:"wait"` + + // taskEntry links into tg.tasks. Note that this means that + // Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread + // group. See threadGroupNode.tasks for synchronization info. + taskEntry + + // parent is the task's parent. parent may be nil. + // + // parent is protected by the TaskSet mutex. + parent *Task + + // children is this task's children. + // + // children is protected by the TaskSet mutex. + children map[*Task]struct{} + + // If childPIDNamespace is not nil, all new tasks created by this task will + // be members of childPIDNamespace rather than this one. (As a corollary, + // this task becomes unable to create sibling tasks in the same thread + // group.) + // + // childPIDNamespace is exclusive to the task goroutine. + childPIDNamespace *PIDNamespace +} + +// ThreadGroup returns the thread group containing t. +func (t *Task) ThreadGroup() *ThreadGroup { + return t.tg +} + +// PIDNamespace returns the PID namespace containing t. +func (t *Task) PIDNamespace() *PIDNamespace { + return t.tg.pidns +} + +// TaskSet returns the TaskSet containing t. +func (t *Task) TaskSet() *TaskSet { + return t.tg.pidns.owner +} + +// Timekeeper returns the system Timekeeper. +func (t *Task) Timekeeper() *Timekeeper { + return t.k.timekeeper +} + +// Parent returns t's parent. +func (t *Task) Parent() *Task { + return t.parent +} + +// ThreadID returns t's thread ID in its own PID namespace. If the task is +// dead, ThreadID returns 0. +func (t *Task) ThreadID() ThreadID { + return t.tg.pidns.IDOfTask(t) +} diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD new file mode 100644 index 000000000..84f31b2dc --- /dev/null +++ b/pkg/sentry/kernel/time/BUILD @@ -0,0 +1,32 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "time_state", + srcs = [ + "time.go", + ], + out = "time_state.go", + package = "time", +) + +go_library( + name = "time", + srcs = [ + "context.go", + "time.go", + "time_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/context", + "//pkg/state", + "//pkg/syserror", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go new file mode 100644 index 000000000..ac4dc01d8 --- /dev/null +++ b/pkg/sentry/kernel/time/context.go @@ -0,0 +1,44 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package time + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// contextID is the time package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxRealtimeClock is a Context.Value key for the current real time. + CtxRealtimeClock contextID = iota +) + +// RealtimeClockFromContext returns the real time clock associated with context +// ctx. +func RealtimeClockFromContext(ctx context.Context) Clock { + if v := ctx.Value(CtxRealtimeClock); v != nil { + return v.(Clock) + } + return nil +} + +// NowFromContext returns the current real time associated with context ctx. +func NowFromContext(ctx context.Context) Time { + if clk := RealtimeClockFromContext(ctx); clk != nil { + return clk.Now() + } + panic("encountered context without RealtimeClock") +} diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go new file mode 100644 index 000000000..c223c2f19 --- /dev/null +++ b/pkg/sentry/kernel/time/time.go @@ -0,0 +1,649 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package time defines the Timer type, which provides a periodic timer that +// works by sampling a user-provided clock. +package time + +import ( + "fmt" + "math" + "sync" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Events that may be generated by a Clock. +const ( + // ClockEventSet occurs when a Clock undergoes a discontinuous change. + ClockEventSet waiter.EventMask = 1 << iota + + // ClockEventRateIncrease occurs when the rate at which a Clock advances + // increases significantly, such that values returned by previous calls to + // Clock.WallTimeUntil may be too large. + ClockEventRateIncrease +) + +// Time represents an instant in time with nanosecond precision. +// +// Time may represent time with respect to any clock and may not have any +// meaning in the real world. +type Time struct { + ns int64 +} + +var ( + // MinTime is the zero time instant, the lowest possible time that can + // be represented by Time. + MinTime = Time{ns: math.MinInt64} + + // MaxTime is the highest possible time that can be represented by + // Time. + MaxTime = Time{ns: math.MaxInt64} + + // ZeroTime represents the zero time in an unspecified Clock's domain. + ZeroTime = Time{ns: 0} +) + +const ( + // MinDuration is the minimum duration representable by time.Duration. + MinDuration = time.Duration(math.MinInt64) + + // MaxDuration is the maximum duration representable by time.Duration. + MaxDuration = time.Duration(math.MaxInt64) +) + +// FromNanoseconds returns a Time representing the point ns nanoseconds after +// an unspecified Clock's zero time. +func FromNanoseconds(ns int64) Time { + return Time{ns} +} + +// FromSeconds returns a Time representing the point s seconds after an +// unspecified Clock's zero time. +func FromSeconds(s int64) Time { + if s > math.MaxInt64/time.Second.Nanoseconds() { + return MaxTime + } + return Time{s * 1e9} +} + +// FromUnix converts from Unix seconds and nanoseconds to Time, assuming a real +// time Unix clock domain. +func FromUnix(s int64, ns int64) Time { + if s > math.MaxInt64/time.Second.Nanoseconds() { + return MaxTime + } + t := s * 1e9 + if t > math.MaxInt64-ns { + return MaxTime + } + return Time{t + ns} +} + +// FromTimespec converts from Linux Timespec to Time. +func FromTimespec(ts linux.Timespec) Time { + return Time{ts.ToNsecCapped()} +} + +// FromTimeval converts a Linux Timeval to Time. +func FromTimeval(tv linux.Timeval) Time { + return Time{tv.ToNsecCapped()} +} + +// Nanoseconds returns nanoseconds elapsed since the zero time in t's Clock +// domain. If t represents walltime, this is nanoseconds since the Unix epoch. +func (t Time) Nanoseconds() int64 { + return t.ns +} + +// Seconds returns seconds elapsed since the zero time in t's Clock domain. If +// t represents walltime, this is seconds since Unix epoch. +func (t Time) Seconds() int64 { + return t.Nanoseconds() / time.Second.Nanoseconds() +} + +// Timespec converts Time to a Linux timespec. +func (t Time) Timespec() linux.Timespec { + return linux.NsecToTimespec(t.Nanoseconds()) +} + +// Unix returns the (seconds, nanoseconds) representation of t such that +// seconds*1e9 + nanoseconds = t. +func (t Time) Unix() (s int64, ns int64) { + s = t.ns / 1e9 + ns = t.ns % 1e9 + return +} + +// TimeT converts Time to a Linux time_t. +func (t Time) TimeT() linux.TimeT { + return linux.NsecToTimeT(t.Nanoseconds()) +} + +// Timeval converts Time to a Linux timeval. +func (t Time) Timeval() linux.Timeval { + return linux.NsecToTimeval(t.Nanoseconds()) +} + +// Add adds the duration of d to t. +func (t Time) Add(d time.Duration) Time { + if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) { + return MaxTime + } + if t.ns < 0 && d.Nanoseconds() < math.MinInt64-int64(t.ns) { + return MinTime + } + return Time{int64(t.ns) + d.Nanoseconds()} +} + +// AddTime adds the duration of u to t. +func (t Time) AddTime(u Time) Time { + return t.Add(time.Duration(u.ns)) +} + +// Equal reports whether the two times represent the same instant in time. +func (t Time) Equal(u Time) bool { + return t.ns == u.ns +} + +// Before reports whether the instant t is before the instant u. +func (t Time) Before(u Time) bool { + return t.ns < u.ns +} + +// After reports whether the instant t is after the instant u. +func (t Time) After(u Time) bool { + return t.ns > u.ns +} + +// Sub returns the duration of t - u. +// +// N.B. This measure may not make sense for every Time returned by ktime.Clock. +// Callers who need wall time duration can use ktime.Clock.WallTimeUntil to +// estimate that wall time. +func (t Time) Sub(u Time) time.Duration { + dur := time.Duration(int64(t.ns)-int64(u.ns)) * time.Nanosecond + switch { + case u.Add(dur).Equal(t): + return dur + case t.Before(u): + return MinDuration + default: + return MaxDuration + } +} + +// IsMin returns whether t represents the lowest possible time instant. +func (t Time) IsMin() bool { + return t == MinTime +} + +// IsZero returns whether t represents the zero time instant in t's Clock domain. +func (t Time) IsZero() bool { + return t == ZeroTime +} + +// String returns the time represented in nanoseconds as a string. +func (t Time) String() string { + return fmt.Sprintf("%dns", t.Nanoseconds()) +} + +// A Clock is an abstract time source. +type Clock interface { + // Now returns the current time in nanoseconds according to the Clock. + Now() Time + + // WallTimeUntil returns the estimated wall time until Now will return a + // value greater than or equal to t, given that a recent call to Now + // returned now. If t has already passed, WallTimeUntil may return 0 or a + // negative value. + // + // WallTimeUntil must be abstract to support Clocks that do not represent + // wall time (e.g. thread group execution timers). Clocks that represent + // wall times may embed the WallRateClock type to obtain an appropriate + // trivial implementation of WallTimeUntil. + // + // WallTimeUntil is used to determine when associated Timers should next + // check for expirations. Returning too small a value may result in + // spurious Timer goroutine wakeups, while returning too large a value may + // result in late expirations. Implementations should usually err on the + // side of underestimating. + WallTimeUntil(t, now Time) time.Duration + + // Waitable methods may be used to subscribe to Clock events. Waiters will + // not be preserved by Save and must be re-established during restore. + // + // Since Clock events are transient, implementations of + // waiter.Waitable.Readiness should return 0. + waiter.Waitable +} + +// WallRateClock implements Clock.WallTimeUntil for Clocks that elapse at the +// same rate as wall time. +type WallRateClock struct{} + +// WallTimeUntil implements Clock.WallTimeUntil. +func (WallRateClock) WallTimeUntil(t, now Time) time.Duration { + return t.Sub(now) +} + +// NoClockEvents implements waiter.Waitable for Clocks that do not generate +// events. +type NoClockEvents struct{} + +// Readiness implements waiter.Waitable.Readiness. +func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask { + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (NoClockEvents) EventUnregister(e *waiter.Entry) { +} + +// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and +// defining waiter.Waitable.Readiness as required by Clock. +type ClockEventsQueue struct { + waiter.Queue +} + +// Readiness implements waiter.Waitable.Readiness. +func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask { + return 0 +} + +// A TimerListener receives expirations from a Timer. +type TimerListener interface { + // Notify is called when its associated Timer expires. exp is the number of + // expirations. + // + // Notify is called with the associated Timer's mutex locked, so Notify + // must not take any locks that precede Timer.mu in lock order. + // + // Preconditions: exp > 0. + Notify(exp uint64) + + // Destroy is called when the timer is destroyed. + Destroy() +} + +// Setting contains user-controlled mutable Timer properties. +type Setting struct { + // Enabled is true if the timer is running. + Enabled bool + + // Next is the time in nanoseconds of the next expiration. + Next Time + + // Period is the time in nanoseconds between expirations. If Period is + // zero, the timer will not automatically restart after expiring. + // + // Invariant: Period >= 0. + Period time.Duration +} + +// SettingFromSpec converts a (value, interval) pair to a Setting based on a +// reading from c. value is interpreted as a time relative to c.Now(). +func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) { + if value < 0 { + return Setting{}, syserror.EINVAL + } + if value == 0 { + return Setting{Period: interval}, nil + } + return Setting{ + Enabled: true, + Next: c.Now().Add(value), + Period: interval, + }, nil +} + +// SettingFromAbsSpec converts a (value, interval) pair to a Setting based on a +// reading from c. value is interpreted as an absolute time. +func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) { + if value.Before(ZeroTime) { + return Setting{}, syserror.EINVAL + } + if value.IsZero() { + return Setting{Period: interval}, nil + } + return Setting{ + Enabled: true, + Next: value, + Period: interval, + }, nil +} + +// SpecFromSetting converts a timestamp and a Setting to a (relative value, +// interval) pair, as used by most Linux syscalls that return a struct +// itimerval or struct itimerspec. +func SpecFromSetting(now Time, s Setting) (value, period time.Duration) { + if !s.Enabled { + return 0, s.Period + } + return s.Next.Sub(now), s.Period +} + +// advancedTo returns an updated Setting and a number of expirations after +// the associated Clock indicates a time of now. +// +// Settings may be created by successive calls to advancedTo with decreasing +// values of now (i.e. time may appear to go backward). Supporting this is +// required to support non-monotonic clocks, as well as allowing +// Timer.clock.Now() to be called without holding Timer.mu. +func (s Setting) advancedTo(now Time) (Setting, uint64) { + if !s.Enabled { + return s, 0 + } + if s.Next.After(now) { + return s, 0 + } + if s.Period == 0 { + s.Enabled = false + return s, 1 + } + exp := 1 + uint64(now.Sub(s.Next).Nanoseconds())/uint64(s.Period) + s.Next = s.Next.Add(time.Duration(uint64(s.Period) * exp)) + return s, exp +} + +// Timer is an optionally-periodic timer driven by sampling a user-specified +// Clock. Timer's semantics support the requirements of Linux's interval timers +// (setitimer(2), timer_create(2), timerfd_create(2)). +// +// Timers should be created using NewTimer and must be cleaned up by calling +// Timer.Destroy when no longer used. +type Timer struct { + // clock is the time source. clock is immutable. + clock Clock + + // listener is notified of expirations. listener is immutable. + listener TimerListener + + // mu protects the following mutable fields. + mu sync.Mutex `state:"nosave"` + + // setting is the timer setting. setting is protected by mu. + setting Setting + + // paused is true if the Timer is paused. paused is protected by mu. + paused bool + + // kicker is used to wake the Timer goroutine. The kicker pointer is + // immutable, but its state is protected by mu. + kicker *time.Timer `state:"nosave"` + + // entry is registered with clock.EventRegister. entry is immutable. + // + // Per comment in Clock, entry must be re-registered after restore; per + // comment in Timer.Load, this is done in Timer.Resume. + entry waiter.Entry `state:"nosave"` + + // events is the channel that will be notified whenever entry receives an + // event. It is also closed by Timer.Destroy to instruct the Timer + // goroutine to exit. + events chan struct{} `state:"nosave"` +} + +// timerTickEvents are Clock events that require the Timer goroutine to Tick +// prematurely. +const timerTickEvents = ClockEventSet | ClockEventRateIncrease + +// NewTimer returns a new Timer that will obtain time from clock and send +// expirations to listener. The Timer is initially stopped and has no first +// expiration or period configured. +func NewTimer(clock Clock, listener TimerListener) *Timer { + t := &Timer{ + clock: clock, + listener: listener, + } + t.init() + return t +} + +// After waits for the duration to elapse according to clock and then sends a +// notification on the returned channel. The timer is started immediately and +// will fire exactly once. The second return value is the start time used with +// the duration. +// +// Callers must call Timer.Destroy. +func After(clock Clock, duration time.Duration) (*Timer, Time, <-chan struct{}) { + notifier, tchan := NewChannelNotifier() + t := NewTimer(clock, notifier) + now := clock.Now() + + t.Swap(Setting{ + Enabled: true, + Period: 0, + Next: now.Add(duration), + }) + return t, now, tchan +} + +// init initializes Timer state that is not preserved across save/restore. If +// init has already been called, calling it again is a no-op. +// +// Preconditions: t.mu must be locked, or the caller must have exclusive access +// to t. +func (t *Timer) init() { + if t.kicker != nil { + return + } + // If t.kicker is nil, the Timer goroutine can't be running, so we can't + // race with it. + t.kicker = time.NewTimer(0) + t.entry, t.events = waiter.NewChannelEntry(nil) + t.clock.EventRegister(&t.entry, timerTickEvents) + go t.runGoroutine() // S/R-SAFE: synchronized by t.mu +} + +// Destroy releases resources owned by the Timer. A Destroyed Timer must not be +// used again; in particular, a Destroyed Timer should not be Saved. +func (t *Timer) Destroy() { + // Stop the Timer, ensuring that the Timer goroutine will not call + // t.kicker.Reset, before calling t.kicker.Stop. + t.mu.Lock() + t.setting.Enabled = false + t.mu.Unlock() + t.kicker.Stop() + // Unregister t.entry, ensuring that the Clock will not send to t.events, + // before closing t.events to instruct the Timer goroutine to exit. + t.clock.EventUnregister(&t.entry) + close(t.events) + t.listener.Destroy() +} + +func (t *Timer) runGoroutine() { + for { + select { + case <-t.kicker.C: + case _, ok := <-t.events: + if !ok { + // Channel closed by Destroy. + return + } + } + t.Tick() + } +} + +// Tick requests that the Timer immediately check for expirations and +// re-evaluate when it should next check for expirations. +func (t *Timer) Tick() { + now := t.clock.Now() + t.mu.Lock() + defer t.mu.Unlock() + if t.paused { + return + } + s, exp := t.setting.advancedTo(now) + t.setting = s + if exp > 0 { + t.listener.Notify(exp) + } + t.resetKickerLocked(now) +} + +// Pause pauses the Timer, ensuring that it does not generate any further +// expirations until Resume is called. If the Timer is already paused, Pause +// has no effect. +func (t *Timer) Pause() { + t.mu.Lock() + defer t.mu.Unlock() + t.paused = true + // t.kicker may be nil if we were restored but never resumed. + if t.kicker != nil { + t.kicker.Stop() + } +} + +// Resume ends the effect of Pause. If the Timer is not paused, Resume has no +// effect. +func (t *Timer) Resume() { + t.mu.Lock() + defer t.mu.Unlock() + if !t.paused { + return + } + t.paused = false + + // Lazily initialize the Timer. We can't call Timer.init until Timer.Resume + // because save/restore will restore Timers before + // kernel.Timekeeper.SetClocks() has been called, so if t.clock is backed + // by a kernel.Timekeeper then the Timer goroutine will panic if it calls + // t.clock.Now(). + t.init() + + // Kick the Timer goroutine in case it was already initialized, but the + // Timer goroutine was sleeping. + t.kicker.Reset(0) +} + +// Get returns a snapshot of the Timer's current Setting and the time +// (according to the Timer's Clock) at which the snapshot was taken. +// +// Preconditions: The Timer must not be paused (since its Setting cannot +// be advanced to the current time while it is paused.) +func (t *Timer) Get() (Time, Setting) { + now := t.clock.Now() + t.mu.Lock() + defer t.mu.Unlock() + if t.paused { + panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t)) + } + s, exp := t.setting.advancedTo(now) + t.setting = s + if exp > 0 { + t.listener.Notify(exp) + } + t.resetKickerLocked(now) + return now, s +} + +// Swap atomically changes the Timer's Setting and returns the Timer's previous +// Setting and the time (according to the Timer's Clock) at which the snapshot +// was taken. Setting s.Enabled to true starts the Timer, while setting +// s.Enabled to false stops it. +// +// Preconditions: The Timer must not be paused. +func (t *Timer) Swap(s Setting) (Time, Setting) { + return t.SwapAnd(s, nil) +} + +// SwapAnd atomically changes the Timer's Setting, calls f if it is not nil, +// and returns the Timer's previous Setting and the time (according to the +// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true +// starts the timer, while setting s.Enabled to false stops it. +// +// Preconditions: The Timer must not be paused. f cannot call any Timer methods +// since it is called with the Timer mutex locked. +func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) { + now := t.clock.Now() + t.mu.Lock() + defer t.mu.Unlock() + if t.paused { + panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t)) + } + oldS, oldExp := t.setting.advancedTo(now) + if oldExp > 0 { + t.listener.Notify(oldExp) + } + if f != nil { + f() + } + newS, newExp := s.advancedTo(now) + t.setting = newS + if newExp > 0 { + t.listener.Notify(newExp) + } + t.resetKickerLocked(now) + return now, oldS +} + +// Preconditions: t.mu must be locked. +func (t *Timer) resetKickerLocked(now Time) { + if t.setting.Enabled { + // Clock.WallTimeUntil may return a negative value. This is fine; + // time.when treats negative Durations as 0. + t.kicker.Reset(t.clock.WallTimeUntil(t.setting.Next, now)) + } + // We don't call t.kicker.Stop if !t.setting.Enabled because in most cases + // resetKickerLocked will be called from the Timer goroutine itself, in + // which case t.kicker has already fired and t.kicker.Stop will be an + // expensive no-op (time.Timer.Stop => time.stopTimer => runtime.stopTimer + // => runtime.deltimer). +} + +// Clock returns the Clock used by t. +func (t *Timer) Clock() Clock { + return t.clock +} + +// ChannelNotifier is a TimerListener that sends a message on an empty struct +// channel. +// +// ChannelNotifier cannot be saved or loaded. +type ChannelNotifier struct { + // tchan must be a buffered channel. + tchan chan struct{} +} + +// NewChannelNotifier creates a new channel notifier. +// +// If the notifier is used with a timer, Timer.Destroy will close the channel +// returned here. +func NewChannelNotifier() (TimerListener, <-chan struct{}) { + tchan := make(chan struct{}, 1) + return &ChannelNotifier{tchan}, tchan +} + +// Notify implements ktime.TimerListener.Notify. +func (c *ChannelNotifier) Notify(uint64) { + select { + case c.tchan <- struct{}{}: + default: + } +} + +// Destroy implements ktime.TimerListener.Destroy and will close the channel. +func (c *ChannelNotifier) Destroy() { + close(c.tchan) +} diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go new file mode 100644 index 000000000..3f16c1676 --- /dev/null +++ b/pkg/sentry/kernel/timekeeper.go @@ -0,0 +1,270 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + "time" + + "gvisor.googlesource.com/gvisor/pkg/log" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time" +) + +// Timekeeper manages all of the kernel clocks. +type Timekeeper struct { + // clocks are the clock sources. + // + // These are not saved directly, as the new machine's clock may behave + // differently. + // + // It is set only once, by SetClocks. + clocks sentrytime.Clocks `state:"nosave"` + + // bootTime is the realtime when the system "booted". i.e., when + // SetClocks was called in the initial (not restored) run. + bootTime ktime.Time + + // monotonicOffset is the offset to apply to the monotonic clock output + // from clocks. + // + // It is set only once, by SetClocks. + monotonicOffset int64 `state:"nosave"` + + // restored indicates that this Timekeeper was restored from a state + // file. + restored bool `state:"nosave"` + + // saveMonotonic is the (offset) value of the monotonic clock at the + // time of save. + // + // It is only valid if restored is true. + // + // It is only used in SetClocks after restore to compute the new + // monotonicOffset. + saveMonotonic int64 + + // saveRealtime is the value of the realtime clock at the time of save. + // + // It is only valid if restored is true. + // + // It is only used in SetClocks after restore to compute the new + // monotonicOffset. + saveRealtime int64 + + // params manages the parameter page. + params *VDSOParamPage + + // mu protects destruction with stop and wg. + mu sync.Mutex `state:"nosave"` + + // stop is used to tell the update goroutine to exit. + stop chan struct{} `state:"nosave"` + + // wg is used to indicate that the update goroutine has exited. + wg sync.WaitGroup `state:"nosave"` +} + +// NewTimekeeper returns a Timekeeper that is automatically kept up-to-date. +// NewTimekeeper does not take ownership of paramPage. +// +// SetClocks must be called on the returned Timekeeper before it is usable. +func NewTimekeeper(platform platform.Platform, paramPage platform.FileRange) (*Timekeeper, error) { + return &Timekeeper{ + params: NewVDSOParamPage(platform, paramPage), + }, nil +} + +// SetClocks the backing clock source. +// +// SetClocks must be called before the Timekeeper is used, and it may not be +// called more than once, as changing the clock source without extra correction +// could cause time discontinuities. +// +// It must also be called after Load. +func (t *Timekeeper) SetClocks(c sentrytime.Clocks) { + // Update the params, marking them "not ready", as we may need to + // restart calibration on this new machine. + if t.restored { + if err := t.params.Write(func() vdsoParams { + return vdsoParams{} + }); err != nil { + panic("unable to reset VDSO params: " + err.Error()) + } + } + + if t.clocks != nil { + panic("SetClocks called on previously-initialized Timekeeper") + } + + t.clocks = c + + // Compute the offset of the monotonic clock from the base Clocks. + // + // In a fresh (not restored) sentry, monotonic time starts at zero. + // + // In a restored sentry, monotonic time jumps forward by approximately + // the same amount as real time. There are no guarantees here, we are + // just making a best-effort attempt to to make it appear that the app + // was simply not scheduled for a long period, rather than that the + // real time clock was changed. + // + // If real time went backwards, it remains the same. + wantMonotonic := int64(0) + + nowMonotonic, err := t.clocks.GetTime(sentrytime.Monotonic) + if err != nil { + panic("Unable to get current monotonic time: " + err.Error()) + } + + nowRealtime, err := t.clocks.GetTime(sentrytime.Realtime) + if err != nil { + panic("Unable to get current realtime: " + err.Error()) + } + + if t.restored { + wantMonotonic = t.saveMonotonic + elapsed := nowRealtime - t.saveRealtime + if elapsed > 0 { + wantMonotonic += elapsed + } + } + + t.monotonicOffset = wantMonotonic - nowMonotonic + + if !t.restored { + // Hold on to the initial "boot" time. + t.bootTime = ktime.FromNanoseconds(nowRealtime) + } + + t.mu.Lock() + defer t.mu.Unlock() + t.startUpdater() +} + +// startUpdater starts an update goroutine that keeps the clocks updated. +// +// mu must be held. +func (t *Timekeeper) startUpdater() { + if t.stop != nil { + // Timekeeper already started + return + } + t.stop = make(chan struct{}) + + // Keep the clocks up to date. + // + // Note that the Go runtime uses host CLOCK_MONOTONIC to service the + // timer, so it may run at a *slightly* different rate from the + // application CLOCK_MONOTONIC. That is fine, as we only need to update + // at approximately this rate. + timer := time.NewTicker(sentrytime.ApproxUpdateInterval) + t.wg.Add(1) + go func() { // S/R-SAFE: stopped during save. + for { + // Start with an update immediately, so the clocks are + // ready ASAP. + + // Call Update within a Write block to prevent the VDSO + // from using the old params between Update and + // Write. + if err := t.params.Write(func() vdsoParams { + monotonicParams, monotonicOk, realtimeParams, realtimeOk := t.clocks.Update() + + var p vdsoParams + if monotonicOk { + p.monotonicReady = 1 + p.monotonicBaseCycles = int64(monotonicParams.BaseCycles) + p.monotonicBaseRef = int64(monotonicParams.BaseRef) + t.monotonicOffset + p.monotonicFrequency = monotonicParams.Frequency + } + if realtimeOk { + p.realtimeReady = 1 + p.realtimeBaseCycles = int64(realtimeParams.BaseCycles) + p.realtimeBaseRef = int64(realtimeParams.BaseRef) + p.realtimeFrequency = realtimeParams.Frequency + } + + log.Debugf("Updating VDSO parameters: %+v", p) + + return p + }); err != nil { + log.Warningf("Unable to update VDSO parameter page: %v", err) + } + + select { + case <-timer.C: + case <-t.stop: + t.wg.Done() + return + } + } + }() +} + +// stopUpdater stops the update goroutine, blocking until it exits. +// +// mu must be held. +func (t *Timekeeper) stopUpdater() { + if t.stop == nil { + // Updater not running. + return + } + + close(t.stop) + t.wg.Wait() + t.stop = nil +} + +// Destroy destroys the Timekeeper, freeing all associated resources. +func (t *Timekeeper) Destroy() { + t.mu.Lock() + defer t.mu.Unlock() + + t.stopUpdater() +} + +// PauseUpdates stops clock parameter updates. This should only be used when +// Tasks are not running and thus cannot access the clock. +func (t *Timekeeper) PauseUpdates() { + t.mu.Lock() + defer t.mu.Unlock() + t.stopUpdater() +} + +// ResumeUpdates restarts clock parameter updates stopped by PauseUpdates. +func (t *Timekeeper) ResumeUpdates() { + t.mu.Lock() + defer t.mu.Unlock() + t.startUpdater() +} + +// GetTime returns the current time in nanoseconds. +func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) { + if t.clocks == nil { + panic("Timekeeper used before initialized with SetClocks") + } + now, err := t.clocks.GetTime(c) + if err == nil && c == sentrytime.Monotonic { + now += t.monotonicOffset + } + return now, err +} + +// BootTime returns the system boot real time. +func (t *Timekeeper) BootTime() ktime.Time { + return t.bootTime +} diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go new file mode 100644 index 000000000..aee983ac7 --- /dev/null +++ b/pkg/sentry/kernel/timekeeper_state.go @@ -0,0 +1,41 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/time" +) + +// beforeSave is invoked by stateify. +func (t *Timekeeper) beforeSave() { + if t.stop != nil { + panic("pauseUpdates must be called before Save") + } + + // N.B. we want the *offset* monotonic time. + var err error + if t.saveMonotonic, err = t.GetTime(time.Monotonic); err != nil { + panic("unable to get current monotonic time: " + err.Error()) + } + + if t.saveRealtime, err = t.GetTime(time.Realtime); err != nil { + panic("unable to get current realtime: " + err.Error()) + } +} + +// afterLoad is invoked by stateify. +func (t *Timekeeper) afterLoad() { + t.restored = true +} diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go new file mode 100644 index 000000000..08bacba4f --- /dev/null +++ b/pkg/sentry/kernel/timekeeper_test.go @@ -0,0 +1,156 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// mockClocks is a sentrytime.Clocks that simply returns the times in the +// struct. +type mockClocks struct { + monotonic int64 + realtime int64 +} + +// Update implements sentrytime.Clocks.Update. It does nothing. +func (*mockClocks) Update() (monotonicParams sentrytime.Parameters, monotonicOk bool, realtimeParam sentrytime.Parameters, realtimeOk bool) { + return +} + +// Update implements sentrytime.Clocks.GetTime. +func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) { + switch id { + case sentrytime.Monotonic: + return c.monotonic, nil + case sentrytime.Realtime: + return c.realtime, nil + default: + return 0, syserror.EINVAL + } +} + +// stateTestClocklessTimekeeper returns a test Timekeeper which has not had +// SetClocks called. +func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper { + ctx := contexttest.Context(tb) + p := platform.FromContext(ctx) + fr, err := p.Memory().Allocate(usermem.PageSize, usage.Anonymous) + if err != nil { + tb.Fatalf("failed to allocate memory: %v", err) + } + return &Timekeeper{ + params: NewVDSOParamPage(p, fr), + } +} + +func stateTestTimekeeper(tb testing.TB) *Timekeeper { + t := stateTestClocklessTimekeeper(tb) + t.SetClocks(sentrytime.NewCalibratedClocks()) + return t +} + +// TestTimekeeperMonotonicZero tests that monotonic time starts at zero. +func TestTimekeeperMonotonicZero(t *testing.T) { + c := &mockClocks{ + monotonic: 100000, + } + + tk := stateTestClocklessTimekeeper(t) + tk.SetClocks(c) + defer tk.Destroy() + + now, err := tk.GetTime(sentrytime.Monotonic) + if err != nil { + t.Errorf("GetTime err got %v want nil", err) + } + if now != 0 { + t.Errorf("GetTime got %d want 0", now) + } + + c.monotonic += 10 + + now, err = tk.GetTime(sentrytime.Monotonic) + if err != nil { + t.Errorf("GetTime err got %v want nil", err) + } + if now != 10 { + t.Errorf("GetTime got %d want 10", now) + } +} + +// TestTimekeeperMonotonicJumpForward tests that monotonic time jumps forward +// after restore. +func TestTimekeeperMonotonicForward(t *testing.T) { + c := &mockClocks{ + monotonic: 900000, + realtime: 600000, + } + + tk := stateTestClocklessTimekeeper(t) + tk.restored = true + tk.saveMonotonic = 100000 + tk.saveRealtime = 400000 + tk.SetClocks(c) + defer tk.Destroy() + + // The monotonic clock should jump ahead by 200000 to 300000. + // + // The new system monotonic time (900000) is irrelevant to what the app + // sees. + now, err := tk.GetTime(sentrytime.Monotonic) + if err != nil { + t.Errorf("GetTime err got %v want nil", err) + } + if now != 300000 { + t.Errorf("GetTime got %d want 300000", now) + } +} + +// TestTimekeeperMonotonicJumpBackwards tests that monotonic time does not jump +// backwards when realtime goes backwards. +func TestTimekeeperMonotonicJumpBackwards(t *testing.T) { + c := &mockClocks{ + monotonic: 900000, + realtime: 400000, + } + + tk := stateTestClocklessTimekeeper(t) + tk.restored = true + tk.saveMonotonic = 100000 + tk.saveRealtime = 600000 + tk.SetClocks(c) + defer tk.Destroy() + + // The monotonic clock should remain at 100000. + // + // The new system monotonic time (900000) is irrelevant to what the app + // sees and we don't want to jump the monotonic clock backwards like + // realtime did. + now, err := tk.GetTime(sentrytime.Monotonic) + if err != nil { + t.Errorf("GetTime err got %v want nil", err) + } + if now != 100000 { + t.Errorf("GetTime got %d want 100000", now) + } +} diff --git a/pkg/sentry/kernel/timer.go b/pkg/sentry/kernel/timer.go new file mode 100644 index 000000000..03a3310be --- /dev/null +++ b/pkg/sentry/kernel/timer.go @@ -0,0 +1,282 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time" +) + +// timekeeperClock is a ktime.Clock that reads time from a +// kernel.Timekeeper-managed clock. +type timekeeperClock struct { + tk *Timekeeper + c sentrytime.ClockID + + // Implements ktime.Clock.WallTimeUntil. + ktime.WallRateClock `state:"nosave"` + + // Implements waiter.Waitable. (We have no ability to detect + // discontinuities from external changes to CLOCK_REALTIME). + ktime.NoClockEvents `state:"nosave"` +} + +// Now implements ktime.Clock.Now. +func (tc *timekeeperClock) Now() ktime.Time { + now, err := tc.tk.GetTime(tc.c) + if err != nil { + panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err)) + } + return ktime.FromNanoseconds(now) +} + +// tgClock is a ktime.Clock that measures the time a thread group has spent +// executing. +type tgClock struct { + tg *ThreadGroup + + // If includeSys is true, the tgClock includes both time spent executing + // application code as well as time spent in the sentry. Otherwise, the + // tgClock includes only time spent executing application code. + includeSys bool + + // Implements waiter.Waitable. + ktime.ClockEventsQueue `state:"nosave"` +} + +// UserCPUClock returns a ktime.Clock that measures the time that a thread +// group has spent executing. +func (tg *ThreadGroup) UserCPUClock() ktime.Clock { + return tg.tm.virtClock +} + +// CPUClock returns a ktime.Clock that measures the time that a thread group +// has spent executing, including sentry time. +func (tg *ThreadGroup) CPUClock() ktime.Clock { + return tg.tm.profClock +} + +// Now implements ktime.Clock.Now. +func (tgc *tgClock) Now() ktime.Time { + stats := tgc.tg.CPUStats() + if tgc.includeSys { + return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) + } + return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) +} + +// WallTimeUntil implements ktime.Clock.WallTimeUntil. +func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration { + // The assumption here is that the time spent in this process (not matter + // virtual or prof) should not exceed wall time * active tasks, since + // Task.exitThreadGroup stops accounting as it transitions to + // TaskExitInitiated. + tgc.tg.pidns.owner.mu.RLock() + n := tgc.tg.activeTasks + tgc.tg.pidns.owner.mu.RUnlock() + if n == 0 { + if t.Before(now) { + return 0 + } + // The timer tick raced with thread group exit, after which no more + // tasks can enter the thread group. So tgc.Now() will never advance + // again. Return a large delay; the timer should be stopped long before + // it comes again anyway. + return time.Hour + } + // This is a lower bound on the amount of time that can elapse before an + // associated timer expires, so returning this value tends to result in a + // sequence of closely-spaced ticks just before timer expiry. To avoid + // this, round up to the nearest ClockTick; CPU usage measurements are + // limited to this resolution anyway. + remaining := time.Duration(int64(t.Sub(now))/int64(n)) * time.Nanosecond + return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick +} + +// taskClock is a ktime.Clock that measures the time that a task has spent +// executing. +type taskClock struct { + t *Task + + // If includeSys is true, the taskClock includes both time spent executing + // application code as well as time spent in the sentry. Otherwise, the + // taskClock includes only time spent executing application code. + includeSys bool + + // Implements waiter.Waitable. TimeUntil wouldn't change its estimation + // based on either of the clock events, so there's no event to be + // notified for. + ktime.NoClockEvents `state:"nosave"` + + // Implements ktime.Clock.WallTimeUntil. + // + // As an upper bound, a task's clock cannot advance faster than CPU + // time. It would have to execute at a rate of more than 1 task-second + // per 1 CPU-second, which isn't possible. + ktime.WallRateClock `state:"nosave"` +} + +// UserCPUClock returns a clock measuring the CPU time the task has spent +// executing application code. +func (t *Task) UserCPUClock() ktime.Clock { + return &taskClock{t: t, includeSys: false} +} + +// CPUClock returns a clock measuring the CPU time the task has spent executing +// application and "kernel" code. +func (t *Task) CPUClock() ktime.Clock { + return &taskClock{t: t, includeSys: true} +} + +// Now implements ktime.Clock.Now. +func (tc *taskClock) Now() ktime.Time { + stats := tc.t.CPUStats() + if tc.includeSys { + return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) + } + return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) +} + +// signalNotifier is a ktime.Listener that sends signals to a ThreadGroup. +type signalNotifier struct { + tg *ThreadGroup + signal linux.Signal + realTimer bool + includeSys bool +} + +// Notify implements ktime.TimerListener.Notify. +func (s *signalNotifier) Notify(exp uint64) { + // Since all signals sent using a signalNotifier are standard (not + // real-time) signals, we can ignore the number of expirations and send + // only a single signal. + if s.realTimer { + // real timer signal sent to leader. See kernel/time/itimer.c:it_real_fn + s.tg.SendSignal(sigPriv(s.signal)) + } else { + s.tg.SendTimerSignal(sigPriv(s.signal), s.includeSys) + } +} + +// Destroy implements ktime.TimerListener.Destroy. +func (s *signalNotifier) Destroy() {} + +// TimerManager is a collection of supported process cpu timers. +type TimerManager struct { + // Clocks used to drive thread group execution time timers. + virtClock *tgClock + profClock *tgClock + + RealTimer *ktime.Timer + VirtualTimer *ktime.Timer + ProfTimer *ktime.Timer + SoftLimitTimer *ktime.Timer + HardLimitTimer *ktime.Timer +} + +// newTimerManager returns a new instance of TimerManager. +func newTimerManager(tg *ThreadGroup, monotonicClock ktime.Clock) TimerManager { + virtClock := &tgClock{tg: tg, includeSys: false} + profClock := &tgClock{tg: tg, includeSys: true} + tm := TimerManager{ + virtClock: virtClock, + profClock: profClock, + RealTimer: ktime.NewTimer(monotonicClock, &signalNotifier{ + tg: tg, + signal: linux.SIGALRM, + realTimer: true, + includeSys: false, + }), + VirtualTimer: ktime.NewTimer(virtClock, &signalNotifier{ + tg: tg, + signal: linux.SIGVTALRM, + realTimer: false, + includeSys: false, + }), + ProfTimer: ktime.NewTimer(profClock, &signalNotifier{ + tg: tg, + signal: linux.SIGPROF, + realTimer: false, + includeSys: true, + }), + SoftLimitTimer: ktime.NewTimer(profClock, &signalNotifier{ + tg: tg, + signal: linux.SIGXCPU, + realTimer: false, + includeSys: true, + }), + HardLimitTimer: ktime.NewTimer(profClock, &signalNotifier{ + tg: tg, + signal: linux.SIGKILL, + realTimer: false, + includeSys: true, + }), + } + tm.applyCPULimits(tg.Limits().Get(limits.CPU)) + return tm +} + +// Save saves this TimerManger. + +// destroy destroys all timers. +func (tm *TimerManager) destroy() { + tm.RealTimer.Destroy() + tm.VirtualTimer.Destroy() + tm.ProfTimer.Destroy() + tm.SoftLimitTimer.Destroy() + tm.HardLimitTimer.Destroy() +} + +func (tm *TimerManager) applyCPULimits(l limits.Limit) { + tm.SoftLimitTimer.Swap(ktime.Setting{ + Enabled: l.Cur != limits.Infinity, + Next: ktime.FromNanoseconds((time.Duration(l.Cur) * time.Second).Nanoseconds()), + Period: time.Second, + }) + tm.HardLimitTimer.Swap(ktime.Setting{ + Enabled: l.Max != limits.Infinity, + Next: ktime.FromNanoseconds((time.Duration(l.Max) * time.Second).Nanoseconds()), + }) +} + +// kick is called when the number of threads in the thread group associated +// with tm increases. +func (tm *TimerManager) kick() { + tm.virtClock.Notify(ktime.ClockEventRateIncrease) + tm.profClock.Notify(ktime.ClockEventRateIncrease) +} + +// pause is to pause the timers and stop timer signal delivery. +func (tm *TimerManager) pause() { + tm.RealTimer.Pause() + tm.VirtualTimer.Pause() + tm.ProfTimer.Pause() + tm.SoftLimitTimer.Pause() + tm.HardLimitTimer.Pause() +} + +// resume is to resume the timers and continue timer signal delivery. +func (tm *TimerManager) resume() { + tm.RealTimer.Resume() + tm.VirtualTimer.Resume() + tm.ProfTimer.Resume() + tm.SoftLimitTimer.Resume() + tm.HardLimitTimer.Resume() +} diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go new file mode 100644 index 000000000..58e9b4d1b --- /dev/null +++ b/pkg/sentry/kernel/uts_namespace.go @@ -0,0 +1,100 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" +) + +// UTSNamespace represents a UTS namespace, a holder of two system identifiers: +// the hostname and domain name. +type UTSNamespace struct { + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + hostName string + domainName string + + // userns is the user namespace associated with the UTSNamespace. + // Privileged operations on this UTSNamespace must have appropriate + // capabilities in userns. + // + // userns is immutable. + userns *auth.UserNamespace +} + +// NewUTSNamespace creates a new UTS namespace. +func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace { + return &UTSNamespace{ + hostName: hostName, + domainName: domainName, + userns: userns, + } +} + +// UTSNamespace returns the task's UTS namespace. +func (t *Task) UTSNamespace() *UTSNamespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.utsns +} + +// HostName returns the host name of this UTS namespace. +func (u *UTSNamespace) HostName() string { + u.mu.Lock() + defer u.mu.Unlock() + return u.hostName +} + +// SetHostName sets the host name of this UTS namespace. +func (u *UTSNamespace) SetHostName(host string) { + u.mu.Lock() + defer u.mu.Unlock() + u.hostName = host +} + +// DomainName returns the domain name of this UTS namespace. +func (u *UTSNamespace) DomainName() string { + u.mu.Lock() + defer u.mu.Unlock() + return u.domainName +} + +// SetDomainName sets the domain name of this UTS namespace. +func (u *UTSNamespace) SetDomainName(domain string) { + u.mu.Lock() + defer u.mu.Unlock() + u.domainName = domain +} + +// UserNamespace returns the user namespace associated with this UTS namespace. +func (u *UTSNamespace) UserNamespace() *auth.UserNamespace { + u.mu.Lock() + defer u.mu.Unlock() + return u.userns +} + +// Clone makes a copy of this UTS namespace, associating the given user +// namespace. +func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace { + u.mu.Lock() + defer u.mu.Unlock() + return &UTSNamespace{ + hostName: u.hostName, + domainName: u.domainName, + userns: userns, + } +} diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go new file mode 100644 index 000000000..0bacbea49 --- /dev/null +++ b/pkg/sentry/kernel/vdso.go @@ -0,0 +1,145 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// vdsoParams are the parameters exposed to the VDSO. +// +// They are exposed to the VDSO via a parameter page managed by VDSOParamPage, +// which also includes a sequence counter. +type vdsoParams struct { + monotonicReady uint64 + monotonicBaseCycles int64 + monotonicBaseRef int64 + monotonicFrequency uint64 + + realtimeReady uint64 + realtimeBaseCycles int64 + realtimeBaseRef int64 + realtimeFrequency uint64 +} + +// VDSOParamPage manages a VDSO parameter page. +// +// Its memory layout looks like: +// +// type page struct { +// // seq is a sequence counter that protects the fields below. +// seq uint64 +// vdsoParams +// } +// +// Everything in the struct is 8 bytes for easy alignment. +// +// It must be kept in sync with params in vdso/vdso_time.cc. +type VDSOParamPage struct { + // The parameter page is fr, allocated from platform.Memory(). + platform platform.Platform + fr platform.FileRange + + // seq is the current sequence count written to the page. + // + // A write is in progress if bit 1 of the counter is set. + // + // Timekeeper's updater goroutine may call Write before equality is + // checked in state_test_util tests, causing this field to change across + // save / restore. + seq uint64 +} + +// NewVDSOParamPage returns a VDSOParamPage. +// +// Preconditions: +// +// * fr is a single page allocated from platform.Memory(). VDSOParamPage does +// not take ownership of fr; it must remain allocated for the lifetime of the +// VDSOParamPage. +// +// * VDSOParamPage must be the only writer to fr. +// +// * platform.Memory().MapInternal(fr) must return a single safemem.Block. +func NewVDSOParamPage(platform platform.Platform, fr platform.FileRange) *VDSOParamPage { + return &VDSOParamPage{platform: platform, fr: fr} +} + +// access returns a mapping of the param page. +func (v *VDSOParamPage) access() (safemem.Block, error) { + bs, err := v.platform.Memory().MapInternal(v.fr, usermem.ReadWrite) + if err != nil { + return safemem.Block{}, err + } + if bs.NumBlocks() != 1 { + panic(fmt.Sprintf("Multiple blocks (%d) in VDSO param BlockSeq", bs.NumBlocks())) + } + return bs.Head(), nil +} + +// incrementSeq increments the sequence counter in the param page. +func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error { + next := v.seq + 1 + old, err := safemem.SwapUint64(paramPage, next) + if err != nil { + return err + } + + if old != v.seq { + return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq) + } + + v.seq = next + return nil +} + +// Write updates the VDSO parameters. +// +// Write starts a write block, calls f to get the new parameters, writes +// out the new parameters, then ends the write block. +func (v *VDSOParamPage) Write(f func() vdsoParams) error { + paramPage, err := v.access() + if err != nil { + return err + } + + // Write begin. + next := v.seq + 1 + if next%2 != 1 { + panic("Out-of-order sequence count") + } + + err = v.incrementSeq(paramPage) + if err != nil { + return err + } + + // Get the new params. + p := f() + buf := binary.Marshal(nil, usermem.ByteOrder, p) + + // Skip the sequence counter. + if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil { + panic(fmt.Sprintf("Unable to get set VDSO parameters: %v", err)) + } + + // Write end. + return v.incrementSeq(paramPage) +} diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go new file mode 100644 index 000000000..a9e84673f --- /dev/null +++ b/pkg/sentry/kernel/version.go @@ -0,0 +1,33 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// Version defines the application-visible system version. +type Version struct { + // Operating system name (e.g. "Linux"). + Sysname string + + // Operating system release (e.g. "3.11.10-amd64"). + Release string + + // Operating system version. On Linux this takes the shape + // "#VERSION CONFIG_FLAGS TIMESTAMP" + // where: + // - VERSION is a sequence counter incremented on every successful build + // - CONFIG_FLAGS is a space-separated list of major enabled kernel features + // (e.g. "SMP" and "PREEMPT") + // - TIMESTAMP is the build timestamp as returned by `date` + Version string +} |