diff options
Diffstat (limited to 'pkg/sentry/kernel')
112 files changed, 25856 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD new file mode 100644 index 000000000..25fe1921b --- /dev/null +++ b/pkg/sentry/kernel/BUILD @@ -0,0 +1,241 @@ +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "pending_signals_list", + out = "pending_signals_list.go", + package = "kernel", + prefix = "pendingSignal", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*pendingSignal", + "Linker": "*pendingSignal", + }, +) + +go_template_instance( + name = "process_group_list", + out = "process_group_list.go", + package = "kernel", + prefix = "processGroup", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*ProcessGroup", + "Linker": "*ProcessGroup", + }, +) + +go_template_instance( + name = "seqatomic_taskgoroutineschedinfo", + out = "seqatomic_taskgoroutineschedinfo_unsafe.go", + package = "kernel", + suffix = "TaskGoroutineSchedInfo", + template = "//pkg/sync:generic_seqatomic", + types = { + "Value": "TaskGoroutineSchedInfo", + }, +) + +go_template_instance( + name = "session_list", + out = "session_list.go", + package = "kernel", + prefix = "session", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Session", + "Linker": "*Session", + }, +) + +go_template_instance( + name = "task_list", + out = "task_list.go", + package = "kernel", + prefix = "task", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Task", + "Linker": "*Task", + }, +) + +go_template_instance( + name = "socket_list", + out = "socket_list.go", + package = "kernel", + prefix = "socket", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*SocketEntry", + "Linker": "*SocketEntry", + }, +) + +proto_library( + name = "uncaught_signal", + srcs = ["uncaught_signal.proto"], + visibility = ["//visibility:public"], + deps = ["//pkg/sentry/arch:registers_proto"], +) + +go_library( + name = "kernel", + srcs = [ + "abstract_socket_namespace.go", + "aio.go", + "context.go", + "fd_table.go", + "fd_table_unsafe.go", + "fs_context.go", + "ipc_namespace.go", + "kernel.go", + "kernel_opts.go", + "kernel_state.go", + "pending_signals.go", + "pending_signals_list.go", + "pending_signals_state.go", + "posixtimer.go", + "process_group_list.go", + "ptrace.go", + "ptrace_amd64.go", + "ptrace_arm64.go", + "rseq.go", + "seccomp.go", + "seqatomic_taskgoroutineschedinfo_unsafe.go", + "session_list.go", + "sessions.go", + "signal.go", + "signal_handlers.go", + "socket_list.go", + "syscalls.go", + "syscalls_state.go", + "syslog.go", + "task.go", + "task_acct.go", + "task_block.go", + "task_clone.go", + "task_context.go", + "task_exec.go", + "task_exit.go", + "task_futex.go", + "task_identity.go", + "task_list.go", + "task_log.go", + "task_net.go", + "task_run.go", + "task_sched.go", + "task_signals.go", + "task_start.go", + "task_stop.go", + "task_syscall.go", + "task_usermem.go", + "thread_group.go", + "threads.go", + "timekeeper.go", + "timekeeper_state.go", + "tty.go", + "uts_namespace.go", + "vdso.go", + "version.go", + ], + imports = [ + "gvisor.dev/gvisor/pkg/bpf", + "gvisor.dev/gvisor/pkg/sentry/device", + "gvisor.dev/gvisor/pkg/tcpip", + ], + visibility = ["//:sandbox"], + deps = [ + ":uncaught_signal_go_proto", + "//pkg/abi", + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/binary", + "//pkg/bits", + "//pkg/bpf", + "//pkg/context", + "//pkg/cpuid", + "//pkg/eventchannel", + "//pkg/fspath", + "//pkg/log", + "//pkg/metric", + "//pkg/refs", + "//pkg/safemem", + "//pkg/secio", + "//pkg/sentry/arch", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/lock", + "//pkg/sentry/fs/timerfd", + "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/pipefs", + "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/fsimpl/timerfd", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/hostcpu", + "//pkg/sentry/inet", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/epoll", + "//pkg/sentry/kernel/futex", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/kernel/semaphore", + "//pkg/sentry/kernel/shm", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/memmap", + "//pkg/sentry/mm", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/socket/netlink/port", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/time", + "//pkg/sentry/unimpl", + "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", + "//pkg/sentry/uniqueid", + "//pkg/sentry/usage", + "//pkg/sentry/vfs", + "//pkg/state", + "//pkg/state/statefile", + "//pkg/state/wire", + "//pkg/sync", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/stack", + "//pkg/usermem", + "//pkg/waiter", + "//tools/go_marshal/marshal", + ], +) + +go_test( + name = "kernel_test", + size = "small", + srcs = [ + "fd_table_test.go", + "table_test.go", + "task_test.go", + "timekeeper_test.go", + ], + library = ":kernel", + deps = [ + "//pkg/abi", + "//pkg/context", + "//pkg/sentry/arch", + "//pkg/sentry/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/fs/filetest", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/limits", + "//pkg/sentry/pgalloc", + "//pkg/sentry/time", + "//pkg/sentry/usage", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/kernel/README.md b/pkg/sentry/kernel/README.md new file mode 100644 index 000000000..427311be8 --- /dev/null +++ b/pkg/sentry/kernel/README.md @@ -0,0 +1,108 @@ +This package contains: + +- A (partial) emulation of the "core Linux kernel", which governs task + execution and scheduling, system call dispatch, and signal handling. See + below for details. + +- The top-level interface for the sentry's Linux kernel emulation in general, + used by the `main` function of all versions of the sentry. This interface + revolves around the `Env` type (defined in `kernel.go`). + +# Background + +In Linux, each schedulable context is referred to interchangeably as a "task" or +"thread". Tasks can be divided into userspace and kernel tasks. In the sentry, +scheduling is managed by the Go runtime, so each schedulable context is a +goroutine; only "userspace" (application) contexts are referred to as tasks, and +represented by Task objects. (From this point forward, "task" refers to the +sentry's notion of a task unless otherwise specified.) + +At a high level, Linux application threads can be thought of as repeating a "run +loop": + +- Some amount of application code is executed in userspace. + +- A trap (explicit syscall invocation, hardware interrupt or exception, etc.) + causes control flow to switch to the kernel. + +- Some amount of kernel code is executed in kernelspace, e.g. to handle the + cause of the trap. + +- The kernel "returns from the trap" into application code. + +Analogously, each task in the sentry is associated with a *task goroutine* that +executes that task's run loop (`Task.run` in `task_run.go`). However, the +sentry's task run loop differs in structure in order to support saving execution +state to, and resuming execution from, checkpoints. + +While in kernelspace, a Linux thread can be descheduled (cease execution) in a +variety of ways: + +- It can yield or be preempted, becoming temporarily descheduled but still + runnable. At present, the sentry delegates scheduling of runnable threads to + the Go runtime. + +- It can exit, becoming permanently descheduled. The sentry's equivalent is + returning from `Task.run`, terminating the task goroutine. + +- It can enter interruptible sleep, a state in which it can be woken by a + caller-defined wakeup or the receipt of a signal. In the sentry, + interruptible sleep (which is ambiguously referred to as *blocking*) is + implemented by making all events that can end blocking (including signal + notifications) communicated via Go channels and using `select` to multiplex + wakeup sources; see `task_block.go`. + +- It can enter uninterruptible sleep, a state in which it can only be woken by + a caller-defined wakeup. Killable sleep is a closely related variant in + which the task can also be woken by SIGKILL. (These definitions also include + Linux's "group-stopped" (`TASK_STOPPED`) and "ptrace-stopped" + (`TASK_TRACED`) states.) + +To maximize compatibility with Linux, sentry checkpointing appears as a spurious +signal-delivery interrupt on all tasks; interrupted system calls return `EINTR` +or are automatically restarted as usual. However, these semantics require that +uninterruptible and killable sleeps do not appear to be interrupted. In other +words, the state of the task, including its progress through the interrupted +operation, must be preserved by checkpointing. For many such sleeps, the wakeup +condition is application-controlled, making it infeasible to wait for the sleep +to end before checkpointing. Instead, we must support checkpointing progress +through sleeping operations. + +# Implementation + +We break the task's control flow graph into *states*, delimited by: + +1. Points where uninterruptible and killable sleeps may occur. For example, + there exists a state boundary between signal dequeueing and signal delivery + because there may be an intervening ptrace signal-delivery-stop. + +2. Points where sleep-induced branches may "rejoin" normal execution. For + example, the syscall exit state exists because it can be reached immediately + following a synchronous syscall, or after a task that is sleeping in + `execve()` or `vfork()` resumes execution. + +3. Points containing large branches. This is strictly for organizational + purposes. For example, the state that processes interrupt-signaled + conditions is kept separate from the main "app" state to reduce the size of + the latter. + +4. `SyscallReinvoke`, which does not correspond to anything in Linux, and + exists solely to serve the autosave feature. + + + +States before which a stop may occur are represented as implementations of the +`taskRunState` interface named `run(state)`, allowing them to be saved and +restored. States that cannot be immediately preceded by a stop are simply `Task` +methods named `do(state)`. + +Conditions that can require task goroutines to cease execution for unknown +lengths of time are called *stops*. Stops are divided into *internal stops*, +which are stops whose start and end conditions are implemented within the +sentry, and *external stops*, which are stops whose start and end conditions are +not known to the sentry. Hence all uninterruptible and killable sleeps are +internal stops, and the existence of a pending checkpoint operation is an +external stop. Internal stops are reified into instances of the `TaskStop` type, +while external stops are merely counted. The task run loop alternates between +checking for stops and advancing the task's state. This allows checkpointing to +hold tasks in a stopped state while waiting for all tasks in the system to stop. diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go new file mode 100644 index 000000000..920fe4329 --- /dev/null +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -0,0 +1,111 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" +) + +// +stateify savable +type abstractEndpoint struct { + ep transport.BoundEndpoint + wr *refs.WeakRef + name string + ns *AbstractSocketNamespace +} + +// WeakRefGone implements refs.WeakRefUser.WeakRefGone. +func (e *abstractEndpoint) WeakRefGone() { + e.ns.mu.Lock() + if e.ns.endpoints[e.name].ep == e.ep { + delete(e.ns.endpoints, e.name) + } + e.ns.mu.Unlock() +} + +// AbstractSocketNamespace is used to implement the Linux abstract socket functionality. +// +// +stateify savable +type AbstractSocketNamespace struct { + mu sync.Mutex `state:"nosave"` + + // Keeps mapping from name to endpoint. + endpoints map[string]abstractEndpoint +} + +// NewAbstractSocketNamespace returns a new AbstractSocketNamespace. +func NewAbstractSocketNamespace() *AbstractSocketNamespace { + return &AbstractSocketNamespace{ + endpoints: make(map[string]abstractEndpoint), + } +} + +// A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on +// its backing object. +type boundEndpoint struct { + transport.BoundEndpoint + rc refs.RefCounter +} + +// Release implements transport.BoundEndpoint.Release. +func (e *boundEndpoint) Release() { + e.rc.DecRef() + e.BoundEndpoint.Release() +} + +// BoundEndpoint retrieves the endpoint bound to the given name. The return +// value is nil if no endpoint was bound. +func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndpoint { + a.mu.Lock() + defer a.mu.Unlock() + + ep, ok := a.endpoints[name] + if !ok { + return nil + } + + rc := ep.wr.Get() + if rc == nil { + delete(a.endpoints, name) + return nil + } + + return &boundEndpoint{ep.ep, rc} +} + +// Bind binds the given socket. +// +// When the last reference managed by rc is dropped, ep may be removed from the +// namespace. +func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { + a.mu.Lock() + defer a.mu.Unlock() + + if ep, ok := a.endpoints[name]; ok { + if rc := ep.wr.Get(); rc != nil { + rc.DecRef() + return syscall.EADDRINUSE + } + } + + ae := abstractEndpoint{ep: ep, name: name, ns: a} + ae.wr = refs.NewWeakRef(rc, &ae) + a.endpoints[name] = ae + return nil +} diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go new file mode 100644 index 000000000..0ac78c0b8 --- /dev/null +++ b/pkg/sentry/kernel/aio.go @@ -0,0 +1,81 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "time" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" +) + +// AIOCallback is an function that does asynchronous I/O on behalf of a task. +type AIOCallback func(context.Context) + +// QueueAIO queues an AIOCallback which will be run asynchronously. +func (t *Task) QueueAIO(cb AIOCallback) { + ctx := taskAsyncContext{t: t} + wg := &t.TaskSet().aioGoroutines + wg.Add(1) + go func() { + cb(ctx) + wg.Done() + }() +} + +type taskAsyncContext struct { + context.NoopSleeper + t *Task +} + +// Debugf implements log.Logger.Debugf. +func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { + ctx.t.Debugf(format, v...) +} + +// Infof implements log.Logger.Infof. +func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { + ctx.t.Infof(format, v...) +} + +// Warningf implements log.Logger.Warningf. +func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { + ctx.t.Warningf(format, v...) +} + +// IsLogging implements log.Logger.IsLogging. +func (ctx taskAsyncContext) IsLogging(level log.Level) bool { + return ctx.t.IsLogging(level) +} + +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return ctx.t.Deadline() +} + +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return ctx.t.Done() +} + +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return ctx.t.Err() +} + +// Value implements context.Context.Value. +func (ctx taskAsyncContext) Value(key interface{}) interface{} { + return ctx.t.Value(key) +} diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD new file mode 100644 index 000000000..2bc49483a --- /dev/null +++ b/pkg/sentry/kernel/auth/BUILD @@ -0,0 +1,69 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "atomicptr_credentials", + out = "atomicptr_credentials_unsafe.go", + package = "auth", + suffix = "Credentials", + template = "//pkg/sync:generic_atomicptr", + types = { + "Value": "Credentials", + }, +) + +go_template_instance( + name = "id_map_range", + out = "id_map_range.go", + package = "auth", + prefix = "idMap", + template = "//pkg/segment:generic_range", + types = { + "T": "uint32", + }, +) + +go_template_instance( + name = "id_map_set", + out = "id_map_set.go", + consts = { + "minDegree": "3", + }, + package = "auth", + prefix = "idMap", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint32", + "Range": "idMapRange", + "Value": "uint32", + "Functions": "idMapFunctions", + }, +) + +go_library( + name = "auth", + srcs = [ + "atomicptr_credentials_unsafe.go", + "auth.go", + "capability_set.go", + "context.go", + "credentials.go", + "id.go", + "id_map.go", + "id_map_functions.go", + "id_map_range.go", + "id_map_set.go", + "user_namespace.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/bits", + "//pkg/context", + "//pkg/log", + "//pkg/sync", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go new file mode 100644 index 000000000..847d121aa --- /dev/null +++ b/pkg/sentry/kernel/auth/auth.go @@ -0,0 +1,22 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package auth implements an access control model that is a subset of Linux's. +// +// The auth package supports two kinds of access controls: user/group IDs and +// capabilities. Each resource in the security model is associated with a user +// namespace; "privileged" operations check that the operator's credentials +// have the required user/group IDs or capabilities within the user namespace +// of accessed resources. +package auth diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go new file mode 100644 index 000000000..fc8c6745c --- /dev/null +++ b/pkg/sentry/kernel/auth/capability_set.go @@ -0,0 +1,61 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bits" +) + +// A CapabilitySet is a set of capabilities implemented as a bitset. The zero +// value of CapabilitySet is a set containing no capabilities. +type CapabilitySet uint64 + +// AllCapabilities is a CapabilitySet containing all valid capabilities. +var AllCapabilities = CapabilitySetOf(linux.CAP_LAST_CAP+1) - 1 + +// CapabilitySetOf returns a CapabilitySet containing only the given +// capability. +func CapabilitySetOf(cp linux.Capability) CapabilitySet { + return CapabilitySet(bits.MaskOf64(int(cp))) +} + +// CapabilitySetOfMany returns a CapabilitySet containing the given capabilities. +func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet { + var cs uint64 + for _, cp := range cps { + cs |= bits.MaskOf64(int(cp)) + } + return CapabilitySet(cs) +} + +// TaskCapabilities represents all the capability sets for a task. Each of these +// sets is explained in greater detail in capabilities(7). +type TaskCapabilities struct { + // Permitted is a limiting superset for the effective capabilities that + // the thread may assume. + PermittedCaps CapabilitySet + // Inheritable is a set of capabilities preserved across an execve(2). + InheritableCaps CapabilitySet + // Effective is the set of capabilities used by the kernel to perform + // permission checks for the thread. + EffectiveCaps CapabilitySet + // Bounding is a limiting superset for the capabilities that a thread + // can add to its inheritable set using capset(2). + BoundingCaps CapabilitySet + // Ambient is a set of capabilities that are preserved across an + // execve(2) of a program that is not privileged. + AmbientCaps CapabilitySet +} diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go new file mode 100644 index 000000000..ef5723127 --- /dev/null +++ b/pkg/sentry/kernel/auth/context.go @@ -0,0 +1,36 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.dev/gvisor/pkg/context" +) + +// contextID is the auth package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxCredentials is a Context.Value key for Credentials. + CtxCredentials contextID = iota +) + +// CredentialsFromContext returns a copy of the Credentials used by ctx, or a +// set of Credentials with no capabilities if ctx does not have Credentials. +func CredentialsFromContext(ctx context.Context) *Credentials { + if v := ctx.Value(CtxCredentials); v != nil { + return v.(*Credentials) + } + return NewAnonymousCredentials() +} diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go new file mode 100644 index 000000000..6862f2ef5 --- /dev/null +++ b/pkg/sentry/kernel/auth/credentials.go @@ -0,0 +1,262 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Credentials contains information required to authorize privileged operations +// in a user namespace. +// +// +stateify savable +type Credentials struct { + // Real/effective/saved user/group IDs in the root user namespace. None of + // these should ever be NoID. + RealKUID KUID + EffectiveKUID KUID + SavedKUID KUID + RealKGID KGID + EffectiveKGID KGID + SavedKGID KGID + + // Filesystem user/group IDs are not implemented. "... setfsuid() is + // nowadays unneeded and should be avoided in new applications (likewise + // for setfsgid(2))." - setfsuid(2) + + // Supplementary groups used by set/getgroups. + // + // ExtraKGIDs slices are immutable, allowing multiple Credentials with the + // same ExtraKGIDs to share the same slice. + ExtraKGIDs []KGID + + // The capability sets applicable to this set of credentials. + PermittedCaps CapabilitySet + InheritableCaps CapabilitySet + EffectiveCaps CapabilitySet + BoundingCaps CapabilitySet + // Ambient capabilities are not introduced until Linux 4.3. + + // KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be + // maintained after a switch from root user to non-root user via setuid(). + KeepCaps bool + + // The user namespace associated with the owner of the credentials. + UserNamespace *UserNamespace +} + +// NewAnonymousCredentials returns a set of credentials with no capabilities in +// any user namespace. +func NewAnonymousCredentials() *Credentials { + // Create a new root user namespace. Since the new namespace's owner is + // KUID 0 and the returned credentials have non-zero KUID/KGID, the + // returned credentials do not have any capabilities in the new namespace. + // Since the new namespace is not part of any existing user namespace + // hierarchy, the returned credentials do not have any capabilities in any + // other namespace. + return &Credentials{ + RealKUID: NobodyKUID, + EffectiveKUID: NobodyKUID, + SavedKUID: NobodyKUID, + RealKGID: NobodyKGID, + EffectiveKGID: NobodyKGID, + SavedKGID: NobodyKGID, + UserNamespace: NewRootUserNamespace(), + } +} + +// NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e. +// global root) in user namespace ns. +func NewRootCredentials(ns *UserNamespace) *Credentials { + // I can't find documentation for this anywhere, but it's correct for the + // inheritable capability set to be initially empty (the capabilities test + // checks for this property). + return &Credentials{ + RealKUID: RootKUID, + EffectiveKUID: RootKUID, + SavedKUID: RootKUID, + RealKGID: RootKGID, + EffectiveKGID: RootKGID, + SavedKGID: RootKGID, + PermittedCaps: AllCapabilities, + EffectiveCaps: AllCapabilities, + BoundingCaps: AllCapabilities, + UserNamespace: ns, + } +} + +// NewUserCredentials returns a set of credentials based on the given UID, GIDs, +// and capabilities in a given namespace. If all arguments are their zero +// values, this returns the same credentials as NewRootCredentials. +func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials { + creds := NewRootCredentials(ns) + + // Set the UID. + uid := kuid + creds.RealKUID = uid + creds.EffectiveKUID = uid + creds.SavedKUID = uid + + // Set GID. + gid := kgid + creds.RealKGID = gid + creds.EffectiveKGID = gid + creds.SavedKGID = gid + + // Set additional GIDs. + creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...) + + // Set capabilities. + if capabilities != nil { + creds.PermittedCaps = capabilities.PermittedCaps + creds.EffectiveCaps = capabilities.EffectiveCaps + creds.BoundingCaps = capabilities.BoundingCaps + creds.InheritableCaps = capabilities.InheritableCaps + // TODO(nlacasse): Support ambient capabilities. + } else { + // If no capabilities are specified, grant capabilities consistent with + // setresuid + setresgid from NewRootCredentials to the given uid and + // gid. + if kuid == RootKUID { + creds.PermittedCaps = AllCapabilities + creds.EffectiveCaps = AllCapabilities + } else { + creds.PermittedCaps = 0 + creds.EffectiveCaps = 0 + } + creds.BoundingCaps = AllCapabilities + } + + return creds +} + +// Fork generates an identical copy of a set of credentials. +func (c *Credentials) Fork() *Credentials { + nc := new(Credentials) + *nc = *c // Copy-by-value; this is legal for all fields. + return nc +} + +// InGroup returns true if c is in group kgid. Compare Linux's +// kernel/groups.c:in_group_p(). +func (c *Credentials) InGroup(kgid KGID) bool { + if c.EffectiveKGID == kgid { + return true + } + for _, extraKGID := range c.ExtraKGIDs { + if extraKGID == kgid { + return true + } + } + return false +} + +// HasCapabilityIn returns true if c has capability cp in ns. +func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool { + for { + // "1. A process has a capability inside a user namespace if it is a member + // of that namespace and it has the capability in its effective capability + // set." - user_namespaces(7) + if c.UserNamespace == ns { + return CapabilitySetOf(cp)&c.EffectiveCaps != 0 + } + // "3. ... A process that resides in the parent of the user namespace and + // whose effective user ID matches the owner of the namespace has all + // capabilities in the namespace." + if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner { + return true + } + // "2. If a process has a capability in a user namespace, then it has that + // capability in all child (and further removed descendant) namespaces as + // well." + if ns.parent == nil { + return false + } + ns = ns.parent + } +} + +// HasCapability returns true if c has capability cp in its user namespace. +func (c *Credentials) HasCapability(cp linux.Capability) bool { + return c.HasCapabilityIn(cp, c.UserNamespace) +} + +// UseUID checks that c can use uid in its user namespace, then translates it +// to the root user namespace. +// +// The checks UseUID does are common, but you should verify that it's doing +// exactly what you want. +func (c *Credentials) UseUID(uid UID) (KUID, error) { + // uid must be mapped. + kuid := c.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return NoID, syserror.EINVAL + } + // If c has CAP_SETUID, then it can use any UID in its user namespace. + if c.HasCapability(linux.CAP_SETUID) { + return kuid, nil + } + // Otherwise, c must already have the UID as its real, effective, or saved + // set-user-ID. + if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID { + return kuid, nil + } + return NoID, syserror.EPERM +} + +// UseGID checks that c can use gid in its user namespace, then translates it +// to the root user namespace. +func (c *Credentials) UseGID(gid GID) (KGID, error) { + kgid := c.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return NoID, syserror.EINVAL + } + if c.HasCapability(linux.CAP_SETGID) { + return kgid, nil + } + if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID { + return kgid, nil + } + return NoID, syserror.EPERM +} + +// SetUID translates the provided uid to the root user namespace and updates c's +// uids to it. This performs no permissions or capabilities checks, the caller +// is responsible for ensuring the calling context is permitted to modify c. +func (c *Credentials) SetUID(uid UID) error { + kuid := c.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return syserror.EINVAL + } + c.RealKUID = kuid + c.EffectiveKUID = kuid + c.SavedKUID = kuid + return nil +} + +// SetGID translates the provided gid to the root user namespace and updates c's +// gids to it. This performs no permissions or capabilities checks, the caller +// is responsible for ensuring the calling context is permitted to modify c. +func (c *Credentials) SetGID(gid GID) error { + kgid := c.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + c.RealKGID = kgid + c.EffectiveKGID = kgid + c.SavedKGID = kgid + return nil +} diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go new file mode 100644 index 000000000..0a58ba17c --- /dev/null +++ b/pkg/sentry/kernel/auth/id.go @@ -0,0 +1,121 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "math" +) + +// UID is a user ID in an unspecified user namespace. +type UID uint32 + +// GID is a group ID in an unspecified user namespace. +type GID uint32 + +// In the root user namespace, user/group IDs have a 1-to-1 relationship with +// the users/groups they represent. In other user namespaces, this is not the +// case; for example, two different unmapped users may both "have" the overflow +// UID. This means that it is generally only valid to compare user and group +// IDs in the root user namespace. We assign distinct types, KUID/KGID, to such +// IDs to emphasize this distinction. ("k" is for "key", as in "unique key". +// Linux also uses the prefix "k", but I think they mean "kernel".) + +// KUID is a user ID in the root user namespace. +type KUID uint32 + +// KGID is a group ID in the root user namespace. +type KGID uint32 + +const ( + // NoID is uint32(-1). -1 is consistently used as a special value, in Linux + // and by extension in the auth package, to mean "no ID": + // + // - ID mapping returns -1 if the ID is not mapped. + // + // - Most set*id() syscalls accept -1 to mean "do not change this ID". + NoID = math.MaxUint32 + + // OverflowUID is the default value of /proc/sys/kernel/overflowuid. The + // "overflow UID" is usually [1] used when translating a user ID between + // namespaces fails because the ID is not mapped. (We don't implement this + // file, so the overflow UID is constant.) + // + // [1] "There is one notable case where unmapped user and group IDs are not + // converted to the corresponding overflow ID value. When viewing a uid_map + // or gid_map file in which there is no mapping for the second field, that + // field is displayed as 4294967295 (-1 as an unsigned integer);" - + // user_namespaces(7) + OverflowUID = UID(65534) + OverflowGID = GID(65534) + + // NobodyKUID is the user ID usually reserved for the least privileged user + // "nobody". + NobodyKUID = KUID(65534) + NobodyKGID = KGID(65534) + + // RootKUID is the user ID usually used for the most privileged user "root". + RootKUID = KUID(0) + RootKGID = KGID(0) + RootUID = UID(0) + RootGID = GID(0) +) + +// Ok returns true if uid is not -1. +func (uid UID) Ok() bool { + return uid != NoID +} + +// Ok returns true if gid is not -1. +func (gid GID) Ok() bool { + return gid != NoID +} + +// Ok returns true if kuid is not -1. +func (kuid KUID) Ok() bool { + return kuid != NoID +} + +// Ok returns true if kgid is not -1. +func (kgid KGID) Ok() bool { + return kgid != NoID +} + +// OrOverflow returns uid if it is valid and the overflow UID otherwise. +func (uid UID) OrOverflow() UID { + if uid.Ok() { + return uid + } + return OverflowUID +} + +// OrOverflow returns gid if it is valid and the overflow GID otherwise. +func (gid GID) OrOverflow() GID { + if gid.Ok() { + return gid + } + return OverflowGID +} + +// In translates kuid into user namespace ns. If kuid is not mapped in ns, In +// returns NoID. +func (kuid KUID) In(ns *UserNamespace) UID { + return ns.MapFromKUID(kuid) +} + +// In translates kgid into user namespace ns. If kgid is not mapped in ns, In +// returns NoID. +func (kgid KGID) In(ns *UserNamespace) GID { + return ns.MapFromKGID(kgid) +} diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go new file mode 100644 index 000000000..28cbe159d --- /dev/null +++ b/pkg/sentry/kernel/auth/id_map.go @@ -0,0 +1,285 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns. +func (ns *UserNamespace) MapFromKUID(kuid KUID) UID { + if ns.parent == nil { + return UID(kuid) + } + return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid)))) +} + +// MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns. +func (ns *UserNamespace) MapFromKGID(kgid KGID) GID { + if ns.parent == nil { + return GID(kgid) + } + return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid)))) +} + +// MapToKUID translates uid, a UID in ns, to a UID in the root namespace. +func (ns *UserNamespace) MapToKUID(uid UID) KUID { + if ns.parent == nil { + return KUID(uid) + } + return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid)))) +} + +// MapToKGID translates gid, a GID in ns, to a GID in the root namespace. +func (ns *UserNamespace) MapToKGID(gid GID) KGID { + if ns.parent == nil { + return KGID(gid) + } + return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid)))) +} + +func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 { + if id == NoID { + return NoID + } + ns.mu.Lock() + defer ns.mu.Unlock() + if it := m.FindSegment(id); it.Ok() { + return it.Value() + (id - it.Start()) + } + return NoID +} + +// allIDsMapped returns true if all IDs in the range [start, end) are mapped in +// m. +// +// Preconditions: end >= start. +func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool { + ns.mu.Lock() + defer ns.mu.Unlock() + return m.SpanRange(idMapRange{start, end}) == end-start +} + +// An IDMapEntry represents a mapping from a range of contiguous IDs in a user +// namespace to an equally-sized range of contiguous IDs in the namespace's +// parent. +// +// +stateify savable +type IDMapEntry struct { + // FirstID is the first ID in the range in the namespace. + FirstID uint32 + + // FirstParentID is the first ID in the range in the parent namespace. + FirstParentID uint32 + + // Length is the number of IDs in the range. + Length uint32 +} + +// SetUIDMap instructs ns to translate UIDs as specified by entries. +// +// Note: SetUIDMap does not place an upper bound on the number of entries, but +// Linux does. This restriction is implemented in SetUIDMap's caller, the +// implementation of /proc/[pid]/uid_map. +func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error { + c := CredentialsFromContext(ctx) + + ns.mu.Lock() + defer ns.mu.Unlock() + // "After the creation of a new user namespace, the uid_map file of *one* + // of the processes in the namespace may be written to *once* to define the + // mapping of user IDs in the new user namespace. An attempt to write more + // than once to a uid_map file in a user namespace fails with the error + // EPERM. Similar rules apply for gid_map files." - user_namespaces(7) + if !ns.uidMapFromParent.IsEmpty() { + return syserror.EPERM + } + // "At least one line must be written to the file." + if len(entries) == 0 { + return syserror.EINVAL + } + // """ + // In order for a process to write to the /proc/[pid]/uid_map + // (/proc/[pid]/gid_map) file, all of the following requirements must be + // met: + // + // 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability + // in the user namespace of the process pid. + // """ + if !c.HasCapabilityIn(linux.CAP_SETUID, ns) { + return syserror.EPERM + } + // "2. The writing process must either be in the user namespace of the process + // pid or be in the parent user namespace of the process pid." + if c.UserNamespace != ns && c.UserNamespace != ns.parent { + return syserror.EPERM + } + // """ + // 3. (see trySetUIDMap) + // + // 4. One of the following two cases applies: + // + // * Either the writing process has the CAP_SETUID (CAP_SETGID) capability + // in the parent user namespace. + // """ + if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) { + // """ + // * Or otherwise all of the following restrictions apply: + // + // + The data written to uid_map (gid_map) must consist of a single line + // that maps the writing process' effective user ID (group ID) in the + // parent user namespace to a user ID (group ID) in the user namespace. + // """ + if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 { + return syserror.EPERM + } + // """ + // + The writing process must have the same effective user ID as the + // process that created the user namespace. + // """ + if c.EffectiveKUID != ns.owner { + return syserror.EPERM + } + } + // trySetUIDMap leaves data in maps if it fails. + if err := ns.trySetUIDMap(entries); err != nil { + ns.uidMapFromParent.RemoveAll() + ns.uidMapToParent.RemoveAll() + return err + } + return nil +} + +func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error { + for _, e := range entries { + // Determine upper bounds and check for overflow. This implicitly + // checks for NoID. + lastID := e.FirstID + e.Length + if lastID <= e.FirstID { + return syserror.EINVAL + } + lastParentID := e.FirstParentID + e.Length + if lastParentID <= e.FirstParentID { + return syserror.EINVAL + } + // "3. The mapped user IDs (group IDs) must in turn have a mapping in + // the parent user namespace." + // Only the root namespace has a nil parent, and root is assigned + // mappings when it's created, so SetUIDMap would have returned EPERM + // without reaching this point if ns is root. + if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) { + return syserror.EPERM + } + // If either of these Adds fail, we have an overlapping range. + if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) { + return syserror.EINVAL + } + if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) { + return syserror.EINVAL + } + } + return nil +} + +// SetGIDMap instructs ns to translate GIDs as specified by entries. +func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error { + c := CredentialsFromContext(ctx) + + ns.mu.Lock() + defer ns.mu.Unlock() + if !ns.gidMapFromParent.IsEmpty() { + return syserror.EPERM + } + if len(entries) == 0 { + return syserror.EINVAL + } + if !c.HasCapabilityIn(linux.CAP_SETGID, ns) { + return syserror.EPERM + } + if c.UserNamespace != ns && c.UserNamespace != ns.parent { + return syserror.EPERM + } + if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) { + if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 { + return syserror.EPERM + } + // It's correct for this to still be UID. + if c.EffectiveKUID != ns.owner { + return syserror.EPERM + } + // "In the case of gid_map, use of the setgroups(2) system call must + // first be denied by writing "deny" to the /proc/[pid]/setgroups file + // (see below) before writing to gid_map." (This file isn't implemented + // in the version of Linux we're emulating; see comment in + // UserNamespace.) + } + if err := ns.trySetGIDMap(entries); err != nil { + ns.gidMapFromParent.RemoveAll() + ns.gidMapToParent.RemoveAll() + return err + } + return nil +} + +func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error { + for _, e := range entries { + lastID := e.FirstID + e.Length + if lastID <= e.FirstID { + return syserror.EINVAL + } + lastParentID := e.FirstParentID + e.Length + if lastParentID <= e.FirstParentID { + return syserror.EINVAL + } + if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) { + return syserror.EPERM + } + if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) { + return syserror.EINVAL + } + if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) { + return syserror.EINVAL + } + } + return nil +} + +// UIDMap returns the user ID mappings configured for ns. If no mappings +// have been configured, UIDMap returns nil. +func (ns *UserNamespace) UIDMap() []IDMapEntry { + return ns.getIDMap(&ns.uidMapToParent) +} + +// GIDMap returns the group ID mappings configured for ns. If no mappings +// have been configured, GIDMap returns nil. +func (ns *UserNamespace) GIDMap() []IDMapEntry { + return ns.getIDMap(&ns.gidMapToParent) +} + +func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry { + ns.mu.Lock() + defer ns.mu.Unlock() + var entries []IDMapEntry + for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() { + entries = append(entries, IDMapEntry{ + FirstID: it.Start(), + FirstParentID: it.Value(), + Length: it.Range().Length(), + }) + } + return entries +} diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go new file mode 100644 index 000000000..432dbfb6d --- /dev/null +++ b/pkg/sentry/kernel/auth/id_map_functions.go @@ -0,0 +1,45 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +// idMapFunctions "implements" generic interface segment.Functions for +// idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one +// user namespace to non-overlapping ranges of contiguous IDs in another user +// namespace. Each such ID mapping is implemented as a range-to-value mapping +// in the set such that [range.Start(), range.End()) => [value, value + +// range.Length()). +type idMapFunctions struct{} + +func (idMapFunctions) MinKey() uint32 { + return 0 +} + +func (idMapFunctions) MaxKey() uint32 { + return NoID +} + +func (idMapFunctions) ClearValue(*uint32) {} + +func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) { + // Mapped ranges have to be contiguous. + if val1+r1.Length() != val2 { + return 0, false + } + return val1, true +} + +func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) { + return val, val + (split - r.Start) +} diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go new file mode 100644 index 000000000..9dd52c860 --- /dev/null +++ b/pkg/sentry/kernel/auth/user_namespace.go @@ -0,0 +1,129 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "math" + + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// A UserNamespace represents a user namespace. See user_namespaces(7) for +// details. +// +// +stateify savable +type UserNamespace struct { + // parent is this namespace's parent. If this is the root namespace, parent + // is nil. The parent pointer is immutable. + parent *UserNamespace + + // owner is the effective UID of the namespace's creator in the root + // namespace. owner is immutable. + owner KUID + + // mu protects the following fields. + // + // If mu will be locked in multiple UserNamespaces, it must be locked in + // descendant namespaces before ancestors. + mu sync.Mutex `state:"nosave"` + + // Mappings of user/group IDs between this namespace and its parent. + // + // All ID maps, once set, cannot be changed. This means that successful + // UID/GID translations cannot be racy. + uidMapFromParent idMapSet + uidMapToParent idMapSet + gidMapFromParent idMapSet + gidMapToParent idMapSet + + // TODO(b/27454212): Support disabling setgroups(2). +} + +// NewRootUserNamespace returns a UserNamespace that is appropriate for a +// system's root user namespace. +func NewRootUserNamespace() *UserNamespace { + var ns UserNamespace + // """ + // The initial user namespace has no parent namespace, but, for + // consistency, the kernel provides dummy user and group ID mapping files + // for this namespace. Looking at the uid_map file (gid_map is the same) + // from a shell in the initial namespace shows: + // + // $ cat /proc/$$/uid_map + // 0 0 4294967295 + // """ - user_namespaces(7) + for _, m := range []*idMapSet{ + &ns.uidMapFromParent, + &ns.uidMapToParent, + &ns.gidMapFromParent, + &ns.gidMapToParent, + } { + if !m.Add(idMapRange{0, math.MaxUint32}, 0) { + panic("Failed to insert into empty ID map") + } + } + return &ns +} + +// Root returns the root of the user namespace tree containing ns. +func (ns *UserNamespace) Root() *UserNamespace { + for ns.parent != nil { + ns = ns.parent + } + return ns +} + +// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user +// namespaces." - user_namespaces(7) +const maxUserNamespaceDepth = 32 + +func (ns *UserNamespace) depth() int { + var i int + for ns != nil { + i++ + ns = ns.parent + } + return i +} + +// NewChildUserNamespace returns a new user namespace created by a caller with +// credentials c. +func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) { + if c.UserNamespace.depth() >= maxUserNamespaceDepth { + // "... Calls to unshare(2) or clone(2) that would cause this limit to + // be exceeded fail with the error EUSERS." - user_namespaces(7) + return nil, syserror.EUSERS + } + // "EPERM: CLONE_NEWUSER was specified in flags, but either the effective + // user ID or the effective group ID of the caller does not have a mapping + // in the parent namespace (see user_namespaces(7))." - clone(2) + // "CLONE_NEWUSER requires that the user ID and group ID of the calling + // process are mapped to user IDs and group IDs in the user namespace of + // the calling process at the time of the call." - unshare(2) + if !c.EffectiveKUID.In(c.UserNamespace).Ok() { + return nil, syserror.EPERM + } + if !c.EffectiveKGID.In(c.UserNamespace).Ok() { + return nil, syserror.EPERM + } + return &UserNamespace{ + parent: c.UserNamespace, + owner: c.EffectiveKUID, + // "When a user namespace is created, it starts without a mapping of + // user IDs (group IDs) to the parent user namespace." - + // user_namespaces(7) + }, nil +} diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go new file mode 100644 index 000000000..dd5f0f5fa --- /dev/null +++ b/pkg/sentry/kernel/context.go @@ -0,0 +1,114 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "time" + + "gvisor.dev/gvisor/pkg/context" +) + +// contextID is the kernel package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxCanTrace is a Context.Value key for a function with the same + // signature and semantics as kernel.Task.CanTrace. + CtxCanTrace contextID = iota + + // CtxKernel is a Context.Value key for a Kernel. + CtxKernel + + // CtxPIDNamespace is a Context.Value key for a PIDNamespace. + CtxPIDNamespace + + // CtxTask is a Context.Value key for a Task. + CtxTask + + // CtxUTSNamespace is a Context.Value key for a UTSNamespace. + CtxUTSNamespace + + // CtxIPCNamespace is a Context.Value key for a IPCNamespace. + CtxIPCNamespace +) + +// ContextCanTrace returns true if ctx is permitted to trace t, in the same sense +// as kernel.Task.CanTrace. +func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool { + if v := ctx.Value(CtxCanTrace); v != nil { + return v.(func(*Task, bool) bool)(t, attach) + } + return false +} + +// KernelFromContext returns the Kernel in which ctx is executing, or nil if +// there is no such Kernel. +func KernelFromContext(ctx context.Context) *Kernel { + if v := ctx.Value(CtxKernel); v != nil { + return v.(*Kernel) + } + return nil +} + +// PIDNamespaceFromContext returns the PID namespace in which ctx is executing, +// or nil if there is no such PID namespace. +func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace { + if v := ctx.Value(CtxPIDNamespace); v != nil { + return v.(*PIDNamespace) + } + return nil +} + +// UTSNamespaceFromContext returns the UTS namespace in which ctx is executing, +// or nil if there is no such UTS namespace. +func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { + if v := ctx.Value(CtxUTSNamespace); v != nil { + return v.(*UTSNamespace) + } + return nil +} + +// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing, +// or nil if there is no such IPC namespace. +func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { + if v := ctx.Value(CtxIPCNamespace); v != nil { + return v.(*IPCNamespace) + } + return nil +} + +// TaskFromContext returns the Task associated with ctx, or nil if there is no +// such Task. +func TaskFromContext(ctx context.Context) *Task { + if v := ctx.Value(CtxTask); v != nil { + return v.(*Task) + } + return nil +} + +// Deadline implements context.Context.Deadline. +func (*Task) Deadline() (time.Time, bool) { + return time.Time{}, false +} + +// Done implements context.Context.Done. +func (*Task) Done() <-chan struct{} { + return nil +} + +// Err implements context.Context.Err. +func (*Task) Err() error { + return nil +} diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD new file mode 100644 index 000000000..9d26392c0 --- /dev/null +++ b/pkg/sentry/kernel/contexttest/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "contexttest", + testonly = 1, + srcs = ["contexttest.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/contexttest", + "//pkg/sentry/kernel", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + ], +) diff --git a/pkg/sentry/kernel/contexttest/contexttest.go b/pkg/sentry/kernel/contexttest/contexttest.go new file mode 100644 index 000000000..22c340e56 --- /dev/null +++ b/pkg/sentry/kernel/contexttest/contexttest.go @@ -0,0 +1,40 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package contexttest provides a test context.Context which includes +// a dummy kernel pointing to a valid platform. +package contexttest + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" +) + +// Context returns a Context that may be used in tests. Uses ptrace as the +// platform.Platform, and provides a stub kernel that only serves to point to +// the platform. +func Context(tb testing.TB) context.Context { + ctx := contexttest.Context(tb) + k := &kernel.Kernel{ + Platform: platform.FromContext(ctx), + } + k.SetMemoryFile(pgalloc.MemoryFileFromContext(ctx)) + ctx.(*contexttest.TestContext).RegisterValue(kernel.CtxKernel, k) + return ctx +} diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD new file mode 100644 index 000000000..75eedd5a2 --- /dev/null +++ b/pkg/sentry/kernel/epoll/BUILD @@ -0,0 +1,51 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "epoll_list", + out = "epoll_list.go", + package = "epoll", + prefix = "pollEntry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*pollEntry", + "Linker": "*pollEntry", + }, +) + +go_library( + name = "epoll", + srcs = [ + "epoll.go", + "epoll_list.go", + "epoll_state.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/refs", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sync", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "epoll_test", + size = "small", + srcs = [ + "epoll_test.go", + ], + library = ":epoll", + deps = [ + "//pkg/sentry/contexttest", + "//pkg/sentry/fs/filetest", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go new file mode 100644 index 000000000..4c0f1e41f --- /dev/null +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -0,0 +1,462 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package epoll provides an implementation of Linux's IO event notification +// facility. See epoll(7) for more details. +package epoll + +import ( + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/anon" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EntryFlags is a bitmask that holds an entry's flags. +type EntryFlags int + +// Valid entry flags. +const ( + OneShot EntryFlags = 1 << iota + EdgeTriggered +) + +// FileIdentifier identifies a file. We cannot use just the FD because it could +// potentially be reassigned. We also cannot use just the file pointer because +// it is possible to have multiple entries for the same file object as long as +// they are created with different FDs (i.e., the FDs point to the same file). +// +// +stateify savable +type FileIdentifier struct { + File *fs.File `state:"wait"` + Fd int32 +} + +// pollEntry holds all the state associated with an event poll entry, that is, +// a file being observed by an event poll object. +// +// +stateify savable +type pollEntry struct { + pollEntryEntry + file *refs.WeakRef `state:"manual"` + id FileIdentifier `state:"wait"` + userData [2]int32 + waiter waiter.Entry `state:"manual"` + mask waiter.EventMask + flags EntryFlags + + epoll *EventPoll + + // We cannot save the current list pointer as it points into EventPoll + // struct, while state framework currently does not support such + // in-struct pointers. Instead, EventPoll will properly set this field + // in its loading logic. + curList *pollEntryList `state:"nosave"` +} + +// WeakRefGone implements refs.WeakRefUser.WeakRefGone. +// weakReferenceGone is called when the file in the weak reference is destroyed. +// The poll entry is removed in response to this. +func (p *pollEntry) WeakRefGone() { + p.epoll.RemoveEntry(p.id) +} + +// EventPoll holds all the state associated with an event poll object, that is, +// collection of files to observe and their current state. +// +// +stateify savable +type EventPoll struct { + fsutil.FilePipeSeek `state:"zerovalue"` + fsutil.FileNotDirReaddir `state:"zerovalue"` + fsutil.FileNoFsync `state:"zerovalue"` + fsutil.FileNoopFlush `state:"zerovalue"` + fsutil.FileNoIoctl `state:"zerovalue"` + fsutil.FileNoMMap `state:"zerovalue"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + // Wait queue is used to notify interested parties when the event poll + // object itself becomes readable or writable. + waiter.Queue `state:"zerovalue"` + + // files is the map of all the files currently being observed, it is + // protected by mu. + mu sync.Mutex `state:"nosave"` + files map[FileIdentifier]*pollEntry + + // listsMu protects manipulation of the lists below. It needs to be a + // different lock to avoid circular lock acquisition order involving + // the wait queue mutexes and mu. The full order is mu, observed file + // wait queue mutex, then listsMu; this allows listsMu to be acquired + // when (*pollEntry).Callback is called. + // + // An entry is always in one of the following lists: + // readyList -- when there's a chance that it's ready to have + // events delivered to epoll waiters. Given that being + // ready is a transient state, the Readiness() and + // readEvents() functions always call the entry's file + // Readiness() function to confirm it's ready. + // waitingList -- when there's no chance that the entry is ready, + // so it's waiting for the (*pollEntry).Callback to be called + // on it before it gets moved to the readyList. + // disabledList -- when the entry is disabled. This happens when + // a one-shot entry gets delivered via readEvents(). + listsMu sync.Mutex `state:"nosave"` + readyList pollEntryList + waitingList pollEntryList + disabledList pollEntryList +} + +// cycleMu is used to serialize all the cycle checks. This is only used when +// an event poll file is added as an entry to another event poll. Such checks +// are serialized to avoid lock acquisition order inversion: if a thread is +// adding A to B, and another thread is adding B to A, each would acquire A's +// and B's mutexes in reverse order, and could cause deadlocks. Having this +// lock prevents this by allowing only one check at a time to happen. +// +// We do the cycle check to prevent callers from introducing potentially +// infinite recursions. If a caller were to add A to B and then B to A, for +// event poll A to know if it's readable, it would need to check event poll B, +// which in turn would need event poll A and so on indefinitely. +var cycleMu sync.Mutex + +// NewEventPoll allocates and initializes a new event poll object. +func NewEventPoll(ctx context.Context) *fs.File { + // name matches fs/eventpoll.c:epoll_create1. + dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]")) + // Release the initial dirent reference after NewFile takes a reference. + defer dirent.DecRef() + return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{ + files: make(map[FileIdentifier]*pollEntry), + }) +} + +// Release implements fs.FileOperations.Release. +func (e *EventPoll) Release() { + // We need to take the lock now because files may be attempting to + // remove entries in parallel if they get destroyed. + e.mu.Lock() + defer e.mu.Unlock() + + // Go through all entries and clean up. + for _, entry := range e.files { + entry.id.File.EventUnregister(&entry.waiter) + entry.file.Drop() + } + e.files = nil +} + +// Read implements fs.FileOperations.Read. +func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syscall.ENOSYS +} + +// Write implements fs.FileOperations.Write. +func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syscall.ENOSYS +} + +// eventsAvailable determines if 'e' has events available for delivery. +func (e *EventPoll) eventsAvailable() bool { + e.listsMu.Lock() + + for it := e.readyList.Front(); it != nil; { + entry := it + it = it.Next() + + // If the entry is ready, we know 'e' has at least one entry + // ready for delivery. + ready := entry.id.File.Readiness(entry.mask) + if ready != 0 { + e.listsMu.Unlock() + return true + } + + // Entry is not ready, so move it to waiting list. + e.readyList.Remove(entry) + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + } + + e.listsMu.Unlock() + + return false +} + +// Readiness determines if the event poll object is currently readable (i.e., +// if there are pending events for delivery). +func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask { + ready := waiter.EventMask(0) + + if (mask&waiter.EventIn) != 0 && e.eventsAvailable() { + ready |= waiter.EventIn + } + + return ready +} + +// ReadEvents returns up to max available events. +func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { + var local pollEntryList + var ret []linux.EpollEvent + + e.listsMu.Lock() + + // Go through all entries we believe may be ready. + for it := e.readyList.Front(); it != nil && len(ret) < max; { + entry := it + it = it.Next() + + // Check the entry's readiness. It it's not really ready, we + // just put it back in the waiting list and move on to the next + // entry. + ready := entry.id.File.Readiness(entry.mask) & entry.mask + if ready == 0 { + e.readyList.Remove(entry) + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + + continue + } + + // Add event to the array that will be returned to caller. + ret = append(ret, linux.EpollEvent{ + Events: uint32(ready), + Data: entry.userData, + }) + + // The entry is consumed, so we must move it to the disabled + // list in case it's one-shot, or back to the wait list if it's + // edge-triggered. If it's neither, we leave it in the ready + // list so that its readiness can be checked the next time + // around; however, we must move it to the end of the list so + // that other events can be delivered as well. + e.readyList.Remove(entry) + if entry.flags&OneShot != 0 { + e.disabledList.PushBack(entry) + entry.curList = &e.disabledList + } else if entry.flags&EdgeTriggered != 0 { + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + } else { + local.PushBack(entry) + } + } + + e.readyList.PushBackList(&local) + + e.listsMu.Unlock() + + return ret +} + +// Callback implements waiter.EntryCallback.Callback. +// +// Callback is called when one of the files we're polling becomes ready. It +// moves said file to the readyList if it's currently in the waiting list. +func (p *pollEntry) Callback(*waiter.Entry) { + e := p.epoll + + e.listsMu.Lock() + + if p.curList == &e.waitingList { + e.waitingList.Remove(p) + e.readyList.PushBack(p) + p.curList = &e.readyList + e.listsMu.Unlock() + + e.Notify(waiter.EventIn) + return + } + + e.listsMu.Unlock() +} + +// initEntryReadiness initializes the entry's state with regards to its +// readiness by placing it in the appropriate list and registering for +// notifications. +func (e *EventPoll) initEntryReadiness(entry *pollEntry) { + // A new entry starts off in the waiting list. + e.listsMu.Lock() + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + e.listsMu.Unlock() + + // Register for event notifications. + f := entry.id.File + f.EventRegister(&entry.waiter, entry.mask) + + // Check if the file happens to already be in a ready state. + ready := f.Readiness(entry.mask) & entry.mask + if ready != 0 { + entry.Callback(&entry.waiter) + } +} + +// observes checks if event poll object e is directly or indirectly observing +// event poll object ep. It uses a bounded recursive depth-first search. +func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool { + // If we reached the maximum depth, we'll consider that we found it + // because we don't want to allow chains that are too long. + if depthLeft <= 0 { + return true + } + + e.mu.Lock() + defer e.mu.Unlock() + + // Go through each observed file and check if it is or observes ep. + for id := range e.files { + f, ok := id.File.FileOperations.(*EventPoll) + if !ok { + continue + } + + if f == ep || f.observes(ep, depthLeft-1) { + return true + } + } + + return false +} + +// AddEntry adds a new file to the collection of files observed by e. +func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error { + // Acquire cycle check lock if another event poll is being added. + ep, ok := id.File.FileOperations.(*EventPoll) + if ok { + cycleMu.Lock() + defer cycleMu.Unlock() + } + + e.mu.Lock() + defer e.mu.Unlock() + + // Fail if the file already has an entry. + if _, ok := e.files[id]; ok { + return syscall.EEXIST + } + + // Check if a cycle would be created. We use 4 as the limit because + // that's the value used by linux and we want to emulate it. + if ep != nil { + if e == ep { + return syscall.EINVAL + } + + if ep.observes(e, 4) { + return syscall.ELOOP + } + } + + // Create new entry and add it to map. + // + // N.B. Even though we are creating a weak reference here, we know it + // won't trigger a callback because we hold a reference to the file + // throughout the execution of this function. + entry := &pollEntry{ + id: id, + userData: data, + epoll: e, + flags: flags, + mask: mask, + } + entry.waiter.Callback = entry + e.files[id] = entry + entry.file = refs.NewWeakRef(id.File, entry) + + // Initialize the readiness state of the new entry. + e.initEntryReadiness(entry) + + return nil +} + +// UpdateEntry updates the flags, mask and user data associated with a file that +// is already part of the collection of observed files. +func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error { + e.mu.Lock() + defer e.mu.Unlock() + + // Fail if the file doesn't have an entry. + entry, ok := e.files[id] + if !ok { + return syscall.ENOENT + } + + // Unregister the old mask and remove entry from the list it's in, so + // (*pollEntry).Callback is guaranteed to not be called on this entry anymore. + entry.id.File.EventUnregister(&entry.waiter) + + // Remove entry from whatever list it's in. This ensure that no other + // threads have access to this entry as the only way left to find it + // is via e.files, but we hold e.mu, which prevents that. + e.listsMu.Lock() + entry.curList.Remove(entry) + e.listsMu.Unlock() + + // Initialize new readiness state. + entry.flags = flags + entry.mask = mask + entry.userData = data + e.initEntryReadiness(entry) + + return nil +} + +// RemoveEntry a files from the collection of observed files. +func (e *EventPoll) RemoveEntry(id FileIdentifier) error { + e.mu.Lock() + defer e.mu.Unlock() + + // Fail if the file doesn't have an entry. + entry, ok := e.files[id] + if !ok { + return syscall.ENOENT + } + + // Unregister from file first so that no concurrent attempts will be + // made to manipulate the file. + entry.id.File.EventUnregister(&entry.waiter) + + // Remove from the current list. + e.listsMu.Lock() + entry.curList.Remove(entry) + entry.curList = nil + e.listsMu.Unlock() + + // Remove file from map, and drop weak reference. + delete(e.files, id) + entry.file.Drop() + + return nil +} + +// UnregisterEpollWaiters removes the epoll waiter objects from the waiting +// queues. This is different from Release() as the file is not dereferenced. +func (e *EventPoll) UnregisterEpollWaiters() { + e.mu.Lock() + defer e.mu.Unlock() + + for _, entry := range e.files { + entry.id.File.EventUnregister(&entry.waiter) + } +} diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go new file mode 100644 index 000000000..7c61e0258 --- /dev/null +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -0,0 +1,51 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package epoll + +import ( + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/waiter" +) + +// afterLoad is invoked by stateify. +func (p *pollEntry) afterLoad() { + p.waiter.Callback = p + p.file = refs.NewWeakRef(p.id.File, p) + p.id.File.EventRegister(&p.waiter, p.mask) +} + +// afterLoad is invoked by stateify. +func (e *EventPoll) afterLoad() { + e.listsMu.Lock() + defer e.listsMu.Unlock() + + for _, ls := range []*pollEntryList{&e.waitingList, &e.readyList, &e.disabledList} { + for it := ls.Front(); it != nil; it = it.Next() { + it.curList = ls + } + } + + for it := e.waitingList.Front(); it != nil; { + entry := it + it = it.Next() + + if entry.id.File.Readiness(entry.mask) != 0 { + e.waitingList.Remove(entry) + e.readyList.PushBack(entry) + entry.curList = &e.readyList + e.Notify(waiter.EventIn) + } + } +} diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go new file mode 100644 index 000000000..22630e9c5 --- /dev/null +++ b/pkg/sentry/kernel/epoll/epoll_test.go @@ -0,0 +1,54 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package epoll + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs/filetest" + "gvisor.dev/gvisor/pkg/waiter" +) + +func TestFileDestroyed(t *testing.T) { + f := filetest.NewTestFile(t) + id := FileIdentifier{f, 12} + + efile := NewEventPoll(contexttest.Context(t)) + e := efile.FileOperations.(*EventPoll) + if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil { + t.Fatalf("addEntry failed: %v", err) + } + + // Check that we get an event reported twice in a row. + evt := e.ReadEvents(1) + if len(evt) != 1 { + t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt)) + } + + evt = e.ReadEvents(1) + if len(evt) != 1 { + t.Fatalf("Unexpected number of ready events: want %v, got %v", 1, len(evt)) + } + + // Destroy the file. Check that we get no more events. + f.DecRef() + + evt = e.ReadEvents(1) + if len(evt) != 0 { + t.Fatalf("Unexpected number of ready events: want %v, got %v", 0, len(evt)) + } + +} diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD new file mode 100644 index 000000000..9983a32e5 --- /dev/null +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "eventfd", + srcs = ["eventfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fdnotifier", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "eventfd_test", + size = "small", + srcs = ["eventfd_test.go"], + library = ":eventfd", + deps = [ + "//pkg/sentry/contexttest", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go new file mode 100644 index 000000000..87951adeb --- /dev/null +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -0,0 +1,285 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package eventfd provides an implementation of Linux's file-based event +// notification. +package eventfd + +import ( + "math" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/anon" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EventOperations represents an event with the semantics of Linux's file-based event +// notification (eventfd). Eventfds are usually internal to the Sentry but in certain +// situations they may be converted into a host-backed eventfd. +// +// +stateify savable +type EventOperations struct { + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + // Mutex that protects accesses to the fields of this event. + mu sync.Mutex `state:"nosave"` + + // Queue is used to notify interested parties when the event object + // becomes readable or writable. + wq waiter.Queue `state:"zerovalue"` + + // val is the current value of the event counter. + val uint64 + + // semMode specifies whether the event is in "semaphore" mode. + semMode bool + + // hostfd indicates whether this eventfd is passed through to the host. + hostfd int +} + +// New creates a new event object with the supplied initial value and mode. +func New(ctx context.Context, initVal uint64, semMode bool) *fs.File { + // name matches fs/eventfd.c:eventfd_file_create. + dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[eventfd]") + // Release the initial dirent reference after NewFile takes a reference. + defer dirent.DecRef() + return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{ + val: initVal, + semMode: semMode, + hostfd: -1, + }) +} + +// HostFD returns the host eventfd associated with this event. +func (e *EventOperations) HostFD() (int, error) { + e.mu.Lock() + defer e.mu.Unlock() + if e.hostfd >= 0 { + return e.hostfd, nil + } + + flags := linux.EFD_NONBLOCK + if e.semMode { + flags |= linux.EFD_SEMAPHORE + } + + fd, _, err := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(e.val), uintptr(flags), 0) + if err != 0 { + return -1, err + } + + if err := fdnotifier.AddFD(int32(fd), &e.wq); err != nil { + syscall.Close(int(fd)) + return -1, err + } + + e.hostfd = int(fd) + return e.hostfd, nil +} + +// Release implements fs.FileOperations.Release. +func (e *EventOperations) Release() { + e.mu.Lock() + defer e.mu.Unlock() + if e.hostfd >= 0 { + fdnotifier.RemoveFD(int32(e.hostfd)) + syscall.Close(e.hostfd) + e.hostfd = -1 + } +} + +// Read implements fs.FileOperations.Read. +func (e *EventOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + if dst.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := e.read(ctx, dst); err != nil { + return 0, err + } + return 8, nil +} + +// Write implements fs.FileOperations.Write. +func (e *EventOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + if src.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := e.write(ctx, src); err != nil { + return 0, err + } + return 8, nil +} + +// Must be called with e.mu locked. +func (e *EventOperations) hostRead(ctx context.Context, dst usermem.IOSequence) error { + var buf [8]byte + + if _, err := syscall.Read(e.hostfd, buf[:]); err != nil { + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err + } + + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) error { + e.mu.Lock() + + if e.hostfd >= 0 { + defer e.mu.Unlock() + return e.hostRead(ctx, dst) + } + + // We can't complete the read if the value is currently zero. + if e.val == 0 { + e.mu.Unlock() + return syserror.ErrWouldBlock + } + + // Update the value based on the mode the event is operating in. + var val uint64 + if e.semMode { + val = 1 + // Consistent with Linux, this is done even if writing to memory fails. + e.val-- + } else { + val = e.val + e.val = 0 + } + + e.mu.Unlock() + + // Notify writers. We do this even if we were already writable because + // it is possible that a writer is waiting to write the maximum value + // to the event. + e.wq.Notify(waiter.EventOut) + + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +// Must be called with e.mu locked. +func (e *EventOperations) hostWrite(val uint64) error { + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := syscall.Write(e.hostfd, buf[:]) + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err +} + +func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) error { + var buf [8]byte + if _, err := src.CopyIn(ctx, buf[:]); err != nil { + return err + } + val := usermem.ByteOrder.Uint64(buf[:]) + + return e.Signal(val) +} + +// Signal is an internal function to signal the event fd. +func (e *EventOperations) Signal(val uint64) error { + if val == math.MaxUint64 { + return syscall.EINVAL + } + + e.mu.Lock() + + if e.hostfd >= 0 { + defer e.mu.Unlock() + return e.hostWrite(val) + } + + // We only allow writes that won't cause the value to go over the max + // uint64 minus 1. + if val > math.MaxUint64-1-e.val { + e.mu.Unlock() + return syserror.ErrWouldBlock + } + + e.val += val + e.mu.Unlock() + + // Always trigger a notification. + e.wq.Notify(waiter.EventIn) + + return nil +} + +// Readiness returns the ready events for the event fd. +func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + e.mu.Lock() + if e.hostfd >= 0 { + defer e.mu.Unlock() + return fdnotifier.NonBlockingPoll(int32(e.hostfd), mask) + } + + ready := waiter.EventMask(0) + if e.val > 0 { + ready |= waiter.EventIn + } + + if e.val < math.MaxUint64-1 { + ready |= waiter.EventOut + } + e.mu.Unlock() + + return mask & ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (e *EventOperations) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { + e.wq.EventRegister(entry, mask) + + e.mu.Lock() + defer e.mu.Unlock() + if e.hostfd >= 0 { + fdnotifier.UpdateFD(int32(e.hostfd)) + } +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (e *EventOperations) EventUnregister(entry *waiter.Entry) { + e.wq.EventUnregister(entry) + + e.mu.Lock() + defer e.mu.Unlock() + if e.hostfd >= 0 { + fdnotifier.UpdateFD(int32(e.hostfd)) + } +} diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go new file mode 100644 index 000000000..9b4892f74 --- /dev/null +++ b/pkg/sentry/kernel/eventfd/eventfd_test.go @@ -0,0 +1,78 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package eventfd + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +func TestEventfd(t *testing.T) { + initVals := []uint64{ + 0, + // Using a non-zero initial value verifies that writing to an + // eventfd signals when the eventfd's counter was already + // non-zero. + 343, + } + + for _, initVal := range initVals { + ctx := contexttest.Context(t) + + // Make a new event that is writable. + event := New(ctx, initVal, false) + + // Register a callback for a write event. + w, ch := waiter.NewChannelEntry(nil) + event.EventRegister(&w, waiter.EventIn) + defer event.EventUnregister(&w) + + data := []byte("00000124") + // Create and submit a write request. + n, err := event.Writev(ctx, usermem.BytesIOSequence(data)) + if err != nil { + t.Fatal(err) + } + if n != 8 { + t.Errorf("eventfd.write wrote %d bytes, not full int64", n) + } + + // Check if the callback fired due to the write event. + select { + case <-ch: + default: + t.Errorf("Didn't get notified of EventIn after write") + } + } +} + +func TestEventfdStat(t *testing.T) { + ctx := contexttest.Context(t) + + // Make a new event that is writable. + event := New(ctx, 0, false) + + // Create and submit an stat request. + uattr, err := event.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + t.Fatalf("eventfd stat request failed: %v", err) + } + if uattr.Size != 0 { + t.Fatal("EventFD size should be 0") + } +} diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD new file mode 100644 index 000000000..2b3955598 --- /dev/null +++ b/pkg/sentry/kernel/fasync/BUILD @@ -0,0 +1,18 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "fasync", + srcs = ["fasync.go"], + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/fs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go new file mode 100644 index 000000000..153d2cd9b --- /dev/null +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -0,0 +1,188 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fasync provides FIOASYNC related functionality. +package fasync + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/waiter" +) + +// New creates a new fs.FileAsync. +func New() fs.FileAsync { + return &FileAsync{} +} + +// NewVFS2 creates a new vfs.FileAsync. +func NewVFS2() vfs.FileAsync { + return &FileAsync{} +} + +// FileAsync sends signals when the registered file is ready for IO. +// +// +stateify savable +type FileAsync struct { + // e is immutable after first use (which is protected by mu below). + e waiter.Entry + + // regMu protects registeration and unregistration actions on e. + // + // regMu must be held while registration decisions are being made + // through the registration action itself. + // + // Lock ordering: regMu, mu. + regMu sync.Mutex `state:"nosave"` + + // mu protects all following fields. + // + // Lock ordering: e.mu, mu. + mu sync.Mutex `state:"nosave"` + requester *auth.Credentials + registered bool + + // Only one of the following is allowed to be non-nil. + recipientPG *kernel.ProcessGroup + recipientTG *kernel.ThreadGroup + recipientT *kernel.Task +} + +// Callback sends a signal. +func (a *FileAsync) Callback(e *waiter.Entry) { + a.mu.Lock() + if !a.registered { + a.mu.Unlock() + return + } + t := a.recipientT + tg := a.recipientTG + if a.recipientPG != nil { + tg = a.recipientPG.Originator() + } + if tg != nil { + t = tg.Leader() + } + if t == nil { + // No recipient has been registered. + a.mu.Unlock() + return + } + c := t.Credentials() + // Logic from sigio_perm in fs/fcntl.c. + if a.requester.EffectiveKUID == 0 || + a.requester.EffectiveKUID == c.SavedKUID || + a.requester.EffectiveKUID == c.RealKUID || + a.requester.RealKUID == c.SavedKUID || + a.requester.RealKUID == c.RealKUID { + t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO)) + } + a.mu.Unlock() +} + +// Register sets the file which will be monitored for IO events. +// +// The file must not be currently registered. +func (a *FileAsync) Register(w waiter.Waitable) { + a.regMu.Lock() + defer a.regMu.Unlock() + a.mu.Lock() + + if a.registered { + a.mu.Unlock() + panic("registering already registered file") + } + + if a.e.Callback == nil { + a.e.Callback = a + } + a.registered = true + + a.mu.Unlock() + w.EventRegister(&a.e, waiter.EventIn|waiter.EventOut|waiter.EventErr|waiter.EventHUp) +} + +// Unregister stops monitoring a file. +// +// The file must be currently registered. +func (a *FileAsync) Unregister(w waiter.Waitable) { + a.regMu.Lock() + defer a.regMu.Unlock() + a.mu.Lock() + + if !a.registered { + a.mu.Unlock() + panic("unregistering unregistered file") + } + + a.registered = false + + a.mu.Unlock() + w.EventUnregister(&a.e) +} + +// Owner returns who is currently getting signals. All return values will be +// nil if no one is set to receive signals. +func (a *FileAsync) Owner() (*kernel.Task, *kernel.ThreadGroup, *kernel.ProcessGroup) { + a.mu.Lock() + defer a.mu.Unlock() + return a.recipientT, a.recipientTG, a.recipientPG +} + +// SetOwnerTask sets the owner (who will receive signals) to a specified task. +// Only this owner will receive signals. +func (a *FileAsync) SetOwnerTask(requester *kernel.Task, recipient *kernel.Task) { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = requester.Credentials() + a.recipientT = recipient + a.recipientTG = nil + a.recipientPG = nil +} + +// SetOwnerThreadGroup sets the owner (who will receive signals) to a specified +// thread group. Only this owner will receive signals. +func (a *FileAsync) SetOwnerThreadGroup(requester *kernel.Task, recipient *kernel.ThreadGroup) { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = requester.Credentials() + a.recipientT = nil + a.recipientTG = recipient + a.recipientPG = nil +} + +// SetOwnerProcessGroup sets the owner (who will receive signals) to a +// specified process group. Only this owner will receive signals. +func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kernel.ProcessGroup) { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = requester.Credentials() + a.recipientT = nil + a.recipientTG = nil + a.recipientPG = recipient +} + +// ClearOwner unsets the current signal recipient. +func (a *FileAsync) ClearOwner() { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = nil + a.recipientT = nil + a.recipientTG = nil + a.recipientPG = nil +} diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go new file mode 100644 index 000000000..4b7d234a4 --- /dev/null +++ b/pkg/sentry/kernel/fd_table.go @@ -0,0 +1,638 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "math" + "strings" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FDFlags define flags for an individual descriptor. +// +// +stateify savable +type FDFlags struct { + // CloseOnExec indicates the descriptor should be closed on exec. + CloseOnExec bool +} + +// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags +// representation. +func (f FDFlags) ToLinuxFileFlags() (mask uint) { + if f.CloseOnExec { + mask |= linux.O_CLOEXEC + } + return +} + +// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags +// representation. +func (f FDFlags) ToLinuxFDFlags() (mask uint) { + if f.CloseOnExec { + mask |= linux.FD_CLOEXEC + } + return +} + +// descriptor holds the details about a file descriptor, namely a pointer to +// the file itself and the descriptor flags. +// +// Note that this is immutable and can only be changed via operations on the +// descriptorTable. +// +// It contains both VFS1 and VFS2 file types, but only one of them can be set. +// +// +stateify savable +type descriptor struct { + // TODO(gvisor.dev/issue/1624): Remove fs.File. + file *fs.File + fileVFS2 *vfs.FileDescription + flags FDFlags +} + +// FDTable is used to manage File references and flags. +// +// +stateify savable +type FDTable struct { + refs.AtomicRefCount + k *Kernel + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // next is start position to find fd. + next int32 + + // used contains the number of non-nil entries. It must be accessed + // atomically. It may be read atomically without holding mu (but not + // written). + used int32 + + // descriptorTable holds descriptors. + descriptorTable `state:".(map[int32]descriptor)"` +} + +func (f *FDTable) saveDescriptorTable() map[int32]descriptor { + m := make(map[int32]descriptor) + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + m[fd] = descriptor{ + file: file, + fileVFS2: fileVFS2, + flags: flags, + } + }) + return m +} + +func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { + f.init() // Initialize table. + for fd, d := range m { + f.setAll(fd, d.file, d.fileVFS2, d.flags) + + // Note that we do _not_ need to acquire a extra table reference here. The + // table reference will already be accounted for in the file, so we drop the + // reference taken by set above. + switch { + case d.file != nil: + d.file.DecRef() + case d.fileVFS2 != nil: + d.fileVFS2.DecRef() + } + } +} + +// drop drops the table reference. +func (f *FDTable) drop(file *fs.File) { + // Release locks. + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF}) + + // Send inotify events. + d := file.Dirent + var ev uint32 + if fs.IsDir(d.Inode.StableAttr) { + ev |= linux.IN_ISDIR + } + if file.Flags().Write { + ev |= linux.IN_CLOSE_WRITE + } else { + ev |= linux.IN_CLOSE_NOWRITE + } + d.InotifyEvent(ev, 0) + + // Drop the table reference. + file.DecRef() +} + +// dropVFS2 drops the table reference. +func (f *FDTable) dropVFS2(file *vfs.FileDescription) { + // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the + // entire file. + err := file.UnlockPOSIX(context.Background(), f, 0, 0, linux.SEEK_SET) + if err != nil && err != syserror.ENOLCK { + panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) + } + + // Generate inotify events. + ev := uint32(linux.IN_CLOSE_NOWRITE) + if file.IsWritable() { + ev = linux.IN_CLOSE_WRITE + } + file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) + + // Drop the table's reference. + file.DecRef() +} + +// NewFDTable allocates a new FDTable that may be used by tasks in k. +func (k *Kernel) NewFDTable() *FDTable { + f := &FDTable{k: k} + f.init() + return f +} + +// destroy removes all of the file descriptors from the map. +func (f *FDTable) destroy() { + f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool { + return true + }) +} + +// DecRef implements RefCounter.DecRef with destructor f.destroy. +func (f *FDTable) DecRef() { + f.DecRefWithDestructor(f.destroy) +} + +// Size returns the number of file descriptor slots currently allocated. +func (f *FDTable) Size() int { + size := atomic.LoadInt32(&f.used) + return int(size) +} + +// forEach iterates over all non-nil files in sorted order. +// +// It is the caller's responsibility to acquire an appropriate lock. +func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { + // retries tracks the number of failed TryIncRef attempts for the same FD. + retries := 0 + fd := int32(0) + for { + file, fileVFS2, flags, ok := f.getAll(fd) + if !ok { + break + } + switch { + case file != nil: + if !file.TryIncRef() { + retries++ + if retries > 1000 { + panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations)) + } + continue // Race caught. + } + fn(fd, file, nil, flags) + file.DecRef() + case fileVFS2 != nil: + if !fileVFS2.TryIncRef() { + retries++ + if retries > 1000 { + panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl())) + } + continue // Race caught. + } + fn(fd, nil, fileVFS2, flags) + fileVFS2.DecRef() + } + retries = 0 + fd++ + } +} + +// String is a stringer for FDTable. +func (f *FDTable) String() string { + var buf strings.Builder + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + switch { + case file != nil: + n, _ := file.Dirent.FullName(nil /* root */) + fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, n) + + case fileVFS2 != nil: + vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem() + name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) + if err != nil { + fmt.Fprintf(&buf, "<err: %v>\n", err) + return + } + fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name) + } + }) + return buf.String() +} + +// NewFDs allocates new FDs guaranteed to be the lowest number available +// greater than or equal to the fd parameter. All files will share the set +// flags. Success is guaranteed to be all or none. +func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) { + if fd < 0 { + // Don't accept negative FDs. + return nil, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if fd >= end { + return nil, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // From f.next to find available fd. + if fd < f.next { + fd = f.next + } + + // Install all entries. + for i := fd; i < end && len(fds) < len(files); i++ { + if d, _, _ := f.get(i); d == nil { + f.set(i, files[len(fds)], flags) // Set the descriptor. + fds = append(fds, i) // Record the file descriptor. + } + } + + // Failure? Unwind existing FDs. + if len(fds) < len(files) { + for _, i := range fds { + f.set(i, nil, FDFlags{}) // Zap entry. + } + return nil, syscall.EMFILE + } + + if fd == f.next { + // Update next search start position. + f.next = fds[len(fds)-1] + 1 + } + + return fds, nil +} + +// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available +// greater than or equal to the fd parameter. All files will share the set +// flags. Success is guaranteed to be all or none. +func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) { + if fd < 0 { + // Don't accept negative FDs. + return nil, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if fd >= end { + return nil, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // From f.next to find available fd. + if fd < f.next { + fd = f.next + } + + // Install all entries. + for i := fd; i < end && len(fds) < len(files); i++ { + if d, _, _ := f.getVFS2(i); d == nil { + f.setVFS2(i, files[len(fds)], flags) // Set the descriptor. + fds = append(fds, i) // Record the file descriptor. + } + } + + // Failure? Unwind existing FDs. + if len(fds) < len(files) { + for _, i := range fds { + f.setVFS2(i, nil, FDFlags{}) // Zap entry. + } + return nil, syscall.EMFILE + } + + if fd == f.next { + // Update next search start position. + f.next = fds[len(fds)-1] + 1 + } + + return fds, nil +} + +// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for +// the given file description. If it succeeds, it takes a reference on file. +func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { + if minfd < 0 { + // Don't accept negative FDs. + return -1, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if minfd >= end { + return -1, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // From f.next to find available fd. + fd := minfd + if fd < f.next { + fd = f.next + } + for fd < end { + if d, _, _ := f.getVFS2(fd); d == nil { + f.setVFS2(fd, file, flags) + if fd == f.next { + // Update next search start position. + f.next = fd + 1 + } + return fd, nil + } + fd++ + } + return -1, syscall.EMFILE +} + +// NewFDAt sets the file reference for the given FD. If there is an active +// reference for that FD, the ref count for that existing reference is +// decremented. +func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error { + return f.newFDAt(ctx, fd, file, nil, flags) +} + +// NewFDAtVFS2 sets the file reference for the given FD. If there is an active +// reference for that FD, the ref count for that existing reference is +// decremented. +func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error { + return f.newFDAt(ctx, fd, nil, file, flags) +} + +func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + // Check the limit for the provided file. + if limitSet := limits.FromContext(ctx); limitSet != nil { + if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { + return syscall.EMFILE + } + } + + // Install the entry. + f.mu.Lock() + defer f.mu.Unlock() + f.setAll(fd, file, fileVFS2, flags) + return nil +} + +// SetFlags sets the flags for the given file descriptor. +// +// True is returned iff flags were changed. +func (f *FDTable) SetFlags(fd int32, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + file, _, _ := f.get(fd) + if file == nil { + // No file found. + return syscall.EBADF + } + + // Update the flags. + f.set(fd, file, flags) + return nil +} + +// SetFlagsVFS2 sets the flags for the given file descriptor. +// +// True is returned iff flags were changed. +func (f *FDTable) SetFlagsVFS2(fd int32, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + file, _, _ := f.getVFS2(fd) + if file == nil { + // No file found. + return syscall.EBADF + } + + // Update the flags. + f.setVFS2(fd, file, flags) + return nil +} + +// Get returns a reference to the file and the flags for the FD or nil if no +// file is defined for the given fd. +// +// N.B. Callers are required to use DecRef when they are done. +// +//go:nosplit +func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) { + if fd < 0 { + return nil, FDFlags{} + } + + for { + file, flags, _ := f.get(fd) + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + // Reference acquired. + return file, flags + } + // No file available. + return nil, FDFlags{} + } +} + +// GetVFS2 returns a reference to the file and the flags for the FD or nil if no +// file is defined for the given fd. +// +// N.B. Callers are required to use DecRef when they are done. +// +//go:nosplit +func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { + if fd < 0 { + return nil, FDFlags{} + } + + for { + file, flags, _ := f.getVFS2(fd) + if file != nil { + if !file.TryIncRef() { + continue // Race caught. + } + // Reference acquired. + return file, flags + } + // No file available. + return nil, FDFlags{} + } +} + +// GetFDs returns a sorted list of valid fds. +// +// Precondition: The caller must be running on the task goroutine, or Task.mu +// must be locked. +func (f *FDTable) GetFDs() []int32 { + fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) + f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { + fds = append(fds, fd) + }) + return fds +} + +// GetRefs returns a stable slice of references to all files and bumps the +// reference count on each. The caller must use DecRef on each reference when +// they're done using the slice. +func (f *FDTable) GetRefs() []*fs.File { + files := make([]*fs.File, 0, f.Size()) + f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + file.IncRef() // Acquire a reference for caller. + files = append(files, file) + }) + return files +} + +// GetRefsVFS2 returns a stable slice of references to all files and bumps the +// reference count on each. The caller must use DecRef on each reference when +// they're done using the slice. +func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { + files := make([]*vfs.FileDescription, 0, f.Size()) + f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { + file.IncRef() // Acquire a reference for caller. + files = append(files, file) + }) + return files +} + +// Fork returns an independent FDTable. +func (f *FDTable) Fork() *FDTable { + clone := f.k.NewFDTable() + + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + // The set function here will acquire an appropriate table + // reference for the clone. We don't need anything else. + switch { + case file != nil: + clone.set(fd, file, flags) + case fileVFS2 != nil: + clone.setVFS2(fd, fileVFS2, flags) + } + }) + return clone +} + +// Remove removes an FD from and returns a non-file iff successful. +// +// N.B. Callers are required to use DecRef when they are done. +func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { + if fd < 0 { + return nil, nil + } + + f.mu.Lock() + defer f.mu.Unlock() + + // Update current available position. + if fd < f.next { + f.next = fd + } + + orig, orig2, _, _ := f.getAll(fd) + + // Add reference for caller. + switch { + case orig != nil: + orig.IncRef() + case orig2 != nil: + orig2.IncRef() + } + if orig != nil || orig2 != nil { + f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + } + return orig, orig2 +} + +// RemoveIf removes all FDs where cond is true. +func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { + f.mu.Lock() + defer f.mu.Unlock() + + f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + if cond(file, fileVFS2, flags) { + f.set(fd, nil, FDFlags{}) // Clear from table. + // Update current available position. + if fd < f.next { + f.next = fd + } + } + }) +} diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go new file mode 100644 index 000000000..29f95a2c4 --- /dev/null +++ b/pkg/sentry/kernel/fd_table_test.go @@ -0,0 +1,228 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "runtime" + "testing" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/filetest" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sync" +) + +const ( + // maxFD is the maximum FD to try to create in the map. + // + // This number of open files has been seen in the wild. + maxFD = 2 * 1024 +) + +func runTest(t testing.TB, fn func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet)) { + t.Helper() // Don't show in stacks. + + // Create the limits and context. + limitSet := limits.NewLimitSet() + limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true) + ctx := contexttest.WithLimitSet(contexttest.Context(t), limitSet) + + // Create a test file.; + file := filetest.NewTestFile(t) + + // Create the table. + fdTable := new(FDTable) + fdTable.init() + + // Run the test. + fn(ctx, fdTable, file, limitSet) +} + +// TestFDTableMany allocates maxFD FDs, i.e. maxes out the FDTable, until there +// is no room, then makes sure that NewFDAt works and also that if we remove +// one and add one that works too. +func TestFDTableMany(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + for i := 0; i < maxFD; i++ { + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Allocated %v FDs but wanted to allocate %v", i, maxFD) + } + } + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("fdTable.NewFDs(0, r) in full map: got nil, wanted error") + } + + if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil { + t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + + i := int32(2) + fdTable.Remove(i) + if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i { + t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err) + } + }) +} + +func TestFDTableOverLimit(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + if _, err := fdTable.NewFDs(ctx, maxFD, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("fdTable.NewFDs(maxFD, f): got nil, wanted error") + } + + if _, err := fdTable.NewFDs(ctx, maxFD-2, []*fs.File{file, file, file}, FDFlags{}); err == nil { + t.Fatalf("fdTable.NewFDs(maxFD-2, {f,f,f}): got nil, wanted error") + } + + if fds, err := fdTable.NewFDs(ctx, maxFD-3, []*fs.File{file, file, file}, FDFlags{}); err != nil { + t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err) + } else { + for _, fd := range fds { + fdTable.Remove(fd) + } + } + + if fds, err := fdTable.NewFDs(ctx, maxFD-1, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != maxFD-1 { + t.Fatalf("fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + + if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) + } else if len(fds) != 1 || fds[0] != 0 { + t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds) + } + }) +} + +// TestFDTable does a set of simple tests to make sure simple adds, removes, +// GetRefs, and DecRefs work. The ordering is just weird enough that a +// table-driven approach seemed clumsy. +func TestFDTable(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, limitSet *limits.LimitSet) { + // Cap the limit at one. + limitSet.Set(limits.NumberOfFiles, limits.Limit{1, maxFD}, true) + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to an empty 1-size map: got %v, want nil", err) + } + + if _, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err == nil { + t.Fatalf("Adding an FD to a filled 1-size map: got nil, wanted an error") + } + + // Remove the previous limit. + limitSet.Set(limits.NumberOfFiles, limits.Limit{maxFD, maxFD}, true) + + if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil { + t.Fatalf("Adding an FD to a resized map: got %v, want nil", err) + } else if len(fds) != 1 || fds[0] != 1 { + t.Fatalf("Added an FD to a resized map: got %v, want {1}", fds) + } + + if err := fdTable.NewFDAt(ctx, 1, file, FDFlags{}); err != nil { + t.Fatalf("Replacing FD 1 via fdTable.NewFDAt(1, r, FDFlags{}): got %v, wanted nil", err) + } + + if err := fdTable.NewFDAt(ctx, maxFD+1, file, FDFlags{}); err == nil { + t.Fatalf("Using an FD that was too large via fdTable.NewFDAt(%v, r, FDFlags{}): got nil, wanted an error", maxFD+1) + } + + if ref, _ := fdTable.Get(1); ref == nil { + t.Fatalf("fdTable.Get(1): got nil, wanted %v", file) + } + + if ref, _ := fdTable.Get(2); ref != nil { + t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref) + } + + ref, _ := fdTable.Remove(1) + if ref == nil { + t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") + } + ref.DecRef() + + if ref, _ := fdTable.Remove(1); ref != nil { + t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") + } + }) +} + +func TestDescriptorFlags(t *testing.T) { + runTest(t, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + if err := fdTable.NewFDAt(ctx, 2, file, FDFlags{CloseOnExec: true}); err != nil { + t.Fatalf("fdTable.NewFDAt(2, r, FDFlags{}): got %v, wanted nil", err) + } + + newFile, flags := fdTable.Get(2) + if newFile == nil { + t.Fatalf("fdTable.Get(2): got a %v, wanted nil", newFile) + } + + if !flags.CloseOnExec { + t.Fatalf("new File flags %v don't match original %d\n", flags, 0) + } + }) +} + +func BenchmarkFDLookupAndDecRef(b *testing.B) { + b.StopTimer() // Setup. + + runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{}) + if err != nil { + b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err) + } + + b.StartTimer() // Benchmark. + for i := 0; i < b.N; i++ { + tf, _ := fdTable.Get(fds[i%len(fds)]) + tf.DecRef() + } + }) +} + +func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) { + b.StopTimer() // Setup. + + runTest(b, func(ctx context.Context, fdTable *FDTable, file *fs.File, _ *limits.LimitSet) { + fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file, file, file, file, file}, FDFlags{}) + if err != nil { + b.Fatalf("fdTable.NewFDs: got %v, wanted nil", err) + } + + concurrency := runtime.GOMAXPROCS(0) + if concurrency < 4 { + concurrency = 4 + } + each := b.N / concurrency + + b.StartTimer() // Benchmark. + var wg sync.WaitGroup + for i := 0; i < concurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < each; i++ { + tf, _ := fdTable.Get(fds[i%len(fds)]) + tf.DecRef() + } + }() + } + wg.Wait() + }) +} diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go new file mode 100644 index 000000000..7fd97dc53 --- /dev/null +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -0,0 +1,169 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +type descriptorTable struct { + // slice is a *[]unsafe.Pointer, where each element is actually + // *descriptor object, updated atomically. + // + // Changes to the slice itself requiring holding FDTable.mu. + slice unsafe.Pointer `state:".(map[int32]*descriptor)"` +} + +// init initializes the table. +func (f *FDTable) init() { + var slice []unsafe.Pointer // Empty slice. + atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) +} + +// get gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) { + file, _, flags, ok := f.getAll(fd) + return file, flags, ok +} + +// getVFS2 gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) getVFS2(fd int32) (*vfs.FileDescription, FDFlags, bool) { + _, file, flags, ok := f.getAll(fd) + return file, flags, ok +} + +// getAll gets a file entry. +// +// The boolean indicates whether this was in range. +// +//go:nosplit +func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, bool) { + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + if fd >= int32(len(slice)) { + return nil, nil, FDFlags{}, false + } + d := (*descriptor)(atomic.LoadPointer(&slice[fd])) + if d == nil { + return nil, nil, FDFlags{}, true + } + if d.file != nil && d.fileVFS2 != nil { + panic("VFS1 and VFS2 files set") + } + return d.file, d.fileVFS2, d.flags, true +} + +// set sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { + f.setAll(fd, file, nil, flags) +} + +// setVFS2 sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) { + f.setAll(fd, nil, file, flags) +} + +// setAll sets an entry. +// +// This handles accounting changes, as well as acquiring and releasing the +// reference needed by the table iff the file is different. +// +// Precondition: mu must be held. +func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + if file != nil && fileVFS2 != nil { + panic("VFS1 and VFS2 files set") + } + + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + + // Grow the table as required. + if last := int32(len(slice)); fd >= last { + end := fd + 1 + if end < 2*last { + end = 2 * last + } + slice = append(slice, make([]unsafe.Pointer, end-last)...) + atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) + } + + var desc *descriptor + if file != nil || fileVFS2 != nil { + desc = &descriptor{ + file: file, + fileVFS2: fileVFS2, + flags: flags, + } + } + + // Update the single element. + orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(desc))) + + // Acquire a table reference. + if desc != nil { + switch { + case desc.file != nil: + if orig == nil || desc.file != orig.file { + desc.file.IncRef() + } + case desc.fileVFS2 != nil: + if orig == nil || desc.fileVFS2 != orig.fileVFS2 { + desc.fileVFS2.IncRef() + } + } + } + + // Drop the table reference. + if orig != nil { + switch { + case orig.file != nil: + if desc == nil || desc.file != orig.file { + f.drop(orig.file) + } + case orig.fileVFS2 != nil: + if desc == nil || desc.fileVFS2 != orig.fileVFS2 { + f.dropVFS2(orig.fileVFS2) + } + } + } + + // Adjust used. + switch { + case orig == nil && desc != nil: + atomic.AddInt32(&f.used, 1) + case orig != nil && desc == nil: + atomic.AddInt32(&f.used, -1) + } +} diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go new file mode 100644 index 000000000..47f78df9a --- /dev/null +++ b/pkg/sentry/kernel/fs_context.go @@ -0,0 +1,283 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" +) + +// FSContext contains filesystem context. +// +// This includes umask and working directory. +// +// +stateify savable +type FSContext struct { + refs.AtomicRefCount + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // root is the filesystem root. Will be nil iff the FSContext has been + // destroyed. + root *fs.Dirent + + // rootVFS2 is the filesystem root. + rootVFS2 vfs.VirtualDentry + + // cwd is the current working directory. Will be nil iff the FSContext + // has been destroyed. + cwd *fs.Dirent + + // cwdVFS2 is the current working directory. + cwdVFS2 vfs.VirtualDentry + + // umask is the current file mode creation mask. When a thread using this + // context invokes a syscall that creates a file, bits set in umask are + // removed from the permissions that the file is created with. + umask uint +} + +// newFSContext returns a new filesystem context. +func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { + root.IncRef() + cwd.IncRef() + f := FSContext{ + root: root, + cwd: cwd, + umask: umask, + } + f.EnableLeakCheck("kernel.FSContext") + return &f +} + +// NewFSContextVFS2 returns a new filesystem context. +func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { + root.IncRef() + cwd.IncRef() + f := FSContext{ + rootVFS2: root, + cwdVFS2: cwd, + umask: umask, + } + f.EnableLeakCheck("kernel.FSContext") + return &f +} + +// destroy is the destructor for an FSContext. +// +// This will call DecRef on both root and cwd Dirents. If either call to +// DecRef returns an error, then it will be propagated. If both calls to +// DecRef return an error, then the one from root.DecRef will be propagated. +// +// Note that there may still be calls to WorkingDirectory() or RootDirectory() +// (that return nil). This is because valid references may still be held via +// proc files or other mechanisms. +func (f *FSContext) destroy() { + // Hold f.mu so that we don't race with RootDirectory() and + // WorkingDirectory(). + f.mu.Lock() + defer f.mu.Unlock() + + if VFS2Enabled { + f.rootVFS2.DecRef() + f.rootVFS2 = vfs.VirtualDentry{} + f.cwdVFS2.DecRef() + f.cwdVFS2 = vfs.VirtualDentry{} + } else { + f.root.DecRef() + f.root = nil + f.cwd.DecRef() + f.cwd = nil + } +} + +// DecRef implements RefCounter.DecRef with destructor f.destroy. +func (f *FSContext) DecRef() { + f.DecRefWithDestructor(f.destroy) +} + +// Fork forks this FSContext. +// +// This is not a valid call after destroy. +func (f *FSContext) Fork() *FSContext { + f.mu.Lock() + defer f.mu.Unlock() + + if VFS2Enabled { + f.cwdVFS2.IncRef() + f.rootVFS2.IncRef() + } else { + f.cwd.IncRef() + f.root.IncRef() + } + + return &FSContext{ + cwd: f.cwd, + root: f.root, + cwdVFS2: f.cwdVFS2, + rootVFS2: f.rootVFS2, + umask: f.umask, + } +} + +// WorkingDirectory returns the current working directory. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) WorkingDirectory() *fs.Dirent { + f.mu.Lock() + defer f.mu.Unlock() + + f.cwd.IncRef() + return f.cwd +} + +// WorkingDirectoryVFS2 returns the current working directory. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { + f.mu.Lock() + defer f.mu.Unlock() + + f.cwdVFS2.IncRef() + return f.cwdVFS2 +} + +// SetWorkingDirectory sets the current working directory. +// This will take an extra reference on the Dirent. +// +// This is not a valid call after destroy. +func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { + if d == nil { + panic("FSContext.SetWorkingDirectory called with nil dirent") + } + + f.mu.Lock() + defer f.mu.Unlock() + + if f.cwd == nil { + panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d)) + } + + old := f.cwd + f.cwd = d + d.IncRef() + old.DecRef() +} + +// SetWorkingDirectoryVFS2 sets the current working directory. +// This will take an extra reference on the VirtualDentry. +// +// This is not a valid call after destroy. +func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) { + f.mu.Lock() + defer f.mu.Unlock() + + old := f.cwdVFS2 + f.cwdVFS2 = d + d.IncRef() + old.DecRef() +} + +// RootDirectory returns the current filesystem root. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) RootDirectory() *fs.Dirent { + f.mu.Lock() + defer f.mu.Unlock() + if f.root != nil { + f.root.IncRef() + } + return f.root +} + +// RootDirectoryVFS2 returns the current filesystem root. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { + f.mu.Lock() + defer f.mu.Unlock() + + f.rootVFS2.IncRef() + return f.rootVFS2 +} + +// SetRootDirectory sets the root directory. +// This will take an extra reference on the Dirent. +// +// This is not a valid call after free. +func (f *FSContext) SetRootDirectory(d *fs.Dirent) { + if d == nil { + panic("FSContext.SetRootDirectory called with nil dirent") + } + + f.mu.Lock() + defer f.mu.Unlock() + + if f.root == nil { + panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", d)) + } + + old := f.root + f.root = d + d.IncRef() + old.DecRef() +} + +// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. +// +// This is not a valid call after free. +func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { + if !vd.Ok() { + panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") + } + + f.mu.Lock() + + if !f.rootVFS2.Ok() { + f.mu.Unlock() + panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd)) + } + + old := f.rootVFS2 + vd.IncRef() + f.rootVFS2 = vd + f.mu.Unlock() + old.DecRef() +} + +// Umask returns the current umask. +func (f *FSContext) Umask() uint { + f.mu.Lock() + defer f.mu.Unlock() + return f.umask +} + +// SwapUmask atomically sets the current umask and returns the old umask. +func (f *FSContext) SwapUmask(mask uint) uint { + f.mu.Lock() + defer f.mu.Unlock() + old := f.umask + f.umask = mask + return old +} diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD new file mode 100644 index 000000000..c5021f2db --- /dev/null +++ b/pkg/sentry/kernel/futex/BUILD @@ -0,0 +1,57 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "atomicptr_bucket", + out = "atomicptr_bucket_unsafe.go", + package = "futex", + suffix = "Bucket", + template = "//pkg/sync:generic_atomicptr", + types = { + "Value": "bucket", + }, +) + +go_template_instance( + name = "waiter_list", + out = "waiter_list.go", + package = "futex", + prefix = "waiter", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Waiter", + "Linker": "*Waiter", + }, +) + +go_library( + name = "futex", + srcs = [ + "atomicptr_bucket_unsafe.go", + "futex.go", + "waiter_list.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/log", + "//pkg/sentry/memmap", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + ], +) + +go_test( + name = "futex_test", + size = "small", + srcs = ["futex_test.go"], + library = ":futex", + deps = [ + "//pkg/sync", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go new file mode 100644 index 000000000..732e66da4 --- /dev/null +++ b/pkg/sentry/kernel/futex/futex.go @@ -0,0 +1,795 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package futex provides an implementation of the futex interface as found in +// the Linux kernel. It allows one to easily transform Wait() calls into waits +// on a channel, which is useful in a Go-based kernel, for example. +package futex + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// KeyKind indicates the type of a Key. +type KeyKind int + +const ( + // KindPrivate indicates a private futex (a futex syscall with the + // FUTEX_PRIVATE_FLAG set). + KindPrivate KeyKind = iota + + // KindSharedPrivate indicates a shared futex on a private memory mapping. + // Although KindPrivate and KindSharedPrivate futexes both use memory + // addresses to identify futexes, they do not interoperate (in Linux, the + // two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key + // comparison). + KindSharedPrivate + + // KindSharedMappable indicates a shared futex on a memory mapping other + // than a private anonymous memory mapping. + KindSharedMappable +) + +// Key represents something that a futex waiter may wait on. +type Key struct { + // Kind is the type of the Key. + Kind KeyKind + + // Mappable is the memory-mapped object that is represented by the Key. + // Mappable is always nil if Kind is not KindSharedMappable, and may be nil + // even if it is. + Mappable memmap.Mappable + + // MappingIdentity is the MappingIdentity associated with Mappable. + // MappingIdentity is always nil is Mappable is nil, and may be nil even if + // it isn't. + MappingIdentity memmap.MappingIdentity + + // If Kind is KindPrivate or KindSharedPrivate, Offset is the represented + // memory address. Otherwise, Offset is the represented offset into + // Mappable. + Offset uint64 +} + +func (k *Key) release() { + if k.MappingIdentity != nil { + k.MappingIdentity.DecRef() + } + k.Mappable = nil + k.MappingIdentity = nil +} + +func (k *Key) clone() Key { + if k.MappingIdentity != nil { + k.MappingIdentity.IncRef() + } + return *k +} + +// Preconditions: k.Kind == KindPrivate or KindSharedPrivate. +func (k *Key) addr() usermem.Addr { + return usermem.Addr(k.Offset) +} + +// matches returns true if a wakeup on k2 should wake a waiter waiting on k. +func (k *Key) matches(k2 *Key) bool { + // k.MappingIdentity is ignored; it's only used for reference counting. + return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset +} + +// Target abstracts memory accesses and keys. +type Target interface { + // SwapUint32 gives access to usermem.IO.SwapUint32. + SwapUint32(addr usermem.Addr, new uint32) (uint32, error) + + // CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32. + CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) + + // LoadUint32 gives access to usermem.IO.LoadUint32. + LoadUint32(addr usermem.Addr) (uint32, error) + + // GetSharedKey returns a Key with kind KindSharedPrivate or + // KindSharedMappable corresponding to the memory mapped at address addr. + // + // If GetSharedKey returns a Key with a non-nil MappingIdentity, a + // reference is held on the MappingIdentity, which must be dropped by the + // caller when the Key is no longer in use. + GetSharedKey(addr usermem.Addr) (Key, error) +} + +// check performs a basic equality check on the given address. +func check(t Target, addr usermem.Addr, val uint32) error { + cur, err := t.LoadUint32(addr) + if err != nil { + return err + } + if cur != val { + return syserror.EAGAIN + } + return nil +} + +// atomicOp performs a complex operation on the given address. +func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) { + opType := (opIn >> 28) & 0xf + cmp := (opIn >> 24) & 0xf + opArg := (opIn >> 12) & 0xfff + cmpArg := opIn & 0xfff + + if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 { + opArg = 1 << opArg + opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag. + } + + var ( + oldVal uint32 + err error + ) + if opType == linux.FUTEX_OP_SET { + oldVal, err = t.SwapUint32(addr, opArg) + if err != nil { + return false, err + } + } else { + for { + oldVal, err = t.LoadUint32(addr) + if err != nil { + return false, err + } + var newVal uint32 + switch opType { + case linux.FUTEX_OP_ADD: + newVal = oldVal + opArg + case linux.FUTEX_OP_OR: + newVal = oldVal | opArg + case linux.FUTEX_OP_ANDN: + newVal = oldVal &^ opArg + case linux.FUTEX_OP_XOR: + newVal = oldVal ^ opArg + default: + return false, syserror.ENOSYS + } + prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal) + if err != nil { + return false, err + } + if prev == oldVal { + break // Success. + } + } + } + + switch cmp { + case linux.FUTEX_OP_CMP_EQ: + return oldVal == cmpArg, nil + case linux.FUTEX_OP_CMP_NE: + return oldVal != cmpArg, nil + case linux.FUTEX_OP_CMP_LT: + return oldVal < cmpArg, nil + case linux.FUTEX_OP_CMP_LE: + return oldVal <= cmpArg, nil + case linux.FUTEX_OP_CMP_GT: + return oldVal > cmpArg, nil + case linux.FUTEX_OP_CMP_GE: + return oldVal >= cmpArg, nil + default: + return false, syserror.ENOSYS + } +} + +// Waiter is the struct which gets enqueued into buckets for wake up routines +// and requeue routines to scan and notify. Once a Waiter has been enqueued by +// WaitPrepare(), callers may listen on C for wake up events. +type Waiter struct { + // Synchronization: + // + // - A Waiter that is not enqueued in a bucket is exclusively owned (no + // synchronization applies). + // + // - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this, + // waiterEntry, bucket, and key are protected by the bucket.mu ("bucket + // lock") of the containing bucket, and bitmask is immutable. Note that + // since bucket is mutated using atomic memory operations, bucket.Load() + // may be called without holding the bucket lock, although it may change + // racily. See WaitComplete(). + // + // - A Waiter is only guaranteed to be no longer queued after calling + // WaitComplete(). + + // waiterEntry links Waiter into bucket.waiters. + waiterEntry + + // bucket is the bucket this waiter is queued in. If bucket is nil, the + // waiter is not waiting and is not in any bucket. + bucket AtomicPtrBucket + + // C is sent to when the Waiter is woken. + C chan struct{} + + // key is what this waiter is waiting on. + key Key + + // The bitmask we're waiting on. + // This is used the case of a FUTEX_WAKE_BITSET. + bitmask uint32 + + // tid is the thread ID for the waiter in case this is a PI mutex. + tid uint32 +} + +// NewWaiter returns a new unqueued Waiter. +func NewWaiter() *Waiter { + return &Waiter{ + C: make(chan struct{}, 1), + } +} + +// woken returns true if w has been woken since the last call to WaitPrepare. +func (w *Waiter) woken() bool { + return len(w.C) != 0 +} + +// bucket holds a list of waiters for a given address hash. +// +// +stateify savable +type bucket struct { + // mu protects waiters and contained Waiter state. See comment in Waiter. + mu sync.Mutex `state:"nosave"` + + waiters waiterList `state:"zerovalue"` +} + +// wakeLocked wakes up to n waiters matching the bitmask at the addr for this +// bucket and returns the number of waiters woken. +// +// Preconditions: b.mu must be locked. +func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int { + done := 0 + for w := b.waiters.Front(); done < n && w != nil; { + if !w.key.matches(key) || w.bitmask&bitmask == 0 { + // Not matching. + w = w.Next() + continue + } + + // Remove from the bucket and wake the waiter. + woke := w + w = w.Next() // Next iteration. + b.wakeWaiterLocked(woke) + done++ + } + return done +} + +func (b *bucket) wakeWaiterLocked(w *Waiter) { + // Remove from the bucket and wake the waiter. + b.waiters.Remove(w) + w.C <- struct{}{} + + // NOTE: The above channel write establishes a write barrier according + // to the memory model, so nothing may be ordered around it. Since + // we've dequeued w and will never touch it again, we can safely + // store nil to w.bucket here and allow the WaitComplete() to + // short-circuit grabbing the bucket lock. If they somehow miss the + // store, we are still holding the lock, so we can know that they won't + // dequeue w, assume it's free and have the below operation + // afterwards. + w.bucket.Store(nil) +} + +// requeueLocked takes n waiters from the bucket and moves them to naddr on the +// bucket "to". +// +// Preconditions: b and to must be locked. +func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { + done := 0 + for w := b.waiters.Front(); done < n && w != nil; { + if !w.key.matches(key) { + // Not matching. + w = w.Next() + continue + } + + requeued := w + w = w.Next() // Next iteration. + b.waiters.Remove(requeued) + requeued.key.release() + requeued.key = nkey.clone() + to.waiters.PushBack(requeued) + requeued.bucket.Store(to) + done++ + } + return done +} + +const ( + // bucketCount is the number of buckets per Manager. By having many of + // these we reduce contention when concurrent yet unrelated calls are made. + bucketCount = 1 << bucketCountBits + bucketCountBits = 10 +) + +// getKey returns a Key representing address addr in c. +func getKey(t Target, addr usermem.Addr, private bool) (Key, error) { + // Ensure the address is aligned. + // It must be a DWORD boundary. + if addr&0x3 != 0 { + return Key{}, syserror.EINVAL + } + if private { + return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil + } + return t.GetSharedKey(addr) +} + +// bucketIndexForAddr returns the index into Manager.buckets for addr. +func bucketIndexForAddr(addr usermem.Addr) uintptr { + // - The bottom 2 bits of addr must be 0, per getKey. + // + // - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47 + // for a canonical address, and (on all existing platforms) bit 47 must be + // 0 for an application address. + // + // Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful" + // bits. We choose one of the simplest possible hash functions that at + // least uses all 45 useful bits in the output, given that bucketCountBits + // == 10. This hash function also has the property that it will usually map + // adjacent addresses to adjacent buckets, slightly improving memory + // locality when an application synchronization structure uses multiple + // nearby futexes. + // + // Note that despite the large number of arithmetic operations in the + // function, many components can be computed in parallel, such that the + // critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This + // is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... + + // (addr >> 42)" without any additional grouping, the compiler puts all 4 + // additions in the critical path. + h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22) + h2 := uintptr(addr>>32) + uintptr(addr>>42) + return (h1 + h2) % bucketCount +} + +// Manager holds futex state for a single virtual address space. +// +// +stateify savable +type Manager struct { + // privateBuckets holds buckets for KindPrivate and KindSharedPrivate + // futexes. + privateBuckets [bucketCount]bucket `state:"zerovalue"` + + // sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket + // may be shared by multiple Managers. The sharedBucket pointer is + // immutable. + sharedBucket *bucket +} + +// NewManager returns an initialized futex manager. +func NewManager() *Manager { + return &Manager{ + sharedBucket: &bucket{}, + } +} + +// Fork returns a new Manager. Shared futex clients using the returned Manager +// may interoperate with those using m. +func (m *Manager) Fork() *Manager { + return &Manager{ + sharedBucket: m.sharedBucket, + } +} + +// lockBucket returns a locked bucket for the given key. +func (m *Manager) lockBucket(k *Key) *bucket { + var b *bucket + if k.Kind == KindSharedMappable { + b = m.sharedBucket + } else { + b = &m.privateBuckets[bucketIndexForAddr(k.addr())] + } + b.mu.Lock() + return b +} + +// lockBuckets returns locked buckets for the given keys. +func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { + // Buckets must be consistently ordered to avoid circular lock + // dependencies. We order buckets in m.privateBuckets by index (lowest + // index first), and all buckets in m.privateBuckets precede + // m.sharedBucket. + + // Handle the common case first: + if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable { + i1 := bucketIndexForAddr(k1.addr()) + i2 := bucketIndexForAddr(k2.addr()) + b1 := &m.privateBuckets[i1] + b2 := &m.privateBuckets[i2] + switch { + case i1 < i2: + b1.mu.Lock() + b2.mu.Lock() + case i2 < i1: + b2.mu.Lock() + b1.mu.Lock() + default: + b1.mu.Lock() + } + return b1, b2 + } + + // At least one of b1 or b2 should be m.sharedBucket. + b1 := m.sharedBucket + b2 := m.sharedBucket + if k1.Kind != KindSharedMappable { + b1 = m.lockBucket(k1) + } else if k2.Kind != KindSharedMappable { + b2 = m.lockBucket(k2) + } + m.sharedBucket.mu.Lock() + return b1, b2 +} + +// Wake wakes up to n waiters matching the bitmask on the given addr. +// The number of waiters woken is returned. +func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) { + // This function is very hot; avoid defer. + k, err := getKey(t, addr, private) + if err != nil { + return 0, err + } + + b := m.lockBucket(&k) + r := b.wakeLocked(&k, bitmask, n) + + b.mu.Unlock() + k.release() + return r, nil +} + +func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) { + k1, err := getKey(t, addr, private) + if err != nil { + return 0, err + } + defer k1.release() + k2, err := getKey(t, naddr, private) + if err != nil { + return 0, err + } + defer k2.release() + + b1, b2 := m.lockBuckets(&k1, &k2) + defer b1.mu.Unlock() + if b2 != b1 { + defer b2.mu.Unlock() + } + + if checkval { + if err := check(t, addr, val); err != nil { + return 0, err + } + } + + // Wake the number required. + done := b1.wakeLocked(&k1, ^uint32(0), nwake) + + // Requeue the number required. + b1.requeueLocked(b2, &k1, &k2, nreq) + + return done, nil +} + +// Requeue wakes up to nwake waiters on the given addr, and unconditionally +// requeues up to nreq waiters on naddr. +func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) { + return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq) +} + +// RequeueCmp atomically checks that the addr contains val (via the Target), +// wakes up to nwake waiters on addr and then unconditionally requeues nreq +// waiters on naddr. +func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) { + return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq) +} + +// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1 +// waiters unconditionally from addr1, and, based on the original value at addr2 +// and a comparison encoded in op, wakes up to nwake2 waiters from addr2. +// It returns the total number of waiters woken. +func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) { + k1, err := getKey(t, addr1, private) + if err != nil { + return 0, err + } + defer k1.release() + k2, err := getKey(t, addr2, private) + if err != nil { + return 0, err + } + defer k2.release() + + b1, b2 := m.lockBuckets(&k1, &k2) + defer b1.mu.Unlock() + if b2 != b1 { + defer b2.mu.Unlock() + } + + done := 0 + cond, err := atomicOp(t, addr2, op) + if err != nil { + return 0, err + } + + // Wake up up to nwake1 entries from the first bucket. + done = b1.wakeLocked(&k1, ^uint32(0), nwake1) + + // Wake up up to nwake2 entries from the second bucket if the + // operation yielded true. + if cond { + done += b2.wakeLocked(&k2, ^uint32(0), nwake2) + } + + return done, nil +} + +// WaitPrepare atomically checks that addr contains val (via the Checker), then +// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the +// Waiter must be subsequently removed by calling WaitComplete, whether or not +// a wakeup is received on w.C. +func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error { + k, err := getKey(t, addr, private) + if err != nil { + return err + } + // Ownership of k is transferred to w below. + + // Prepare the Waiter before taking the bucket lock. + select { + case <-w.C: + default: + } + w.key = k + w.bitmask = bitmask + + b := m.lockBucket(&k) + // This function is very hot; avoid defer. + + // Perform our atomic check. + if err := check(t, addr, val); err != nil { + b.mu.Unlock() + w.key.release() + return err + } + + // Add the waiter to the bucket. + b.waiters.PushBack(w) + w.bucket.Store(b) + + b.mu.Unlock() + return nil +} + +// WaitComplete must be called when a Waiter previously added by WaitPrepare is +// no longer eligible to be woken. +func (m *Manager) WaitComplete(w *Waiter) { + // Remove w from the bucket it's in. + for { + b := w.bucket.Load() + + // If b is nil, the waiter isn't in any bucket anymore. This can't be + // racy because the waiter can't be concurrently re-queued in another + // bucket. + if b == nil { + break + } + + // Take the bucket lock. Note that without holding the bucket lock, the + // waiter is not guaranteed to stay in that bucket, so after we take + // the bucket lock, we must ensure that the bucket hasn't changed: if + // it happens to have changed, we release the old bucket lock and try + // again with the new bucket; if it hasn't changed, we know it won't + // change now because we hold the lock. + b.mu.Lock() + if b != w.bucket.Load() { + b.mu.Unlock() + continue + } + + // Remove waiter from bucket. + b.waiters.Remove(w) + w.bucket.Store(nil) + b.mu.Unlock() + break + } + + // Release references held by the waiter. + w.key.release() +} + +// LockPI attempts to lock the futex following the Priority-inheritance futex +// rules. The lock is acquired only when 'addr' points to 0. The TID of the +// calling task is set to 'addr' to indicate the futex is owned. It returns true +// if the futex was successfully acquired. +// +// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see +// exit_robust_list()). Given we don't support robust lists, although handled +// below, it's never set. +func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) { + k, err := getKey(t, addr, private) + if err != nil { + return false, err + } + // Ownership of k is transferred to w below. + + // Prepare the Waiter before taking the bucket lock. + select { + case <-w.C: + default: + } + w.key = k + w.tid = tid + + b := m.lockBucket(&k) + // Hot function: avoid defers. + + success, err := m.lockPILocked(w, t, addr, tid, b, try) + if err != nil { + w.key.release() + b.mu.Unlock() + return false, err + } + if success || try { + // Release waiter if it's not going to be a wait. + w.key.release() + } + b.mu.Unlock() + return success, nil +} + +func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) { + for { + cur, err := t.LoadUint32(addr) + if err != nil { + return false, err + } + if (cur & linux.FUTEX_TID_MASK) == tid { + return false, syserror.EDEADLK + } + + if (cur & linux.FUTEX_TID_MASK) == 0 { + // No owner and no waiters, try to acquire the futex. + + // Set TID and preserve owner died status. + val := tid + val |= cur & linux.FUTEX_OWNER_DIED + prev, err := t.CompareAndSwapUint32(addr, cur, val) + if err != nil { + return false, err + } + if prev != cur { + // CAS failed, retry... + // Linux reacquires the bucket lock on retries, which will re-lookup the + // mapping at the futex address. However, retrying while holding the + // lock is more efficient and reduces the chance of another conflict. + continue + } + // Futex acquired. + return true, nil + } + + // Futex is already owned, prepare to wait. + + if try { + // Caller doesn't want to wait. + return false, nil + } + + // Set waiters bit if not set yet. + if cur&linux.FUTEX_WAITERS == 0 { + prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS) + if err != nil { + return false, err + } + if prev != cur { + // CAS failed, retry... + continue + } + } + + // Add the waiter to the bucket. + b.waiters.PushBack(w) + w.bucket.Store(b) + return false, nil + } +} + +// UnlockPI unlock the futex following the Priority-inheritance futex +// rules. The address provided must contain the caller's TID. If there are +// waiters, TID of the next waiter (FIFO) is set to the given address, and the +// waiter woken up. If there are no waiters, 0 is set to the address. +func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error { + k, err := getKey(t, addr, private) + if err != nil { + return err + } + b := m.lockBucket(&k) + + err = m.unlockPILocked(t, addr, tid, b, &k) + + k.release() + b.mu.Unlock() + return err +} + +func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket, key *Key) error { + cur, err := t.LoadUint32(addr) + if err != nil { + return err + } + + if (cur & linux.FUTEX_TID_MASK) != tid { + return syserror.EPERM + } + + var next *Waiter // Who's the next owner? + var next2 *Waiter // Who's the one after that? + for w := b.waiters.Front(); w != nil; w = w.Next() { + if !w.key.matches(key) { + continue + } + + if next == nil { + next = w + } else { + next2 = w + break + } + } + + if next == nil { + // It's safe to set 0 because there are no waiters, no new owner, and the + // executing task is the current owner (no owner died bit). + prev, err := t.CompareAndSwapUint32(addr, cur, 0) + if err != nil { + return err + } + if prev != cur { + // Let user mode handle CAS races. This is different than lock, which + // retries when CAS fails. + return syserror.EAGAIN + } + return nil + } + + // Set next owner's TID, waiters if there are any. Resets owner died bit, if + // set, because the executing task takes over as the owner. + val := next.tid + if next2 != nil { + val |= linux.FUTEX_WAITERS + } + + prev, err := t.CompareAndSwapUint32(addr, cur, val) + if err != nil { + return err + } + if prev != cur { + return syserror.EINVAL + } + + b.wakeWaiterLocked(next) + return nil +} diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go new file mode 100644 index 000000000..7c5c7665b --- /dev/null +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -0,0 +1,530 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package futex + +import ( + "math" + "runtime" + "sync/atomic" + "syscall" + "testing" + "unsafe" + + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" +) + +// testData implements the Target interface, and allows us to +// treat the address passed for futex operations as an index in +// a byte slice for testing simplicity. +type testData []byte + +const sizeofInt32 = 4 + +func newTestData(size uint) testData { + return make([]byte, size) +} + +func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { + val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new) + return val, nil +} + +func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { + if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) { + return old, nil + } + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil +} + +func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) { + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil +} + +func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) { + return Key{ + Kind: KindSharedMappable, + Offset: uint64(addr), + }, nil +} + +func futexKind(private bool) string { + if private { + return "private" + } + return "shared" +} + +func newPreparedTestWaiter(t *testing.T, m *Manager, ta Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) *Waiter { + w := NewWaiter() + if err := m.WaitPrepare(w, ta, addr, private, val, bitmask); err != nil { + t.Fatalf("WaitPrepare failed: %v", err) + } + return w +} + +func TestFutexWake(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(sizeofInt32) + + // Start waiting for wakeup. + w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) + defer m.WaitComplete(w) + + // Perform a wakeup. + if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 { + t.Errorf("Wake: got (%d, %v), wanted (1, nil)", n, err) + } + + // Expect the waiter to have been woken. + if !w.woken() { + t.Error("waiter not woken") + } + }) + } +} + +func TestFutexWakeBitmask(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(sizeofInt32) + + // Start waiting for wakeup. + w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff) + defer m.WaitComplete(w) + + // Perform a wakeup using the wrong bitmask. + if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 { + t.Errorf("Wake with non-matching bitmask: got (%d, %v), wanted (0, nil)", n, err) + } + + // Expect the waiter to still be waiting. + if w.woken() { + t.Error("waiter woken unexpectedly") + } + + // Perform a wakeup using the right bitmask. + if n, err := m.Wake(d, 0, private, 0x00000001, 1); err != nil || n != 1 { + t.Errorf("Wake with matching bitmask: got (%d, %v), wanted (1, nil)", n, err) + } + + // Expect that the waiter was woken. + if !w.woken() { + t.Error("waiter not woken") + } + }) + } +} + +func TestFutexWakeTwo(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(sizeofInt32) + + // Start three waiters waiting for wakeup. + var ws [3]*Waiter + for i := range ws { + ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) + defer m.WaitComplete(ws[i]) + } + + // Perform two wakeups. + const wakeups = 2 + if n, err := m.Wake(d, 0, private, ^uint32(0), 2); err != nil || n != wakeups { + t.Errorf("Wake: got (%d, %v), wanted (%d, nil)", n, err, wakeups) + } + + // Expect that exactly two waiters were woken. + // We don't get guarantees about exactly which two, + // (although we expect them to be w1 and w2). + awake := 0 + for i := range ws { + if ws[i].woken() { + awake++ + } + } + if awake != wakeups { + t.Errorf("got %d woken waiters, wanted %d", awake, wakeups) + } + }) + } +} + +func TestFutexWakeUnrelated(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(2 * sizeofInt32) + + // Start two waiters waiting for wakeup on different addresses. + w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0)) + defer m.WaitComplete(w1) + w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0)) + defer m.WaitComplete(w2) + + // Perform two wakeups on the second address. + if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 { + t.Errorf("Wake: got (%d, %v), wanted (1, nil)", n, err) + } + + // Expect that only the second waiter was woken. + if w1.woken() { + t.Error("w1 woken unexpectedly") + } + if !w2.woken() { + t.Error("w2 not woken") + } + }) + } +} + +func TestWakeOpEmpty(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(2 * sizeofInt32) + + // Perform wakeups with no waiters. + if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 10, 0); err != nil || n != 0 { + t.Fatalf("WakeOp: got (%d, %v), wanted (0, nil)", n, err) + } + }) + } +} + +func TestWakeOpFirstNonEmpty(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add two waiters on address 0. + w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) + defer m.WaitComplete(w1) + w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) + defer m.WaitComplete(w2) + + // Perform 10 wakeups on address 0. + if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 { + t.Errorf("WakeOp: got (%d, %v), wanted (2, nil)", n, err) + } + + // Expect that both waiters were woken. + if !w1.woken() { + t.Error("w1 not woken") + } + if !w2.woken() { + t.Error("w2 not woken") + } + }) + } +} + +func TestWakeOpSecondNonEmpty(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add two waiters on address sizeofInt32. + w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) + defer m.WaitComplete(w1) + w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) + defer m.WaitComplete(w2) + + // Perform 10 wakeups on address sizeofInt32 (contingent on + // d.Op(0), which should succeed). + if n, err := m.WakeOp(d, 0, sizeofInt32, private, 0, 10, 0); err != nil || n != 2 { + t.Errorf("WakeOp: got (%d, %v), wanted (2, nil)", n, err) + } + + // Expect that both waiters were woken. + if !w1.woken() { + t.Error("w1 not woken") + } + if !w2.woken() { + t.Error("w2 not woken") + } + }) + } +} + +func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add two waiters on address sizeofInt32. + w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) + defer m.WaitComplete(w1) + w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) + defer m.WaitComplete(w2) + + // Perform 10 wakeups on address sizeofInt32 (contingent on + // d.Op(1), which should fail). + if n, err := m.WakeOp(d, 0, sizeofInt32, private, 0, 10, 1); err != nil || n != 0 { + t.Errorf("WakeOp: got (%d, %v), wanted (0, nil)", n, err) + } + + // Expect that neither waiter was woken. + if w1.woken() { + t.Error("w1 woken unexpectedly") + } + if w2.woken() { + t.Error("w2 woken unexpectedly") + } + }) + } +} + +func TestWakeOpAllNonEmpty(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add two waiters on address 0. + w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) + defer m.WaitComplete(w1) + w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) + defer m.WaitComplete(w2) + + // Add two waiters on address sizeofInt32. + w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) + defer m.WaitComplete(w3) + w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) + defer m.WaitComplete(w4) + + // Perform 10 wakeups on address 0 (unconditionally), and 10 + // wakeups on address sizeofInt32 (contingent on d.Op(0), which + // should succeed). + if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 10, 0); err != nil || n != 4 { + t.Errorf("WakeOp: got (%d, %v), wanted (4, nil)", n, err) + } + + // Expect that all waiters were woken. + if !w1.woken() { + t.Error("w1 not woken") + } + if !w2.woken() { + t.Error("w2 not woken") + } + if !w3.woken() { + t.Error("w3 not woken") + } + if !w4.woken() { + t.Error("w4 not woken") + } + }) + } +} + +func TestWakeOpAllNonEmptyFailingOp(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add two waiters on address 0. + w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) + defer m.WaitComplete(w1) + w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) + defer m.WaitComplete(w2) + + // Add two waiters on address sizeofInt32. + w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) + defer m.WaitComplete(w3) + w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) + defer m.WaitComplete(w4) + + // Perform 10 wakeups on address 0 (unconditionally), and 10 + // wakeups on address sizeofInt32 (contingent on d.Op(1), which + // should fail). + if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 10, 1); err != nil || n != 2 { + t.Errorf("WakeOp: got (%d, %v), wanted (2, nil)", n, err) + } + + // Expect that only the first two waiters were woken. + if !w1.woken() { + t.Error("w1 not woken") + } + if !w2.woken() { + t.Error("w2 not woken") + } + if w3.woken() { + t.Error("w3 woken unexpectedly") + } + if w4.woken() { + t.Error("w4 woken unexpectedly") + } + }) + } +} + +func TestWakeOpSameAddress(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add four waiters on address 0. + var ws [4]*Waiter + for i := range ws { + ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) + defer m.WaitComplete(ws[i]) + } + + // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup + // on address 0 (contingent on d.Op(0), which should succeed). + const wakeups = 2 + if n, err := m.WakeOp(d, 0, 0, private, 1, 1, 0); err != nil || n != wakeups { + t.Errorf("WakeOp: got (%d, %v), wanted (%d, nil)", n, err, wakeups) + } + + // Expect that exactly two waiters were woken. + awake := 0 + for i := range ws { + if ws[i].woken() { + awake++ + } + } + if awake != wakeups { + t.Errorf("got %d woken waiters, wanted %d", awake, wakeups) + } + }) + } +} + +func TestWakeOpSameAddressFailingOp(t *testing.T) { + for _, private := range []bool{false, true} { + t.Run(futexKind(private), func(t *testing.T) { + m := NewManager() + d := newTestData(8) + + // Add four waiters on address 0. + var ws [4]*Waiter + for i := range ws { + ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) + defer m.WaitComplete(ws[i]) + } + + // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup + // on address 0 (contingent on d.Op(1), which should fail). + const wakeups = 1 + if n, err := m.WakeOp(d, 0, 0, private, 1, 1, 1); err != nil || n != wakeups { + t.Errorf("WakeOp: got (%d, %v), wanted (%d, nil)", n, err, wakeups) + } + + // Expect that exactly one waiter was woken. + awake := 0 + for i := range ws { + if ws[i].woken() { + awake++ + } + } + if awake != wakeups { + t.Errorf("got %d woken waiters, wanted %d", awake, wakeups) + } + }) + } +} + +const ( + testMutexSize = sizeofInt32 + testMutexLocked uint32 = 1 + testMutexUnlocked uint32 = 0 +) + +// testMutex ties together a testData slice, an address, and a +// futex manager in order to implement the sync.Locker interface. +// Beyond being used as a Locker, this is a simple mechanism for +// changing the underlying values for simpler tests. +type testMutex struct { + a usermem.Addr + d testData + m *Manager +} + +func newTestMutex(addr usermem.Addr, d testData, m *Manager) *testMutex { + return &testMutex{a: addr, d: d, m: m} +} + +// Lock acquires the testMutex. +// This may wait for it to be available via the futex manager. +func (t *testMutex) Lock() { + for { + // Attempt to grab the lock. + if atomic.CompareAndSwapUint32( + (*uint32)(unsafe.Pointer(&t.d[t.a])), + testMutexUnlocked, + testMutexLocked) { + // Lock held. + return + } + + // Wait for it to be "not locked". + w := NewWaiter() + err := t.m.WaitPrepare(w, t.d, t.a, true, testMutexLocked, ^uint32(0)) + if err == syscall.EAGAIN { + continue + } + if err != nil { + // Should never happen. + panic("WaitPrepare returned unexpected error: " + err.Error()) + } + <-w.C + t.m.WaitComplete(w) + } +} + +// Unlock releases the testMutex. +// This will notify any waiters via the futex manager. +func (t *testMutex) Unlock() { + // Unlock. + atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked) + + // Notify all waiters. + t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32) +} + +// This function was shamelessly stolen from mutex_test.go. +func HammerMutex(l sync.Locker, loops int, cdone chan bool) { + for i := 0; i < loops; i++ { + l.Lock() + runtime.Gosched() + l.Unlock() + } + cdone <- true +} + +func TestMutexStress(t *testing.T) { + m := NewManager() + d := newTestData(testMutexSize) + tm := newTestMutex(0*testMutexSize, d, m) + c := make(chan bool) + + for i := 0; i < 10; i++ { + go HammerMutex(tm, 1000, c) + } + + for i := 0; i < 10; i++ { + <-c + } +} diff --git a/pkg/sentry/kernel/g3doc/run_states.dot b/pkg/sentry/kernel/g3doc/run_states.dot new file mode 100644 index 000000000..7861fe1f5 --- /dev/null +++ b/pkg/sentry/kernel/g3doc/run_states.dot @@ -0,0 +1,99 @@ +digraph { + subgraph { + App; + } + subgraph { + Interrupt; + InterruptAfterSignalDeliveryStop; + } + subgraph { + Syscall; + SyscallAfterPtraceEventSeccomp; + SyscallEnter; + SyscallAfterSyscallEnterStop; + SyscallAfterSysemuStop; + SyscallInvoke; + SyscallAfterPtraceEventClone; + SyscallAfterExecStop; + SyscallAfterVforkStop; + SyscallReinvoke; + SyscallExit; + } + subgraph { + Vsyscall; + VsyscallAfterPtraceEventSeccomp; + VsyscallInvoke; + } + subgraph { + Exit; + ExitMain; // leave thread group, release resources, reparent children, kill PID namespace and wait if TGID 1 + ExitNotify; // signal parent/tracer, become waitable + ExitDone; // represented by t.runState == nil + } + + // Task exit + Exit -> ExitMain; + ExitMain -> ExitNotify; + ExitNotify -> ExitDone; + + // Execution of untrusted application code + App -> App; + + // Interrupts (usually signal delivery) + App -> Interrupt; + Interrupt -> Interrupt; // if other interrupt conditions may still apply + Interrupt -> Exit; // if killed + + // Syscalls + App -> Syscall; + Syscall -> SyscallEnter; + SyscallEnter -> SyscallInvoke; + SyscallInvoke -> SyscallExit; + SyscallExit -> App; + + // exit, exit_group + SyscallInvoke -> Exit; + + // execve + SyscallInvoke -> SyscallAfterExecStop; + SyscallAfterExecStop -> SyscallExit; + SyscallAfterExecStop -> App; // fatal signal pending + + // vfork + SyscallInvoke -> SyscallAfterVforkStop; + SyscallAfterVforkStop -> SyscallExit; + + // Vsyscalls + App -> Vsyscall; + Vsyscall -> VsyscallInvoke; + Vsyscall -> App; // fault while reading return address from stack + VsyscallInvoke -> App; + + // ptrace-specific branches + Interrupt -> InterruptAfterSignalDeliveryStop; + InterruptAfterSignalDeliveryStop -> Interrupt; + SyscallEnter -> SyscallAfterSyscallEnterStop; + SyscallAfterSyscallEnterStop -> SyscallInvoke; + SyscallAfterSyscallEnterStop -> SyscallExit; // skipped by tracer + SyscallAfterSyscallEnterStop -> App; // fatal signal pending + SyscallEnter -> SyscallAfterSysemuStop; + SyscallAfterSysemuStop -> SyscallExit; + SyscallAfterSysemuStop -> App; // fatal signal pending + SyscallInvoke -> SyscallAfterPtraceEventClone; + SyscallAfterPtraceEventClone -> SyscallExit; + SyscallAfterPtraceEventClone -> SyscallAfterVforkStop; + + // seccomp + Syscall -> App; // SECCOMP_RET_TRAP, SECCOMP_RET_ERRNO, SECCOMP_RET_KILL, SECCOMP_RET_TRACE without tracer + Syscall -> SyscallAfterPtraceEventSeccomp; // SECCOMP_RET_TRACE + SyscallAfterPtraceEventSeccomp -> SyscallEnter; + SyscallAfterPtraceEventSeccomp -> SyscallExit; // skipped by tracer + SyscallAfterPtraceEventSeccomp -> App; // fatal signal pending + Vsyscall -> VsyscallAfterPtraceEventSeccomp; + VsyscallAfterPtraceEventSeccomp -> VsyscallInvoke; + VsyscallAfterPtraceEventSeccomp -> App; + + // Autosave + SyscallInvoke -> SyscallReinvoke; + SyscallReinvoke -> SyscallInvoke; +} diff --git a/pkg/sentry/kernel/g3doc/run_states.png b/pkg/sentry/kernel/g3doc/run_states.png Binary files differnew file mode 100644 index 000000000..b63b60f02 --- /dev/null +++ b/pkg/sentry/kernel/g3doc/run_states.png diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go new file mode 100644 index 000000000..80a070d7e --- /dev/null +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -0,0 +1,58 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" + "gvisor.dev/gvisor/pkg/sentry/kernel/shm" +) + +// IPCNamespace represents an IPC namespace. +// +// +stateify savable +type IPCNamespace struct { + // User namespace which owns this IPC namespace. Immutable. + userNS *auth.UserNamespace + + semaphores *semaphore.Registry + shms *shm.Registry +} + +// NewIPCNamespace creates a new IPC namespace. +func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { + return &IPCNamespace{ + userNS: userNS, + semaphores: semaphore.NewRegistry(userNS), + shms: shm.NewRegistry(userNS), + } +} + +// SemaphoreRegistry returns the semaphore set registry for this namespace. +func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry { + return i.semaphores +} + +// ShmRegistry returns the shm segment registry for this namespace. +func (i *IPCNamespace) ShmRegistry() *shm.Registry { + return i.shms +} + +// IPCNamespace returns the task's IPC namespace. +func (t *Task) IPCNamespace() *IPCNamespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.ipcns +} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go new file mode 100644 index 000000000..2177b785a --- /dev/null +++ b/pkg/sentry/kernel/kernel.go @@ -0,0 +1,1682 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kernel provides an emulation of the Linux kernel. +// +// See README.md for a detailed overview. +// +// Lock order (outermost locks must be taken first): +// +// Kernel.extMu +// ThreadGroup.timerMu +// ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer) +// TaskSet.mu +// SignalHandlers.mu +// Task.mu +// runningTasksMu +// +// Locking SignalHandlers.mu in multiple SignalHandlers requires locking +// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same +// time requires locking all of their signal mutexes first. +package kernel + +import ( + "errors" + "fmt" + "path/filepath" + "sync/atomic" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + oldtimerfd "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/hostcpu" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" + sentrytime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/wire" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" +) + +// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow +// easy access everywhere. To be removed once VFS2 becomes the default. +var VFS2Enabled = false + +// Kernel represents an emulated Linux kernel. It must be initialized by calling +// Init() or LoadFrom(). +// +// +stateify savable +type Kernel struct { + // extMu serializes external changes to the Kernel with calls to + // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel + // remains frozen for the duration of the call; it requires that the Kernel + // is paused as a precondition, which ensures that none of the tasks + // running within the Kernel can affect its state, but extMu is required to + // ensure that concurrent users of the Kernel *outside* the Kernel's + // control cannot affect its state by calling e.g. + // Kernel.SendExternalSignal.) + extMu sync.Mutex `state:"nosave"` + + // started is true if Start has been called. Unless otherwise specified, + // all Kernel fields become immutable once started becomes true. + started bool `state:"nosave"` + + // All of the following fields are immutable unless otherwise specified. + + // Platform is the platform that is used to execute tasks in the created + // Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is + // embedded anonymously (the same issue applies). + platform.Platform `state:"nosave"` + + // mf provides application memory. + mf *pgalloc.MemoryFile `state:"nosave"` + + // See InitKernelArgs for the meaning of these fields. + featureSet *cpuid.FeatureSet + timekeeper *Timekeeper + tasks *TaskSet + rootUserNamespace *auth.UserNamespace + rootNetworkNamespace *inet.Namespace + applicationCores uint + useHostCores bool + extraAuxv []arch.AuxEntry + vdso *loader.VDSO + rootUTSNamespace *UTSNamespace + rootIPCNamespace *IPCNamespace + rootAbstractSocketNamespace *AbstractSocketNamespace + + // futexes is the "root" futex.Manager, from which all others are forked. + // This is necessary to ensure that shared futexes are coherent across all + // tasks, including those created by CreateProcess. + futexes *futex.Manager + + // globalInit is the thread group whose leader has ID 1 in the root PID + // namespace. globalInit is stored separately so that it is accessible even + // after all tasks in the thread group have exited, such that ID 1 is no + // longer mapped. + // + // globalInit is mutable until it is assigned by the first successful call + // to CreateProcess, and is protected by extMu. + globalInit *ThreadGroup + + // realtimeClock is a ktime.Clock based on timekeeper's Realtime. + realtimeClock *timekeeperClock + + // monotonicClock is a ktime.Clock based on timekeeper's Monotonic. + monotonicClock *timekeeperClock + + // syslog is the kernel log. + syslog syslog + + // runningTasksMu synchronizes disable/enable of cpuClockTicker when + // the kernel is idle (runningTasks == 0). + // + // runningTasksMu is used to exclude critical sections when the timer + // disables itself and when the first active task enables the timer, + // ensuring that tasks always see a valid cpuClock value. + runningTasksMu sync.Mutex `state:"nosave"` + + // runningTasks is the total count of tasks currently in + // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are + // not blocked or stopped. + // + // runningTasks must be accessed atomically. Increments from 0 to 1 are + // further protected by runningTasksMu (see incRunningTasks). + runningTasks int64 + + // cpuClock is incremented every linux.ClockTick. cpuClock is used to + // measure task CPU usage, since sampling monotonicClock twice on every + // syscall turns out to be unreasonably expensive. This is similar to how + // Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING), + // although Linux also uses scheduler timing information to improve + // resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do + // since "preeemptive" scheduling is managed by the Go runtime, which + // doesn't provide this information. + // + // cpuClock is mutable, and is accessed using atomic memory operations. + cpuClock uint64 + + // cpuClockTicker increments cpuClock. + cpuClockTicker *ktime.Timer `state:"nosave"` + + // cpuClockTickerDisabled indicates that cpuClockTicker has been + // disabled because no tasks are running. + // + // cpuClockTickerDisabled is protected by runningTasksMu. + cpuClockTickerDisabled bool + + // cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the + // point it was disabled. It is cached here to avoid a lock ordering + // violation with cpuClockTicker.mu when runningTaskMu is held. + // + // cpuClockTickerSetting is only valid when cpuClockTickerDisabled is + // true. + // + // cpuClockTickerSetting is protected by runningTasksMu. + cpuClockTickerSetting ktime.Setting + + // uniqueID is used to generate unique identifiers. + // + // uniqueID is mutable, and is accessed using atomic memory operations. + uniqueID uint64 + + // nextInotifyCookie is a monotonically increasing counter used for + // generating unique inotify event cookies. + // + // nextInotifyCookie is mutable, and is accessed using atomic memory + // operations. + nextInotifyCookie uint32 + + // netlinkPorts manages allocation of netlink socket port IDs. + netlinkPorts *port.Manager + + // saveErr is the error causing the sandbox to exit during save, if + // any. It is protected by extMu. + saveErr error `state:"nosave"` + + // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. + danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` + + // sockets is the list of all network sockets the system. Protected by + // extMu. + sockets socketList + + // nextSocketEntry is the next entry number to use in sockets. Protected + // by extMu. + nextSocketEntry uint64 + + // deviceRegistry is used to save/restore device.SimpleDevices. + deviceRegistry struct{} `state:".(*device.Registry)"` + + // DirentCacheLimiter controls the number of total dirent entries can be in + // caches. Not all caches use it, only the caches that use host resources use + // the limiter. It may be nil if disabled. + DirentCacheLimiter *fs.DirentCacheLimiter + + // unimplementedSyscallEmitterOnce is used in the initialization of + // unimplementedSyscallEmitter. + unimplementedSyscallEmitterOnce sync.Once `state:"nosave"` + + // unimplementedSyscallEmitter is used to emit unimplemented syscall + // events. This is initialized lazily on the first unimplemented + // syscall. + unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` + + // SpecialOpts contains special kernel options. + SpecialOpts + + // VFS keeps the filesystem state used across the kernel. + vfs vfs.VirtualFilesystem + + // hostMount is the Mount used for file descriptors that were imported + // from the host. + hostMount *vfs.Mount + + // pipeMount is the Mount used for pipes created by the pipe() and pipe2() + // syscalls (as opposed to named pipes created by mknod()). + pipeMount *vfs.Mount + + // shmMount is the Mount used for anonymous files created by the + // memfd_create() syscalls. It is analagous to Linux's shm_mnt. + shmMount *vfs.Mount + + // socketMount is the Mount used for sockets created by the socket() and + // socketpair() syscalls. There are several cases where a socket dentry will + // not be contained in socketMount: + // 1. Socket files created by mknod() + // 2. Socket fds imported from the host (Kernel.hostMount is used for these) + // 3. Socket files created by binding Unix sockets to a file path + socketMount *vfs.Mount + + // If set to true, report address space activation waits as if the task is in + // external wait so that the watchdog doesn't report the task stuck. + SleepForAddressSpaceActivation bool +} + +// InitKernelArgs holds arguments to Init. +type InitKernelArgs struct { + // FeatureSet is the emulated CPU feature set. + FeatureSet *cpuid.FeatureSet + + // Timekeeper manages time for all tasks in the system. + Timekeeper *Timekeeper + + // RootUserNamespace is the root user namespace. + RootUserNamespace *auth.UserNamespace + + // RootNetworkNamespace is the root network namespace. If nil, no networking + // will be available. + RootNetworkNamespace *inet.Namespace + + // ApplicationCores is the number of logical CPUs visible to sandboxed + // applications. The set of logical CPU IDs is [0, ApplicationCores); thus + // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the + // most significant bit in cpu_possible_mask + 1. + ApplicationCores uint + + // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU + // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a + // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it + // will be overridden. + UseHostCores bool + + // ExtraAuxv contains additional auxiliary vector entries that are added to + // each process by the ELF loader. + ExtraAuxv []arch.AuxEntry + + // Vdso holds the VDSO and its parameter page. + Vdso *loader.VDSO + + // RootUTSNamespace is the root UTS namespace. + RootUTSNamespace *UTSNamespace + + // RootIPCNamespace is the root IPC namespace. + RootIPCNamespace *IPCNamespace + + // RootAbstractSocketNamespace is the root Abstract Socket namespace. + RootAbstractSocketNamespace *AbstractSocketNamespace + + // PIDNamespace is the root PID namespace. + PIDNamespace *PIDNamespace +} + +// Init initialize the Kernel with no tasks. +// +// Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile +// before calling Init. +func (k *Kernel) Init(args InitKernelArgs) error { + if args.FeatureSet == nil { + return fmt.Errorf("FeatureSet is nil") + } + if args.Timekeeper == nil { + return fmt.Errorf("Timekeeper is nil") + } + if args.Timekeeper.clocks == nil { + return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()") + } + if args.RootUserNamespace == nil { + return fmt.Errorf("RootUserNamespace is nil") + } + if args.ApplicationCores == 0 { + return fmt.Errorf("ApplicationCores is 0") + } + + k.featureSet = args.FeatureSet + k.timekeeper = args.Timekeeper + k.tasks = newTaskSet(args.PIDNamespace) + k.rootUserNamespace = args.RootUserNamespace + k.rootUTSNamespace = args.RootUTSNamespace + k.rootIPCNamespace = args.RootIPCNamespace + k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace + k.rootNetworkNamespace = args.RootNetworkNamespace + if k.rootNetworkNamespace == nil { + k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil) + } + k.applicationCores = args.ApplicationCores + if args.UseHostCores { + k.useHostCores = true + maxCPU, err := hostcpu.MaxPossibleCPU() + if err != nil { + return fmt.Errorf("Failed to get maximum CPU number: %v", err) + } + minAppCores := uint(maxCPU) + 1 + if k.applicationCores < minAppCores { + log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) + k.applicationCores = minAppCores + } + } + k.extraAuxv = args.ExtraAuxv + k.vdso = args.Vdso + k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime} + k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} + k.futexes = futex.NewManager() + k.netlinkPorts = port.New() + + if VFS2Enabled { + if err := k.vfs.Init(); err != nil { + return fmt.Errorf("failed to initialize VFS: %v", err) + } + + pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) + if err != nil { + return fmt.Errorf("failed to create pipefs filesystem: %v", err) + } + defer pipeFilesystem.DecRef() + pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create pipefs mount: %v", err) + } + k.pipeMount = pipeMount + + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + if err != nil { + return fmt.Errorf("failed to create tmpfs filesystem: %v", err) + } + defer tmpfsFilesystem.DecRef() + defer tmpfsRoot.DecRef() + shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create tmpfs mount: %v", err) + } + k.shmMount = shmMount + + socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) + if err != nil { + return fmt.Errorf("failed to create sockfs filesystem: %v", err) + } + defer socketFilesystem.DecRef() + socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create sockfs mount: %v", err) + } + k.socketMount = socketMount + } + + return nil +} + +// SaveTo saves the state of k to w. +// +// Preconditions: The kernel must be paused throughout the call to SaveTo. +func (k *Kernel) SaveTo(w wire.Writer) error { + saveStart := time.Now() + ctx := k.SupervisorContext() + + // Do not allow other Kernel methods to affect it while it's being saved. + k.extMu.Lock() + defer k.extMu.Unlock() + + // Stop time. + k.pauseTimeLocked() + defer k.resumeTimeLocked() + + // Evict all evictable MemoryFile allocations. + k.mf.StartEvictions() + k.mf.WaitForEvictions() + + // Flush write operations on open files so data reaches backing storage. + // This must come after MemoryFile eviction since eviction may cause file + // writes. + if err := k.tasks.flushWritesToFiles(ctx); err != nil { + return err + } + + // Remove all epoll waiter objects from underlying wait queues. + // NOTE: for programs to resume execution in future snapshot scenarios, + // we will need to re-establish these waiter objects after saving. + k.tasks.unregisterEpollWaiters() + + // Clear the dirent cache before saving because Dirents must be Loaded in a + // particular order (parents before children), and Loading dirents from a cache + // breaks that order. + if err := k.flushMountSourceRefs(); err != nil { + return err + } + + // Ensure that all inode and mount release operations have completed. + fs.AsyncBarrier() + + // Once all fs work has completed (flushed references have all been released), + // reset mount mappings. This allows individual mounts to save how inodes map + // to filesystem resources. Without this, fs.Inodes cannot be restored. + fs.SaveInodeMappings() + + // Discard unsavable mappings, such as those for host file descriptors. + // This must be done after waiting for "asynchronous fs work", which + // includes async I/O that may touch application memory. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } + + // Save the CPUID FeatureSet before the rest of the kernel so we can + // verify its compatibility on restore before attempting to restore the + // entire kernel, which may fail on an incompatible machine. + // + // N.B. This will also be saved along with the full kernel save below. + cpuidStart := time.Now() + if _, err := state.Save(k.SupervisorContext(), w, k.FeatureSet()); err != nil { + return err + } + log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) + + // Save the kernel state. + kernelStart := time.Now() + stats, err := state.Save(k.SupervisorContext(), w, k) + if err != nil { + return err + } + log.Infof("Kernel save stats: %s", stats.String()) + log.Infof("Kernel save took [%s].", time.Since(kernelStart)) + + // Save the memory file's state. + memoryStart := time.Now() + if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil { + return err + } + log.Infof("Memory save took [%s].", time.Since(memoryStart)) + + log.Infof("Overall save took [%s].", time.Since(saveStart)) + + return nil +} + +// flushMountSourceRefs flushes the MountSources for all mounted filesystems +// and open FDs. +func (k *Kernel) flushMountSourceRefs() error { + // Flush all mount sources for currently mounted filesystems in each task. + flushed := make(map[*fs.MountNamespace]struct{}) + k.tasks.mu.RLock() + k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) { + if _, ok := flushed[tg.mounts]; ok { + // Already flushed. + return + } + tg.mounts.FlushMountSourceRefs() + flushed[tg.mounts] = struct{}{} + }) + k.tasks.mu.RUnlock() + + // There may be some open FDs whose filesystems have been unmounted. We + // must flush those as well. + return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + file.Dirent.Inode.MountSource.FlushDirentRefs() + return nil + }) +} + +// forEachFDPaused applies the given function to each open file descriptor in +// each task. +// +// Precondition: Must be called with the kernel paused. +func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return nil + } + + ts.mu.RLock() + defer ts.mu.RUnlock() + for t := range ts.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if t.fdTable == nil { + continue + } + t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { + if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { + err = lastErr + } + }) + } + return err +} + +func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + if flags := file.Flags(); !flags.Write { + return nil + } + if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { + return nil + } + // Here we need all metadata synced. + syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + if err := fs.SaveFileFsyncError(syncErr); err != nil { + name, _ := file.Dirent.FullName(nil /* root */) + // Wrap this error in ErrSaveRejection so that it will trigger a save + // error, rather than a panic. This also allows us to distinguish Fsync + // errors from state file errors in state.Save. + return fs.ErrSaveRejection{ + Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err), + } + } + return nil + }) +} + +// Preconditions: The kernel must be paused. +func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { + invalidated := make(map[*mm.MemoryManager]struct{}) + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t := range k.tasks.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if mm := t.tc.MemoryManager; mm != nil { + if _, ok := invalidated[mm]; !ok { + if err := mm.InvalidateUnsavable(ctx); err != nil { + return err + } + invalidated[mm] = struct{}{} + } + } + // I really wish we just had a sync.Map of all MMs... + if r, ok := t.runState.(*runSyscallAfterExecStop); ok { + if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil { + return err + } + } + } + return nil +} + +func (ts *TaskSet) unregisterEpollWaiters() { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return + } + + ts.mu.RLock() + defer ts.mu.RUnlock() + + // Tasks that belong to the same process could potentially point to the + // same FDTable. So we retain a map of processed ones to avoid + // processing the same FDTable multiple times. + processed := make(map[*FDTable]struct{}) + for t := range ts.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if t.fdTable == nil { + continue + } + if _, ok := processed[t.fdTable]; ok { + continue + } + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if e, ok := file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() + } + }) + processed[t.fdTable] = struct{}{} + } +} + +// LoadFrom returns a new Kernel loaded from args. +func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clocks) error { + loadStart := time.Now() + + initAppCores := k.applicationCores + + // Load the pre-saved CPUID FeatureSet. + // + // N.B. This was also saved along with the full kernel below, so we + // don't need to explicitly install it in the Kernel. + cpuidStart := time.Now() + var features cpuid.FeatureSet + if _, err := state.Load(k.SupervisorContext(), r, &features); err != nil { + return err + } + log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) + + // Verify that the FeatureSet is usable on this host. We do this before + // Kernel load so that the explicit CPUID mismatch error has priority + // over floating point state restore errors that may occur on load on + // an incompatible machine. + if err := features.CheckHostCompatible(); err != nil { + return err + } + + // Load the kernel state. + kernelStart := time.Now() + stats, err := state.Load(k.SupervisorContext(), r, k) + if err != nil { + return err + } + log.Infof("Kernel load stats: %s", stats.String()) + log.Infof("Kernel load took [%s].", time.Since(kernelStart)) + + // rootNetworkNamespace should be populated after loading the state file. + // Restore the root network stack. + k.rootNetworkNamespace.RestoreRootStack(net) + + // Load the memory file's state. + memoryStart := time.Now() + if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { + return err + } + log.Infof("Memory load took [%s].", time.Since(memoryStart)) + + log.Infof("Overall load took [%s]", time.Since(loadStart)) + + k.Timekeeper().SetClocks(clocks) + if net != nil { + net.Resume() + } + + // Ensure that all pending asynchronous work is complete: + // - namedpipe opening + // - inode file opening + if err := fs.AsyncErrorBarrier(); err != nil { + return err + } + + tcpip.AsyncLoading.Wait() + + log.Infof("Overall load took [%s] after async work", time.Since(loadStart)) + + // Applications may size per-cpu structures based on k.applicationCores, so + // it can't change across save/restore. When we are virtualizing CPU + // numbers, this isn't a problem. However, when we are exposing host CPU + // assignments, we can't tolerate an increase in the number of host CPUs, + // which could result in getcpu(2) returning CPUs that applications expect + // not to exist. + if k.useHostCores && initAppCores > k.applicationCores { + return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) + } + + return nil +} + +// UniqueID returns a unique identifier. +func (k *Kernel) UniqueID() uint64 { + id := atomic.AddUint64(&k.uniqueID, 1) + if id == 0 { + panic("unique identifier generator wrapped around") + } + return id +} + +// CreateProcessArgs holds arguments to kernel.CreateProcess. +type CreateProcessArgs struct { + // Filename is the filename to load as the init binary. + // + // If this is provided as "", File will be checked, then the file will be + // guessed via Argv[0]. + Filename string + + // File is a passed host FD pointing to a file to load as the init binary. + // + // This is checked if and only if Filename is "". + File fsbridge.File + + // Argvv is a list of arguments. + Argv []string + + // Envv is a list of environment variables. + Envv []string + + // WorkingDirectory is the initial working directory. + // + // This defaults to the root if empty. + WorkingDirectory string + + // Credentials is the initial credentials. + Credentials *auth.Credentials + + // FDTable is the initial set of file descriptors. If CreateProcess succeeds, + // it takes a reference on FDTable. + FDTable *FDTable + + // Umask is the initial umask. + Umask uint + + // Limits is the initial resource limits. + Limits *limits.LimitSet + + // MaxSymlinkTraversals is the maximum number of symlinks to follow + // during resolution. + MaxSymlinkTraversals uint + + // UTSNamespace is the initial UTS namespace. + UTSNamespace *UTSNamespace + + // IPCNamespace is the initial IPC namespace. + IPCNamespace *IPCNamespace + + // PIDNamespace is the initial PID Namespace. + PIDNamespace *PIDNamespace + + // AbstractSocketNamespace is the initial Abstract Socket namespace. + AbstractSocketNamespace *AbstractSocketNamespace + + // MountNamespace optionally contains the mount namespace for this + // process. If nil, the init process's mount namespace is used. + // + // Anyone setting MountNamespace must donate a reference (i.e. + // increment it). + MountNamespace *fs.MountNamespace + + // MountNamespaceVFS2 optionally contains the mount namespace for this + // process. If nil, the init process's mount namespace is used. + // + // Anyone setting MountNamespaceVFS2 must donate a reference (i.e. + // increment it). + MountNamespaceVFS2 *vfs.MountNamespace + + // ContainerID is the container that the process belongs to. + ContainerID string +} + +// NewContext returns a context.Context that represents the task that will be +// created by args.NewContext(k). +func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext { + return &createProcessContext{ + Logger: log.Log(), + k: k, + args: args, + } +} + +// createProcessContext is a context.Context that represents the context +// associated with a task that is being created. +type createProcessContext struct { + context.NoopSleeper + log.Logger + k *Kernel + args *CreateProcessArgs +} + +// Value implements context.Context.Value. +func (ctx *createProcessContext) Value(key interface{}) interface{} { + switch key { + case CtxKernel: + return ctx.k + case CtxPIDNamespace: + return ctx.args.PIDNamespace + case CtxUTSNamespace: + return ctx.args.UTSNamespace + case CtxIPCNamespace: + return ctx.args.IPCNamespace + case auth.CtxCredentials: + return ctx.args.Credentials + case fs.CtxRoot: + if ctx.args.MountNamespace != nil { + // MountNamespace.Root() will take a reference on the root dirent for us. + return ctx.args.MountNamespace.Root() + } + return nil + case vfs.CtxRoot: + if ctx.args.MountNamespaceVFS2 == nil { + return nil + } + // MountNamespaceVFS2.Root() takes a reference on the root dirent for us. + return ctx.args.MountNamespaceVFS2.Root() + case vfs.CtxMountNamespace: + if ctx.k.globalInit == nil { + return nil + } + // MountNamespaceVFS2 takes a reference for us. + return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + case fs.CtxDirentCacheLimiter: + return ctx.k.DirentCacheLimiter + case inet.CtxStack: + return ctx.k.RootNetworkNamespace().Stack() + case ktime.CtxRealtimeClock: + return ctx.k.RealtimeClock() + case limits.CtxLimits: + return ctx.args.Limits + case pgalloc.CtxMemoryFile: + return ctx.k.mf + case pgalloc.CtxMemoryFileProvider: + return ctx.k + case platform.CtxPlatform: + return ctx.k + case uniqueid.CtxGlobalUniqueID: + return ctx.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return ctx.k + case uniqueid.CtxInotifyCookie: + return ctx.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return ctx.k + default: + return nil + } +} + +// CreateProcess creates a new task in a new thread group with the given +// options. The new task has no parent and is in the root PID namespace. +// +// If k.Start() has already been called, then the created process must be +// started by calling kernel.StartProcess(tg). +// +// If k.Start() has not yet been called, then the created task will begin +// running when k.Start() is called. +// +// CreateProcess has no analogue in Linux; it is used to create the initial +// application task, as well as processes started by the control server. +func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) { + k.extMu.Lock() + defer k.extMu.Unlock() + log.Infof("EXEC: %v", args.Argv) + + ctx := args.NewContext(k) + + var ( + opener fsbridge.Lookup + fsContext *FSContext + mntns *fs.MountNamespace + ) + + if VFS2Enabled { + mntnsVFS2 := args.MountNamespaceVFS2 + if mntnsVFS2 == nil { + // MountNamespaceVFS2 adds a reference to the namespace, which is + // transferred to the new process. + mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2() + } + // Get the root directory from the MountNamespace. + root := args.MountNamespaceVFS2.Root() + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. + defer root.DecRef() + + // Grab the working directory. + wd := root // Default. + if args.WorkingDirectory != "" { + pop := vfs.PathOperation{ + Root: root, + Start: wd, + Path: fspath.Parse(args.WorkingDirectory), + FollowFinalSymlink: true, + } + var err error + wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) + fsContext = NewFSContextVFS2(root, wd, args.Umask) + + } else { + mntns = args.MountNamespace + if mntns == nil { + mntns = k.GlobalInit().Leader().MountNamespace() + mntns.IncRef() + } + // Get the root directory from the MountNamespace. + root := mntns.Root() + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. + defer root.DecRef() + + // Grab the working directory. + remainingTraversals := args.MaxSymlinkTraversals + wd := root // Default. + if args.WorkingDirectory != "" { + var err error + wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + opener = fsbridge.NewFSLookup(mntns, root, wd) + fsContext = newFSContext(root, wd, args.Umask) + } + + tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) + + // Check which file to start from. + switch { + case args.Filename != "": + // If a filename is given, take that. + // Set File to nil so we resolve the path in LoadTaskImage. + args.File = nil + case args.File != nil: + // If File is set, take the File provided directly. + default: + // Otherwise look at Argv and see if the first argument is a valid path. + if len(args.Argv) == 0 { + return nil, 0, fmt.Errorf("no filename or command provided") + } + if !filepath.IsAbs(args.Argv[0]) { + return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) + } + args.Filename = args.Argv[0] + } + + // Create a fresh task context. + remainingTraversals := args.MaxSymlinkTraversals + loadArgs := loader.LoadArgs{ + Opener: opener, + RemainingTraversals: &remainingTraversals, + ResolveFinal: true, + Filename: args.Filename, + File: args.File, + CloseOnExec: false, + Argv: args.Argv, + Envv: args.Envv, + Features: k.featureSet, + } + + tc, se := k.LoadTaskImage(ctx, loadArgs) + if se != nil { + return nil, 0, errors.New(se.String()) + } + + // Take a reference on the FDTable, which will be transferred to + // TaskSet.NewTask(). + args.FDTable.IncRef() + + // Create the task. + config := &TaskConfig{ + Kernel: k, + ThreadGroup: tg, + TaskContext: tc, + FSContext: fsContext, + FDTable: args.FDTable, + Credentials: args.Credentials, + NetworkNamespace: k.RootNetworkNamespace(), + AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), + UTSNamespace: args.UTSNamespace, + IPCNamespace: args.IPCNamespace, + AbstractSocketNamespace: args.AbstractSocketNamespace, + MountNamespaceVFS2: args.MountNamespaceVFS2, + ContainerID: args.ContainerID, + } + t, err := k.tasks.NewTask(config) + if err != nil { + return nil, 0, err + } + t.traceExecEvent(tc) // Simulate exec for tracing. + + // Success. + tgid := k.tasks.Root.IDOfThreadGroup(tg) + if k.globalInit == nil { + k.globalInit = tg + } + return tg, tgid, nil +} + +// StartProcess starts running a process that was created with CreateProcess. +func (k *Kernel) StartProcess(tg *ThreadGroup) { + t := tg.Leader() + tid := k.tasks.Root.IDOfTask(t) + t.Start(tid) +} + +// Start starts execution of all tasks in k. +// +// Preconditions: Start may be called exactly once. +func (k *Kernel) Start() error { + k.extMu.Lock() + defer k.extMu.Unlock() + + if k.globalInit == nil { + return fmt.Errorf("kernel contains no tasks") + } + if k.started { + return fmt.Errorf("kernel already started") + } + + k.started = true + k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k)) + k.cpuClockTicker.Swap(ktime.Setting{ + Enabled: true, + Period: linux.ClockTick, + }) + // If k was created by LoadKernelFrom, timers were stopped during + // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, + // this is a no-op. + k.resumeTimeLocked() + // Start task goroutines. + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t, tid := range k.tasks.Root.tids { + t.Start(tid) + } + return nil +} + +// pauseTimeLocked pauses all Timers and Timekeeper updates. +// +// Preconditions: Any task goroutines running in k must be stopped. k.extMu +// must be locked. +func (k *Kernel) pauseTimeLocked() { + // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before + // Kernel.Start(). + if k.cpuClockTicker != nil { + k.cpuClockTicker.Pause() + } + + // By precondition, nothing else can be interacting with PIDNamespace.tids + // or FDTable.files, so we can iterate them without synchronization. (We + // can't hold the TaskSet mutex when pausing thread group timers because + // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet + // mutex, while holding the Timer mutex.) + for t := range k.tasks.Root.tids { + if t == t.tg.leader { + t.tg.itimerRealTimer.Pause() + for _, it := range t.tg.timers { + it.PauseTimer() + } + } + // This means we'll iterate FDTables shared by multiple tasks repeatedly, + // but ktime.Timer.Pause is idempotent so this is harmless. + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + if VFS2Enabled { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { + tfd.PauseTimer() + } + } else { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { + tfd.PauseTimer() + } + } + }) + } + } + k.timekeeper.PauseUpdates() +} + +// resumeTimeLocked resumes all Timers and Timekeeper updates. If +// pauseTimeLocked has not been previously called, resumeTimeLocked has no +// effect. +// +// Preconditions: Any task goroutines running in k must be stopped. k.extMu +// must be locked. +func (k *Kernel) resumeTimeLocked() { + if k.cpuClockTicker != nil { + k.cpuClockTicker.Resume() + } + + k.timekeeper.ResumeUpdates() + for t := range k.tasks.Root.tids { + if t == t.tg.leader { + t.tg.itimerRealTimer.Resume() + for _, it := range t.tg.timers { + it.ResumeTimer() + } + } + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + if VFS2Enabled { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { + tfd.ResumeTimer() + } + } else { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { + tfd.ResumeTimer() + } + } + }) + } + } +} + +func (k *Kernel) incRunningTasks() { + for { + tasks := atomic.LoadInt64(&k.runningTasks) + if tasks != 0 { + // Standard case. Simply increment. + if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) { + continue + } + return + } + + // Transition from 0 -> 1. Synchronize with other transitions and timer. + k.runningTasksMu.Lock() + tasks = atomic.LoadInt64(&k.runningTasks) + if tasks != 0 { + // We're no longer the first task, no need to + // re-enable. + atomic.AddInt64(&k.runningTasks, 1) + k.runningTasksMu.Unlock() + return + } + + if !k.cpuClockTickerDisabled { + // Timer was never disabled. + atomic.StoreInt64(&k.runningTasks, 1) + k.runningTasksMu.Unlock() + return + } + + // We need to update cpuClock for all of the ticks missed while we + // slept, and then re-enable the timer. + // + // The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify + // always increments cpuClock by 1 regardless of the number of + // expirations as a heuristic to avoid over-accounting in cases of CPU + // throttling. + // + // We want to cover the normal case, when all time should be accounted, + // so we increment for all expirations. Throttling is less concerning + // here because the ticker is only disabled from Notify. This means + // that Notify must schedule and compensate for the throttled period + // before the timer is disabled. Throttling while the timer is disabled + // doesn't matter, as nothing is running or reading cpuClock anyways. + // + // S/R also adds complication, as there are two cases. Recall that + // monotonicClock will jump forward on restore. + // + // 1. If the ticker is enabled during save, then on Restore Notify is + // called with many expirations, covering the time jump, but cpuClock + // is only incremented by 1. + // + // 2. If the ticker is disabled during save, then after Restore the + // first wakeup will call this function and cpuClock will be + // incremented by the number of expirations across the S/R. + // + // These cause very different value of cpuClock. But again, since + // nothing was running while the ticker was disabled, those differences + // don't matter. + setting, exp := k.cpuClockTickerSetting.At(k.monotonicClock.Now()) + if exp > 0 { + atomic.AddUint64(&k.cpuClock, exp) + } + + // Now that cpuClock is updated it is safe to allow other tasks to + // transition to running. + atomic.StoreInt64(&k.runningTasks, 1) + + // N.B. we must unlock before calling Swap to maintain lock ordering. + // + // cpuClockTickerDisabled need not wait until after Swap to become + // true. It is sufficient that the timer *will* be enabled. + k.cpuClockTickerDisabled = false + k.runningTasksMu.Unlock() + + // This won't call Notify (unless it's been ClockTick since setting.At + // above). This means we skip the thread group work in Notify. However, + // since nothing was running while we were disabled, none of the timers + // could have expired. + k.cpuClockTicker.Swap(setting) + + return + } +} + +func (k *Kernel) decRunningTasks() { + tasks := atomic.AddInt64(&k.runningTasks, -1) + if tasks < 0 { + panic(fmt.Sprintf("Invalid running count %d", tasks)) + } + + // Nothing to do. The next CPU clock tick will disable the timer if + // there is still nothing running. This provides approximately one tick + // of slack in which we can switch back and forth between idle and + // active without an expensive transition. +} + +// WaitExited blocks until all tasks in k have exited. +func (k *Kernel) WaitExited() { + k.tasks.liveGoroutines.Wait() +} + +// Kill requests that all tasks in k immediately exit as if group exiting with +// status es. Kill does not wait for tasks to exit. +func (k *Kernel) Kill(es ExitStatus) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.Kill(es) +} + +// Pause requests that all tasks in k temporarily stop executing, and blocks +// until all tasks and asynchronous I/O operations in k have stopped. Multiple +// calls to Pause nest and require an equal number of calls to Unpause to +// resume execution. +func (k *Kernel) Pause() { + k.extMu.Lock() + k.tasks.BeginExternalStop() + k.extMu.Unlock() + k.tasks.runningGoroutines.Wait() + k.tasks.aioGoroutines.Wait() +} + +// Unpause ends the effect of a previous call to Pause. If Unpause is called +// without a matching preceding call to Pause, Unpause may panic. +func (k *Kernel) Unpause() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.EndExternalStop() +} + +// SendExternalSignal injects a signal into the kernel. +// +// context is used only for debugging to describe how the signal was received. +// +// Preconditions: Kernel must have an init process. +func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.sendExternalSignal(info, context) +} + +// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. +// This function doesn't skip signals like SendExternalSignal does. +func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error { + k.extMu.Lock() + defer k.extMu.Unlock() + return tg.SendSignal(info) +} + +// SendContainerSignal sends the given signal to all processes inside the +// namespace that match the given container ID. +func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + + var lastErr error + for tg := range k.tasks.Root.tgids { + if tg.leader.ContainerID() == cid { + tg.signalHandlers.mu.Lock() + infoCopy := *info + if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { + lastErr = err + } + tg.signalHandlers.mu.Unlock() + } + } + return lastErr +} + +// RebuildTraceContexts rebuilds the trace context for all tasks. +// +// Unfortunately, if these are built while tracing is not enabled, then we will +// not have meaningful trace data. Rebuilding here ensures that we can do so +// after tracing has been enabled. +func (k *Kernel) RebuildTraceContexts() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + + for t, tid := range k.tasks.Root.tids { + t.rebuildTraceContext(tid) + } +} + +// FeatureSet returns the FeatureSet. +func (k *Kernel) FeatureSet() *cpuid.FeatureSet { + return k.featureSet +} + +// Timekeeper returns the Timekeeper. +func (k *Kernel) Timekeeper() *Timekeeper { + return k.timekeeper +} + +// TaskSet returns the TaskSet. +func (k *Kernel) TaskSet() *TaskSet { + return k.tasks +} + +// RootUserNamespace returns the root UserNamespace. +func (k *Kernel) RootUserNamespace() *auth.UserNamespace { + return k.rootUserNamespace +} + +// RootUTSNamespace returns the root UTSNamespace. +func (k *Kernel) RootUTSNamespace() *UTSNamespace { + return k.rootUTSNamespace +} + +// RootIPCNamespace returns the root IPCNamespace. +func (k *Kernel) RootIPCNamespace() *IPCNamespace { + return k.rootIPCNamespace +} + +// RootPIDNamespace returns the root PIDNamespace. +func (k *Kernel) RootPIDNamespace() *PIDNamespace { + return k.tasks.Root +} + +// RootAbstractSocketNamespace returns the root AbstractSocketNamespace. +func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { + return k.rootAbstractSocketNamespace +} + +// RootNetworkNamespace returns the root network namespace, always non-nil. +func (k *Kernel) RootNetworkNamespace() *inet.Namespace { + return k.rootNetworkNamespace +} + +// GlobalInit returns the thread group with ID 1 in the root PID namespace, or +// nil if no such thread group exists. GlobalInit may return a thread group +// containing no tasks if the thread group has already exited. +func (k *Kernel) GlobalInit() *ThreadGroup { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.globalInit +} + +// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace. +func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) { + k.globalInit = tg +} + +// ApplicationCores returns the number of CPUs visible to sandboxed +// applications. +func (k *Kernel) ApplicationCores() uint { + return k.applicationCores +} + +// RealtimeClock returns the application CLOCK_REALTIME clock. +func (k *Kernel) RealtimeClock() ktime.Clock { + return k.realtimeClock +} + +// MonotonicClock returns the application CLOCK_MONOTONIC clock. +func (k *Kernel) MonotonicClock() ktime.Clock { + return k.monotonicClock +} + +// CPUClockNow returns the current value of k.cpuClock. +func (k *Kernel) CPUClockNow() uint64 { + return atomic.LoadUint64(&k.cpuClock) +} + +// Syslog returns the syslog. +func (k *Kernel) Syslog() *syslog { + return &k.syslog +} + +// GenerateInotifyCookie generates a unique inotify event cookie. +// +// Returned values may overlap with previously returned values if the value +// space is exhausted. 0 is not a valid cookie value, all other values +// representable in a uint32 are allowed. +func (k *Kernel) GenerateInotifyCookie() uint32 { + id := atomic.AddUint32(&k.nextInotifyCookie, 1) + // Wrap-around is explicitly allowed for inotify event cookies. + if id == 0 { + id = atomic.AddUint32(&k.nextInotifyCookie, 1) + } + return id +} + +// NetlinkPorts returns the netlink port manager. +func (k *Kernel) NetlinkPorts() *port.Manager { + return k.netlinkPorts +} + +// SaveError returns the sandbox error that caused the kernel to exit during +// save. +func (k *Kernel) SaveError() error { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.saveErr +} + +// SetSaveError sets the sandbox error that caused the kernel to exit during +// save, if one is not already set. +func (k *Kernel) SetSaveError(err error) { + k.extMu.Lock() + defer k.extMu.Unlock() + if k.saveErr == nil { + k.saveErr = err + } +} + +var _ tcpip.Clock = (*Kernel)(nil) + +// NowNanoseconds implements tcpip.Clock.NowNanoseconds. +func (k *Kernel) NowNanoseconds() int64 { + now, err := k.timekeeper.GetTime(sentrytime.Realtime) + if err != nil { + panic("Kernel.NowNanoseconds: " + err.Error()) + } + return now +} + +// NowMonotonic implements tcpip.Clock.NowMonotonic. +func (k *Kernel) NowMonotonic() int64 { + now, err := k.timekeeper.GetTime(sentrytime.Monotonic) + if err != nil { + panic("Kernel.NowMonotonic: " + err.Error()) + } + return now +} + +// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or +// LoadFrom. +func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { + k.mf = mf +} + +// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. +func (k *Kernel) MemoryFile() *pgalloc.MemoryFile { + return k.mf +} + +// SupervisorContext returns a Context with maximum privileges in k. It should +// only be used by goroutines outside the control of the emulated kernel +// defined by e. +// +// Callers are responsible for ensuring that the returned Context is not used +// concurrently with changes to the Kernel. +func (k *Kernel) SupervisorContext() context.Context { + return supervisorContext{ + Logger: log.Log(), + k: k, + } +} + +// SocketEntry represents a socket recorded in Kernel.sockets. It implements +// refs.WeakRefUser for sockets stored in the socket table. +// +// +stateify savable +type SocketEntry struct { + socketEntry + k *Kernel + Sock *refs.WeakRef + SockVFS2 *vfs.FileDescription + ID uint64 // Socket table entry number. +} + +// WeakRefGone implements refs.WeakRefUser.WeakRefGone. +func (s *SocketEntry) WeakRefGone() { + s.k.extMu.Lock() + s.k.sockets.Remove(s) + s.k.extMu.Unlock() +} + +// RecordSocket adds a socket to the system-wide socket table for tracking. +// +// Precondition: Caller must hold a reference to sock. +func (k *Kernel) RecordSocket(sock *fs.File) { + k.extMu.Lock() + id := k.nextSocketEntry + k.nextSocketEntry++ + s := &SocketEntry{k: k, ID: id} + s.Sock = refs.NewWeakRef(sock, s) + k.sockets.PushBack(s) + k.extMu.Unlock() +} + +// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for +// tracking. +// +// Precondition: Caller must hold a reference to sock. +// +// Note that the socket table will not hold a reference on the +// vfs.FileDescription, because we do not support weak refs on VFS2 files. +func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) { + k.extMu.Lock() + id := k.nextSocketEntry + k.nextSocketEntry++ + s := &SocketEntry{ + k: k, + ID: id, + SockVFS2: sock, + } + k.sockets.PushBack(s) + k.extMu.Unlock() +} + +// ListSockets returns a snapshot of all sockets. +// +// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef() +// to get a reference on a socket in the table. +func (k *Kernel) ListSockets() []*SocketEntry { + k.extMu.Lock() + var socks []*SocketEntry + for s := k.sockets.Front(); s != nil; s = s.Next() { + socks = append(socks, s) + } + k.extMu.Unlock() + return socks +} + +// supervisorContext is a privileged context. +type supervisorContext struct { + context.NoopSleeper + log.Logger + k *Kernel +} + +// Value implements context.Context. +func (ctx supervisorContext) Value(key interface{}) interface{} { + switch key { + case CtxCanTrace: + // The supervisor context can trace anything. (None of + // supervisorContext's users are expected to invoke ptrace, but ptrace + // permissions are required for certain file accesses.) + return func(*Task, bool) bool { return true } + case CtxKernel: + return ctx.k + case CtxPIDNamespace: + return ctx.k.tasks.Root + case CtxUTSNamespace: + return ctx.k.rootUTSNamespace + case CtxIPCNamespace: + return ctx.k.rootIPCNamespace + case auth.CtxCredentials: + // The supervisor context is global root. + return auth.NewRootCredentials(ctx.k.rootUserNamespace) + case fs.CtxRoot: + if ctx.k.globalInit != nil { + return ctx.k.globalInit.mounts.Root() + } + return nil + case vfs.CtxRoot: + if ctx.k.globalInit == nil { + return vfs.VirtualDentry{} + } + mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + defer mntns.DecRef() + // Root() takes a reference on the root dirent for us. + return mntns.Root() + case vfs.CtxMountNamespace: + if ctx.k.globalInit == nil { + return nil + } + // MountNamespaceVFS2() takes a reference for us. + return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + case fs.CtxDirentCacheLimiter: + return ctx.k.DirentCacheLimiter + case inet.CtxStack: + return ctx.k.RootNetworkNamespace().Stack() + case ktime.CtxRealtimeClock: + return ctx.k.RealtimeClock() + case limits.CtxLimits: + // No limits apply. + return limits.NewLimitSet() + case pgalloc.CtxMemoryFile: + return ctx.k.mf + case pgalloc.CtxMemoryFileProvider: + return ctx.k + case platform.CtxPlatform: + return ctx.k + case uniqueid.CtxGlobalUniqueID: + return ctx.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return ctx.k + case uniqueid.CtxInotifyCookie: + return ctx.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return ctx.k + default: + return nil + } +} + +// Rate limits for the number of unimplemented syscall events. +const ( + unimplementedSyscallsMaxRate = 100 // events per second + unimplementedSyscallBurst = 1000 // events +) + +// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event +// channel. +func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { + k.unimplementedSyscallEmitterOnce.Do(func() { + k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) + }) + + t := TaskFromContext(ctx) + k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ + Tid: int32(t.ThreadID()), + Registers: t.Arch().StateData().Proto(), + }) +} + +// VFS returns the virtual filesystem for the kernel. +func (k *Kernel) VFS() *vfs.VirtualFilesystem { + return &k.vfs +} + +// SetHostMount sets the hostfs mount. +func (k *Kernel) SetHostMount(mnt *vfs.Mount) { + if k.hostMount != nil { + panic("Kernel.hostMount cannot be set more than once") + } + k.hostMount = mnt +} + +// HostMount returns the hostfs mount. +func (k *Kernel) HostMount() *vfs.Mount { + return k.hostMount +} + +// PipeMount returns the pipefs mount. +func (k *Kernel) PipeMount() *vfs.Mount { + return k.pipeMount +} + +// ShmMount returns the tmpfs mount. +func (k *Kernel) ShmMount() *vfs.Mount { + return k.shmMount +} + +// SocketMount returns the sockfs mount. +func (k *Kernel) SocketMount() *vfs.Mount { + return k.socketMount +} diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go new file mode 100644 index 000000000..2e66ec587 --- /dev/null +++ b/pkg/sentry/kernel/kernel_opts.go @@ -0,0 +1,20 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// SpecialOpts contains non-standard options for the kernel. +// +// +stateify savable +type SpecialOpts struct{} diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go new file mode 100644 index 000000000..909219086 --- /dev/null +++ b/pkg/sentry/kernel/kernel_state.go @@ -0,0 +1,42 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/sentry/device" + "gvisor.dev/gvisor/pkg/tcpip" +) + +// saveDanglingEndpoints is invoked by stateify. +func (k *Kernel) saveDanglingEndpoints() []tcpip.Endpoint { + return tcpip.GetDanglingEndpoints() +} + +// loadDanglingEndpoints is invoked by stateify. +func (k *Kernel) loadDanglingEndpoints(es []tcpip.Endpoint) { + for _, e := range es { + tcpip.AddDanglingEndpoint(e) + } +} + +// saveDeviceRegistry is invoked by stateify. +func (k *Kernel) saveDeviceRegistry() *device.Registry { + return device.SimpleDevices +} + +// loadDeviceRegistry is invoked by stateify. +func (k *Kernel) loadDeviceRegistry(r *device.Registry) { + device.SimpleDevices.LoadFrom(r) +} diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD new file mode 100644 index 000000000..4486848d2 --- /dev/null +++ b/pkg/sentry/kernel/memevent/BUILD @@ -0,0 +1,24 @@ +load("//tools:defs.bzl", "go_library", "proto_library") + +package(licenses = ["notice"]) + +go_library( + name = "memevent", + srcs = ["memory_events.go"], + visibility = ["//:sandbox"], + deps = [ + ":memory_events_go_proto", + "//pkg/eventchannel", + "//pkg/log", + "//pkg/metric", + "//pkg/sentry/kernel", + "//pkg/sentry/usage", + "//pkg/sync", + ], +) + +proto_library( + name = "memory_events", + srcs = ["memory_events.proto"], + visibility = ["//visibility:public"], +) diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go new file mode 100644 index 000000000..200565bb8 --- /dev/null +++ b/pkg/sentry/kernel/memevent/memory_events.go @@ -0,0 +1,111 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package memevent implements the memory usage events controller, which +// periodically emits events via the eventchannel. +package memevent + +import ( + "time" + + "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/metric" + "gvisor.dev/gvisor/pkg/sentry/kernel" + pb "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sync" +) + +var totalTicks = metric.MustCreateNewUint64Metric("/memory_events/ticks", false /*sync*/, "Total number of memory event periods that have elapsed since startup.") +var totalEvents = metric.MustCreateNewUint64Metric("/memory_events/events", false /*sync*/, "Total number of memory events emitted.") + +// MemoryEvents describes the configuration for the global memory event emitter. +type MemoryEvents struct { + k *kernel.Kernel + + // The period is how often to emit an event. The memory events goroutine + // will ensure a minimum of one event is emitted per this period, regardless + // how of much memory usage has changed. + period time.Duration + + // Writing to this channel indicates the memory goroutine should stop. + stop chan struct{} + + // done is used to signal when the memory event goroutine has exited. + done sync.WaitGroup +} + +// New creates a new MemoryEvents. +func New(k *kernel.Kernel, period time.Duration) *MemoryEvents { + return &MemoryEvents{ + k: k, + period: period, + stop: make(chan struct{}), + } +} + +// Stop stops the memory usage events emitter goroutine. Stop must not be called +// concurrently with Start and may only be called once. +func (m *MemoryEvents) Stop() { + close(m.stop) + m.done.Wait() +} + +// Start starts the memory usage events emitter goroutine. Start must not be +// called concurrently with Stop and may only be called once. +func (m *MemoryEvents) Start() { + if m.period == 0 { + return + } + m.done.Add(1) + go m.run() // S/R-SAFE: doesn't interact with saved state. +} + +func (m *MemoryEvents) run() { + defer m.done.Done() + + // Emit the first event immediately on startup. + totalTicks.Increment() + m.emit() + + ticker := time.NewTicker(m.period) + defer ticker.Stop() + + for { + select { + case <-m.stop: + return + case <-ticker.C: + totalTicks.Increment() + m.emit() + } + } +} + +func (m *MemoryEvents) emit() { + totalPlatform, err := m.k.MemoryFile().TotalUsage() + if err != nil { + log.Warningf("Failed to fetch memory usage for memory events: %v", err) + return + } + snapshot, _ := usage.MemoryAccounting.Copy() + total := totalPlatform + snapshot.Mapped + + totalEvents.Increment() + eventchannel.Emit(&pb.MemoryUsageEvent{ + Mapped: snapshot.Mapped, + Total: total, + }) +} diff --git a/pkg/sentry/kernel/memevent/memory_events.proto b/pkg/sentry/kernel/memevent/memory_events.proto new file mode 100644 index 000000000..bf8029ff5 --- /dev/null +++ b/pkg/sentry/kernel/memevent/memory_events.proto @@ -0,0 +1,29 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package gvisor; + +// MemoryUsageEvent describes the memory usage of the sandbox at a single +// instant in time. These messages are emitted periodically on the eventchannel. +message MemoryUsageEvent { + // The total memory usage of the sandboxed application in bytes, calculated + // using the 'fast' method. + uint64 total = 1; + + // Memory used to back memory-mapped regions for files in the application, in + // bytes. This corresponds to the usage.MemoryKind.Mapped memory type. + uint64 mapped = 2; +} diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go new file mode 100644 index 000000000..77a35b788 --- /dev/null +++ b/pkg/sentry/kernel/pending_signals.go @@ -0,0 +1,142 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +const ( + // stdSignalCap is the maximum number of instances of a given standard + // signal that may be pending. ("[If] multiple instances of a standard + // signal are delivered while that signal is currently blocked, then only + // one instance is queued.") - signal(7) + stdSignalCap = 1 + + // rtSignalCap is the maximum number of instances of a given realtime + // signal that may be pending. + // + // TODO(igudger): In Linux, the minimum signal queue size is + // RLIMIT_SIGPENDING, which is by default max_threads/2. + rtSignalCap = 32 +) + +// pendingSignals holds a collection of pending signals. The zero value of +// pendingSignals is a valid empty collection. pendingSignals is thread-unsafe; +// users must provide synchronization. +// +// +stateify savable +type pendingSignals struct { + // signals contains all pending signals. + // + // Note that signals is zero-indexed, but signal 1 is the first valid + // signal, so signals[0] contains signals with signo 1 etc. This offset is + // usually handled by using Signal.index(). + signals [linux.SignalMaximum]pendingSignalQueue `state:".([]savedPendingSignal)"` + + // Bit i of pendingSet is set iff there is at least one signal with signo + // i+1 pending. + pendingSet linux.SignalSet `state:"manual"` +} + +// pendingSignalQueue holds a pendingSignalList for a single signal number. +// +// +stateify savable +type pendingSignalQueue struct { + pendingSignalList + length int +} + +// +stateify savable +type pendingSignal struct { + // pendingSignalEntry links into a pendingSignalList. + pendingSignalEntry + *arch.SignalInfo + + // If timer is not nil, it is the IntervalTimer which sent this signal. + timer *IntervalTimer +} + +// enqueue enqueues the given signal. enqueue returns true on success and false +// on failure (if the given signal's queue is full). +// +// Preconditions: info represents a valid signal. +func (p *pendingSignals) enqueue(info *arch.SignalInfo, timer *IntervalTimer) bool { + sig := linux.Signal(info.Signo) + q := &p.signals[sig.Index()] + if sig.IsStandard() { + if q.length >= stdSignalCap { + return false + } + } else if q.length >= rtSignalCap { + return false + } + q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info, timer: timer}) + q.length++ + p.pendingSet |= linux.SignalSetOf(sig) + return true +} + +// dequeue dequeues and returns any pending signal not masked by mask. If no +// unmasked signals are pending, dequeue returns nil. +func (p *pendingSignals) dequeue(mask linux.SignalSet) *arch.SignalInfo { + // "Real-time signals are delivered in a guaranteed order. Multiple + // real-time signals of the same type are delivered in the order they were + // sent. If different real-time signals are sent to a process, they are + // delivered starting with the lowest-numbered signal. (I.e., low-numbered + // signals have highest priority.) By contrast, if multiple standard + // signals are pending for a process, the order in which they are delivered + // is unspecified. If both standard and real-time signals are pending for a + // process, POSIX leaves it unspecified which is delivered first. Linux, + // like many other implementations, gives priority to standard signals in + // this case." - signal(7) + lowestPendingUnblockedBit := bits.TrailingZeros64(uint64(p.pendingSet &^ mask)) + if lowestPendingUnblockedBit >= linux.SignalMaximum { + return nil + } + return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1)) +} + +func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *arch.SignalInfo { + q := &p.signals[sig.Index()] + ps := q.pendingSignalList.Front() + if ps == nil { + return nil + } + q.pendingSignalList.Remove(ps) + q.length-- + if q.length == 0 { + p.pendingSet &^= linux.SignalSetOf(sig) + } + if ps.timer != nil { + ps.timer.updateDequeuedSignalLocked(ps.SignalInfo) + } + return ps.SignalInfo +} + +// discardSpecific causes all pending signals with number sig to be discarded. +func (p *pendingSignals) discardSpecific(sig linux.Signal) { + q := &p.signals[sig.Index()] + for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() { + if ps.timer != nil { + ps.timer.signalRejectedLocked() + } + } + q.pendingSignalList.Reset() + q.length = 0 + p.pendingSet &^= linux.SignalSetOf(sig) +} diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go new file mode 100644 index 000000000..ca8b4e164 --- /dev/null +++ b/pkg/sentry/kernel/pending_signals_state.go @@ -0,0 +1,46 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +// +stateify savable +type savedPendingSignal struct { + si *arch.SignalInfo + timer *IntervalTimer +} + +// saveSignals is invoked by stateify. +func (p *pendingSignals) saveSignals() []savedPendingSignal { + var pending []savedPendingSignal + for _, q := range p.signals { + for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() { + pending = append(pending, savedPendingSignal{ + si: ps.SignalInfo, + timer: ps.timer, + }) + } + } + return pending +} + +// loadSignals is invoked by stateify. +func (p *pendingSignals) loadSignals(pending []savedPendingSignal) { + for _, sps := range pending { + p.enqueue(sps.si, sps.timer) + } +} diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD new file mode 100644 index 000000000..449643118 --- /dev/null +++ b/pkg/sentry/kernel/pipe/BUILD @@ -0,0 +1,54 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "pipe", + srcs = [ + "device.go", + "node.go", + "pipe.go", + "pipe_unsafe.go", + "pipe_util.go", + "reader.go", + "reader_writer.go", + "vfs.go", + "writer.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/buffer", + "//pkg/context", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "pipe_test", + size = "small", + srcs = [ + "node_test.go", + "pipe_test.go", + ], + library = ":pipe", + deps = [ + "//pkg/context", + "//pkg/sentry/contexttest", + "//pkg/sentry/fs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go new file mode 100644 index 000000000..89f5d9342 --- /dev/null +++ b/pkg/sentry/kernel/pipe/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import "gvisor.dev/gvisor/pkg/sentry/device" + +// pipeDevice is used for all pipe files. +var pipeDevice = device.NewAnonDevice() diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go new file mode 100644 index 000000000..4b688c627 --- /dev/null +++ b/pkg/sentry/kernel/pipe/node.go @@ -0,0 +1,139 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// inodeOperations implements fs.InodeOperations for pipes. +// +// +stateify savable +type inodeOperations struct { + fsutil.InodeGenericChecker `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + fsutil.InodeNoopRelease `state:"nosave"` + fsutil.InodeNoopTruncate `state:"nosave"` + fsutil.InodeNoopWriteOut `state:"nosave"` + fsutil.InodeNotDirectory `state:"nosave"` + fsutil.InodeNotMappable `state:"nosave"` + fsutil.InodeNotSocket `state:"nosave"` + fsutil.InodeNotSymlink `state:"nosave"` + + // Marking pipe inodes as virtual allows them to be saved and restored + // even if they have been unlinked. We can get away with this because + // their state exists entirely within the sentry. + fsutil.InodeVirtual `state:"nosave"` + + fsutil.InodeSimpleAttributes + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // p is the underlying Pipe object representing this fifo. + p *Pipe + + // Channels for synchronizing the creation of new readers and writers of + // this fifo. See waitFor and newHandleLocked. + // + // These are not saved/restored because all waiters are unblocked on save, + // and either automatically restart (via ERESTARTSYS) or return EINTR on + // resume. On restarts via ERESTARTSYS, the appropriate channel will be + // recreated. + rWakeup chan struct{} `state:"nosave"` + wWakeup chan struct{} `state:"nosave"` +} + +var _ fs.InodeOperations = (*inodeOperations)(nil) + +// NewInodeOperations returns a new fs.InodeOperations for a given pipe. +func NewInodeOperations(ctx context.Context, perms fs.FilePermissions, p *Pipe) *inodeOperations { + return &inodeOperations{ + InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), perms, linux.PIPEFS_MAGIC), + p: p, + } +} + +// GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking +// semantics during open: +// +// "Normally, opening the FIFO blocks until the other end is opened also. A +// process can open a FIFO in nonblocking mode. In this case, opening for +// read-only will succeed even if no-one has opened on the write side yet, +// opening for write-only will fail with ENXIO (no such device or address) +// unless the other end has already been opened. Under Linux, opening a FIFO +// for read and write will succeed both in blocking and nonblocking mode. POSIX +// leaves this behavior undefined. This can be used to open a FIFO for writing +// while there are no readers available." - fifo(7) +func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + i.mu.Lock() + defer i.mu.Unlock() + + switch { + case flags.Read && !flags.Write: // O_RDONLY. + r := i.p.Open(ctx, d, flags) + newHandleLocked(&i.rWakeup) + + if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { + if !waitFor(&i.mu, &i.wWakeup, ctx) { + r.DecRef() + return nil, syserror.ErrInterrupted + } + } + + // By now, either we're doing a nonblocking open or we have a writer. On + // a nonblocking read-only open, the open succeeds even if no-one has + // opened the write side yet. + return r, nil + + case flags.Write && !flags.Read: // O_WRONLY. + w := i.p.Open(ctx, d, flags) + newHandleLocked(&i.wWakeup) + + if i.p.isNamed && !i.p.HasReaders() { + // On a nonblocking, write-only open, the open fails with ENXIO if the + // read side isn't open yet. + if flags.NonBlocking { + w.DecRef() + return nil, syserror.ENXIO + } + + if !waitFor(&i.mu, &i.rWakeup, ctx) { + w.DecRef() + return nil, syserror.ErrInterrupted + } + } + return w, nil + + case flags.Read && flags.Write: // O_RDWR. + // Pipes opened for read-write always succeeds without blocking. + rw := i.p.Open(ctx, d, flags) + newHandleLocked(&i.rWakeup) + newHandleLocked(&i.wWakeup) + return rw, nil + + default: + return nil, syserror.EINVAL + } +} + +func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { + return syserror.EPIPE +} diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go new file mode 100644 index 000000000..ab75a87ff --- /dev/null +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -0,0 +1,320 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "testing" + "time" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type sleeper struct { + context.Context + ch chan struct{} +} + +func newSleeperContext(t *testing.T) context.Context { + return &sleeper{ + Context: contexttest.Context(t), + ch: make(chan struct{}), + } +} + +func (s *sleeper) SleepStart() <-chan struct{} { + return s.ch +} + +func (s *sleeper) SleepFinish(bool) { +} + +func (s *sleeper) Cancel() { + s.ch <- struct{}{} +} + +func (s *sleeper) Interrupted() bool { + return len(s.ch) != 0 +} + +type openResult struct { + *fs.File + error +} + +var perms fs.FilePermissions = fs.FilePermissions{ + User: fs.PermMask{Read: true, Write: true}, +} + +func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) { + inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe}) + d := fs.NewDirent(ctx, inode, "pipe") + file, err := n.GetFile(ctx, d, flags) + if err != nil { + t.Fatalf("open with flags %+v failed: %v", flags, err) + } + if doneChan != nil { + doneChan <- struct{}{} + } + return file, err +} + +func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, resChan chan<- openResult) (*fs.File, error) { + inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe}) + d := fs.NewDirent(ctx, inode, "pipe") + file, err := n.GetFile(ctx, d, flags) + if resChan != nil { + resChan <- openResult{file, err} + } + return file, err +} + +func newNamedPipe(t *testing.T) *Pipe { + return NewPipe(true, DefaultPipeSize, usermem.PageSize) +} + +func newAnonPipe(t *testing.T) *Pipe { + return NewPipe(false, DefaultPipeSize, usermem.PageSize) +} + +// assertRecvBlocks ensures that a recv attempt on c blocks for at least +// blockDuration. This is useful for checking that a goroutine that is supposed +// to be executing a blocking operation is actually blocking. +func assertRecvBlocks(t *testing.T, c <-chan struct{}, blockDuration time.Duration, failMsg string) { + select { + case <-c: + t.Fatalf(failMsg) + case <-time.After(blockDuration): + // Ok, blocked for the required duration. + } +} + +func TestReadOpenBlocksForWriteOpen(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + rDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone) + + // Verify that the open for read is blocking. + assertRecvBlocks(t, rDone, time.Millisecond*100, + "open for read not blocking with no writers") + + wDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + + <-wDone + <-rDone +} + +func TestWriteOpenBlocksForReadOpen(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + wDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + + // Verify that the open for write is blocking + assertRecvBlocks(t, wDone, time.Millisecond*100, + "open for write not blocking with no readers") + + rDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone) + + <-rDone + <-wDone +} + +func TestMultipleWriteOpenDoesntCountAsReadOpen(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + rDone1 := make(chan struct{}) + rDone2 := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone1) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone2) + + assertRecvBlocks(t, rDone1, time.Millisecond*100, + "open for read didn't block with no writers") + assertRecvBlocks(t, rDone2, time.Millisecond*100, + "open for read didn't block with no writers") + + wDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + + <-wDone + <-rDone2 + <-rDone1 +} + +func TestClosedReaderBlocksWriteOpen(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil) + rFile.DecRef() + + wDone := make(chan struct{}) + // This open for write should block because the reader is now gone. + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + assertRecvBlocks(t, wDone, time.Millisecond*100, + "open for write didn't block with no concurrent readers") + + // Open for read again. This should unblock the open for write. + rDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone) + + <-rDone + <-wDone +} + +func TestReadWriteOpenNeverBlocks(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + rwDone := make(chan struct{}) + // Open for read-write never wait for a reader or writer, even if the + // nonblocking flag is not set. + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true, NonBlocking: false}, rwDone) + <-rwDone +} + +func TestReadWriteOpenUnblocksReadOpen(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + rDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone) + + rwDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone) + + <-rwDone + <-rDone +} + +func TestReadWriteOpenUnblocksWriteOpen(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + wDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + + rwDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, Write: true}, rwDone) + + <-rwDone + <-wDone +} + +func TestBlockedOpenIsCancellable(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + done := make(chan openResult) + go testOpen(ctx, t, f, fs.FileFlags{Read: true}, done) + select { + case <-done: + t.Fatalf("open for read didn't block with no writers") + case <-time.After(time.Millisecond * 100): + // Ok. + } + + ctx.(*sleeper).Cancel() + // If the cancel on the sleeper didn't work, the open for read would never + // return. + res := <-done + if res.error != syserror.ErrInterrupted { + t.Fatalf("Cancellation didn't cause GetFile to return fs.ErrInterrupted, got %v.", + res.error) + } +} + +func TestNonblockingReadOpenFileNoWriters(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil { + t.Fatalf("Nonblocking open for read failed with error %v.", err) + } +} + +func TestNonblockingWriteOpenFileNoReaders(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != syserror.ENXIO { + t.Fatalf("Nonblocking open for write failed unexpected error %v.", err) + } +} + +func TestNonBlockingReadOpenWithWriter(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + wDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Write: true}, wDone) + + // Open for write blocks since there are no readers yet. + assertRecvBlocks(t, wDone, time.Millisecond*100, + "Open for write didn't block with no reader.") + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil); err != nil { + t.Fatalf("Nonblocking open for read failed with error %v.", err) + } + + // Open for write should now be unblocked. + <-wDone +} + +func TestNonBlockingWriteOpenWithReader(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newNamedPipe(t)) + + rDone := make(chan struct{}) + go testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true}, rDone) + + // Open for write blocked, since no reader yet. + assertRecvBlocks(t, rDone, time.Millisecond*100, + "Open for reader didn't block with no writer.") + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true, NonBlocking: true}, nil); err != nil { + t.Fatalf("Nonblocking open for write failed with error %v.", err) + } + + // Open for write should now be unblocked. + <-rDone +} + +func TestAnonReadOpen(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newAnonPipe(t)) + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Read: true}, nil); err != nil { + t.Fatalf("open anon pipe for read failed: %v", err) + } +} + +func TestAnonWriteOpen(t *testing.T) { + ctx := newSleeperContext(t) + f := NewInodeOperations(ctx, perms, newAnonPipe(t)) + + if _, err := testOpen(ctx, t, f, fs.FileFlags{Write: true}, nil); err != nil { + t.Fatalf("open anon pipe for write failed: %v", err) + } +} diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go new file mode 100644 index 000000000..79645d7d2 --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -0,0 +1,419 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pipe provides a pipe implementation. +package pipe + +import ( + "fmt" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/buffer" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + // MinimumPipeSize is a hard limit of the minimum size of a pipe. + MinimumPipeSize = 64 << 10 + + // DefaultPipeSize is the system-wide default size of a pipe in bytes. + DefaultPipeSize = MinimumPipeSize + + // MaximumPipeSize is a hard limit on the maximum size of a pipe. + MaximumPipeSize = 8 << 20 +) + +// Pipe is an encapsulation of a platform-independent pipe. +// It manages a buffered byte queue shared between a reader/writer +// pair. +// +// +stateify savable +type Pipe struct { + waiter.Queue `state:"nosave"` + + // isNamed indicates whether this is a named pipe. + // + // This value is immutable. + isNamed bool + + // atomicIOBytes is the maximum number of bytes that the pipe will + // guarantee atomic reads or writes atomically. + // + // This value is immutable. + atomicIOBytes int64 + + // The number of active readers for this pipe. + // + // Access atomically. + readers int32 + + // The number of active writes for this pipe. + // + // Access atomically. + writers int32 + + // mu protects all pipe internal state below. + mu sync.Mutex `state:"nosave"` + + // view is the underlying set of buffers. + // + // This is protected by mu. + view buffer.View + + // max is the maximum size of the pipe in bytes. When this max has been + // reached, writers will get EWOULDBLOCK. + // + // This is protected by mu. + max int64 + + // hadWriter indicates if this pipe ever had a writer. Note that this + // does not necessarily indicate there is *currently* a writer, just + // that there has been a writer at some point since the pipe was + // created. + // + // This is protected by mu. + hadWriter bool +} + +// NewPipe initializes and returns a pipe. +// +// N.B. The size and atomicIOBytes will be bounded. +func NewPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe { + if sizeBytes < MinimumPipeSize { + sizeBytes = MinimumPipeSize + } + if sizeBytes > MaximumPipeSize { + sizeBytes = MaximumPipeSize + } + if atomicIOBytes <= 0 { + atomicIOBytes = 1 + } + if atomicIOBytes > sizeBytes { + atomicIOBytes = sizeBytes + } + var p Pipe + initPipe(&p, isNamed, sizeBytes, atomicIOBytes) + return &p +} + +func initPipe(pipe *Pipe, isNamed bool, sizeBytes, atomicIOBytes int64) { + if sizeBytes < MinimumPipeSize { + sizeBytes = MinimumPipeSize + } + if sizeBytes > MaximumPipeSize { + sizeBytes = MaximumPipeSize + } + if atomicIOBytes <= 0 { + atomicIOBytes = 1 + } + if atomicIOBytes > sizeBytes { + atomicIOBytes = sizeBytes + } + pipe.isNamed = isNamed + pipe.max = sizeBytes + pipe.atomicIOBytes = atomicIOBytes +} + +// NewConnectedPipe initializes a pipe and returns a pair of objects +// representing the read and write ends of the pipe. +func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) { + p := NewPipe(false /* isNamed */, sizeBytes, atomicIOBytes) + + // Build an fs.Dirent for the pipe which will be shared by both + // returned files. + perms := fs.FilePermissions{ + User: fs.PermMask{Read: true, Write: true}, + } + iops := NewInodeOperations(ctx, perms, p) + ino := pipeDevice.NextIno() + sattr := fs.StableAttr{ + Type: fs.Pipe, + DeviceID: pipeDevice.DeviceID(), + InodeID: ino, + BlockSize: int64(atomicIOBytes), + } + ms := fs.NewPseudoMountSource(ctx) + d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino)) + // The p.Open calls below will each take a reference on the Dirent. We + // must drop the one we already have. + defer d.DecRef() + return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true}) +} + +// Open opens the pipe and returns a new file. +// +// Precondition: at least one of flags.Read or flags.Write must be set. +func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.File { + flags.NonSeekable = true + switch { + case flags.Read && flags.Write: + p.rOpen() + p.wOpen() + return fs.NewFile(ctx, d, flags, &ReaderWriter{ + Pipe: p, + }) + case flags.Read: + p.rOpen() + return fs.NewFile(ctx, d, flags, &Reader{ + ReaderWriter: ReaderWriter{Pipe: p}, + }) + case flags.Write: + p.wOpen() + return fs.NewFile(ctx, d, flags, &Writer{ + ReaderWriter: ReaderWriter{Pipe: p}, + }) + default: + // Precondition violated. + panic("invalid pipe flags") + } +} + +type readOps struct { + // left returns the bytes remaining. + left func() int64 + + // limit limits subsequence reads. + limit func(int64) + + // read performs the actual read operation. + read func(*buffer.View) (int64, error) +} + +// read reads data from the pipe into dst and returns the number of bytes +// read, or returns ErrWouldBlock if the pipe is empty. +// +// Precondition: this pipe must have readers. +func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { + // Don't block for a zero-length read even if the pipe is empty. + if ops.left() == 0 { + return 0, nil + } + + p.mu.Lock() + defer p.mu.Unlock() + return p.readLocked(ctx, ops) +} + +func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) { + // Is the pipe empty? + if p.view.Size() == 0 { + if !p.HasWriters() { + // There are no writers, return EOF. + return 0, nil + } + return 0, syserror.ErrWouldBlock + } + + // Limit how much we consume. + if ops.left() > p.view.Size() { + ops.limit(p.view.Size()) + } + + // Copy user data; the read op is responsible for trimming. + done, err := ops.read(&p.view) + return done, err +} + +type writeOps struct { + // left returns the bytes remaining. + left func() int64 + + // limit should limit subsequent writes. + limit func(int64) + + // write should write to the provided buffer. + write func(*buffer.View) (int64, error) +} + +// write writes data from sv into the pipe and returns the number of bytes +// written. If no bytes are written because the pipe is full (or has less than +// atomicIOBytes free capacity), write returns ErrWouldBlock. +// +// Precondition: this pipe must have writers. +func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { + p.mu.Lock() + defer p.mu.Unlock() + return p.writeLocked(ctx, ops) +} + +func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) { + // Can't write to a pipe with no readers. + if !p.HasReaders() { + return 0, syscall.EPIPE + } + + // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be + // atomic, but requires no atomicity for writes larger than this. + wanted := ops.left() + avail := p.max - p.view.Size() + if wanted > avail { + if wanted <= p.atomicIOBytes { + return 0, syserror.ErrWouldBlock + } + ops.limit(avail) + } + + // Copy user data. + done, err := ops.write(&p.view) + if err != nil { + return done, err + } + + if done < avail { + // Non-failure, but short write. + return done, nil + } + if done < wanted { + // Partial write due to full pipe. Note that this could also be + // the short write case above, we would expect a second call + // and the write to return zero bytes in this case. + return done, syserror.ErrWouldBlock + } + + return done, nil +} + +// rOpen signals a new reader of the pipe. +func (p *Pipe) rOpen() { + atomic.AddInt32(&p.readers, 1) +} + +// wOpen signals a new writer of the pipe. +func (p *Pipe) wOpen() { + p.mu.Lock() + defer p.mu.Unlock() + p.hadWriter = true + atomic.AddInt32(&p.writers, 1) +} + +// rClose signals that a reader has closed their end of the pipe. +func (p *Pipe) rClose() { + newReaders := atomic.AddInt32(&p.readers, -1) + if newReaders < 0 { + panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders)) + } +} + +// wClose signals that a writer has closed their end of the pipe. +func (p *Pipe) wClose() { + newWriters := atomic.AddInt32(&p.writers, -1) + if newWriters < 0 { + panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters)) + } +} + +// HasReaders returns whether the pipe has any active readers. +func (p *Pipe) HasReaders() bool { + return atomic.LoadInt32(&p.readers) > 0 +} + +// HasWriters returns whether the pipe has any active writers. +func (p *Pipe) HasWriters() bool { + return atomic.LoadInt32(&p.writers) > 0 +} + +// rReadinessLocked calculates the read readiness. +// +// Precondition: mu must be held. +func (p *Pipe) rReadinessLocked() waiter.EventMask { + ready := waiter.EventMask(0) + if p.HasReaders() && p.view.Size() != 0 { + ready |= waiter.EventIn + } + if !p.HasWriters() && p.hadWriter { + // POLLHUP must be suppressed until the pipe has had at least one writer + // at some point. Otherwise a reader thread may poll and immediately get + // a POLLHUP before the writer ever opens the pipe, which the reader may + // interpret as the writer opening then closing the pipe. + ready |= waiter.EventHUp + } + return ready +} + +// rReadiness returns a mask that states whether the read end of the pipe is +// ready for reading. +func (p *Pipe) rReadiness() waiter.EventMask { + p.mu.Lock() + defer p.mu.Unlock() + return p.rReadinessLocked() +} + +// wReadinessLocked calculates the write readiness. +// +// Precondition: mu must be held. +func (p *Pipe) wReadinessLocked() waiter.EventMask { + ready := waiter.EventMask(0) + if p.HasWriters() && p.view.Size() < p.max { + ready |= waiter.EventOut + } + if !p.HasReaders() { + ready |= waiter.EventErr + } + return ready +} + +// wReadiness returns a mask that states whether the write end of the pipe +// is ready for writing. +func (p *Pipe) wReadiness() waiter.EventMask { + p.mu.Lock() + defer p.mu.Unlock() + return p.wReadinessLocked() +} + +// rwReadiness returns a mask that states whether a read-write handle to the +// pipe is ready for IO. +func (p *Pipe) rwReadiness() waiter.EventMask { + p.mu.Lock() + defer p.mu.Unlock() + return p.rReadinessLocked() | p.wReadinessLocked() +} + +// queued returns the amount of queued data. +func (p *Pipe) queued() int64 { + p.mu.Lock() + defer p.mu.Unlock() + return p.view.Size() +} + +// FifoSize implements fs.FifoSizer.FifoSize. +func (p *Pipe) FifoSize(context.Context, *fs.File) (int64, error) { + p.mu.Lock() + defer p.mu.Unlock() + return p.max, nil +} + +// SetFifoSize implements fs.FifoSizer.SetFifoSize. +func (p *Pipe) SetFifoSize(size int64) (int64, error) { + if size < 0 { + return 0, syserror.EINVAL + } + if size < MinimumPipeSize { + size = MinimumPipeSize // Per spec. + } + if size > MaximumPipeSize { + return 0, syserror.EPERM + } + p.mu.Lock() + defer p.mu.Unlock() + if size < p.view.Size() { + return 0, syserror.EBUSY + } + p.max = size + return size, nil +} diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go new file mode 100644 index 000000000..bda739dbe --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -0,0 +1,139 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "bytes" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +func TestPipeRW(t *testing.T) { + ctx := contexttest.Context(t) + r, w := NewConnectedPipe(ctx, 65536, 4096) + defer r.DecRef() + defer w.DecRef() + + msg := []byte("here's some bytes") + wantN := int64(len(msg)) + n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) + if n != wantN || err != nil { + t.Fatalf("Writev: got (%d, %v), wanted (%d, nil)", n, err, wantN) + } + + buf := make([]byte, len(msg)) + n, err = r.Readv(ctx, usermem.BytesIOSequence(buf)) + if n != wantN || err != nil || !bytes.Equal(buf, msg) { + t.Fatalf("Readv: got (%d, %v) %q, wanted (%d, nil) %q", n, err, buf, wantN, msg) + } +} + +func TestPipeReadBlock(t *testing.T) { + ctx := contexttest.Context(t) + r, w := NewConnectedPipe(ctx, 65536, 4096) + defer r.DecRef() + defer w.DecRef() + + n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1))) + if n != 0 || err != syserror.ErrWouldBlock { + t.Fatalf("Readv: got (%d, %v), wanted (0, %v)", n, err, syserror.ErrWouldBlock) + } +} + +func TestPipeWriteBlock(t *testing.T) { + const atomicIOBytes = 2 + const capacity = MinimumPipeSize + + ctx := contexttest.Context(t) + r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes) + defer r.DecRef() + defer w.DecRef() + + msg := make([]byte, capacity+1) + n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) + if wantN, wantErr := int64(capacity), syserror.ErrWouldBlock; n != wantN || err != wantErr { + t.Fatalf("Writev: got (%d, %v), wanted (%d, %v)", n, err, wantN, wantErr) + } +} + +func TestPipeWriteUntilEnd(t *testing.T) { + const atomicIOBytes = 2 + + ctx := contexttest.Context(t) + r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes) + defer r.DecRef() + defer w.DecRef() + + msg := []byte("here's some bytes") + + wDone := make(chan struct{}, 0) + rDone := make(chan struct{}, 0) + defer func() { + // Signal the reader to stop and wait until it does so. + close(wDone) + <-rDone + }() + + go func() { + defer close(rDone) + // Read from r until done is closed. + ctx := contexttest.Context(t) + buf := make([]byte, len(msg)+1) + dst := usermem.BytesIOSequence(buf) + e, ch := waiter.NewChannelEntry(nil) + r.EventRegister(&e, waiter.EventIn) + defer r.EventUnregister(&e) + for { + n, err := r.Readv(ctx, dst) + dst = dst.DropFirst64(n) + if err == syserror.ErrWouldBlock { + select { + case <-ch: + continue + case <-wDone: + // We expect to have 1 byte left in dst since len(buf) == + // len(msg)+1. + if dst.NumBytes() != 1 || !bytes.Equal(buf[:len(msg)], msg) { + t.Errorf("Reader: got %q (%d bytes remaining), wanted %q", buf, dst.NumBytes(), msg) + } + return + } + } + if err != nil { + t.Fatalf("Readv: got unexpected error %v", err) + } + } + }() + + src := usermem.BytesIOSequence(msg) + e, ch := waiter.NewChannelEntry(nil) + w.EventRegister(&e, waiter.EventOut) + defer w.EventUnregister(&e) + for src.NumBytes() != 0 { + n, err := w.Writev(ctx, src) + src = src.DropFirst64(n) + if err == syserror.ErrWouldBlock { + <-ch + continue + } + if err != nil { + t.Fatalf("Writev: got (%d, %v)", n, err) + } + } +} diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go new file mode 100644 index 000000000..dd60cba24 --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go @@ -0,0 +1,35 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "unsafe" +) + +// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be +// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that +// concurrent calls cannot deadlock. +// +// Preconditions: x != y. +func lockTwoPipes(x, y *Pipe) { + // Lock the two pipes in order of increasing address. + if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) { + x.mu.Lock() + y.mu.Lock() + } else { + y.mu.Lock() + x.mu.Lock() + } +} diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go new file mode 100644 index 000000000..aacf28da2 --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -0,0 +1,214 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "io" + "math" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/buffer" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// This file contains Pipe file functionality that is tied to neither VFS nor +// the old fs architecture. + +// Release cleans up the pipe's state. +func (p *Pipe) Release() { + p.rClose() + p.wClose() + + // Wake up readers and writers. + p.Notify(waiter.EventIn | waiter.EventOut) +} + +// Read reads from the Pipe into dst. +func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) { + n, err := p.read(ctx, readOps{ + left: func() int64 { + return dst.NumBytes() + }, + limit: func(l int64) { + dst = dst.TakeFirst64(l) + }, + read: func(view *buffer.View) (int64, error) { + n, err := dst.CopyOutFrom(ctx, view) + dst = dst.DropFirst64(n) + view.TrimFront(n) + return n, err + }, + }) + if n > 0 { + p.Notify(waiter.EventOut) + } + return n, err +} + +// WriteTo writes to w from the Pipe. +func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) (int64, error) { + ops := readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadToWriter(w, count) + if !dup { + view.TrimFront(n) + } + count -= n + return n, err + }, + } + n, err := p.read(ctx, ops) + if n > 0 { + p.Notify(waiter.EventOut) + } + return n, err +} + +// Write writes to the Pipe from src. +func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) { + n, err := p.write(ctx, writeOps{ + left: func() int64 { + return src.NumBytes() + }, + limit: func(l int64) { + src = src.TakeFirst64(l) + }, + write: func(view *buffer.View) (int64, error) { + n, err := src.CopyInTo(ctx, view) + src = src.DropFirst64(n) + return n, err + }, + }) + if n > 0 { + p.Notify(waiter.EventIn) + } + return n, err +} + +// ReadFrom reads from r to the Pipe. +func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, error) { + n, err := p.write(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(view *buffer.View) (int64, error) { + n, err := view.WriteFromReader(r, count) + count -= n + return n, err + }, + }) + if n > 0 { + p.Notify(waiter.EventIn) + } + return n, err +} + +// Readiness returns the ready events in the underlying pipe. +func (p *Pipe) Readiness(mask waiter.EventMask) waiter.EventMask { + return p.rwReadiness() & mask +} + +// Ioctl implements ioctls on the Pipe. +func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Switch on ioctl request. + switch int(args[1].Int()) { + case linux.FIONREAD: + v := p.queued() + if v > math.MaxInt32 { + v = math.MaxInt32 // Silently truncate. + } + // Copy result to userspace. + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + default: + return 0, syscall.ENOTTY + } +} + +// waitFor blocks until the underlying pipe has at least one reader/writer is +// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this +// function will block for either readers or writers, depending on where +// 'wakeupChan' points. +// +// mu must be held by the caller. waitFor returns with mu held, but it will +// drop mu before blocking for any reader/writers. +func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool { + // Ideally this function would simply use a condition variable. However, the + // wait needs to be interruptible via 'sleeper', so we must sychronize via a + // channel. The synchronization below relies on the fact that closing a + // channel unblocks all receives on the channel. + + // Does an appropriate wakeup channel already exist? If not, create a new + // one. This is all done under f.mu to avoid races. + if *wakeupChan == nil { + *wakeupChan = make(chan struct{}) + } + + // Grab a local reference to the wakeup channel since it may disappear as + // soon as we drop f.mu. + wakeup := *wakeupChan + + // Drop the lock and prepare to sleep. + mu.Unlock() + cancel := sleeper.SleepStart() + + // Wait for either a new reader/write to be signalled via 'wakeup', or + // for the sleep to be cancelled. + select { + case <-wakeup: + sleeper.SleepFinish(true) + case <-cancel: + sleeper.SleepFinish(false) + } + + // Take the lock and check if we were woken. If we were woken and + // interrupted, the former takes priority. + mu.Lock() + select { + case <-wakeup: + return true + default: + return false + } +} + +// newHandleLocked signals a new pipe reader or writer depending on where +// 'wakeupChan' points. This unblocks any corresponding reader or writer +// waiting for the other end of the channel to be opened, see Fifo.waitFor. +// +// Precondition: the mutex protecting wakeupChan must be held. +func newHandleLocked(wakeupChan *chan struct{}) { + if *wakeupChan != nil { + close(*wakeupChan) + *wakeupChan = nil + } +} diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go new file mode 100644 index 000000000..7724b4452 --- /dev/null +++ b/pkg/sentry/kernel/pipe/reader.go @@ -0,0 +1,42 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "gvisor.dev/gvisor/pkg/waiter" +) + +// Reader satisfies the fs.FileOperations interface for read-only pipes. +// Reader should be used with !fs.FileFlags.Write to reject writes. +// +// +stateify savable +type Reader struct { + ReaderWriter +} + +// Release implements fs.FileOperations.Release. +// +// This overrides ReaderWriter.Release. +func (r *Reader) Release() { + r.Pipe.rClose() + + // Wake up writers. + r.Pipe.Notify(waiter.EventOut) +} + +// Readiness returns the ready events in the underlying pipe. +func (r *Reader) Readiness(mask waiter.EventMask) waiter.EventMask { + return r.Pipe.rReadiness() & mask +} diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go new file mode 100644 index 000000000..b2b5691ee --- /dev/null +++ b/pkg/sentry/kernel/pipe/reader_writer.go @@ -0,0 +1,67 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "io" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ReaderWriter satisfies the FileOperations interface and services both +// read and write requests. This should only be used directly for named pipes. +// pipe(2) and pipe2(2) only support unidirectional pipes and should use +// either pipe.Reader or pipe.Writer. +// +// +stateify savable +type ReaderWriter struct { + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + *Pipe +} + +// Read implements fs.FileOperations.Read. +func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + return rw.Pipe.Read(ctx, dst) +} + +// WriteTo implements fs.FileOperations.WriteTo. +func (rw *ReaderWriter) WriteTo(ctx context.Context, _ *fs.File, w io.Writer, count int64, dup bool) (int64, error) { + return rw.Pipe.WriteTo(ctx, w, count, dup) +} + +// Write implements fs.FileOperations.Write. +func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + return rw.Pipe.Write(ctx, src) +} + +// ReadFrom implements fs.FileOperations.WriteTo. +func (rw *ReaderWriter) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) { + return rw.Pipe.ReadFrom(ctx, r, count) +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (rw *ReaderWriter) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return rw.Pipe.Ioctl(ctx, io, args) +} diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go new file mode 100644 index 000000000..45d4c5fc1 --- /dev/null +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -0,0 +1,468 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/buffer" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// This file contains types enabling the pipe package to be used with the vfs +// package. + +// VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should +// not be copied. +type VFSPipe struct { + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // pipe is the underlying pipe. + pipe Pipe + + // Channels for synchronizing the creation of new readers and writers + // of this fifo. See waitFor and newHandleLocked. + // + // These are not saved/restored because all waiters are unblocked on + // save, and either automatically restart (via ERESTARTSYS) or return + // EINTR on resume. On restarts via ERESTARTSYS, the appropriate + // channel will be recreated. + rWakeup chan struct{} `state:"nosave"` + wWakeup chan struct{} `state:"nosave"` +} + +// NewVFSPipe returns an initialized VFSPipe. +func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { + var vp VFSPipe + initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes) + return &vp +} + +// ReaderWriterPair returns read-only and write-only FDs for vp. +// +// Preconditions: statusFlags should not contain an open access mode. +func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { + // Connected pipes share the same locks. + locks := &vfs.FileLocks{} + return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) +} + +// Open opens the pipe represented by vp. +func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) { + vp.mu.Lock() + defer vp.mu.Unlock() + + readable := vfs.MayReadFileWithOpenFlags(statusFlags) + writable := vfs.MayWriteFileWithOpenFlags(statusFlags) + if !readable && !writable { + return nil, syserror.EINVAL + } + + fd := vp.newFD(mnt, vfsd, statusFlags, locks) + + // Named pipes have special blocking semantics during open: + // + // "Normally, opening the FIFO blocks until the other end is opened also. A + // process can open a FIFO in nonblocking mode. In this case, opening for + // read-only will succeed even if no-one has opened on the write side yet, + // opening for write-only will fail with ENXIO (no such device or address) + // unless the other end has already been opened. Under Linux, opening a + // FIFO for read and write will succeed both in blocking and nonblocking + // mode. POSIX leaves this behavior undefined. This can be used to open a + // FIFO for writing while there are no readers available." - fifo(7) + switch { + case readable && writable: + // Pipes opened for read-write always succeed without blocking. + newHandleLocked(&vp.rWakeup) + newHandleLocked(&vp.wWakeup) + + case readable: + newHandleLocked(&vp.rWakeup) + // If this pipe is being opened as blocking and there's no + // writer, we have to wait for a writer to open the other end. + if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { + fd.DecRef() + return nil, syserror.EINTR + } + + case writable: + newHandleLocked(&vp.wWakeup) + + if vp.pipe.isNamed && !vp.pipe.HasReaders() { + // Non-blocking, write-only opens fail with ENXIO when the read + // side isn't open yet. + if statusFlags&linux.O_NONBLOCK != 0 { + fd.DecRef() + return nil, syserror.ENXIO + } + // Wait for a reader to open the other end. + if !waitFor(&vp.mu, &vp.rWakeup, ctx) { + fd.DecRef() + return nil, syserror.EINTR + } + } + + default: + panic("invalid pipe flags: must be readable, writable, or both") + } + + return fd, nil +} + +// Preconditions: vp.mu must be held. +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) *vfs.FileDescription { + fd := &VFSPipeFD{ + pipe: &vp.pipe, + } + fd.LockFD.Init(locks) + fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }) + + switch { + case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable(): + vp.pipe.rOpen() + vp.pipe.wOpen() + case fd.vfsfd.IsReadable(): + vp.pipe.rOpen() + case fd.vfsfd.IsWritable(): + vp.pipe.wOpen() + default: + panic("invalid pipe flags: must be readable, writable, or both") + } + + return &fd.vfsfd +} + +// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements +// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to +// other FileDescriptions for splice(2) and tee(2). +type VFSPipeFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD + + pipe *Pipe +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *VFSPipeFD) Release() { + var event waiter.EventMask + if fd.vfsfd.IsReadable() { + fd.pipe.rClose() + event |= waiter.EventOut + } + if fd.vfsfd.IsWritable() { + fd.pipe.wClose() + event |= waiter.EventIn | waiter.EventHUp + } + if event == 0 { + panic("invalid pipe flags: must be readable, writable, or both") + } + + fd.pipe.Notify(event) +} + +// Readiness implements waiter.Waitable.Readiness. +func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { + switch { + case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable(): + return fd.pipe.rwReadiness() + case fd.vfsfd.IsReadable(): + return fd.pipe.rReadiness() + case fd.vfsfd.IsWritable(): + return fd.pipe.wReadiness() + default: + panic("pipe FD is neither readable nor writable") + } +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ESPIPE +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.pipe.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) { + fd.pipe.EventUnregister(e) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + return fd.pipe.Read(ctx, dst) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + return fd.pipe.Write(ctx, src) +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return fd.pipe.Ioctl(ctx, uio, args) +} + +// PipeSize implements fcntl(F_GETPIPE_SZ). +func (fd *VFSPipeFD) PipeSize() int64 { + // Inline Pipe.FifoSize() rather than calling it with nil Context and + // fs.File and ignoring the returned error (which is always nil). + fd.pipe.mu.Lock() + defer fd.pipe.mu.Unlock() + return fd.pipe.max +} + +// SetPipeSize implements fcntl(F_SETPIPE_SZ). +func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { + return fd.pipe.SetFifoSize(size) +} + +// IOSequence returns a useremm.IOSequence that reads up to count bytes from, +// or writes up to count bytes to, fd. +func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence { + return usermem.IOSequence{ + IO: fd, + Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), + } +} + +// CopyIn implements usermem.IO.CopyIn. +func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { + origCount := int64(len(dst)) + n, err := fd.pipe.read(ctx, readOps{ + left: func() int64 { + return int64(len(dst)) + }, + limit: func(l int64) { + dst = dst[:l] + }, + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadAt(dst, 0) + view.TrimFront(int64(n)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventOut) + } + if err == nil && n != origCount { + return int(n), syserror.ErrWouldBlock + } + return int(n), err +} + +// CopyOut implements usermem.IO.CopyOut. +func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { + origCount := int64(len(src)) + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return int64(len(src)) + }, + limit: func(l int64) { + src = src[:l] + }, + write: func(view *buffer.View) (int64, error) { + view.Append(src) + return int64(len(src)), nil + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return int(n), syserror.ErrWouldBlock + } + return int(n), err +} + +// ZeroOut implements usermem.IO.ZeroOut. +func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { + origCount := toZero + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return toZero + }, + limit: func(l int64) { + toZero = l + }, + write: func(view *buffer.View) (int64, error) { + view.Grow(view.Size()+toZero, true /* zero */) + return toZero, nil + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// CopyInTo implements usermem.IO.CopyInTo. +func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { + count := ars.NumBytes() + if count == 0 { + return 0, nil + } + origCount := count + n, err := fd.pipe.read(ctx, readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadToSafememWriter(dst, uint64(count)) + view.TrimFront(int64(n)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventOut) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// CopyOutFrom implements usermem.IO.CopyOutFrom. +func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { + count := ars.NumBytes() + if count == 0 { + return 0, nil + } + origCount := count + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(view *buffer.View) (int64, error) { + n, err := view.WriteFromSafememReader(src, uint64(count)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// SwapUint32 implements usermem.IO.SwapUint32. +func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { + // How did a pipe get passed as the virtual address space to futex(2)? + panic("VFSPipeFD.SwapUint32 called unexpectedly") +} + +// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. +func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { + panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly") +} + +// LoadUint32 implements usermem.IO.LoadUint32. +func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) { + panic("VFSPipeFD.LoadUint32 called unexpectedly") +} + +// Splice reads up to count bytes from src and writes them to dst. It returns +// the number of bytes moved. +// +// Preconditions: count > 0. +func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { + return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */) +} + +// Tee reads up to count bytes from src and writes them to dst, without +// removing the read bytes from src. It returns the number of bytes copied. +// +// Preconditions: count > 0. +func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { + return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */) +} + +// Preconditions: count > 0. +func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) { + if dst.pipe == src.pipe { + return 0, syserror.EINVAL + } + + lockTwoPipes(dst.pipe, src.pipe) + defer dst.pipe.mu.Unlock() + defer src.pipe.mu.Unlock() + + n, err := dst.pipe.writeLocked(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(dstView *buffer.View) (int64, error) { + return src.pipe.readLocked(ctx, readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(srcView *buffer.View) (int64, error) { + n, err := srcView.ReadToSafememWriter(dstView, uint64(count)) + if n > 0 && removeFromSrc { + srcView.TrimFront(int64(n)) + } + return int64(n), err + }, + }) + }, + }) + if n > 0 { + dst.pipe.Notify(waiter.EventIn) + src.pipe.Notify(waiter.EventOut) + } + return n, err +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go new file mode 100644 index 000000000..5bc6aa931 --- /dev/null +++ b/pkg/sentry/kernel/pipe/writer.go @@ -0,0 +1,42 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "gvisor.dev/gvisor/pkg/waiter" +) + +// Writer satisfies the fs.FileOperations interface for write-only pipes. +// Writer should be used with !fs.FileFlags.Read to reject reads. +// +// +stateify savable +type Writer struct { + ReaderWriter +} + +// Release implements fs.FileOperations.Release. +// +// This overrides ReaderWriter.Release. +func (w *Writer) Release() { + w.Pipe.wClose() + + // Wake up readers. + w.Pipe.Notify(waiter.EventHUp) +} + +// Readiness returns the ready events in the underlying pipe. +func (w *Writer) Readiness(mask waiter.EventMask) waiter.EventMask { + return w.Pipe.wReadiness() & mask +} diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go new file mode 100644 index 000000000..2e861a5a8 --- /dev/null +++ b/pkg/sentry/kernel/posixtimer.go @@ -0,0 +1,308 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "math" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" +) + +// IntervalTimer represents a POSIX interval timer as described by +// timer_create(2). +// +// +stateify savable +type IntervalTimer struct { + timer *ktime.Timer + + // If target is not nil, it receives signo from timer expirations. If group + // is true, these signals are thread-group-directed. These fields are + // immutable. + target *Task + signo linux.Signal + id linux.TimerID + sigval uint64 + group bool + + // If sigpending is true, a signal to target is already queued, and timer + // expirations should increment overrunCur instead of sending another + // signal. sigpending is protected by target's signal mutex. (If target is + // nil, the timer will never send signals, so sigpending will be unused.) + sigpending bool + + // If sigorphan is true, timer's setting has been changed since sigpending + // last became true, such that overruns should no longer be counted in the + // pending signals si_overrun. sigorphan is protected by target's signal + // mutex. + sigorphan bool + + // overrunCur is the number of overruns that have occurred since the last + // time a signal was sent. overrunCur is protected by target's signal + // mutex. + overrunCur uint64 + + // Consider the last signal sent by this timer that has been dequeued. + // overrunLast is the number of overruns that occurred between when this + // signal was sent and when it was dequeued. Equivalently, overrunLast was + // the value of overrunCur when this signal was dequeued. overrunLast is + // protected by target's signal mutex. + overrunLast uint64 +} + +// DestroyTimer releases it's resources. +func (it *IntervalTimer) DestroyTimer() { + it.timer.Destroy() + it.timerSettingChanged() + // A destroyed IntervalTimer is still potentially reachable via a + // pendingSignal; nil out timer so that it won't be saved. + it.timer = nil +} + +func (it *IntervalTimer) timerSettingChanged() { + if it.target == nil { + return + } + it.target.tg.pidns.owner.mu.RLock() + defer it.target.tg.pidns.owner.mu.RUnlock() + it.target.tg.signalHandlers.mu.Lock() + defer it.target.tg.signalHandlers.mu.Unlock() + it.sigorphan = true + it.overrunCur = 0 + it.overrunLast = 0 +} + +// PauseTimer pauses the associated Timer. +func (it *IntervalTimer) PauseTimer() { + it.timer.Pause() +} + +// ResumeTimer resumes the associated Timer. +func (it *IntervalTimer) ResumeTimer() { + it.timer.Resume() +} + +// Preconditions: it.target's signal mutex must be locked. +func (it *IntervalTimer) updateDequeuedSignalLocked(si *arch.SignalInfo) { + it.sigpending = false + if it.sigorphan { + return + } + it.overrunLast = it.overrunCur + it.overrunCur = 0 + si.SetOverrun(saturateI32FromU64(it.overrunLast)) +} + +// Preconditions: it.target's signal mutex must be locked. +func (it *IntervalTimer) signalRejectedLocked() { + it.sigpending = false + if it.sigorphan { + return + } + it.overrunCur++ +} + +// Notify implements ktime.TimerListener.Notify. +func (it *IntervalTimer) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { + if it.target == nil { + return ktime.Setting{}, false + } + + it.target.tg.pidns.owner.mu.RLock() + defer it.target.tg.pidns.owner.mu.RUnlock() + it.target.tg.signalHandlers.mu.Lock() + defer it.target.tg.signalHandlers.mu.Unlock() + + if it.sigpending { + it.overrunCur += exp + return ktime.Setting{}, false + } + + // sigpending must be set before sendSignalTimerLocked() so that it can be + // unset if the signal is discarded (in which case sendSignalTimerLocked() + // will return nil). + it.sigpending = true + it.sigorphan = false + it.overrunCur += exp - 1 + si := &arch.SignalInfo{ + Signo: int32(it.signo), + Code: arch.SignalInfoTimer, + } + si.SetTimerID(it.id) + si.SetSigval(it.sigval) + // si_overrun is set when the signal is dequeued. + if err := it.target.sendSignalTimerLocked(si, it.group, it); err != nil { + it.signalRejectedLocked() + } + + return ktime.Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. Users of Timer should call +// DestroyTimer instead. +func (it *IntervalTimer) Destroy() { +} + +// IntervalTimerCreate implements timer_create(2). +func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux.TimerID, error) { + t.tg.timerMu.Lock() + defer t.tg.timerMu.Unlock() + + // Allocate a timer ID. + var id linux.TimerID + end := t.tg.nextTimerID + for { + id = t.tg.nextTimerID + _, ok := t.tg.timers[id] + t.tg.nextTimerID++ + if t.tg.nextTimerID < 0 { + t.tg.nextTimerID = 0 + } + if !ok { + break + } + if t.tg.nextTimerID == end { + return 0, syserror.EAGAIN + } + } + + // "The implementation of the default case where evp [sic] is NULL is + // handled inside glibc, which invokes the underlying system call with a + // suitably populated sigevent structure." - timer_create(2). This is + // misleading; the timer_create syscall also handles a NULL sevp as + // described by the man page + // (kernel/time/posix-timers.c:sys_timer_create(), do_timer_create()). This + // must be handled here instead of the syscall wrapper since sigval is the + // timer ID, which isn't available until we allocate it in this function. + if sigev == nil { + sigev = &linux.Sigevent{ + Signo: int32(linux.SIGALRM), + Notify: linux.SIGEV_SIGNAL, + Value: uint64(id), + } + } + + // Construct the timer. + it := &IntervalTimer{ + id: id, + sigval: sigev.Value, + } + switch sigev.Notify { + case linux.SIGEV_NONE: + // leave it.target = nil + case linux.SIGEV_SIGNAL, linux.SIGEV_THREAD: + // POSIX SIGEV_THREAD semantics are implemented in userspace by libc; + // to the kernel, SIGEV_THREAD and SIGEV_SIGNAL are equivalent. (See + // Linux's kernel/time/posix-timers.c:good_sigevent().) + it.target = t.tg.leader + it.group = true + case linux.SIGEV_THREAD_ID: + t.tg.pidns.owner.mu.RLock() + target, ok := t.tg.pidns.tasks[ThreadID(sigev.Tid)] + t.tg.pidns.owner.mu.RUnlock() + if !ok || target.tg != t.tg { + return 0, syserror.EINVAL + } + it.target = target + default: + return 0, syserror.EINVAL + } + if sigev.Notify != linux.SIGEV_NONE { + it.signo = linux.Signal(sigev.Signo) + if !it.signo.IsValid() { + return 0, syserror.EINVAL + } + } + it.timer = ktime.NewTimer(c, it) + + t.tg.timers[id] = it + return id, nil +} + +// IntervalTimerDelete implements timer_delete(2). +func (t *Task) IntervalTimerDelete(id linux.TimerID) error { + t.tg.timerMu.Lock() + defer t.tg.timerMu.Unlock() + it := t.tg.timers[id] + if it == nil { + return syserror.EINVAL + } + delete(t.tg.timers, id) + it.DestroyTimer() + return nil +} + +// IntervalTimerSettime implements timer_settime(2). +func (t *Task) IntervalTimerSettime(id linux.TimerID, its linux.Itimerspec, abs bool) (linux.Itimerspec, error) { + t.tg.timerMu.Lock() + defer t.tg.timerMu.Unlock() + it := t.tg.timers[id] + if it == nil { + return linux.Itimerspec{}, syserror.EINVAL + } + + newS, err := ktime.SettingFromItimerspec(its, abs, it.timer.Clock()) + if err != nil { + return linux.Itimerspec{}, err + } + tm, oldS := it.timer.SwapAnd(newS, it.timerSettingChanged) + its = ktime.ItimerspecFromSetting(tm, oldS) + return its, nil +} + +// IntervalTimerGettime implements timer_gettime(2). +func (t *Task) IntervalTimerGettime(id linux.TimerID) (linux.Itimerspec, error) { + t.tg.timerMu.Lock() + defer t.tg.timerMu.Unlock() + it := t.tg.timers[id] + if it == nil { + return linux.Itimerspec{}, syserror.EINVAL + } + + tm, s := it.timer.Get() + its := ktime.ItimerspecFromSetting(tm, s) + return its, nil +} + +// IntervalTimerGetoverrun implements timer_getoverrun(2). +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) IntervalTimerGetoverrun(id linux.TimerID) (int32, error) { + t.tg.timerMu.Lock() + defer t.tg.timerMu.Unlock() + it := t.tg.timers[id] + if it == nil { + return 0, syserror.EINVAL + } + // By timer_create(2) invariant, either it.target == nil (in which case + // it.overrunLast is immutably 0) or t.tg == it.target.tg; and the fact + // that t is executing timer_getoverrun(2) means that t.tg can't be + // completing execve, so t.tg.signalHandlers can't be changing, allowing us + // to lock t.tg.signalHandlers.mu without holding the TaskSet mutex. + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + // This is consistent with Linux after 78c9c4dfbf8c ("posix-timers: + // Sanitize overrun handling"). + return saturateI32FromU64(it.overrunLast), nil +} + +func saturateI32FromU64(x uint64) int32 { + if x > math.MaxInt32 { + return math.MaxInt32 + } + return int32(x) +} diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go new file mode 100644 index 000000000..e23e796ef --- /dev/null +++ b/pkg/sentry/kernel/ptrace.go @@ -0,0 +1,1119 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ptraceOptions are the subset of options controlling a task's ptrace behavior +// that are set by ptrace(PTRACE_SETOPTIONS). +// +// +stateify savable +type ptraceOptions struct { + // ExitKill is true if the tracee should be sent SIGKILL when the tracer + // exits. + ExitKill bool + + // If SysGood is true, set bit 7 in the signal number for + // syscall-entry-stop and syscall-exit-stop traps delivered to this task's + // tracer. + SysGood bool + + // TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE + // events. + TraceClone bool + + // TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC + // events. + TraceExec bool + + // TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT + // events. + TraceExit bool + + // TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK + // events. + TraceFork bool + + // TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP + // events. + TraceSeccomp bool + + // TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK + // events. + TraceVfork bool + + // TraceVforkDone is true if the tracer wants to receive + // PTRACE_EVENT_VFORK_DONE events. + TraceVforkDone bool +} + +// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry +// and exit. +type ptraceSyscallMode int + +const ( + // ptraceSyscallNone indicates that the task has never ptrace-stopped, or + // that it was resumed from its last ptrace-stop by PTRACE_CONT or + // PTRACE_DETACH. The task's syscalls will not be intercepted. + ptraceSyscallNone ptraceSyscallMode = iota + + // ptraceSyscallIntercept indicates that the task was resumed from its last + // ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a + // syscall, a ptrace-stop will occur. + ptraceSyscallIntercept + + // ptraceSyscallEmu indicates that the task was resumed from its last + // ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time + // the task enters a syscall, the syscall will be skipped, and a + // ptrace-stop will occur. + ptraceSyscallEmu +) + +// CanTrace checks that t is permitted to access target's state, as defined by +// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it +// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access +// mode PTRACE_MODE_READ. +// +// NOTE(b/30815691): The result of CanTrace is immediately stale (e.g., a +// racing setuid(2) may change traceability). This may pose a risk when a task +// changes from traceable to not traceable. This is only problematic across +// execve, where privileges may increase. +// +// We currently do not implement privileged executables (set-user/group-ID bits +// and file capabilities), so that case is not reachable. +func (t *Task) CanTrace(target *Task, attach bool) bool { + // "1. If the calling thread and the target thread are in the same thread + // group, access is always allowed." - ptrace(2) + // + // Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access() + // should not deny sub-threads", first released in Linux 3.12), the rule + // only applies if t and target are the same task. But, as that commit + // message puts it, "[any] security check is pointless when the tasks share + // the same ->mm." + if t.tg == target.tg { + return true + } + + // """ + // 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped, + // doesn't exist until Linux 4.5). + // + // Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the + // caller's real UID and GID for the checks in the next step. (Most APIs + // that check the caller's UID and GID use the effective IDs. For + // historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs + // instead.) + // + // 3. Deny access if neither of the following is true: + // + // - The real, effective, and saved-set user IDs of the target match the + // caller's user ID, *and* the real, effective, and saved-set group IDs of + // the target match the caller's group ID. + // + // - The caller has the CAP_SYS_PTRACE capability in the user namespace of + // the target. + // + // 4. Deny access if the target process "dumpable" attribute has a value + // other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in + // prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in + // the user namespace of the target process. + // + // 5. The kernel LSM security_ptrace_access_check() interface is invoked to + // see if ptrace access is permitted. The results depend on the LSM(s). The + // implementation of this interface in the commoncap LSM performs the + // following steps: + // + // a) If the access mode includes PTRACE_MODE_FSCREDS, then use the + // caller's effective capability set; otherwise (the access mode specifies + // PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set. + // + // b) Deny access if neither of the following is true: + // + // - The caller and the target process are in the same user namespace, and + // the caller's capabilities are a proper superset of the target process's + // permitted capabilities. + // + // - The caller has the CAP_SYS_PTRACE capability in the target process's + // user namespace. + // + // Note that the commoncap LSM does not distinguish between + // PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this + // section: "the commoncap LSM ... is always invoked".) + // """ + callerCreds := t.Credentials() + targetCreds := target.Credentials() + if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) { + return true + } + if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID { + return false + } + if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID { + return false + } + var targetMM *mm.MemoryManager + target.WithMuLocked(func(t *Task) { + targetMM = t.MemoryManager() + }) + if targetMM != nil && targetMM.Dumpability() != mm.UserDumpable { + return false + } + if callerCreds.UserNamespace != targetCreds.UserNamespace { + return false + } + if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 { + return false + } + return true +} + +// Tracer returns t's ptrace Tracer. +func (t *Task) Tracer() *Task { + return t.ptraceTracer.Load().(*Task) +} + +// hasTracer returns true if t has a ptrace tracer attached. +func (t *Task) hasTracer() bool { + // This isn't just inlined into callers so that if Task.Tracer() turns out + // to be too expensive because of e.g. interface conversion, we can switch + // to having a separate atomic flag more easily. + return t.Tracer() != nil +} + +// ptraceStop is a TaskStop placed on tasks in a ptrace-stop. +// +// +stateify savable +type ptraceStop struct { + // If frozen is true, the stopped task's tracer is currently operating on + // it, so Task.Kill should not remove the stop. + frozen bool + + // If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so + // ptraceFreeze should fail. + listen bool +} + +// Killable implements TaskStop.Killable. +func (s *ptraceStop) Killable() bool { + return !s.frozen +} + +// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been +// killed, the stop is skipped, and beginPtraceStopLocked returns false. +// +// beginPtraceStopLocked does not signal t's tracer or wake it if it is +// waiting. +// +// Preconditions: The TaskSet mutex must be locked. The caller must be running +// on the task goroutine. +func (t *Task) beginPtraceStopLocked() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + // This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... => + // kernel/sched/core.c:__schedule() => signal_pending_state() check, which + // is what prevents tasks from entering ptrace-stops after being killed. + // Note that if t was SIGKILLed and beingPtraceStopLocked is being called + // for PTRACE_EVENT_EXIT, the task will have dequeued the signal before + // entering the exit path, so t.killedLocked() will no longer return true. + // This is consistent with Linux: "Bugs: ... A SIGKILL signal may still + // cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be + // changed in the future; SIGKILL is meant to always immediately kill tasks + // even under ptrace. Last confirmed on Linux 3.13." - ptrace(2) + if t.killedLocked() { + return false + } + t.beginInternalStopLocked(&ptraceStop{}) + return true +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceTrapLocked(code int32) { + // This is unconditional in ptrace_stop(). + t.tg.signalHandlers.mu.Lock() + t.trapStopPending = false + t.tg.signalHandlers.mu.Unlock() + t.ptraceCode = code + t.ptraceSiginfo = &arch.SignalInfo{ + Signo: int32(linux.SIGTRAP), + Code: code, + } + t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + if t.beginPtraceStopLocked() { + tracer := t.Tracer() + tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP)) + tracer.tg.eventQueue.Notify(EventTraceeStop) + } +} + +// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the +// ptraceStop, temporarily preventing it from being removed by a concurrent +// Task.Kill, and returns true. Otherwise it returns false. +// +// Preconditions: The TaskSet mutex must be locked. The caller must be running +// on the task goroutine of t's tracer. +func (t *Task) ptraceFreeze() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.stop == nil { + return false + } + s, ok := t.stop.(*ptraceStop) + if !ok { + return false + } + if s.listen { + return false + } + s.frozen = true + return true +} + +// ptraceUnfreeze ends the effect of a previous successful call to +// ptraceFreeze. +// +// Preconditions: t must be in a frozen ptraceStop. +func (t *Task) ptraceUnfreeze() { + // t.tg.signalHandlers is stable because t is in a frozen ptrace-stop, + // preventing its thread group from completing execve. + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.ptraceUnfreezeLocked() +} + +// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be +// locked. +func (t *Task) ptraceUnfreezeLocked() { + // Do this even if the task has been killed to ensure a panic if t.stop is + // nil or not a ptraceStop. + t.stop.(*ptraceStop).frozen = false + if t.killedLocked() { + t.endInternalStopLocked() + } +} + +// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL, +// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on +// mode and singlestep. +// +// Preconditions: t must be in a frozen ptrace stop. +// +// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace +// stop. +func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error { + if sig != 0 && !sig.IsValid() { + return syserror.EIO + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.ptraceCode = int32(sig) + t.ptraceSyscallMode = mode + t.ptraceSinglestep = singlestep + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.endInternalStopLocked() + return nil +} + +func (t *Task) ptraceTraceme() error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if t.hasTracer() { + return syserror.EPERM + } + if t.parent == nil { + // In Linux, only init can not have a parent, and init is assumed never + // to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user + // application that may invoke PTRACE_TRACEME; having no parent can + // also occur if all tasks in the parent thread group have exited, and + // failed to find a living thread group to reparent to. The former case + // is treated as if TGID 1 has an exited parent in an invisible + // ancestor PID namespace that is an owner of the root user namespace + // (and consequently has CAP_SYS_PTRACE), and the latter case is a + // special form of the exited parent case below. In either case, + // returning nil here is correct. + return nil + } + if !t.parent.CanTrace(t, true) { + return syserror.EPERM + } + if t.parent.exitState != TaskExitNone { + // Fail silently, as if we were successfully attached but then + // immediately detached. This is consistent with Linux. + return nil + } + t.ptraceTracer.Store(t.parent) + t.parent.ptraceTracees[t] = struct{}{} + return nil +} + +// ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and +// ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller. +func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { + if t.tg == target.tg { + return syserror.EPERM + } + if !t.CanTrace(target, true) { + return syserror.EPERM + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.hasTracer() { + return syserror.EPERM + } + // Attaching to zombies and dead tasks is not permitted; the exit + // notification logic relies on this. Linux allows attaching to PF_EXITING + // tasks, though. + if target.exitState >= TaskExitZombie { + return syserror.EPERM + } + if seize { + if err := target.ptraceSetOptionsLocked(opts); err != nil { + return syserror.EIO + } + } + target.ptraceTracer.Store(t) + t.ptraceTracees[target] = struct{}{} + target.ptraceSeized = seize + target.tg.signalHandlers.mu.Lock() + // "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." - + // ptrace(2) + if !seize { + target.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGSTOP), + Code: arch.SignalInfoUser, + }, false /* group */) + } + // Undocumented Linux feature: If the tracee is already group-stopped (and + // consequently will not report the SIGSTOP just sent), force it to leave + // and re-enter the stop so that it will switch to a ptrace-stop. + if target.stop == (*groupStop)(nil) { + target.trapStopPending = true + target.endInternalStopLocked() + // TODO(jamieliu): Linux blocks ptrace_attach() until the task has + // entered the ptrace-stop (or exited) via JOBCTL_TRAPPING. + } + target.tg.signalHandlers.mu.Unlock() + return nil +} + +// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the +// caller. +// +// Preconditions: target must be a tracee of t in a frozen ptrace stop. +// +// Postconditions: If ptraceDetach returns nil, target will no longer be in a +// ptrace stop. +func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error { + if sig != 0 && !sig.IsValid() { + return syserror.EIO + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + target.ptraceCode = int32(sig) + target.forgetTracerLocked() + delete(t.ptraceTracees, target) + return nil +} + +// exitPtrace is called in the exit path to detach all of t's tracees. +func (t *Task) exitPtrace() { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + for target := range t.ptraceTracees { + if target.ptraceOpts.ExitKill { + target.tg.signalHandlers.mu.Lock() + target.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + }, false /* group */) + target.tg.signalHandlers.mu.Unlock() + } + // Leave ptraceCode unchanged so that if the task is ptrace-stopped, it + // observes the ptraceCode it set before it entered the stop. I believe + // this is consistent with Linux. + target.forgetTracerLocked() + } + // "nil maps cannot be saved" + t.ptraceTracees = make(map[*Task]struct{}) +} + +// forgetTracerLocked detaches t's tracer and ensures that t is no longer +// ptrace-stopped. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) forgetTracerLocked() { + t.ptraceSeized = false + t.ptraceOpts = ptraceOptions{} + t.ptraceSyscallMode = ptraceSyscallNone + t.ptraceSinglestep = false + t.ptraceTracer.Store((*Task)(nil)) + if t.exitTracerNotified && !t.exitTracerAcked { + t.exitTracerAcked = true + t.exitNotifyLocked(true) + } + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + // Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If + // it wasn't, it will be reset via t.groupStopPending after the following. + t.trapStopPending = false + // If t's thread group is in a group stop and t is eligible to participate, + // make it do so. This is essentially the reverse of the special case in + // ptraceAttach, which converts a group stop to a ptrace stop. ("Handling + // of restart from group-stop is currently buggy, but the "as planned" + // behavior is to leave tracee stopped and waiting for SIGCONT." - + // ptrace(2)) + if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated { + t.groupStopPending = true + // t already participated in the group stop when it unset + // groupStopPending. + t.groupStopAcknowledged = true + t.interrupt() + } + if _, ok := t.stop.(*ptraceStop); ok { + t.endInternalStopLocked() + } +} + +// ptraceSignalLocked is called after signal dequeueing to check if t should +// enter ptrace signal-delivery-stop. +// +// Preconditions: The signal mutex must be locked. The caller must be running +// on the task goroutine. +func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool { + if linux.Signal(info.Signo) == linux.SIGKILL { + return false + } + if !t.hasTracer() { + return false + } + // The tracer might change this signal into a stop signal, in which case + // any SIGCONT received after the signal was originally dequeued should + // cancel it. This is consistent with Linux. + t.tg.groupStopDequeued = true + // This is unconditional in ptrace_stop(). + t.trapStopPending = false + // Can't lock the TaskSet mutex while holding a signal mutex. + t.tg.signalHandlers.mu.Unlock() + defer t.tg.signalHandlers.mu.Lock() + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + tracer := t.Tracer() + if tracer == nil { + return false + } + t.ptraceCode = info.Signo + t.ptraceSiginfo = info + t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo) + if t.beginPtraceStopLocked() { + tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo) + tracer.tg.eventQueue.Notify(EventTraceeStop) + } + return true +} + +// ptraceSeccomp is called when a seccomp-bpf filter returns action +// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data +// is the lower 16 bits of the filter's return value. +func (t *Task) ptraceSeccomp(data uint16) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceSeccomp { + return false + } + t.Debugf("Entering PTRACE_EVENT_SECCOMP stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_SECCOMP, uint64(data)) + return true +} + +// ptraceSyscallEnter is called immediately before entering a syscall to check +// if t should enter ptrace syscall-enter-stop. +func (t *Task) ptraceSyscallEnter() (taskRunState, bool) { + if !t.hasTracer() { + return nil, false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + switch t.ptraceSyscallMode { + case ptraceSyscallNone: + return nil, false + case ptraceSyscallIntercept: + t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL") + t.ptraceSyscallStopLocked() + return (*runSyscallAfterSyscallEnterStop)(nil), true + case ptraceSyscallEmu: + t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU") + t.ptraceSyscallStopLocked() + return (*runSyscallAfterSysemuStop)(nil), true + } + panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode)) +} + +// ptraceSyscallExit is called immediately after leaving a syscall to check if +// t should enter ptrace syscall-exit-stop. +func (t *Task) ptraceSyscallExit() { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if t.ptraceSyscallMode != ptraceSyscallIntercept { + return + } + t.Debugf("Entering syscall-exit-stop") + t.ptraceSyscallStopLocked() +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceSyscallStopLocked() { + code := int32(linux.SIGTRAP) + if t.ptraceOpts.SysGood { + code |= 0x80 + } + t.ptraceTrapLocked(code) +} + +type ptraceCloneKind int32 + +const ( + // ptraceCloneKindClone represents a call to Task.Clone where + // TerminationSignal is not SIGCHLD and Vfork is false. + ptraceCloneKindClone ptraceCloneKind = iota + + // ptraceCloneKindFork represents a call to Task.Clone where + // TerminationSignal is SIGCHLD and Vfork is false. + ptraceCloneKindFork + + // ptraceCloneKindVfork represents a call to Task.Clone where Vfork is + // true. + ptraceCloneKindVfork +) + +// ptraceClone is called at the end of a clone or fork syscall to check if t +// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK +// stop. child is the new task. +func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + event := false + if !opts.Untraced { + switch kind { + case ptraceCloneKindClone: + if t.ptraceOpts.TraceClone { + t.Debugf("Entering PTRACE_EVENT_CLONE stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child])) + event = true + } + case ptraceCloneKindFork: + if t.ptraceOpts.TraceFork { + t.Debugf("Entering PTRACE_EVENT_FORK stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child])) + event = true + } + case ptraceCloneKindVfork: + if t.ptraceOpts.TraceVfork { + t.Debugf("Entering PTRACE_EVENT_VFORK stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child])) + event = true + } + default: + panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind)) + } + } + // "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE + // options are in effect, then children created by, respectively, vfork(2) + // or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit + // signal set to SIGCHLD, and other kinds of clone(2), are automatically + // attached to the same tracer which traced their parent. SIGSTOP is + // delivered to the children, causing them to enter signal-delivery-stop + // after they exit the system call which created them." - ptrace(2) + // + // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is + // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() => + // include/linux/ptrace.h:ptrace_init_task(). + if event || opts.InheritTracer { + tracer := t.Tracer() + if tracer != nil { + child.ptraceTracer.Store(tracer) + tracer.ptraceTracees[child] = struct{}{} + // "The "seized" behavior ... is inherited by children that are + // automatically attached using PTRACE_O_TRACEFORK, + // PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2) + child.ptraceSeized = t.ptraceSeized + // "Flags are inherited by new tracees created and "auto-attached" + // via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or + // PTRACE_O_TRACECLONE options." - ptrace(2) + child.ptraceOpts = t.ptraceOpts + child.tg.signalHandlers.mu.Lock() + // "PTRACE_SEIZE: ... Automatically attached children stop with + // PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead + // of having SIGSTOP signal delivered to them." - ptrace(2) + if child.ptraceSeized { + child.trapStopPending = true + } else { + child.pendingSignals.enqueue(&arch.SignalInfo{ + Signo: int32(linux.SIGSTOP), + }, nil) + } + // The child will self-interrupt() when its task goroutine starts + // running, so we don't have to. + child.tg.signalHandlers.mu.Unlock() + } + } + return event +} + +// ptraceVforkDone is called after the end of a vfork stop to check if t should +// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's +// PID namespace. +func (t *Task) ptraceVforkDone(child ThreadID) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceVforkDone { + return false + } + t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK_DONE, uint64(child)) + return true +} + +// ptraceExec is called at the end of an execve syscall to check if t should +// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID +// namespace, prior to the execve. (If t did not have a tracer at the time +// oldTID was read, oldTID may be 0. This is consistent with Linux.) +func (t *Task) ptraceExec(oldTID ThreadID) { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + // Recheck with the TaskSet mutex locked. Most ptrace points don't need to + // do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC + // is special because both TraceExec and !TraceExec do something if a + // tracer is attached. + if !t.hasTracer() { + return + } + if t.ptraceOpts.TraceExec { + t.Debugf("Entering PTRACE_EVENT_EXEC stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_EXEC, uint64(oldTID)) + return + } + // "If the PTRACE_O_TRACEEXEC option is not in effect for the execing + // tracee, and if the tracee was PTRACE_ATTACHed rather that [sic] + // PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after + // execve(2) returns. This is an ordinary signal (similar to one which can + // be generated by `kill -TRAP`, not a special kind of ptrace-stop. + // Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0 + // (SI_USER). This signal may be blocked by signal mask, and thus may be + // delivered (much) later." - ptrace(2) + if t.ptraceSeized { + return + } + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGTRAP), + Code: arch.SignalInfoUser, + }, false /* group */) +} + +// ptraceExit is called early in the task exit path to check if t should enter +// PTRACE_EVENT_EXIT stop. +func (t *Task) ptraceExit() { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceExit { + return + } + t.tg.signalHandlers.mu.Lock() + status := t.exitStatus.Status() + t.tg.signalHandlers.mu.Unlock() + t.Debugf("Entering PTRACE_EVENT_EXIT stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status)) +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceEventLocked(event int32, msg uint64) { + t.ptraceEventMsg = msg + // """ + // PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning + // with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An + // additional bit is set in the higher byte of the status word: the value + // status>>8 will be + // + // (SIGTRAP | PTRACE_EVENT_foo << 8). + // + // ... + // + // """ - ptrace(2) + t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8)) +} + +// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller. +func (t *Task) ptraceKill(target *Task) error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.Tracer() != t { + return syserror.ESRCH + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + // "This operation is deprecated; do not use it! Instead, send a SIGKILL + // directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is + // that it requires the tracee to be in signal-delivery-stop, otherwise it + // may not work (i.e., may complete successfully but won't kill the + // tracee)." - ptrace(2) + if target.stop == nil { + return nil + } + if _, ok := target.stop.(*ptraceStop); !ok { + return nil + } + target.ptraceCode = int32(linux.SIGKILL) + target.endInternalStopLocked() + return nil +} + +func (t *Task) ptraceInterrupt(target *Task) error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.Tracer() != t { + return syserror.ESRCH + } + if !target.ptraceSeized { + return syserror.EIO + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.killedLocked() || target.exitState >= TaskExitInitiated { + return nil + } + target.trapStopPending = true + if s, ok := target.stop.(*ptraceStop); ok && s.listen { + target.endInternalStopLocked() + } + target.interrupt() + return nil +} + +// Preconditions: The TaskSet mutex must be locked for writing. t must have a +// tracer. +func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { + const valid = uintptr(linux.PTRACE_O_EXITKILL | + linux.PTRACE_O_TRACESYSGOOD | + linux.PTRACE_O_TRACECLONE | + linux.PTRACE_O_TRACEEXEC | + linux.PTRACE_O_TRACEEXIT | + linux.PTRACE_O_TRACEFORK | + linux.PTRACE_O_TRACESECCOMP | + linux.PTRACE_O_TRACEVFORK | + linux.PTRACE_O_TRACEVFORKDONE) + if opts&^valid != 0 { + return syserror.EINVAL + } + t.ptraceOpts = ptraceOptions{ + ExitKill: opts&linux.PTRACE_O_EXITKILL != 0, + SysGood: opts&linux.PTRACE_O_TRACESYSGOOD != 0, + TraceClone: opts&linux.PTRACE_O_TRACECLONE != 0, + TraceExec: opts&linux.PTRACE_O_TRACEEXEC != 0, + TraceExit: opts&linux.PTRACE_O_TRACEEXIT != 0, + TraceFork: opts&linux.PTRACE_O_TRACEFORK != 0, + TraceSeccomp: opts&linux.PTRACE_O_TRACESECCOMP != 0, + TraceVfork: opts&linux.PTRACE_O_TRACEVFORK != 0, + TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0, + } + return nil +} + +// Ptrace implements the ptrace system call. +func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { + // PTRACE_TRACEME ignores all other arguments. + if req == linux.PTRACE_TRACEME { + return t.ptraceTraceme() + } + // All other ptrace requests operate on a current or future tracee + // specified by pid. + target := t.tg.pidns.TaskWithID(pid) + if target == nil { + return syserror.ESRCH + } + + // PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already + // a tracee. + if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE { + seize := req == linux.PTRACE_SEIZE + if seize && addr != 0 { + return syserror.EIO + } + return t.ptraceAttach(target, seize, uintptr(data)) + } + // PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee, + // but does not require that it is ptrace-stopped. + if req == linux.PTRACE_KILL { + return t.ptraceKill(target) + } + if req == linux.PTRACE_INTERRUPT { + return t.ptraceInterrupt(target) + } + // All other ptrace requests require that the target is a ptrace-stopped + // tracee, and freeze the ptrace-stop so the tracee can be operated on. + t.tg.pidns.owner.mu.RLock() + if target.Tracer() != t { + t.tg.pidns.owner.mu.RUnlock() + return syserror.ESRCH + } + if !target.ptraceFreeze() { + t.tg.pidns.owner.mu.RUnlock() + // "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE, + // PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the + // tracee to be in a ptrace-stop, otherwise they fail with ESRCH." - + // ptrace(2) + return syserror.ESRCH + } + t.tg.pidns.owner.mu.RUnlock() + // Even if the target has a ptrace-stop active, the tracee's task goroutine + // may not yet have reached Task.doStop; wait for it to do so. This is safe + // because there's no way for target to initiate a ptrace-stop and then + // block (by calling Task.block) before entering it. + // + // Caveat: If tasks were just restored, the tracee's first call to + // Task.Activate (in Task.run) occurs before its first call to Task.doStop, + // which may block if the tracer's address space is active. + t.UninterruptibleSleepStart(true) + target.waitGoroutineStoppedOrExited() + t.UninterruptibleSleepFinish(true) + + // Resuming commands end the ptrace stop, but only if successful. + // PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the + // target. + switch req { + case linux.PTRACE_DETACH: + if err := t.ptraceDetach(target, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_CONT: + if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SYSCALL: + if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SINGLESTEP: + if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SYSEMU: + if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SYSEMU_SINGLESTEP: + if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_LISTEN: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !target.ptraceSeized { + return syserror.EIO + } + if target.ptraceSiginfo == nil { + return syserror.EIO + } + if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP { + return syserror.EIO + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.trapNotifyPending { + target.endInternalStopLocked() + } else { + target.stop.(*ptraceStop).listen = true + target.ptraceUnfreezeLocked() + } + return nil + } + + // All other ptrace requests expect us to unfreeze the stop. + defer target.ptraceUnfreeze() + + switch req { + case linux.PTRACE_PEEKTEXT, linux.PTRACE_PEEKDATA: + // "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and + // PTRACE_PEEKUSER requests have a different API: they store the result + // at the address specified by the data parameter, and the return value + // is the error flag." - ptrace(2) + word := t.Arch().Native(0) + if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{ + IgnorePermissions: true, + }); err != nil { + return err + } + _, err := t.CopyOut(data, word) + return err + + case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA: + _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{ + IgnorePermissions: true, + }) + return err + + case linux.PTRACE_GETREGSET: + // "Read the tracee's registers. addr specifies, in an + // architecture-dependent way, the type of registers to be read. ... + // data points to a struct iovec, which describes the destination + // buffer's location and length. On return, the kernel modifies iov.len + // to indicate the actual number of bytes returned." - ptrace(2) + ars, err := t.CopyInIovecs(data, 1) + if err != nil { + return err + } + ar := ars.Head() + n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: ar.Start, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }, int(ar.Length())) + if err != nil { + return err + } + + // Update iovecs to represent the range of the written register set. + end, ok := ar.Start.AddLength(uint64(n)) + if !ok { + panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length())) + } + ar.End = end + return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) + + case linux.PTRACE_SETREGSET: + ars, err := t.CopyInIovecs(data, 1) + if err != nil { + return err + } + ar := ars.Head() + n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: ar.Start, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }, int(ar.Length())) + if err != nil { + return err + } + ar.End -= usermem.Addr(n) + return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) + + case linux.PTRACE_GETSIGINFO: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if target.ptraceSiginfo == nil { + return syserror.EINVAL + } + _, err := t.CopyOut(data, target.ptraceSiginfo) + return err + + case linux.PTRACE_SETSIGINFO: + var info arch.SignalInfo + if _, err := t.CopyIn(data, &info); err != nil { + return err + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if target.ptraceSiginfo == nil { + return syserror.EINVAL + } + target.ptraceSiginfo = &info + return nil + + case linux.PTRACE_GETSIGMASK: + if addr != linux.SignalSetSize { + return syserror.EINVAL + } + _, err := t.CopyOut(data, target.SignalMask()) + return err + + case linux.PTRACE_SETSIGMASK: + if addr != linux.SignalSetSize { + return syserror.EINVAL + } + var mask linux.SignalSet + if _, err := t.CopyIn(data, &mask); err != nil { + return err + } + // The target's task goroutine is stopped, so this is safe: + target.SetSignalMask(mask &^ UnblockableSignals) + return nil + + case linux.PTRACE_SETOPTIONS: + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + return target.ptraceSetOptionsLocked(uintptr(data)) + + case linux.PTRACE_GETEVENTMSG: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg) + return err + + // PEEKSIGINFO is unimplemented but seems to have no users anywhere. + + default: + return t.ptraceArch(target, req, addr, data) + } +} diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go new file mode 100644 index 000000000..cef1276ec --- /dev/null +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -0,0 +1,89 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ptraceArch implements arch-specific ptrace commands. +func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error { + switch req { + case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER + n, err := target.Arch().PtracePeekUser(uintptr(addr)) + if err != nil { + return err + } + _, err = t.CopyOut(data, n) + return err + + case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER + return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data)) + + case linux.PTRACE_GETREGS: + // "Copy the tracee's general-purpose ... registers ... to the address + // data in the tracer. ... (addr is ignored.) Note that SPARC systems + // have the meaning of data and addr reversed ..." + _, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + case linux.PTRACE_GETFPREGS: + _, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + case linux.PTRACE_SETREGS: + _, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + case linux.PTRACE_SETFPREGS: + _, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + default: + return syserror.EIO + } +} diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go new file mode 100644 index 000000000..d971b96b3 --- /dev/null +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ptraceArch implements arch-specific ptrace commands. +func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error { + return syserror.EIO +} diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go new file mode 100644 index 000000000..18416643b --- /dev/null +++ b/pkg/sentry/kernel/rseq.go @@ -0,0 +1,393 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/hostcpu" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Restartable sequences. +// +// We support two different APIs for restartable sequences. +// +// 1. The upstream interface added in v4.18. +// 2. The interface described in https://lwn.net/Articles/650333/. +// +// Throughout this file and other parts of the kernel, the latter is referred +// to as "old rseq". This interface was never merged upstream, but is supported +// for a limited set of applications that use it regardless. + +// OldRSeqCriticalRegion describes an old rseq critical region. +// +// +stateify savable +type OldRSeqCriticalRegion struct { + // When a task in this thread group has its CPU preempted (as defined by + // platform.ErrContextCPUPreempted) or has a signal delivered to an + // application handler while its instruction pointer is in CriticalSection, + // set the instruction pointer to Restart and application register r10 (on + // amd64) to the former instruction pointer. + CriticalSection usermem.AddrRange + Restart usermem.Addr +} + +// RSeqAvailable returns true if t supports (old and new) restartable sequences. +func (t *Task) RSeqAvailable() bool { + return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption() +} + +// SetRSeq registers addr as this thread's rseq structure. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error { + if t.rseqAddr != 0 { + if t.rseqAddr != addr { + return syserror.EINVAL + } + if t.rseqSignature != signature { + return syserror.EINVAL + } + return syserror.EBUSY + } + + // rseq must be aligned and correctly sized. + if addr&(linux.AlignOfRSeq-1) != 0 { + return syserror.EINVAL + } + if length != linux.SizeOfRSeq { + return syserror.EINVAL + } + if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok { + return syserror.EFAULT + } + + t.rseqAddr = addr + t.rseqSignature = signature + + // Initialize the CPUID. + // + // Linux implicitly does this on return from userspace, where failure + // would cause SIGSEGV. + if err := t.rseqUpdateCPU(); err != nil { + t.rseqAddr = 0 + t.rseqSignature = 0 + + t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return syserror.EFAULT + } + + return nil +} + +// ClearRSeq unregisters addr as this thread's rseq structure. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error { + if t.rseqAddr == 0 { + return syserror.EINVAL + } + if t.rseqAddr != addr { + return syserror.EINVAL + } + if length != linux.SizeOfRSeq { + return syserror.EINVAL + } + if t.rseqSignature != signature { + return syserror.EPERM + } + + if err := t.rseqClearCPU(); err != nil { + return err + } + + t.rseqAddr = 0 + t.rseqSignature = 0 + + if t.oldRSeqCPUAddr == 0 { + // rseqCPU no longer needed. + t.rseqCPU = -1 + } + + return nil +} + +// OldRSeqCriticalRegion returns a copy of t's thread group's current +// old restartable sequence. +func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion { + return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) +} + +// SetOldRSeqCriticalRegion replaces t's thread group's old restartable +// sequence. +// +// Preconditions: t.RSeqAvailable() == true. +func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error { + // These checks are somewhat more lenient than in Linux, which (bizarrely) + // requires r.CriticalSection to be non-empty and r.Restart to be + // outside of r.CriticalSection, even if r.CriticalSection.Start == 0 + // (which disables the critical region). + if r.CriticalSection.Start == 0 { + r.CriticalSection.End = 0 + r.Restart = 0 + t.tg.oldRSeqCritical.Store(&r) + return nil + } + if r.CriticalSection.Start >= r.CriticalSection.End { + return syserror.EINVAL + } + if r.CriticalSection.Contains(r.Restart) { + return syserror.EINVAL + } + // TODO(jamieliu): check that r.CriticalSection and r.Restart are in + // the application address range, for consistency with Linux. + t.tg.oldRSeqCritical.Store(&r) + return nil +} + +// OldRSeqCPUAddr returns the address that old rseq will keep updated with t's +// CPU number. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) OldRSeqCPUAddr() usermem.Addr { + return t.oldRSeqCPUAddr +} + +// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with +// t's CPU number. +// +// Preconditions: t.RSeqAvailable() == true. The caller must be running on the +// task goroutine. t's AddressSpace must be active. +func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error { + t.oldRSeqCPUAddr = addr + + // Check that addr is writable. + // + // N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's + // unfortunate, but unlikely in a correct program. + if err := t.rseqUpdateCPU(); err != nil { + t.oldRSeqCPUAddr = 0 + return syserror.EINVAL // yes, EINVAL, not err or EFAULT + } + return nil +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqUpdateCPU() error { + if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 { + t.rseqCPU = -1 + return nil + } + + t.rseqCPU = int32(hostcpu.GetCPU()) + + // Update both CPUs, even if one fails. + rerr := t.rseqCopyOutCPU() + oerr := t.oldRSeqCopyOutCPU() + + if rerr != nil { + return rerr + } + return oerr +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) oldRSeqCopyOutCPU() error { + if t.oldRSeqCPUAddr == 0 { + return nil + } + + buf := t.CopyScratchBuffer(4) + usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) + _, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf) + return err +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqCopyOutCPU() error { + if t.rseqAddr == 0 { + return nil + } + + buf := t.CopyScratchBuffer(8) + // CPUIDStart and CPUID are the first two fields in linux.RSeq. + usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart + usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID + // N.B. This write is not atomic, but since this occurs on the task + // goroutine then as long as userspace uses a single-instruction read + // it can't see an invalid value. + _, err := t.CopyOutBytes(t.rseqAddr, buf) + return err +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqClearCPU() error { + buf := t.CopyScratchBuffer(8) + // CPUIDStart and CPUID are the first two fields in linux.RSeq. + usermem.ByteOrder.PutUint32(buf, 0) // CPUIDStart + usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID + // N.B. This write is not atomic, but since this occurs on the task + // goroutine then as long as userspace uses a single-instruction read + // it can't see an invalid value. + _, err := t.CopyOutBytes(t.rseqAddr, buf) + return err +} + +// rseqAddrInterrupt checks if IP is in a critical section, and aborts if so. +// +// This is a bit complex since both the RSeq and RSeqCriticalSection structs +// are stored in userspace. So we must: +// +// 1. Copy in the address of RSeqCriticalSection from RSeq. +// 2. Copy in RSeqCriticalSection itself. +// 3. Validate critical section struct version, address range, abort address. +// 4. Validate the abort signature (4 bytes preceding abort IP match expected +// signature). +// 5. Clear address of RSeqCriticalSection from RSeq. +// 6. Finally, conditionally abort. +// +// See kernel/rseq.c:rseq_ip_fixup for reference. +// +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqAddrInterrupt() { + if t.rseqAddr == 0 { + return + } + + critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection) + if !ok { + // SetRSeq should validate this. + panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr)) + } + + if t.Arch().Width() != 8 { + // We only handle 64-bit for now. + t.Debugf("Only 64-bit rseq supported.") + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + buf := t.CopyScratchBuffer(8) + if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil { + t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf)) + if critAddr == 0 { + return + } + + var cs linux.RSeqCriticalSection + if _, err := cs.CopyIn(t, critAddr); err != nil { + t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + if cs.Version != 0 { + t.Debugf("Unknown version in %+v", cs) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + start := usermem.Addr(cs.Start) + critRange, ok := start.ToRange(cs.PostCommitOffset) + if !ok { + t.Debugf("Invalid start and offset in %+v", cs) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + abort := usermem.Addr(cs.Abort) + if critRange.Contains(abort) { + t.Debugf("Abort in critical section in %+v", cs) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Verify signature. + sigAddr := abort - linux.SizeOfRSeqSignature + + buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature) + if _, err := t.CopyInBytes(sigAddr, buf); err != nil { + t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + sig := usermem.ByteOrder.Uint32(buf) + if sig != t.rseqSignature { + t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Clear the critical section address. + // + // NOTE(b/143949567): We don't support any rseq flags, so we always + // restart if we are in the critical section, and thus *always* clear + // critAddrAddr. + if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return + } + + // Finally we can actually decide whether or not to restart. + if !critRange.Contains(usermem.Addr(t.Arch().IP())) { + return + } + + t.Arch().SetIP(uintptr(cs.Abort)) +} + +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) oldRSeqInterrupt() { + r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) + if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) { + t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart) + t.Arch().SetIP(uintptr(r.Restart)) + t.Arch().SetOldRSeqInterruptedIP(ip) + } +} + +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) rseqInterrupt() { + t.rseqAddrInterrupt() + t.oldRSeqInterrupt() +} diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD new file mode 100644 index 000000000..1b82e087b --- /dev/null +++ b/pkg/sentry/kernel/sched/BUILD @@ -0,0 +1,19 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "sched", + srcs = [ + "cpuset.go", + "sched.go", + ], + visibility = ["//pkg/sentry:internal"], +) + +go_test( + name = "sched_test", + size = "small", + srcs = ["cpuset_test.go"], + library = ":sched", +) diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go new file mode 100644 index 000000000..c6c436690 --- /dev/null +++ b/pkg/sentry/kernel/sched/cpuset.go @@ -0,0 +1,105 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sched + +import "math/bits" + +const ( + bitsPerByte = 8 + bytesPerLong = 8 // only for 64-bit architectures +) + +// CPUSet contains a bitmap to record CPU information. +// +// Note that this definition is only correct for little-endian architectures, +// since Linux's cpumask_t uses unsigned long. +type CPUSet []byte + +// CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus. +func CPUSetSize(num uint) uint { + // NOTE(b/68859821): Applications may expect that the size of a CPUSet in + // bytes is always a multiple of sizeof(unsigned long), since this is true + // in Linux. Thus we always round up. + bytes := (num + bitsPerByte - 1) / bitsPerByte + longs := (bytes + bytesPerLong - 1) / bytesPerLong + return longs * bytesPerLong +} + +// NewCPUSet returns a CPUSet for the given number of CPUs which initially +// contains no CPUs. +func NewCPUSet(num uint) CPUSet { + return CPUSet(make([]byte, CPUSetSize(num))) +} + +// NewFullCPUSet returns a CPUSet for the given number of CPUs, all of which +// are present in the set. +func NewFullCPUSet(num uint) CPUSet { + c := NewCPUSet(num) + var i uint + for ; i < num/bitsPerByte; i++ { + c[i] = 0xff + } + if rem := num % bitsPerByte; rem != 0 { + c[i] = (1 << rem) - 1 + } + return c +} + +// Size returns the size of 'c' in bytes. +func (c CPUSet) Size() uint { + return uint(len(c)) +} + +// NumCPUs returns how many cpus are set in the CPUSet. +func (c CPUSet) NumCPUs() uint { + var n int + for _, b := range c { + n += bits.OnesCount8(b) + } + return uint(n) +} + +// Copy returns a copy of the CPUSet. +func (c CPUSet) Copy() CPUSet { + return append(CPUSet(nil), c...) +} + +// Set sets the bit corresponding to cpu. +func (c *CPUSet) Set(cpu uint) { + (*c)[cpu/bitsPerByte] |= 1 << (cpu % bitsPerByte) +} + +// ClearAbove clears bits corresponding to cpu and all higher cpus. +func (c *CPUSet) ClearAbove(cpu uint) { + i := cpu / bitsPerByte + if i >= c.Size() { + return + } + (*c)[i] &^= 0xff << (cpu % bitsPerByte) + for i++; i < c.Size(); i++ { + (*c)[i] = 0 + } +} + +// ForEachCPU iterates over the CPUSet and calls fn with the cpu index if +// it's set. +func (c CPUSet) ForEachCPU(fn func(uint)) { + for i := uint(0); i < c.Size()*bitsPerByte; i++ { + bit := uint(1) << (i & (bitsPerByte - 1)) + if uint(c[i/bitsPerByte])&bit == bit { + fn(i) + } + } +} diff --git a/pkg/sentry/kernel/sched/cpuset_test.go b/pkg/sentry/kernel/sched/cpuset_test.go new file mode 100644 index 000000000..3af9f1197 --- /dev/null +++ b/pkg/sentry/kernel/sched/cpuset_test.go @@ -0,0 +1,44 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sched + +import ( + "testing" +) + +func TestNumCPUs(t *testing.T) { + for i := uint(0); i < 1024; i++ { + c := NewCPUSet(i) + for j := uint(0); j < i; j++ { + c.Set(j) + } + n := c.NumCPUs() + if n != i { + t.Errorf("got wrong number of cpus %d, want %d", n, i) + } + } +} + +func TestClearAbove(t *testing.T) { + const n = 1024 + c := NewFullCPUSet(n) + for i := uint(0); i < n; i++ { + cpu := n - i + c.ClearAbove(cpu) + if got := c.NumCPUs(); got != cpu { + t.Errorf("iteration %d: got %d cpus, wanted %d", i, got, cpu) + } + } +} diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go new file mode 100644 index 000000000..de18c9d02 --- /dev/null +++ b/pkg/sentry/kernel/sched/sched.go @@ -0,0 +1,16 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sched implements scheduler related features. +package sched diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go new file mode 100644 index 000000000..c38c5a40c --- /dev/null +++ b/pkg/sentry/kernel/seccomp.go @@ -0,0 +1,217 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const maxSyscallFilterInstructions = 1 << 15 + +// seccompData is equivalent to struct seccomp_data, which contains the data +// passed to seccomp-bpf filters. +type seccompData struct { + // nr is the system call number. + nr int32 + + // arch is an AUDIT_ARCH_* value indicating the system call convention. + arch uint32 + + // instructionPointer is the value of the instruction pointer at the time + // of the system call. + instructionPointer uint64 + + // args contains the first 6 system call arguments. + args [6]uint64 +} + +func (d *seccompData) asBPFInput() bpf.Input { + return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder} +} + +func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo { + si := &arch.SignalInfo{ + Signo: int32(linux.SIGSYS), + Errno: errno, + Code: arch.SYS_SECCOMP, + } + si.SetCallAddr(uint64(ip)) + si.SetSyscall(sysno) + si.SetArch(t.SyscallTable().AuditNumber) + return si +} + +// checkSeccompSyscall applies the task's seccomp filters before the execution +// of syscall sysno at instruction pointer ip. (These parameters must be passed +// in because vsyscalls do not use the values in t.Arch().) +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) linux.BPFAction { + result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip)) + action := result & linux.SECCOMP_RET_ACTION + switch action { + case linux.SECCOMP_RET_TRAP: + // "Results in the kernel sending a SIGSYS signal to the triggering + // task without executing the system call. ... The SECCOMP_RET_DATA + // portion of the return value will be passed as si_errno." - + // Documentation/prctl/seccomp_filter.txt + t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip)) + // "The return value register will contain an arch-dependent value." In + // practice, it's ~always the syscall number. + t.Arch().SetReturn(uintptr(sysno)) + + case linux.SECCOMP_RET_ERRNO: + // "Results in the lower 16-bits of the return value being passed to + // userland as the errno without executing the system call." + t.Arch().SetReturn(-uintptr(result.Data())) + + case linux.SECCOMP_RET_TRACE: + // "When returned, this value will cause the kernel to attempt to + // notify a ptrace()-based tracer prior to executing the system call. + // If there is no tracer present, -ENOSYS is returned to userland and + // the system call is not executed." + if !t.ptraceSeccomp(result.Data()) { + // This useless-looking temporary is needed because Go. + tmp := uintptr(syscall.ENOSYS) + t.Arch().SetReturn(-tmp) + return linux.SECCOMP_RET_ERRNO + } + + case linux.SECCOMP_RET_ALLOW: + // "Results in the system call being executed." + + case linux.SECCOMP_RET_KILL_THREAD: + // "Results in the task exiting immediately without executing the + // system call. The exit status of the task will be SIGSYS, not + // SIGKILL." + + default: + // consistent with Linux + return linux.SECCOMP_RET_KILL_THREAD + } + return action +} + +func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 { + data := seccompData{ + nr: sysno, + arch: t.tc.st.AuditNumber, + instructionPointer: uint64(ip), + } + // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so + // we can't do any slicing tricks or even use copy/append here. + for i, arg := range args { + if i >= len(data.args) { + break + } + data.args[i] = arg.Uint64() + } + input := data.asBPFInput() + + ret := uint32(linux.SECCOMP_RET_ALLOW) + f := t.syscallFilters.Load() + if f == nil { + return ret + } + + // "Every filter successfully installed will be evaluated (in reverse + // order) for each system call the task makes." - kernel/seccomp.c + for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- { + thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input) + if err != nil { + t.Debugf("seccomp-bpf filter %d returned error: %v", i, err) + thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD) + } + // "If multiple filters exist, the return value for the evaluation of a + // given system call will always use the highest precedent value." - + // Documentation/prctl/seccomp_filter.txt + // + // (Note that this contradicts prctl(2): "If the filters permit prctl() + // calls, then additional filters can be added; they are run in order + // until the first non-allow result is seen." prctl(2) is incorrect.) + // + // "The ordering ensures that a min_t() over composed return values + // always selects the least permissive choice." - + // include/uapi/linux/seccomp.h + if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) { + ret = thisRet + } + } + + return ret +} + +// AppendSyscallFilter adds BPF program p as a system call filter. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error { + // While syscallFilters are an atomic.Value we must take the mutex to prevent + // our read-copy-update from happening while another task is syncing syscall + // filters to us, this keeps the filters in a consistent state. + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + + // Cap the combined length of all syscall filters (plus a penalty of 4 + // instructions per filter beyond the first) to maxSyscallFilterInstructions. + // This restriction is inherited from Linux. + totalLength := p.Length() + var newFilters []bpf.Program + + if sf := t.syscallFilters.Load(); sf != nil { + oldFilters := sf.([]bpf.Program) + for _, f := range oldFilters { + totalLength += f.Length() + 4 + } + newFilters = append(newFilters, oldFilters...) + } + + if totalLength > maxSyscallFilterInstructions { + return syserror.ENOMEM + } + + newFilters = append(newFilters, p) + t.syscallFilters.Store(newFilters) + + if syncAll { + // Note: No new privs is always assumed to be set. + for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() { + if ot != t { + var copiedFilters []bpf.Program + copiedFilters = append(copiedFilters, newFilters...) + ot.syscallFilters.Store(copiedFilters) + } + } + } + + return nil +} + +// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current +// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP) +// and /proc/[pid]/status. +func (t *Task) SeccompMode() int { + f := t.syscallFilters.Load() + if f != nil && len(f.([]bpf.Program)) > 0 { + return linux.SECCOMP_MODE_FILTER + } + return linux.SECCOMP_MODE_NONE +} diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD new file mode 100644 index 000000000..65e5427c1 --- /dev/null +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -0,0 +1,49 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "waiter_list", + out = "waiter_list.go", + package = "semaphore", + prefix = "waiter", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*waiter", + "Linker": "*waiter", + }, +) + +go_library( + name = "semaphore", + srcs = [ + "semaphore.go", + "waiter_list.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/log", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sync", + "//pkg/syserror", + ], +) + +go_test( + name = "semaphore_test", + size = "small", + srcs = ["semaphore_test.go"], + library = ":semaphore", + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/contexttest", + "//pkg/sentry/kernel/auth", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go new file mode 100644 index 000000000..c00fa1138 --- /dev/null +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -0,0 +1,572 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package semaphore implements System V semaphores. +package semaphore + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + valueMax = 32767 // SEMVMX + + // semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL). + semaphoresMax = 32000 + + // setMax is "system-wide limit on the number of semaphore sets" (SEMMNI). + setsMax = 32000 + + // semaphoresTotalMax is "system-wide limit on the number of semaphores" + // (SEMMNS = SEMMNI*SEMMSL). + semaphoresTotalMax = 1024000000 +) + +// Registry maintains a set of semaphores that can be found by key or ID. +// +// +stateify savable +type Registry struct { + // userNS owning the ipc name this registry belongs to. Immutable. + userNS *auth.UserNamespace + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + semaphores map[int32]*Set + lastIDUsed int32 +} + +// Set represents a set of semaphores that can be operated atomically. +// +// +stateify savable +type Set struct { + // registry owning this sem set. Immutable. + registry *Registry + + // Id is a handle that identifies the set. + ID int32 + + // key is an user provided key that can be shared between processes. + key int32 + + // creator is the user that created the set. Immutable. + creator fs.FileOwner + + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + owner fs.FileOwner + perms fs.FilePermissions + opTime ktime.Time + changeTime ktime.Time + + // sems holds all semaphores in the set. The slice itself is immutable after + // it's been set, however each 'sem' object in the slice requires 'mu' lock. + sems []sem + + // dead is set to true when the set is removed and can't be reached anymore. + // All waiters must wake up and fail when set is dead. + dead bool +} + +// sem represents a single semaphore from a set. +// +// +stateify savable +type sem struct { + value int16 + waiters waiterList `state:"zerovalue"` + pid int32 +} + +// waiter represents a caller that is waiting for the semaphore value to +// become positive or zero. +// +// +stateify savable +type waiter struct { + waiterEntry + + // value represents how much resource the waiter needs to wake up. + value int16 + ch chan struct{} +} + +// NewRegistry creates a new semaphore set registry. +func NewRegistry(userNS *auth.UserNamespace) *Registry { + return &Registry{ + userNS: userNS, + semaphores: make(map[int32]*Set), + } +} + +// FindOrCreate searches for a semaphore set that matches 'key'. If not found, +// it may create a new one if requested. If private is true, key is ignored and +// a new set is always created. If create is false, it fails if a set cannot +// be found. If exclusive is true, it fails if a set with the same key already +// exists. +func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { + if nsems < 0 || nsems > semaphoresMax { + return nil, syserror.EINVAL + } + + r.mu.Lock() + defer r.mu.Unlock() + + if !private { + // Look up an existing semaphore. + if set := r.findByKey(key); set != nil { + set.mu.Lock() + defer set.mu.Unlock() + + // Check that caller can access semaphore set. + creds := auth.CredentialsFromContext(ctx) + if !set.checkPerms(creds, fs.PermsFromMode(mode)) { + return nil, syserror.EACCES + } + + // Validate parameters. + if nsems > int32(set.Size()) { + return nil, syserror.EINVAL + } + if create && exclusive { + return nil, syserror.EEXIST + } + return set, nil + } + + if !create { + // Semaphore not found and should not be created. + return nil, syserror.ENOENT + } + } + + // Zero is only valid if an existing set is found. + if nsems == 0 { + return nil, syserror.EINVAL + } + + // Apply system limits. + if len(r.semaphores) >= setsMax { + return nil, syserror.EINVAL + } + if r.totalSems() > int(semaphoresTotalMax-nsems) { + return nil, syserror.EINVAL + } + + // Finally create a new set. + owner := fs.FileOwnerFromContext(ctx) + perms := fs.FilePermsFromMode(mode) + return r.newSet(ctx, key, owner, owner, perms, nsems) +} + +// RemoveID removes set with give 'id' from the registry and marks the set as +// dead. All waiters will be awakened and fail. +func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { + r.mu.Lock() + defer r.mu.Unlock() + + set := r.semaphores[id] + if set == nil { + return syserror.EINVAL + } + + set.mu.Lock() + defer set.mu.Unlock() + + // "The effective user ID of the calling process must match the creator or + // owner of the semaphore set, or the caller must be privileged." + if !set.checkCredentials(creds) && !set.checkCapability(creds) { + return syserror.EACCES + } + + delete(r.semaphores, set.ID) + set.destroy() + return nil +} + +func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) { + set := &Set{ + registry: r, + key: key, + owner: owner, + creator: owner, + perms: perms, + changeTime: ktime.NowFromContext(ctx), + sems: make([]sem, nsems), + } + + // Find the next available ID. + for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { + // Handle wrap around. + if id < 0 { + id = 0 + continue + } + if r.semaphores[id] == nil { + r.lastIDUsed = id + r.semaphores[id] = set + set.ID = id + return set, nil + } + } + + log.Warningf("Semaphore map is full, they must be leaking") + return nil, syserror.ENOMEM +} + +// FindByID looks up a set given an ID. +func (r *Registry) FindByID(id int32) *Set { + r.mu.Lock() + defer r.mu.Unlock() + return r.semaphores[id] +} + +func (r *Registry) findByKey(key int32) *Set { + for _, v := range r.semaphores { + if v.key == key { + return v + } + } + return nil +} + +func (r *Registry) totalSems() int { + totalSems := 0 + for _, v := range r.semaphores { + totalSems += v.Size() + } + return totalSems +} + +func (s *Set) findSem(num int32) *sem { + if num < 0 || int(num) >= s.Size() { + return nil + } + return &s.sems[num] +} + +// Size returns the number of semaphores in the set. Size is immutable. +func (s *Set) Size() int { + return len(s.sems) +} + +// Change changes some fields from the set atomically. +func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error { + s.mu.Lock() + defer s.mu.Unlock() + + // "The effective UID of the calling process must match the owner or creator + // of the semaphore set, or the caller must be privileged." + if !s.checkCredentials(creds) && !s.checkCapability(creds) { + return syserror.EACCES + } + + s.owner = owner + s.perms = perms + s.changeTime = ktime.NowFromContext(ctx) + return nil +} + +// SetVal overrides a semaphore value, waking up waiters as needed. +func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error { + if val < 0 || val > valueMax { + return syserror.ERANGE + } + + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have alter permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Write: true}) { + return syserror.EACCES + } + + sem := s.findSem(num) + if sem == nil { + return syserror.ERANGE + } + + // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. + sem.value = val + sem.pid = pid + s.changeTime = ktime.NowFromContext(ctx) + sem.wakeWaiters() + return nil +} + +// SetValAll overrides all semaphores values, waking up waiters as needed. It also +// sets semaphore's PID which was fixed in Linux 4.6. +// +// 'len(vals)' must be equal to 's.Size()'. +func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials, pid int32) error { + if len(vals) != s.Size() { + panic(fmt.Sprintf("vals length (%d) different that Set.Size() (%d)", len(vals), s.Size())) + } + + for _, val := range vals { + if val < 0 || val > valueMax { + return syserror.ERANGE + } + } + + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have alter permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Write: true}) { + return syserror.EACCES + } + + for i, val := range vals { + sem := &s.sems[i] + + // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. + sem.value = int16(val) + sem.pid = pid + sem.wakeWaiters() + } + s.changeTime = ktime.NowFromContext(ctx) + return nil +} + +// GetVal returns a semaphore value. +func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have read permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return 0, syserror.EACCES + } + + sem := s.findSem(num) + if sem == nil { + return 0, syserror.ERANGE + } + return sem.value, nil +} + +// GetValAll returns value for all semaphores. +func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have read permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return nil, syserror.EACCES + } + + vals := make([]uint16, s.Size()) + for i, sem := range s.sems { + vals[i] = uint16(sem.value) + } + return vals, nil +} + +// GetPID returns the PID set when performing operations in the semaphore. +func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have read permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return 0, syserror.EACCES + } + + sem := s.findSem(num) + if sem == nil { + return 0, syserror.ERANGE + } + return sem.pid, nil +} + +// ExecuteOps attempts to execute a list of operations to the set. It only +// succeeds when all operations can be applied. No changes are made if it fails. +// +// On failure, it may return an error (retries are hopeless) or it may return +// a channel that can be waited on before attempting again. +func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials, pid int32) (chan struct{}, int32, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // Did it race with a removal operation? + if s.dead { + return nil, 0, syserror.EIDRM + } + + // Validate the operations. + readOnly := true + for _, op := range ops { + if s.findSem(int32(op.SemNum)) == nil { + return nil, 0, syserror.EFBIG + } + if op.SemOp != 0 { + readOnly = false + } + } + + if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) { + return nil, 0, syserror.EACCES + } + + ch, num, err := s.executeOps(ctx, ops, pid) + if err != nil { + return nil, 0, err + } + return ch, num, nil +} + +func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (chan struct{}, int32, error) { + // Changes to semaphores go to this slice temporarily until they all succeed. + tmpVals := make([]int16, len(s.sems)) + for i := range s.sems { + tmpVals[i] = s.sems[i].value + } + + for _, op := range ops { + sem := &s.sems[op.SemNum] + if op.SemOp == 0 { + // Handle 'wait for zero' operation. + if tmpVals[op.SemNum] != 0 { + // Semaphore isn't 0, must wait. + if op.SemFlg&linux.IPC_NOWAIT != 0 { + return nil, 0, syserror.ErrWouldBlock + } + + w := newWaiter(op.SemOp) + sem.waiters.PushBack(w) + return w.ch, int32(op.SemNum), nil + } + } else { + if op.SemOp < 0 { + // Handle 'wait' operation. + if -op.SemOp > valueMax { + return nil, 0, syserror.ERANGE + } + if -op.SemOp > tmpVals[op.SemNum] { + // Not enough resources, must wait. + if op.SemFlg&linux.IPC_NOWAIT != 0 { + return nil, 0, syserror.ErrWouldBlock + } + + w := newWaiter(op.SemOp) + sem.waiters.PushBack(w) + return w.ch, int32(op.SemNum), nil + } + } else { + // op.SemOp > 0: Handle 'signal' operation. + if tmpVals[op.SemNum] > valueMax-op.SemOp { + return nil, 0, syserror.ERANGE + } + } + + tmpVals[op.SemNum] += op.SemOp + } + } + + // All operations succeeded, apply them. + // TODO(gvisor.dev/issue/137): handle undo operations. + for i, v := range tmpVals { + s.sems[i].value = v + s.sems[i].wakeWaiters() + s.sems[i].pid = pid + } + s.opTime = ktime.NowFromContext(ctx) + return nil, 0, nil +} + +// AbortWait notifies that a waiter is giving up and will not wait on the +// channel anymore. +func (s *Set) AbortWait(num int32, ch chan struct{}) { + s.mu.Lock() + defer s.mu.Unlock() + + sem := &s.sems[num] + for w := sem.waiters.Front(); w != nil; w = w.Next() { + if w.ch == ch { + sem.waiters.Remove(w) + return + } + } + // Waiter may not be found in case it raced with wakeWaiters(). +} + +func (s *Set) checkCredentials(creds *auth.Credentials) bool { + return s.owner.UID == creds.EffectiveKUID || + s.owner.GID == creds.EffectiveKGID || + s.creator.UID == creds.EffectiveKUID || + s.creator.GID == creds.EffectiveKGID +} + +func (s *Set) checkCapability(creds *auth.Credentials) bool { + return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok() +} + +func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool { + // Are we owner, or in group, or other? + p := s.perms.Other + if s.owner.UID == creds.EffectiveKUID { + p = s.perms.User + } else if creds.InGroup(s.owner.GID) { + p = s.perms.Group + } + + // Are permissions satisfied without capability checks? + if p.SupersetOf(reqPerms) { + return true + } + + return s.checkCapability(creds) +} + +// destroy destroys the set. Caller must hold 's.mu'. +func (s *Set) destroy() { + // Notify all waiters. They will fail on the next attempt to execute + // operations and return error. + s.dead = true + for _, s := range s.sems { + for w := s.waiters.Front(); w != nil; w = w.Next() { + w.ch <- struct{}{} + } + s.waiters.Reset() + } +} + +// wakeWaiters goes over all waiters and checks which of them can be notified. +func (s *sem) wakeWaiters() { + // Note that this will release all waiters waiting for 0 too. + for w := s.waiters.Front(); w != nil; { + if s.value < w.value { + // Still blocked, skip it. + w = w.Next() + continue + } + w.ch <- struct{}{} + old := w + w = w.Next() + s.waiters.Remove(old) + } +} + +func newWaiter(val int16) *waiter { + return &waiter{ + value: val, + ch: make(chan struct{}, 1), + } +} diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go new file mode 100644 index 000000000..e47acefdf --- /dev/null +++ b/pkg/sentry/kernel/semaphore/semaphore_test.go @@ -0,0 +1,172 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package semaphore + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +func executeOps(ctx context.Context, t *testing.T, set *Set, ops []linux.Sembuf, block bool) chan struct{} { + ch, _, err := set.executeOps(ctx, ops, 123) + if err != nil { + t.Fatalf("ExecuteOps(ops) failed, err: %v, ops: %+v", err, ops) + } + if block { + if ch == nil { + t.Fatalf("ExecuteOps(ops) got: nil, expected: !nil, ops: %+v", ops) + } + if signalled(ch) { + t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops) + } + } else { + if ch != nil { + t.Fatalf("ExecuteOps(ops) got: %v, expected: nil, ops: %+v", ch, ops) + } + } + return ch +} + +func signalled(ch chan struct{}) bool { + select { + case <-ch: + return true + default: + return false + } +} + +func TestBasic(t *testing.T) { + ctx := contexttest.Context(t) + set := &Set{ID: 123, sems: make([]sem, 1)} + ops := []linux.Sembuf{ + {SemOp: 1}, + } + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = -1 + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = -1 + ch1 := executeOps(ctx, t, set, ops, true) + + ops[0].SemOp = 1 + executeOps(ctx, t, set, ops, false) + if !signalled(ch1) { + t.Fatalf("ExecuteOps(ops) channel should not have been signalled, ops: %+v", ops) + } +} + +func TestWaitForZero(t *testing.T) { + ctx := contexttest.Context(t) + set := &Set{ID: 123, sems: make([]sem, 1)} + ops := []linux.Sembuf{ + {SemOp: 0}, + } + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = -2 + ch1 := executeOps(ctx, t, set, ops, true) + + ops[0].SemOp = 0 + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = 1 + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = 0 + chZero1 := executeOps(ctx, t, set, ops, true) + + ops[0].SemOp = 0 + chZero2 := executeOps(ctx, t, set, ops, true) + + ops[0].SemOp = 1 + executeOps(ctx, t, set, ops, false) + if !signalled(ch1) { + t.Fatalf("ExecuteOps(ops) channel should have been signalled, ops: %+v, set: %+v", ops, set) + } + + ops[0].SemOp = -2 + executeOps(ctx, t, set, ops, false) + if !signalled(chZero1) { + t.Fatalf("ExecuteOps(ops) channel zero 1 should have been signalled, ops: %+v, set: %+v", ops, set) + } + if !signalled(chZero2) { + t.Fatalf("ExecuteOps(ops) channel zero 2 should have been signalled, ops: %+v, set: %+v", ops, set) + } +} + +func TestNoWait(t *testing.T) { + ctx := contexttest.Context(t) + set := &Set{ID: 123, sems: make([]sem, 1)} + ops := []linux.Sembuf{ + {SemOp: 1}, + } + executeOps(ctx, t, set, ops, false) + + ops[0].SemOp = -2 + ops[0].SemFlg = linux.IPC_NOWAIT + if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock { + t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock) + } + + ops[0].SemOp = 0 + ops[0].SemFlg = linux.IPC_NOWAIT + if _, _, err := set.executeOps(ctx, ops, 123); err != syserror.ErrWouldBlock { + t.Fatalf("ExecuteOps(ops) wrong result, got: %v, expected: %v", err, syserror.ErrWouldBlock) + } +} + +func TestUnregister(t *testing.T) { + ctx := contexttest.Context(t) + r := NewRegistry(auth.NewRootUserNamespace()) + set, err := r.FindOrCreate(ctx, 123, 2, linux.FileMode(0x600), true, true, true) + if err != nil { + t.Fatalf("FindOrCreate() failed, err: %v", err) + } + if got := r.FindByID(set.ID); got.ID != set.ID { + t.Fatalf("FindById(%d) failed, got: %+v, expected: %+v", set.ID, got, set) + } + + ops := []linux.Sembuf{ + {SemOp: -1}, + } + chs := make([]chan struct{}, 0, 5) + for i := 0; i < 5; i++ { + ch := executeOps(ctx, t, set, ops, true) + chs = append(chs, ch) + } + + creds := auth.CredentialsFromContext(ctx) + if err := r.RemoveID(set.ID, creds); err != nil { + t.Fatalf("RemoveID(%d) failed, err: %v", set.ID, err) + } + if !set.dead { + t.Fatalf("set is not dead: %+v", set) + } + if got := r.FindByID(set.ID); got != nil { + t.Fatalf("FindById(%d) failed, got: %+v, expected: nil", set.ID, got) + } + for i, ch := range chs { + if !signalled(ch) { + t.Fatalf("channel %d should have been signalled", i) + } + } +} diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go new file mode 100644 index 000000000..0e19286de --- /dev/null +++ b/pkg/sentry/kernel/sessions.go @@ -0,0 +1,528 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/syserror" +) + +// SessionID is the public identifier. +type SessionID ThreadID + +// ProcessGroupID is the public identifier. +type ProcessGroupID ThreadID + +// Session contains a leader threadgroup and a list of ProcessGroups. +// +// +stateify savable +type Session struct { + refs refs.AtomicRefCount + + // leader is the originator of the Session. + // + // Note that this may no longer be running (and may be reaped), so the + // ID is cached upon initial creation. The leader is still required + // however, since its PIDNamespace defines the scope of the Session. + // + // The leader is immutable. + leader *ThreadGroup + + // id is the cached identifier in the leader's namespace. + // + // The id is immutable. + id SessionID + + // foreground is the foreground process group. + // + // This is protected by TaskSet.mu. + foreground *ProcessGroup + + // ProcessGroups is a list of process groups in this Session. This is + // protected by TaskSet.mu. + processGroups processGroupList + + // sessionEntry is the embed for TaskSet.sessions. This is protected by + // TaskSet.mu. + sessionEntry +} + +// incRef grabs a reference. +func (s *Session) incRef() { + s.refs.IncRef() +} + +// decRef drops a reference. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (s *Session) decRef() { + s.refs.DecRefWithDestructor(func() { + // Remove translations from the leader. + for ns := s.leader.pidns; ns != nil; ns = ns.parent { + id := ns.sids[s] + delete(ns.sids, s) + delete(ns.sessions, id) + } + + // Remove from the list of global Sessions. + s.leader.pidns.owner.sessions.Remove(s) + }) +} + +// ProcessGroup contains an originator threadgroup and a parent Session. +// +// +stateify savable +type ProcessGroup struct { + refs refs.AtomicRefCount // not exported. + + // originator is the originator of the group. + // + // See note re: leader in Session. The same applies here. + // + // The originator is immutable. + originator *ThreadGroup + + // id is the cached identifier in the originator's namespace. + // + // The id is immutable. + id ProcessGroupID + + // Session is the parent Session. + // + // The session is immutable. + session *Session + + // ancestors is the number of thread groups in this process group whose + // parent is in a different process group in the same session. + // + // The name is derived from the fact that process groups where + // ancestors is zero are considered "orphans". + // + // ancestors is protected by TaskSet.mu. + ancestors uint32 + + // processGroupEntry is the embedded entry for Sessions.groups. This is + // protected by TaskSet.mu. + processGroupEntry +} + +// Originator retuns the originator of the process group. +func (pg *ProcessGroup) Originator() *ThreadGroup { + return pg.originator +} + +// IsOrphan returns true if this process group is an orphan. +func (pg *ProcessGroup) IsOrphan() bool { + pg.originator.TaskSet().mu.RLock() + defer pg.originator.TaskSet().mu.RUnlock() + return pg.ancestors == 0 +} + +// incRefWithParent grabs a reference. +// +// This function is called when this ProcessGroup is being associated with some +// new ThreadGroup, tg. parentPG is the ProcessGroup of tg's parent +// ThreadGroup. If tg is init, then parentPG may be nil. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (pg *ProcessGroup) incRefWithParent(parentPG *ProcessGroup) { + // We acquire an "ancestor" reference in the case of a nil parent. + // This is because the process being associated is init, and init can + // never be orphaned (we count it as always having an ancestor). + if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) { + pg.ancestors++ + } + + pg.refs.IncRef() +} + +// decRefWithParent drops a reference. +// +// parentPG is per incRefWithParent. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { + // See incRefWithParent regarding parent == nil. + if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) { + pg.ancestors-- + } + + alive := true + pg.refs.DecRefWithDestructor(func() { + alive = false // don't bother with handleOrphan. + + // Remove translations from the originator. + for ns := pg.originator.pidns; ns != nil; ns = ns.parent { + id := ns.pgids[pg] + delete(ns.pgids, pg) + delete(ns.processGroups, id) + } + + // Remove the list of process groups. + pg.session.processGroups.Remove(pg) + pg.session.decRef() + }) + if alive { + pg.handleOrphan() + } +} + +// parentPG returns the parent process group. +// +// Precondition: callers must hold TaskSet.mu. +func (tg *ThreadGroup) parentPG() *ProcessGroup { + if tg.leader.parent != nil { + return tg.leader.parent.tg.processGroup + } + return nil +} + +// handleOrphan checks whether the process group is an orphan and has any +// stopped jobs. If yes, then appropriate signals are delivered to each thread +// group within the process group. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (pg *ProcessGroup) handleOrphan() { + // Check if this process is an orphan. + if pg.ancestors != 0 { + return + } + + // See if there are any stopped jobs. + hasStopped := false + pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) { + if tg.processGroup != pg { + return + } + tg.signalHandlers.mu.Lock() + if tg.groupStopComplete { + hasStopped = true + } + tg.signalHandlers.mu.Unlock() + }) + if !hasStopped { + return + } + + // Deliver appropriate signals to all thread groups. + pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) { + if tg.processGroup != pg { + return + } + tg.signalHandlers.mu.Lock() + tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGHUP), true /* group */) + tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGCONT), true /* group */) + tg.signalHandlers.mu.Unlock() + }) + + return +} + +// Session returns the process group's session without taking a reference. +func (pg *ProcessGroup) Session() *Session { + return pg.session +} + +// SendSignal sends a signal to all processes inside the process group. It is +// analagous to kernel/signal.c:kill_pgrp. +func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error { + tasks := pg.originator.TaskSet() + tasks.mu.RLock() + defer tasks.mu.RUnlock() + + var lastErr error + for tg := range tasks.Root.tgids { + if tg.processGroup == pg { + tg.signalHandlers.mu.Lock() + infoCopy := *info + if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { + lastErr = err + } + tg.signalHandlers.mu.Unlock() + } + } + return lastErr +} + +// CreateSession creates a new Session, with the ThreadGroup as the leader. +// +// EPERM may be returned if either the given ThreadGroup is already a Session +// leader, or a ProcessGroup already exists for the ThreadGroup's ID. +func (tg *ThreadGroup) CreateSession() error { + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + return tg.createSession() +} + +// createSession creates a new session for a threadgroup. +// +// Precondition: callers must hold TaskSet.mu and the signal mutex for writing. +func (tg *ThreadGroup) createSession() error { + // Get the ID for this thread in the current namespace. + id := tg.pidns.tgids[tg] + + // Check if this ThreadGroup already leads a Session, or + // if the proposed group is already taken. + for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() { + if s.leader.pidns != tg.pidns { + continue + } + if s.leader == tg { + return syserror.EPERM + } + if s.id == SessionID(id) { + return syserror.EPERM + } + for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { + if pg.id == ProcessGroupID(id) { + return syserror.EPERM + } + } + } + + // Create a new Session, with a single reference. + s := &Session{ + id: SessionID(id), + leader: tg, + } + s.refs.EnableLeakCheck("kernel.Session") + + // Create a new ProcessGroup, belonging to that Session. + // This also has a single reference (assigned below). + // + // Note that since this is a new session and a new process group, there + // will be zero ancestors for this process group. (It is an orphan at + // this point.) + pg := &ProcessGroup{ + id: ProcessGroupID(id), + originator: tg, + session: s, + ancestors: 0, + } + pg.refs.EnableLeakCheck("kernel.ProcessGroup") + + // Tie them and return the result. + s.processGroups.PushBack(pg) + tg.pidns.owner.sessions.PushBack(s) + + // Leave the current group, and assign the new one. + if tg.processGroup != nil { + oldParentPG := tg.parentPG() + tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { + childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.decRefWithParent(oldParentPG) + }) + // If tg.processGroup is an orphan, decRefWithParent will lock + // the signal mutex of each thread group in tg.processGroup. + // However, tg's signal mutex may already be locked at this + // point. We change tg's process group before calling + // decRefWithParent to avoid locking tg's signal mutex twice. + oldPG := tg.processGroup + tg.processGroup = pg + oldPG.decRefWithParent(oldParentPG) + } else { + // The current process group may be nil only in the case of an + // unparented thread group (i.e. the init process). This would + // not normally occur, but we allow it for the convenience of + // CreateSession working from that point. There will be no + // child processes. We always say that the very first group + // created has ancestors (avoids checks elsewhere). + // + // Note that this mirrors the parent == nil logic in + // incRef/decRef/reparent, which counts nil as an ancestor. + tg.processGroup = pg + tg.processGroup.ancestors++ + } + + // Ensure a translation is added to all namespaces. + for ns := tg.pidns; ns != nil; ns = ns.parent { + local := ns.tgids[tg] + ns.sids[s] = SessionID(local) + ns.sessions[SessionID(local)] = s + ns.pgids[pg] = ProcessGroupID(local) + ns.processGroups[ProcessGroupID(local)] = pg + } + + // Disconnect from the controlling terminal. + tg.tty = nil + + return nil +} + +// CreateProcessGroup creates a new process group. +// +// An EPERM error will be returned if the ThreadGroup belongs to a different +// Session, is a Session leader or the group already exists. +func (tg *ThreadGroup) CreateProcessGroup() error { + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + + // Get the ID for this thread in the current namespace. + id := tg.pidns.tgids[tg] + + // Per above, check for a Session leader or existing group. + for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() { + if s.leader.pidns != tg.pidns { + continue + } + if s.leader == tg { + return syserror.EPERM + } + for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { + if pg.id == ProcessGroupID(id) { + return syserror.EPERM + } + } + } + + // Create a new ProcessGroup, belonging to the current Session. + // + // We manually adjust the ancestors if the parent is in the same + // session. + tg.processGroup.session.incRef() + pg := ProcessGroup{ + id: ProcessGroupID(id), + originator: tg, + session: tg.processGroup.session, + } + pg.refs.EnableLeakCheck("kernel.ProcessGroup") + + if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { + pg.ancestors++ + } + + // Assign the new process group; adjust children. + oldParentPG := tg.parentPG() + tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { + childTG.processGroup.incRefWithParent(&pg) + childTG.processGroup.decRefWithParent(oldParentPG) + }) + tg.processGroup.decRefWithParent(oldParentPG) + tg.processGroup = &pg + + // Add the new process group to the session. + pg.session.processGroups.PushBack(&pg) + + // Ensure this translation is added to all namespaces. + for ns := tg.pidns; ns != nil; ns = ns.parent { + local := ns.tgids[tg] + ns.pgids[&pg] = ProcessGroupID(local) + ns.processGroups[ProcessGroupID(local)] = &pg + } + + return nil +} + +// JoinProcessGroup joins an existing process group. +// +// This function will return EACCES if an exec has been performed since fork +// by the given ThreadGroup, and EPERM if the Sessions are not the same or the +// group does not exist. +// +// If checkExec is set, then the join is not permitted after the process has +// executed exec at least once. +func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error { + pidns.owner.mu.Lock() + defer pidns.owner.mu.Unlock() + + // Lookup the ProcessGroup. + pg := pidns.processGroups[pgid] + if pg == nil { + return syserror.EPERM + } + + // Disallow the join if an execve has performed, per POSIX. + if checkExec && tg.execed { + return syserror.EACCES + } + + // See if it's in the same session as ours. + if pg.session != tg.processGroup.session { + return syserror.EPERM + } + + // Join the group; adjust children. + parentPG := tg.parentPG() + pg.incRefWithParent(parentPG) + tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { + childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.decRefWithParent(tg.processGroup) + }) + tg.processGroup.decRefWithParent(parentPG) + tg.processGroup = pg + + return nil +} + +// Session returns the ThreadGroup's Session. +// +// A reference is not taken on the session. +func (tg *ThreadGroup) Session() *Session { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.processGroup.session +} + +// IDOfSession returns the Session assigned to s in PID namespace ns. +// +// If this group isn't visible in this namespace, zero will be returned. It is +// the callers responsibility to check that before using this function. +func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.sids[s] +} + +// SessionWithID returns the Session with the given ID in the PID namespace ns, +// or nil if that given ID is not defined in this namespace. +// +// A reference is not taken on the session. +func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.sessions[id] +} + +// ProcessGroup returns the ThreadGroup's ProcessGroup. +// +// A reference is not taken on the process group. +func (tg *ThreadGroup) ProcessGroup() *ProcessGroup { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.processGroup +} + +// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns. +// +// The same constraints apply as IDOfSession. +func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.pgids[pg] +} + +// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID +// namespace ns, or nil if that given ID is not defined in this namespace. +// +// A reference is not taken on the process group. +func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.processGroups[id] +} diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD new file mode 100644 index 000000000..bfd779837 --- /dev/null +++ b/pkg/sentry/kernel/shm/BUILD @@ -0,0 +1,29 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "shm", + srcs = [ + "device.go", + "shm.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/log", + "//pkg/refs", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/usage", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/kernel/shm/device.go b/pkg/sentry/kernel/shm/device.go new file mode 100644 index 000000000..6b0d5818b --- /dev/null +++ b/pkg/sentry/kernel/shm/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package shm + +import "gvisor.dev/gvisor/pkg/sentry/device" + +// shmDevice is the kernel shm device. +var shmDevice = device.NewAnonDevice() diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go new file mode 100644 index 000000000..f66cfcc7f --- /dev/null +++ b/pkg/sentry/kernel/shm/shm.go @@ -0,0 +1,707 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package shm implements sysv shared memory segments. +// +// Known missing features: +// +// - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement +// memory locking in general. +// +// - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy +// way to implement hugetlb support on a per-map basis, and it has no impact +// on correctness. +// +// - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap +// so it's meaningless to reserve space for swap. +// +// - No per-process segment size enforcement. This feature probably isn't used +// much anyways, since Linux sets the per-process limits to the system-wide +// limits by default. +// +// Lock ordering: mm.mappingMu -> shm registry lock -> shm lock +package shm + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Key represents a shm segment key. Analogous to a file name. +type Key int32 + +// ID represents the opaque handle for a shm segment. Analogous to an fd. +type ID int32 + +// Registry tracks all shared memory segments in an IPC namespace. The registry +// provides the mechanisms for creating and finding segments, and reporting +// global shm parameters. +// +// +stateify savable +type Registry struct { + // userNS owns the IPC namespace this registry belong to. Immutable. + userNS *auth.UserNamespace + + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + + // shms maps segment ids to segments. + // + // shms holds all referenced segments, which are removed on the last + // DecRef. Thus, it cannot itself hold a reference on the Shm. + // + // Since removal only occurs after the last (unlocked) DecRef, there + // exists a short window during which a Shm still exists in Shm, but is + // unreferenced. Users must use TryIncRef to determine if the Shm is + // still valid. + shms map[ID]*Shm + + // keysToShms maps segment keys to segments. + // + // Shms in keysToShms are guaranteed to be referenced, as they are + // removed by disassociateKey before the last DecRef. + keysToShms map[Key]*Shm + + // Sum of the sizes of all existing segments rounded up to page size, in + // units of page size. + totalPages uint64 + + // ID assigned to the last created segment. Used to quickly find the next + // unused ID. + lastIDUsed ID +} + +// NewRegistry creates a new shm registry. +func NewRegistry(userNS *auth.UserNamespace) *Registry { + return &Registry{ + userNS: userNS, + shms: make(map[ID]*Shm), + keysToShms: make(map[Key]*Shm), + } +} + +// FindByID looks up a segment given an ID. +// +// FindByID returns a reference on Shm. +func (r *Registry) FindByID(id ID) *Shm { + r.mu.Lock() + defer r.mu.Unlock() + s := r.shms[id] + // Take a reference on s. If TryIncRef fails, s has reached the last + // DecRef, but hasn't quite been removed from r.shms yet. + if s != nil && s.TryIncRef() { + return s + } + return nil +} + +// dissociateKey removes the association between a segment and its key, +// preventing it from being discovered in the registry. This doesn't necessarily +// mean the segment is about to be destroyed. This is analogous to unlinking a +// file; the segment can still be used by a process already referencing it, but +// cannot be discovered by a new process. +func (r *Registry) dissociateKey(s *Shm) { + r.mu.Lock() + defer r.mu.Unlock() + s.mu.Lock() + defer s.mu.Unlock() + if s.key != linux.IPC_PRIVATE { + delete(r.keysToShms, s.key) + s.key = linux.IPC_PRIVATE + } +} + +// FindOrCreate looks up or creates a segment in the registry. It's functionally +// analogous to open(2). +// +// FindOrCreate returns a reference on Shm. +func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { + if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) { + // "A new segment was to be created and size is less than SHMMIN or + // greater than SHMMAX." - man shmget(2) + // + // Note that 'private' always implies the creation of a new segment + // whether IPC_CREAT is specified or not. + return nil, syserror.EINVAL + } + + r.mu.Lock() + defer r.mu.Unlock() + + if len(r.shms) >= linux.SHMMNI { + // "All possible shared memory IDs have been taken (SHMMNI) ..." + // - man shmget(2) + return nil, syserror.ENOSPC + } + + if !private { + // Look up an existing segment. + if shm := r.keysToShms[key]; shm != nil { + shm.mu.Lock() + defer shm.mu.Unlock() + + // Check that caller can access the segment. + if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) { + // "The user does not have permission to access the shared + // memory segment, and does not have the CAP_IPC_OWNER + // capability in the user namespace that governs its IPC + // namespace." - man shmget(2) + return nil, syserror.EACCES + } + + if size > shm.size { + // "A segment for the given key exists, but size is greater than + // the size of that segment." - man shmget(2) + return nil, syserror.EINVAL + } + + if create && exclusive { + // "IPC_CREAT and IPC_EXCL were specified in shmflg, but a + // shared memory segment already exists for key." + // - man shmget(2) + return nil, syserror.EEXIST + } + + shm.IncRef() + return shm, nil + } + + if !create { + // "No segment exists for the given key, and IPC_CREAT was not + // specified." - man shmget(2) + return nil, syserror.ENOENT + } + } + + var sizeAligned uint64 + if val, ok := usermem.Addr(size).RoundUp(); ok { + sizeAligned = uint64(val) + } else { + return nil, syserror.EINVAL + } + + if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > linux.SHMALL { + // "... allocating a segment of the requested size would cause the + // system to exceed the system-wide limit on shared memory (SHMALL)." + // - man shmget(2) + return nil, syserror.ENOSPC + } + + // Need to create a new segment. + creator := fs.FileOwnerFromContext(ctx) + perms := fs.FilePermsFromMode(mode) + s, err := r.newShm(ctx, pid, key, creator, perms, size) + if err != nil { + return nil, err + } + // The initial reference is held by s itself. Take another to return to + // the caller. + s.IncRef() + return s, nil +} + +// newShm creates a new segment in the registry. +// +// Precondition: Caller must hold r.mu. +func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) { + mfp := pgalloc.MemoryFileProviderFromContext(ctx) + if mfp == nil { + panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider)) + } + + effectiveSize := uint64(usermem.Addr(size).MustRoundUp()) + fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous) + if err != nil { + return nil, err + } + + shm := &Shm{ + mfp: mfp, + registry: r, + creator: creator, + size: size, + effectiveSize: effectiveSize, + fr: fr, + key: key, + perms: perms, + owner: creator, + creatorPID: pid, + changeTime: ktime.NowFromContext(ctx), + } + shm.EnableLeakCheck("kernel.Shm") + + // Find the next available ID. + for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { + // Handle wrap around. + if id < 0 { + id = 0 + continue + } + if r.shms[id] == nil { + r.lastIDUsed = id + + shm.ID = id + r.shms[id] = shm + r.keysToShms[key] = shm + + r.totalPages += effectiveSize / usermem.PageSize + + return shm, nil + } + } + + log.Warningf("Shm ids exhuasted, they may be leaking") + return nil, syserror.ENOSPC +} + +// IPCInfo reports global parameters for sysv shared memory segments on this +// system. See shmctl(IPC_INFO). +func (r *Registry) IPCInfo() *linux.ShmParams { + return &linux.ShmParams{ + ShmMax: linux.SHMMAX, + ShmMin: linux.SHMMIN, + ShmMni: linux.SHMMNI, + ShmSeg: linux.SHMSEG, + ShmAll: linux.SHMALL, + } +} + +// ShmInfo reports linux-specific global parameters for sysv shared memory +// segments on this system. See shmctl(SHM_INFO). +func (r *Registry) ShmInfo() *linux.ShmInfo { + r.mu.Lock() + defer r.mu.Unlock() + + return &linux.ShmInfo{ + UsedIDs: int32(r.lastIDUsed), + ShmTot: r.totalPages, + ShmRss: r.totalPages, // We could probably get a better estimate from memory accounting. + ShmSwp: 0, // No reclaim at the moment. + } +} + +// remove deletes a segment from this registry, deaccounting the memory used by +// the segment. +// +// Precondition: Must follow a call to r.dissociateKey(s). +func (r *Registry) remove(s *Shm) { + r.mu.Lock() + defer r.mu.Unlock() + s.mu.Lock() + defer s.mu.Unlock() + + if s.key != linux.IPC_PRIVATE { + panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked())) + } + + delete(r.shms, s.ID) + r.totalPages -= s.effectiveSize / usermem.PageSize +} + +// Shm represents a single shared memory segment. +// +// Shm segment are backed directly by an allocation from platform memory. +// Segments are always mapped as a whole, greatly simplifying how mappings are +// tracked. However note that mremap and munmap calls may cause the vma for a +// segment to become fragmented; which requires special care when unmapping a +// segment. See mm/shm.go. +// +// Segments persist until they are explicitly marked for destruction via +// MarkDestroyed(). +// +// Shm implements memmap.Mappable and memmap.MappingIdentity. +// +// +stateify savable +type Shm struct { + // AtomicRefCount tracks the number of references to this segment. + // + // A segment holds a reference to itself until it is marked for + // destruction. + // + // In addition to direct users, the MemoryManager will hold references + // via MappingIdentity. + refs.AtomicRefCount + + mfp pgalloc.MemoryFileProvider + + // registry points to the shm registry containing this segment. Immutable. + registry *Registry + + // ID is the kernel identifier for this segment. Immutable. + ID ID + + // creator is the user that created the segment. Immutable. + creator fs.FileOwner + + // size is the requested size of the segment at creation, in + // bytes. Immutable. + size uint64 + + // effectiveSize of the segment, rounding up to the next page + // boundary. Immutable. + // + // Invariant: effectiveSize must be a multiple of usermem.PageSize. + effectiveSize uint64 + + // fr is the offset into mfp.MemoryFile() that backs this contents of this + // segment. Immutable. + fr platform.FileRange + + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + + // key is the public identifier for this segment. + key Key + + // perms is the access permissions for the segment. + perms fs.FilePermissions + + // owner of this segment. + owner fs.FileOwner + // attachTime is updated on every successful shmat. + attachTime ktime.Time + // detachTime is updated on every successful shmdt. + detachTime ktime.Time + // changeTime is updated on every successful changes to the segment via + // shmctl(IPC_SET). + changeTime ktime.Time + + // creatorPID is the PID of the process that created the segment. + creatorPID int32 + // lastAttachDetachPID is the pid of the process that issued the last shmat + // or shmdt syscall. + lastAttachDetachPID int32 + + // pendingDestruction indicates the segment was marked as destroyed through + // shmctl(IPC_RMID). When marked as destroyed, the segment will not be found + // in the registry and can no longer be attached. When the last user + // detaches from the segment, it is destroyed. + pendingDestruction bool +} + +// Precondition: Caller must hold s.mu. +func (s *Shm) debugLocked() string { + return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}", + s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (s *Shm) MappedName(ctx context.Context) string { + s.mu.Lock() + defer s.mu.Unlock() + return fmt.Sprintf("SYSV%08d", s.key) +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (s *Shm) DeviceID() uint64 { + return shmDevice.DeviceID() +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (s *Shm) InodeID() uint64 { + // "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use + // this. Changing this will break them." -- Linux, ipc/shm.c:newseg() + return uint64(s.ID) +} + +// DecRef overrides refs.RefCount.DecRef with a destructor. +// +// Precondition: Caller must not hold s.mu. +func (s *Shm) DecRef() { + s.DecRefWithDestructor(s.destroy) +} + +// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm +// segments. +func (s *Shm) Msync(context.Context, memmap.MappableRange) error { + return nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) error { + s.mu.Lock() + defer s.mu.Unlock() + s.attachTime = ktime.NowFromContext(ctx) + if pid, ok := context.ThreadGroupIDFromContext(ctx); ok { + s.lastAttachDetachPID = pid + } else { + // AddMapping is called during a syscall, so ctx should always be a task + // context. + log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked()) + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) { + s.mu.Lock() + defer s.mu.Unlock() + // RemoveMapping may be called during task exit, when ctx + // is context.Background. Gracefully handle missing clocks. Failing to + // update the detach time in these cases is ok, since no one can observe the + // omission. + if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { + s.detachTime = clock.Now() + } + + // If called from a non-task context we also won't have a threadgroup + // id. Silently skip updating the lastAttachDetachPid in that case. + if pid, ok := context.ThreadGroupIDFromContext(ctx); ok { + s.lastAttachDetachPID = pid + } else { + log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked()) + } +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error { + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + var err error + if required.End > s.fr.Length() { + err = &memmap.BusError{syserror.EFAULT} + } + if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 { + return []memmap.Translation{ + { + Source: source, + File: s.mfp.MemoryFile(), + Offset: s.fr.Start + source.Start, + Perms: usermem.AnyAccess, + }, + }, err + } + return nil, err +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (s *Shm) InvalidateUnsavable(ctx context.Context) error { + return nil +} + +// AttachOpts describes various flags passed to shmat(2). +type AttachOpts struct { + Execute bool + Readonly bool + Remap bool +} + +// ConfigureAttach creates an mmap configuration for the segment with the +// requested attach options. +// +// Postconditions: The returned MMapOpts are valid only as long as a reference +// continues to be held on s. +func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.pendingDestruction && s.ReadRefs() == 0 { + return memmap.MMapOpts{}, syserror.EIDRM + } + + if !s.checkPermissions(ctx, fs.PermMask{ + Read: true, + Write: !opts.Readonly, + Execute: opts.Execute, + }) { + // "The calling process does not have the required permissions for the + // requested attach type, and does not have the CAP_IPC_OWNER capability + // in the user namespace that governs its IPC namespace." - man shmat(2) + return memmap.MMapOpts{}, syserror.EACCES + } + return memmap.MMapOpts{ + Length: s.size, + Offset: 0, + Addr: addr, + Fixed: opts.Remap, + Perms: usermem.AccessType{ + Read: true, + Write: !opts.Readonly, + Execute: opts.Execute, + }, + MaxPerms: usermem.AnyAccess, + Mappable: s, + MappingIdentity: s, + }, nil +} + +// EffectiveSize returns the size of the underlying shared memory segment. This +// may be larger than the requested size at creation, due to rounding to page +// boundaries. +func (s *Shm) EffectiveSize() uint64 { + return s.effectiveSize +} + +// IPCStat returns information about a shm. See shmctl(IPC_STAT). +func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The caller must have read permission on the shared memory segment." + // - man shmctl(2) + if !s.checkPermissions(ctx, fs.PermMask{Read: true}) { + // "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow + // read access for shmid, and the calling process does not have the + // CAP_IPC_OWNER capability in the user namespace that governs its IPC + // namespace." - man shmctl(2) + return nil, syserror.EACCES + } + + var mode uint16 + if s.pendingDestruction { + mode |= linux.SHM_DEST + } + creds := auth.CredentialsFromContext(ctx) + + // Use the reference count as a rudimentary count of the number of + // attaches. We exclude: + // + // 1. The reference the caller holds. + // 2. The self-reference held by s prior to destruction. + // + // Note that this may still overcount by including transient references + // used in concurrent calls. + nattach := uint64(s.ReadRefs()) - 1 + if !s.pendingDestruction { + nattach-- + } + + ds := &linux.ShmidDS{ + ShmPerm: linux.IPCPerm{ + Key: uint32(s.key), + UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), + Mode: mode | uint16(s.perms.LinuxMode()), + Seq: 0, // IPC sequences not supported. + }, + ShmSegsz: s.size, + ShmAtime: s.attachTime.TimeT(), + ShmDtime: s.detachTime.TimeT(), + ShmCtime: s.changeTime.TimeT(), + ShmCpid: s.creatorPID, + ShmLpid: s.lastAttachDetachPID, + ShmNattach: nattach, + } + + return ds, nil +} + +// Set modifies attributes for a segment. See shmctl(IPC_SET). +func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { + s.mu.Lock() + defer s.mu.Unlock() + + if !s.checkOwnership(ctx) { + return syserror.EPERM + } + + creds := auth.CredentialsFromContext(ctx) + uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID)) + gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID)) + if !uid.Ok() || !gid.Ok() { + return syserror.EINVAL + } + + // User may only modify the lower 9 bits of the mode. All the other bits are + // always 0 for the underlying inode. + mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff) + s.perms = fs.FilePermsFromMode(mode) + + s.owner.UID = uid + s.owner.GID = gid + + s.changeTime = ktime.NowFromContext(ctx) + return nil +} + +func (s *Shm) destroy() { + s.mfp.MemoryFile().DecRef(s.fr) + s.registry.remove(s) +} + +// MarkDestroyed marks a segment for destruction. The segment is actually +// destroyed once it has no references. MarkDestroyed may be called multiple +// times, and is safe to call after a segment has already been destroyed. See +// shmctl(IPC_RMID). +func (s *Shm) MarkDestroyed() { + s.registry.dissociateKey(s) + + s.mu.Lock() + defer s.mu.Unlock() + if !s.pendingDestruction { + s.pendingDestruction = true + // Drop the self-reference so destruction occurs when all + // external references are gone. + // + // N.B. This cannot be the final DecRef, as the caller also + // holds a reference. + s.DecRef() + return + } +} + +// checkOwnership verifies whether a segment may be accessed by ctx as an +// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux. +// +// Precondition: Caller must hold s.mu. +func (s *Shm) checkOwnership(ctx context.Context) bool { + creds := auth.CredentialsFromContext(ctx) + if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID { + return true + } + + // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux + // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented + // for use to "override IPC ownership checks". + return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS) +} + +// checkPermissions verifies whether a segment is accessible by ctx for access +// described by req. See ipc/util.c:ipcperms() in Linux. +// +// Precondition: Caller must hold s.mu. +func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool { + creds := auth.CredentialsFromContext(ctx) + + p := s.perms.Other + if s.owner.UID == creds.EffectiveKUID { + p = s.perms.User + } else if creds.InGroup(s.owner.GID) { + p = s.perms.Group + } + if p.SupersetOf(req) { + return true + } + + // Tasks with CAP_IPC_OWNER may bypass permission checks. + return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) +} diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go new file mode 100644 index 000000000..e8cce37d0 --- /dev/null +++ b/pkg/sentry/kernel/signal.go @@ -0,0 +1,79 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform" +) + +// SignalPanic is used to panic the running threads. It is a signal which +// cannot be used by the application: it must be caught and ignored by the +// runtime (in order to catch possible races). +const SignalPanic = linux.SIGUSR2 + +// sendExternalSignal is called when an asynchronous signal is sent to the +// sentry ("in sentry context"). On some platforms, it may also be called when +// an asynchronous signal is sent to sandboxed application threads ("in +// application context"). +// +// context is used only for debugging to differentiate these cases. +// +// Preconditions: Kernel must have an init process. +func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) { + switch linux.Signal(info.Signo) { + case linux.SIGURG: + // Sent by the Go 1.14+ runtime for asynchronous goroutine preemption. + + case platform.SignalInterrupt: + // Assume that a call to platform.Context.Interrupt() misfired. + + case SignalPanic: + // SignalPanic is also specially handled in sentry setup to ensure that + // it causes a panic even after tasks exit, but SignalPanic may also + // be sent here if it is received while in app context. + panic("Signal-induced panic") + + default: + log.Infof("Received external signal %d in %s context", info.Signo, context) + if k.globalInit == nil { + panic(fmt.Sprintf("Received external signal %d before init created", info.Signo)) + } + k.globalInit.SendSignal(info) + } +} + +// SignalInfoPriv returns a SignalInfo equivalent to Linux's SEND_SIG_PRIV. +func SignalInfoPriv(sig linux.Signal) *arch.SignalInfo { + return &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoKernel, + } +} + +// SignalInfoNoInfo returns a SignalInfo equivalent to Linux's SEND_SIG_NOINFO. +func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo { + info := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + info.SetPid(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg))) + info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + return info +} diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go new file mode 100644 index 000000000..768fda220 --- /dev/null +++ b/pkg/sentry/kernel/signal_handlers.go @@ -0,0 +1,88 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" +) + +// SignalHandlers holds information about signal actions. +// +// +stateify savable +type SignalHandlers struct { + // mu protects actions, as well as the signal state of all tasks and thread + // groups using this SignalHandlers object. (See comment on + // ThreadGroup.signalHandlers.) + mu sync.Mutex `state:"nosave"` + + // actions is the action to be taken upon receiving each signal. + actions map[linux.Signal]arch.SignalAct +} + +// NewSignalHandlers returns a new SignalHandlers specifying all default +// actions. +func NewSignalHandlers() *SignalHandlers { + return &SignalHandlers{ + actions: make(map[linux.Signal]arch.SignalAct), + } +} + +// Fork returns a copy of sh for a new thread group. +func (sh *SignalHandlers) Fork() *SignalHandlers { + sh2 := NewSignalHandlers() + sh.mu.Lock() + defer sh.mu.Unlock() + for sig, act := range sh.actions { + sh2.actions[sig] = act + } + return sh2 +} + +// CopyForExec returns a copy of sh for a thread group that is undergoing an +// execve. (See comments in Task.finishExec.) +func (sh *SignalHandlers) CopyForExec() *SignalHandlers { + sh2 := NewSignalHandlers() + sh.mu.Lock() + defer sh.mu.Unlock() + for sig, act := range sh.actions { + if act.Handler == arch.SignalActIgnore { + sh2.actions[sig] = arch.SignalAct{ + Handler: arch.SignalActIgnore, + } + } + } + return sh2 +} + +// IsIgnored returns true if the signal is ignored. +func (sh *SignalHandlers) IsIgnored(sig linux.Signal) bool { + sh.mu.Lock() + defer sh.mu.Unlock() + sa, ok := sh.actions[sig] + return ok && sa.Handler == arch.SignalActIgnore +} + +// dequeueActionLocked returns the SignalAct that should be used to handle sig. +// +// Preconditions: sh.mu must be locked. +func (sh *SignalHandlers) dequeueAction(sig linux.Signal) arch.SignalAct { + act := sh.actions[sig] + if act.IsResetHandler() { + delete(sh.actions, sig) + } + return act +} diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD new file mode 100644 index 000000000..3eb78e91b --- /dev/null +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -0,0 +1,22 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "signalfd", + srcs = ["signalfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go new file mode 100644 index 000000000..8243bb93e --- /dev/null +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -0,0 +1,139 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package signalfd provides an implementation of signal file descriptors. +package signalfd + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/anon" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SignalOperations represent a file with signalfd semantics. +// +// +stateify savable +type SignalOperations struct { + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoWrite `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + // target is the original task target. + // + // The semantics here are a bit broken. Linux will always use current + // for all reads, regardless of where the signalfd originated. We can't + // do exactly that because we need to plumb the context through + // EventRegister in order to support proper blocking behavior. This + // will undoubtedly become very complicated quickly. + target *kernel.Task + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // mask is the signal mask. Protected by mu. + mask linux.SignalSet +} + +// New creates a new signalfd object with the supplied mask. +func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // No task context? Not valid. + return nil, syserror.EINVAL + } + // name matches fs/signalfd.c:signalfd4. + dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[signalfd]") + return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &SignalOperations{ + target: t, + mask: mask, + }), nil +} + +// Release implements fs.FileOperations.Release. +func (s *SignalOperations) Release() {} + +// Mask returns the signal mask. +func (s *SignalOperations) Mask() linux.SignalSet { + s.mu.Lock() + mask := s.mask + s.mu.Unlock() + return mask +} + +// SetMask sets the signal mask. +func (s *SignalOperations) SetMask(mask linux.SignalSet) { + s.mu.Lock() + s.mask = mask + s.mu.Unlock() +} + +// Read implements fs.FileOperations.Read. +func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + // Attempt to dequeue relevant signals. + info, err := s.target.Sigtimedwait(s.Mask(), 0) + if err != nil { + // There must be no signal available. + return 0, syserror.ErrWouldBlock + } + + // Copy out the signal info using the specified format. + var buf [128]byte + binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{ + Signo: uint32(info.Signo), + Errno: info.Errno, + Code: info.Code, + PID: uint32(info.Pid()), + UID: uint32(info.Uid()), + Status: info.Status(), + Overrun: uint32(info.Overrun()), + Addr: info.Addr(), + }) + n, err := dst.CopyOut(ctx, buf[:]) + return int64(n), err +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SignalOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + if mask&waiter.EventIn != 0 && s.target.PendingSignals()&s.Mask() != 0 { + return waiter.EventIn // Pending signals. + } + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SignalOperations) EventRegister(entry *waiter.Entry, _ waiter.EventMask) { + // Register for the signal set; ignore the passed events. + s.target.SignalRegister(entry, waiter.EventMask(s.Mask())) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SignalOperations) EventUnregister(entry *waiter.Entry) { + // Unregister the original entry. + s.target.SignalUnregister(entry) +} diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go new file mode 100644 index 000000000..413111faf --- /dev/null +++ b/pkg/sentry/kernel/syscalls.go @@ -0,0 +1,364 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" +) + +// maxSyscallNum is the highest supported syscall number. +// +// The types below create fast lookup slices for all syscalls. This maximum +// serves as a sanity check that we don't allocate huge slices for a very large +// syscall. This is checked during registration. +const maxSyscallNum = 2000 + +// SyscallSupportLevel is a syscall support levels. +type SyscallSupportLevel int + +// String returns a human readable represetation of the support level. +func (l SyscallSupportLevel) String() string { + switch l { + case SupportUnimplemented: + return "Unimplemented" + case SupportPartial: + return "Partial Support" + case SupportFull: + return "Full Support" + default: + return "Undocumented" + } +} + +const ( + // SupportUndocumented indicates the syscall is not documented yet. + SupportUndocumented = iota + + // SupportUnimplemented indicates the syscall is unimplemented. + SupportUnimplemented + + // SupportPartial indicates the syscall is partially supported. + SupportPartial + + // SupportFull indicates the syscall is fully supported. + SupportFull +) + +// Syscall includes the syscall implementation and compatibility information. +type Syscall struct { + // Name is the syscall name. + Name string + // Fn is the implementation of the syscall. + Fn SyscallFn + // SupportLevel is the level of support implemented in gVisor. + SupportLevel SyscallSupportLevel + // Note describes the compatibility of the syscall. + Note string + // URLs is set of URLs to any relevant bugs or issues. + URLs []string +} + +// SyscallFn is a syscall implementation. +type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error) + +// MissingFn is a syscall to be called when an implementation is missing. +type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) + +// Possible flags for SyscallFlagsTable.enable. +const ( + // syscallPresent indicates that this is not a missing syscall. + // + // This flag is used internally in SyscallFlagsTable. + syscallPresent = 1 << iota + + // StraceEnableLog enables syscall log tracing. + StraceEnableLog + + // StraceEnableEvent enables syscall event tracing. + StraceEnableEvent + + // ExternalBeforeEnable enables the external hook before syscall execution. + ExternalBeforeEnable + + // ExternalAfterEnable enables the external hook after syscall execution. + ExternalAfterEnable +) + +// StraceEnableBits combines both strace log and event flags. +const StraceEnableBits = StraceEnableLog | StraceEnableEvent + +// SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall +// basis. +type SyscallFlagsTable struct { + // mu protects writes to the fields below. + // + // Atomic loads are always allowed. Atomic stores are allowed only + // while mu is held. + mu sync.Mutex + + // enable contains the enable bits for each syscall. + // + // missing syscalls have the same value in enable as missingEnable to + // avoid an extra branch in Word. + enable []uint32 + + // missingEnable contains the enable bits for missing syscalls. + missingEnable uint32 +} + +// Init initializes the struct, with all syscalls in table set to enable. +// +// max is the largest syscall number in table. +func (e *SyscallFlagsTable) init(table map[uintptr]Syscall, max uintptr) { + e.enable = make([]uint32, max+1) + for num := range table { + e.enable[num] = syscallPresent + } +} + +// Word returns the enable bitfield for sysno. +func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 { + if sysno < uintptr(len(e.enable)) { + return atomic.LoadUint32(&e.enable[sysno]) + } + + return atomic.LoadUint32(&e.missingEnable) +} + +// Enable sets enable bit bit for all syscalls based on s. +// +// Syscalls missing from s are disabled. +// +// Syscalls missing from the initial table passed to Init cannot be added as +// individual syscalls. If present in s they will be ignored. +// +// Callers to Word may see either the old or new value while this function +// is executing. +func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) { + e.mu.Lock() + defer e.mu.Unlock() + + missingVal := atomic.LoadUint32(&e.missingEnable) + if missingEnable { + missingVal |= bit + } else { + missingVal &^= bit + } + atomic.StoreUint32(&e.missingEnable, missingVal) + + for num := range e.enable { + val := atomic.LoadUint32(&e.enable[num]) + if !bits.IsOn32(val, syscallPresent) { + // Missing. + atomic.StoreUint32(&e.enable[num], missingVal) + continue + } + + if s[uintptr(num)] { + val |= bit + } else { + val &^= bit + } + atomic.StoreUint32(&e.enable[num], val) + } +} + +// EnableAll sets enable bit bit for all syscalls, present and missing. +func (e *SyscallFlagsTable) EnableAll(bit uint32) { + e.mu.Lock() + defer e.mu.Unlock() + + missingVal := atomic.LoadUint32(&e.missingEnable) + missingVal |= bit + atomic.StoreUint32(&e.missingEnable, missingVal) + + for num := range e.enable { + val := atomic.LoadUint32(&e.enable[num]) + if !bits.IsOn32(val, syscallPresent) { + // Missing. + atomic.StoreUint32(&e.enable[num], missingVal) + continue + } + + val |= bit + atomic.StoreUint32(&e.enable[num], val) + } +} + +// Stracer traces syscall execution. +type Stracer interface { + // SyscallEnter is called on syscall entry. + // + // The returned private data is passed to SyscallExit. + SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} + + // SyscallExit is called on syscall exit. + SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error) +} + +// SyscallTable is a lookup table of system calls. +// +// Note that a SyscallTable is not savable directly. Instead, they are saved as +// an OS/Arch pair and lookup happens again on restore. +type SyscallTable struct { + // OS is the operating system that this syscall table implements. + OS abi.OS + + // Arch is the architecture that this syscall table targets. + Arch arch.Arch + + // The OS version that this syscall table implements. + Version Version + + // AuditNumber is a numeric constant that represents the syscall table. If + // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by + // linux/audit.h. + AuditNumber uint32 + + // Table is the collection of functions. + Table map[uintptr]Syscall + + // lookup is a fixed-size array that holds the syscalls (indexed by + // their numbers). It is used for fast look ups. + lookup []SyscallFn + + // Emulate is a collection of instruction addresses to emulate. The + // keys are addresses, and the values are system call numbers. + Emulate map[usermem.Addr]uintptr + + // The function to call in case of a missing system call. + Missing MissingFn + + // Stracer traces this syscall table. + Stracer Stracer + + // External is used to handle an external callback. + External func(*Kernel) + + // ExternalFilterBefore is called before External is called before the syscall is executed. + // External is not called if it returns false. + ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool + + // ExternalFilterAfter is called before External is called after the syscall is executed. + // External is not called if it returns false. + ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool + + // FeatureEnable stores the strace and one-shot enable bits. + FeatureEnable SyscallFlagsTable +} + +// MaxSysno returns the largest system call number. +func (s *SyscallTable) MaxSysno() (max uintptr) { + for num := range s.Table { + if num > max { + max = num + } + } + return max +} + +// allSyscallTables contains all known tables. +var allSyscallTables []*SyscallTable + +// SyscallTables returns a read-only slice of registered SyscallTables. +func SyscallTables() []*SyscallTable { + return allSyscallTables +} + +// LookupSyscallTable returns the SyscallCall table for the OS/Arch combination. +func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { + for _, s := range allSyscallTables { + if s.OS == os && s.Arch == a { + return s, true + } + } + return nil, false +} + +// RegisterSyscallTable registers a new syscall table for use by a Kernel. +func RegisterSyscallTable(s *SyscallTable) { + if max := s.MaxSysno(); max > maxSyscallNum { + panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) + } + if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { + panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) + } + allSyscallTables = append(allSyscallTables, s) + s.Init() +} + +// Init initializes the system call table. +// +// This should normally be called only during registration. +func (s *SyscallTable) Init() { + if s.Table == nil { + // Ensure non-nil lookup table. + s.Table = make(map[uintptr]Syscall) + } + if s.Emulate == nil { + // Ensure non-nil emulate table. + s.Emulate = make(map[usermem.Addr]uintptr) + } + + max := s.MaxSysno() // Checked during RegisterSyscallTable. + + // Initialize the fast-lookup table. + s.lookup = make([]SyscallFn, max+1) + for num, sc := range s.Table { + s.lookup[num] = sc.Fn + } + + // Initialize all features. + s.FeatureEnable.init(s.Table, max) +} + +// Lookup returns the syscall implementation, if one exists. +func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { + if sysno < uintptr(len(s.lookup)) { + return s.lookup[sysno] + } + + return nil +} + +// LookupName looks up a syscall name. +func (s *SyscallTable) LookupName(sysno uintptr) string { + if sc, ok := s.Table[sysno]; ok { + return sc.Name + } + return fmt.Sprintf("sys_%d", sysno) // Unlikely. +} + +// LookupEmulate looks up an emulation syscall number. +func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { + sysno, ok := s.Emulate[addr] + return sysno, ok +} + +// mapLookup is similar to Lookup, except that it only uses the syscall table, +// that is, it skips the fast look array. This is available for benchmarking. +func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { + if sc, ok := s.Table[sysno]; ok { + return sc.Fn + } + return nil +} diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go new file mode 100644 index 000000000..90f890495 --- /dev/null +++ b/pkg/sentry/kernel/syscalls_state.go @@ -0,0 +1,47 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +// syscallTableInfo is used to reload the SyscallTable. +// +// +stateify savable +type syscallTableInfo struct { + OS abi.OS + Arch arch.Arch +} + +// saveSt saves the SyscallTable. +func (tc *TaskContext) saveSt() syscallTableInfo { + return syscallTableInfo{ + OS: tc.st.OS, + Arch: tc.st.Arch, + } +} + +// loadSt loads the SyscallTable. +func (tc *TaskContext) loadSt(sti syscallTableInfo) { + st, ok := LookupSyscallTable(sti.OS, sti.Arch) + if !ok { + panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch)) + } + tc.st = st // Save the table reference. +} diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go new file mode 100644 index 000000000..4607cde2f --- /dev/null +++ b/pkg/sentry/kernel/syslog.go @@ -0,0 +1,108 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "math/rand" + + "gvisor.dev/gvisor/pkg/sync" +) + +// syslog represents a sentry-global kernel log. +// +// Currently, it contains only fun messages for a dmesg easter egg. +// +// +stateify savable +type syslog struct { + // mu protects the below. + mu sync.Mutex `state:"nosave"` + + // msg is the syslog message buffer. It is lazily initialized. + msg []byte +} + +// Log returns a copy of the syslog. +func (s *syslog) Log() []byte { + s.mu.Lock() + defer s.mu.Unlock() + + if s.msg != nil { + // Already initialized, just return a copy. + o := make([]byte, len(s.msg)) + copy(o, s.msg) + return o + } + + // Not initialized, create message. + allMessages := []string{ + "Synthesizing system calls...", + "Mounting deweydecimalfs...", + "Moving files to filing cabinet...", + "Digging up root...", + "Constructing home...", + "Segmenting fault lines...", + "Creating bureaucratic processes...", + "Searching for needles in stacks...", + "Preparing for the zombie uprising...", + "Feeding the init monster...", + "Creating cloned children...", + "Daemonizing children...", + "Waiting for children...", + "Gathering forks...", + "Committing treasure map to memory...", + "Reading process obituaries...", + "Searching for socket adapter...", + "Creating process schedule...", + "Generating random numbers by fair dice roll...", + "Rewriting operating system in Javascript...", + "Reticulating splines...", + "Consulting tar man page...", + "Forking spaghetti code...", + "Checking naughty and nice process list...", + "Checking naughty and nice process list...", // Check it up to twice. + "Granting licence to kill(2)...", // British spelling for British movie. + "Letting the watchdogs out...", + } + + selectMessage := func() string { + i := rand.Intn(len(allMessages)) + m := allMessages[i] + + // Delete the selected message. + allMessages[i] = allMessages[len(allMessages)-1] + allMessages = allMessages[:len(allMessages)-1] + + return m + } + + const format = "<6>[%11.6f] %s\n" + + s.msg = append(s.msg, []byte(fmt.Sprintf(format, 0.0, "Starting gVisor..."))...) + + time := 0.1 + for i := 0; i < 10; i++ { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...) + } + + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...) + + // Return a copy. + o := make([]byte, len(s.msg)) + copy(o, s.msg) + return o +} diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go new file mode 100644 index 000000000..32cf47e05 --- /dev/null +++ b/pkg/sentry/kernel/table_test.go @@ -0,0 +1,110 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +const ( + maxTestSyscall = 1000 +) + +func createSyscallTable() *SyscallTable { + m := make(map[uintptr]Syscall) + for i := uintptr(0); i <= maxTestSyscall; i++ { + j := i + m[i] = Syscall{ + Fn: func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) { + return j, nil, nil + }, + } + } + + s := &SyscallTable{ + OS: abi.Linux, + Arch: arch.AMD64, + Table: m, + } + + RegisterSyscallTable(s) + return s +} + +func TestTable(t *testing.T) { + table := createSyscallTable() + defer func() { + // Cleanup registered tables to keep tests separate. + allSyscallTables = []*SyscallTable{} + }() + + // Go through all functions and check that they return the right value. + for i := uintptr(0); i < maxTestSyscall; i++ { + fn := table.Lookup(i) + if fn == nil { + t.Errorf("Syscall %v is set to nil", i) + continue + } + + v, _, _ := fn(nil, arch.SyscallArguments{}) + if v != i { + t.Errorf("Wrong return value for syscall %v: expected %v, got %v", i, i, v) + } + } + + // Check that values outside the range return nil. + for i := uintptr(maxTestSyscall + 1); i < maxTestSyscall+100; i++ { + fn := table.Lookup(i) + if fn != nil { + t.Errorf("Syscall %v is not nil: %v", i, fn) + continue + } + } +} + +func BenchmarkTableLookup(b *testing.B) { + table := createSyscallTable() + + b.ResetTimer() + + j := uintptr(0) + for i := 0; i < b.N; i++ { + table.Lookup(j) + j = (j + 1) % 310 + } + + b.StopTimer() + // Cleanup registered tables to keep tests separate. + allSyscallTables = []*SyscallTable{} +} + +func BenchmarkTableMapLookup(b *testing.B) { + table := createSyscallTable() + + b.ResetTimer() + + j := uintptr(0) + for i := 0; i < b.N; i++ { + table.mapLookup(j) + j = (j + 1) % 310 + } + + b.StopTimer() + // Cleanup registered tables to keep tests separate. + allSyscallTables = []*SyscallTable{} +} diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go new file mode 100644 index 000000000..f48247c94 --- /dev/null +++ b/pkg/sentry/kernel/task.go @@ -0,0 +1,886 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + gocontext "context" + "runtime/trace" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Task represents a thread of execution in the untrusted app. It +// includes registers and any thread-specific state that you would +// normally expect. +// +// Each task is associated with a goroutine, called the task goroutine, that +// executes code (application code, system calls, etc.) on behalf of that task. +// See Task.run (task_run.go). +// +// All fields that are "owned by the task goroutine" can only be mutated by the +// task goroutine while it is running. The task goroutine does not require +// synchronization to read these fields, although it still requires +// synchronization as described for those fields to mutate them. +// +// All fields that are "exclusive to the task goroutine" can only be accessed +// by the task goroutine while it is running. The task goroutine does not +// require synchronization to read or write these fields. +// +// +stateify savable +type Task struct { + taskNode + + // runState is what the task goroutine is executing if it is not stopped. + // If runState is nil, the task goroutine should exit or has exited. + // runState is exclusive to the task goroutine. + runState taskRunState + + // haveSyscallReturn is true if tc.Arch().Return() represents a value + // returned by a syscall (or set by ptrace after a syscall). + // + // haveSyscallReturn is exclusive to the task goroutine. + haveSyscallReturn bool + + // interruptChan is notified whenever the task goroutine is interrupted + // (usually by a pending signal). interruptChan is effectively a condition + // variable that can be used in select statements. + // + // interruptChan is not saved; because saving interrupts all tasks, + // interruptChan is always notified after restore (see Task.run). + interruptChan chan struct{} `state:"nosave"` + + // gosched contains the current scheduling state of the task goroutine. + // + // gosched is protected by goschedSeq. gosched is owned by the task + // goroutine. + goschedSeq sync.SeqCount `state:"nosave"` + gosched TaskGoroutineSchedInfo + + // yieldCount is the number of times the task goroutine has called + // Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or + // Task.Yield(), voluntarily ceasing execution. + // + // yieldCount is accessed using atomic memory operations. yieldCount is + // owned by the task goroutine. + yieldCount uint64 + + // pendingSignals is the set of pending signals that may be handled only by + // this task. + // + // pendingSignals is protected by (taskNode.)tg.signalHandlers.mu + // (hereafter "the signal mutex"); see comment on + // ThreadGroup.signalHandlers. + pendingSignals pendingSignals + + // signalMask is the set of signals whose delivery is currently blocked. + // + // signalMask is accessed using atomic memory operations, and is protected + // by the signal mutex (such that reading signalMask is safe if either the + // signal mutex is locked or if atomic memory operations are used, while + // writing signalMask requires both). signalMask is owned by the task + // goroutine. + signalMask linux.SignalSet + + // If the task goroutine is currently executing Task.sigtimedwait, + // realSignalMask is the previous value of signalMask, which has temporarily + // been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0. + // + // realSignalMask is exclusive to the task goroutine. + realSignalMask linux.SignalSet + + // If haveSavedSignalMask is true, savedSignalMask is the signal mask that + // should be applied after the task has either delivered one signal to a + // user handler or is about to resume execution in the untrusted + // application. + // + // Both haveSavedSignalMask and savedSignalMask are exclusive to the task + // goroutine. + haveSavedSignalMask bool + savedSignalMask linux.SignalSet + + // signalStack is the alternate signal stack used by signal handlers for + // which the SA_ONSTACK flag is set. + // + // signalStack is exclusive to the task goroutine. + signalStack arch.SignalStack + + // signalQueue is a set of registered waiters for signal-related events. + // + // signalQueue is protected by the signalMutex. Note that the task does + // not implement all queue methods, specifically the readiness checks. + // The task only broadcast a notification on signal delivery. + signalQueue waiter.Queue `state:"zerovalue"` + + // If groupStopPending is true, the task should participate in a group + // stop in the interrupt path. + // + // groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux. + // + // groupStopPending is protected by the signal mutex. + groupStopPending bool + + // If groupStopAcknowledged is true, the task has already acknowledged that + // it is entering the most recent group stop that has been initiated on its + // thread group. + // + // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux. + // + // groupStopAcknowledged is protected by the signal mutex. + groupStopAcknowledged bool + + // If trapStopPending is true, the task goroutine should enter a + // PTRACE_INTERRUPT-induced stop from the interrupt path. + // + // trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that + // Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects + // JOBCTL_STOP_PENDING. + // + // trapStopPending is protected by the signal mutex. + trapStopPending bool + + // If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group + // stop has begun or ended since the last time the task entered a + // ptrace-stop from the group-stop path. + // + // trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux. + // + // trapNotifyPending is protected by the signal mutex. + trapNotifyPending bool + + // If stop is not nil, it is the internally-initiated condition that + // currently prevents the task goroutine from running. + // + // stop is protected by the signal mutex. + stop TaskStop + + // stopCount is the number of active external stops (calls to + // Task.BeginExternalStop that have not been paired with a call to + // Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is + // non-zero if the task goroutine should stop. + // + // Mutating stopCount requires both locking the signal mutex and using + // atomic memory operations. Reading stopCount requires either locking the + // signal mutex or using atomic memory operations. This allows Task.doStop + // to require only a single atomic read in the common case where stopCount + // is 0. + // + // stopCount is not saved, because external stops cannot be retained across + // a save/restore cycle. (Suppose a sentryctl command issues an external + // stop; after a save/restore cycle, the restored sentry has no knowledge + // of the pre-save sentryctl command, and the stopped task would remain + // stopped forever.) + stopCount int32 `state:"nosave"` + + // endStopCond is signaled when stopCount transitions to 0. The combination + // of stopCount and endStopCond effectively form a sync.WaitGroup, but + // WaitGroup provides no way to read its counter value. + // + // Invariant: endStopCond.L is the signal mutex. (This is not racy because + // sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine + // calls sync.Cond.Wait; and only the task goroutine can change the + // identity of the signal mutex, in Task.finishExec.) + endStopCond sync.Cond `state:"nosave"` + + // exitStatus is the task's exit status. + // + // exitStatus is protected by the signal mutex. + exitStatus ExitStatus + + // syscallRestartBlock represents a custom restart function to run in + // restart_syscall(2) to resume an interrupted syscall. + // + // syscallRestartBlock is exclusive to the task goroutine. + syscallRestartBlock SyscallRestartBlock + + // p provides the mechanism by which the task runs code in userspace. The p + // interface object is immutable. + p platform.Context `state:"nosave"` + + // k is the Kernel that this task belongs to. The k pointer is immutable. + k *Kernel + + // containerID has no equivalent in Linux; it's used by runsc to track all + // tasks that belong to a given containers since cgroups aren't implemented. + // It's inherited by the children, is immutable, and may be empty. + // + // NOTE: cgroups can be used to track this when implemented. + containerID string + + // mu protects some of the following fields. + mu sync.Mutex `state:"nosave"` + + // tc holds task data provided by the ELF loader. + // + // tc is protected by mu, and is owned by the task goroutine. + tc TaskContext + + // fsContext is the task's filesystem context. + // + // fsContext is protected by mu, and is owned by the task goroutine. + fsContext *FSContext + + // fdTable is the task's file descriptor table. + // + // fdTable is protected by mu, and is owned by the task goroutine. + fdTable *FDTable + + // If vforkParent is not nil, it is the task that created this task with + // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when + // this TaskContext is released. + // + // vforkParent is protected by the TaskSet mutex. + vforkParent *Task + + // exitState is the task's progress through the exit path. + // + // exitState is protected by the TaskSet mutex. exitState is owned by the + // task goroutine. + exitState TaskExitState + + // exitTracerNotified is true if the exit path has either signaled the + // task's tracer to indicate the exit, or determined that no such signal is + // needed. exitTracerNotified can only be true if exitState is + // TaskExitZombie or TaskExitDead. + // + // exitTracerNotified is protected by the TaskSet mutex. + exitTracerNotified bool + + // exitTracerAcked is true if exitTracerNotified is true and either the + // task's tracer has acknowledged the exit notification, or the exit path + // has determined that no such notification is needed. + // + // exitTracerAcked is protected by the TaskSet mutex. + exitTracerAcked bool + + // exitParentNotified is true if the exit path has either signaled the + // task's parent to indicate the exit, or determined that no such signal is + // needed. exitParentNotified can only be true if exitState is + // TaskExitZombie or TaskExitDead. + // + // exitParentNotified is protected by the TaskSet mutex. + exitParentNotified bool + + // exitParentAcked is true if exitParentNotified is true and either the + // task's parent has acknowledged the exit notification, or the exit path + // has determined that no such acknowledgment is needed. + // + // exitParentAcked is protected by the TaskSet mutex. + exitParentAcked bool + + // goroutineStopped is a WaitGroup whose counter value is 1 when the task + // goroutine is running and 0 when the task goroutine is stopped or has + // exited. + goroutineStopped sync.WaitGroup `state:"nosave"` + + // ptraceTracer is the task that is ptrace-attached to this one. If + // ptraceTracer is nil, this task is not being traced. Note that due to + // atomic.Value limitations (atomic.Value.Store(nil) panics), a nil + // ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)). + // + // ptraceTracer is protected by the TaskSet mutex, and accessed with atomic + // operations. This allows paths that wouldn't otherwise lock the TaskSet + // mutex, notably the syscall path, to check if ptraceTracer is nil without + // additional synchronization. + ptraceTracer atomic.Value `state:".(*Task)"` + + // ptraceTracees is the set of tasks that this task is ptrace-attached to. + // + // ptraceTracees is protected by the TaskSet mutex. + ptraceTracees map[*Task]struct{} + + // ptraceSeized is true if ptraceTracer attached to this task with + // PTRACE_SEIZE. + // + // ptraceSeized is protected by the TaskSet mutex. + ptraceSeized bool + + // ptraceOpts contains ptrace options explicitly set by the tracer. If + // ptraceTracer is nil, ptraceOpts is expected to be the zero value. + // + // ptraceOpts is protected by the TaskSet mutex. + ptraceOpts ptraceOptions + + // ptraceSyscallMode controls ptrace behavior around syscall entry and + // exit. + // + // ptraceSyscallMode is protected by the TaskSet mutex. + ptraceSyscallMode ptraceSyscallMode + + // If ptraceSinglestep is true, the next time the task executes application + // code, single-stepping should be enabled. ptraceSinglestep is stored + // independently of the architecture-specific trap flag because tracer + // detaching (which can happen concurrently with the tracee's execution if + // the tracer exits) must disable single-stepping, and the task's + // architectural state is implicitly exclusive to the task goroutine (no + // synchronization occurs before passing registers to SwitchToApp). + // + // ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP. + // + // ptraceSinglestep is protected by the TaskSet mutex. + ptraceSinglestep bool + + // If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the + // time that t entered the ptrace stop, reset to 0 when the tracer + // acknowledges the stop with a wait*() syscall. Otherwise, it is the + // signal number passed to the ptrace operation that ended the last ptrace + // stop on this task. In the latter case, the effect of ptraceCode depends + // on the nature of the ptrace stop; signal-delivery-stop uses it to + // conditionally override ptraceSiginfo, syscall-entry/exit-stops send the + // signal to the task after leaving the stop, and PTRACE_EVENT stops and + // traced group stops ignore it entirely. + // + // Linux contextually stores the equivalent of ptraceCode in + // task_struct::exit_code. + // + // ptraceCode is protected by the TaskSet mutex. + ptraceCode int32 + + // ptraceSiginfo is the value returned to the tracer by + // ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO). + // (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.) + // ptraceSiginfo is nil if the task is in a ptraced group-stop (this is + // required for PTRACE_GETSIGINFO to return EINVAL during such stops, which + // is in turn required to distinguish group stops from other ptrace stops, + // per subsection "Group-stop" in ptrace(2)). + // + // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo. + // + // ptraceSiginfo is protected by the TaskSet mutex. + ptraceSiginfo *arch.SignalInfo + + // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to + // the tracer by ptrace(PTRACE_GETEVENTMSG). + // + // ptraceEventMsg is protected by the TaskSet mutex. + ptraceEventMsg uint64 + + // The struct that holds the IO-related usage. The ioUsage pointer is + // immutable. + ioUsage *usage.IO + + // logPrefix is a string containing the task's thread ID in the root PID + // namespace, and is prepended to log messages emitted by Task.Infof etc. + logPrefix atomic.Value `state:"nosave"` + + // traceContext and traceTask are both used for tracing, and are + // updated along with the logPrefix in updateInfoLocked. + // + // These are exclusive to the task goroutine. + traceContext gocontext.Context `state:"nosave"` + traceTask *trace.Task `state:"nosave"` + + // creds is the task's credentials. + // + // creds.Load() may be called without synchronization. creds.Store() is + // serialized by mu. creds is owned by the task goroutine. All + // auth.Credentials objects that creds may point to, or have pointed to + // in the past, must be treated as immutable. + creds auth.AtomicPtrCredentials + + // utsns is the task's UTS namespace. + // + // utsns is protected by mu. utsns is owned by the task goroutine. + utsns *UTSNamespace + + // ipcns is the task's IPC namespace. + // + // ipcns is protected by mu. ipcns is owned by the task goroutine. + ipcns *IPCNamespace + + // abstractSockets tracks abstract sockets that are in use. + // + // abstractSockets is protected by mu. + abstractSockets *AbstractSocketNamespace + + // mountNamespaceVFS2 is the task's mount namespace. + // + // It is protected by mu. It is owned by the task goroutine. + mountNamespaceVFS2 *vfs.MountNamespace + + // parentDeathSignal is sent to this task's thread group when its parent exits. + // + // parentDeathSignal is protected by mu. + parentDeathSignal linux.Signal + + // syscallFilters is all seccomp-bpf syscall filters applicable to the + // task, in the order in which they were installed. The type of the atomic + // is []bpf.Program. Writing needs to be protected by the signal mutex. + // + // syscallFilters is owned by the task goroutine. + syscallFilters atomic.Value `state:".([]bpf.Program)"` + + // If cleartid is non-zero, treat it as a pointer to a ThreadID in the + // task's virtual address space; when the task exits, set the pointed-to + // ThreadID to 0, and wake any futex waiters. + // + // cleartid is exclusive to the task goroutine. + cleartid usermem.Addr + + // This is mostly a fake cpumask just for sched_set/getaffinity as we + // don't really control the affinity. + // + // Invariant: allowedCPUMask.Size() == + // sched.CPUMaskSize(Kernel.applicationCores). + // + // allowedCPUMask is protected by mu. + allowedCPUMask sched.CPUSet + + // cpu is the fake cpu number returned by getcpu(2). cpu is ignored + // entirely if Kernel.useHostCores is true. + // + // cpu is accessed using atomic memory operations. + cpu int32 + + // This is used to keep track of changes made to a process' priority/niceness. + // It is mostly used to provide some reasonable return value from + // getpriority(2) after a call to setpriority(2) has been made. + // We currently do not actually modify a process' scheduling priority. + // NOTE: This represents the userspace view of priority (nice). + // This means that the value should be in the range [-20, 19]. + // + // niceness is protected by mu. + niceness int + + // This is used to track the numa policy for the current thread. This can be + // modified through a set_mempolicy(2) syscall. Since we always report a + // single numa node, all policies are no-ops. We only track this information + // so that we can return reasonable values if the application calls + // get_mempolicy(2) after setting a non-default policy. Note that in the + // real syscall, nodemask can be longer than a single unsigned long, but we + // always report a single node so never need to save more than a single + // bit. + // + // numaPolicy and numaNodeMask are protected by mu. + numaPolicy linux.NumaPolicy + numaNodeMask uint64 + + // netns is the task's network namespace. netns is never nil. + // + // netns is protected by mu. + netns *inet.Namespace + + // If rseqPreempted is true, before the next call to p.Switch(), + // interrupt rseq critical regions as defined by rseqAddr and + // tg.oldRSeqCritical and write the task goroutine's CPU number to + // rseqAddr/oldRSeqCPUAddr. + // + // We support two ABIs for restartable sequences: + // + // 1. The upstream interface added in v4.18, + // 2. An "old" interface never merged upstream. In the implementation, + // this is referred to as "old rseq". + // + // rseqPreempted is exclusive to the task goroutine. + rseqPreempted bool `state:"nosave"` + + // rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr. + // + // If rseq is unused, rseqCPU is -1 for convenient use in + // platform.Context.Switch. + // + // rseqCPU is exclusive to the task goroutine. + rseqCPU int32 + + // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable. + // + // oldRSeqCPUAddr is exclusive to the task goroutine. + oldRSeqCPUAddr usermem.Addr + + // rseqAddr is a pointer to the userspace linux.RSeq structure. + // + // rseqAddr is exclusive to the task goroutine. + rseqAddr usermem.Addr + + // rseqSignature is the signature that the rseq abort IP must be signed + // with. + // + // rseqSignature is exclusive to the task goroutine. + rseqSignature uint32 + + // copyScratchBuffer is a buffer available to CopyIn/CopyOut + // implementations that require an intermediate buffer to copy data + // into/out of. It prevents these buffers from being allocated/zeroed in + // each syscall and eventually garbage collected. + // + // copyScratchBuffer is exclusive to the task goroutine. + copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"` + + // blockingTimer is used for blocking timeouts. blockingTimerChan is the + // channel that is sent to when blockingTimer fires. + // + // blockingTimer is exclusive to the task goroutine. + blockingTimer *ktime.Timer `state:"nosave"` + blockingTimerChan <-chan struct{} `state:"nosave"` + + // futexWaiter is used for futex(FUTEX_WAIT) syscalls. + // + // futexWaiter is exclusive to the task goroutine. + futexWaiter *futex.Waiter `state:"nosave"` + + // startTime is the real time at which the task started. It is set when + // a Task is created or invokes execve(2). + // + // startTime is protected by mu. + startTime ktime.Time +} + +func (t *Task) savePtraceTracer() *Task { + return t.ptraceTracer.Load().(*Task) +} + +func (t *Task) loadPtraceTracer(tracer *Task) { + t.ptraceTracer.Store(tracer) +} + +func (t *Task) saveSyscallFilters() []bpf.Program { + if f := t.syscallFilters.Load(); f != nil { + return f.([]bpf.Program) + } + return nil +} + +func (t *Task) loadSyscallFilters(filters []bpf.Program) { + t.syscallFilters.Store(filters) +} + +// afterLoad is invoked by stateify. +func (t *Task) afterLoad() { + t.updateInfoLocked() + t.interruptChan = make(chan struct{}, 1) + t.gosched.State = TaskGoroutineNonexistent + if t.stop != nil { + t.stopCount = 1 + } + t.endStopCond.L = &t.tg.signalHandlers.mu + t.p = t.k.Platform.NewContext() + t.rseqPreempted = true + t.futexWaiter = futex.NewWaiter() +} + +// copyScratchBufferLen is the length of Task.copyScratchBuffer. +const copyScratchBufferLen = 144 // sizeof(struct stat) + +// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut +// functions. It must only be used within those functions and can only be used +// by the task goroutine; it exists to improve performance and thus +// intentionally lacks any synchronization. +// +// Callers should pass a constant value as an argument if possible, which will +// allow the compiler to inline and optimize out the if statement below. +func (t *Task) CopyScratchBuffer(size int) []byte { + if size > copyScratchBufferLen { + return make([]byte, size) + } + return t.copyScratchBuffer[:size] +} + +// FutexWaiter returns the Task's futex.Waiter. +func (t *Task) FutexWaiter() *futex.Waiter { + return t.futexWaiter +} + +// Kernel returns the Kernel containing t. +func (t *Task) Kernel() *Kernel { + return t.k +} + +// Value implements context.Context.Value. +// +// Preconditions: The caller must be running on the task goroutine (as implied +// by the requirements of context.Context). +func (t *Task) Value(key interface{}) interface{} { + switch key { + case CtxCanTrace: + return t.CanTrace + case CtxKernel: + return t.k + case CtxPIDNamespace: + return t.tg.pidns + case CtxUTSNamespace: + return t.utsns + case CtxIPCNamespace: + return t.ipcns + case CtxTask: + return t + case auth.CtxCredentials: + return t.Credentials() + case context.CtxThreadGroupID: + return int32(t.ThreadGroup().ID()) + case fs.CtxRoot: + return t.fsContext.RootDirectory() + case vfs.CtxRoot: + return t.fsContext.RootDirectoryVFS2() + case vfs.CtxMountNamespace: + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 + case fs.CtxDirentCacheLimiter: + return t.k.DirentCacheLimiter + case inet.CtxStack: + return t.NetworkContext() + case ktime.CtxRealtimeClock: + return t.k.RealtimeClock() + case limits.CtxLimits: + return t.tg.limits + case pgalloc.CtxMemoryFile: + return t.k.mf + case pgalloc.CtxMemoryFileProvider: + return t.k + case platform.CtxPlatform: + return t.k + case uniqueid.CtxGlobalUniqueID: + return t.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return t.k + case uniqueid.CtxInotifyCookie: + return t.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return t.k + default: + return nil + } +} + +// SetClearTID sets t's cleartid. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) SetClearTID(addr usermem.Addr) { + t.cleartid = addr +} + +// SetSyscallRestartBlock sets the restart block for use in +// restart_syscall(2). After registering a restart block, a syscall should +// return ERESTART_RESTARTBLOCK to request a restart using the block. +// +// Precondition: The caller must be running on the task goroutine. +func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) { + t.syscallRestartBlock = r +} + +// SyscallRestartBlock returns the currently registered restart block for use in +// restart_syscall(2). This function is *not* idempotent and may be called once +// per syscall. This function must not be called if a restart block has not been +// registered for the current syscall. +// +// Precondition: The caller must be running on the task goroutine. +func (t *Task) SyscallRestartBlock() SyscallRestartBlock { + r := t.syscallRestartBlock + // Explicitly set the restart block to nil so that a future syscall can't + // accidentally reuse it. + t.syscallRestartBlock = nil + return r +} + +// IsChrooted returns true if the root directory of t's FSContext is not the +// root directory of t's MountNamespace. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) IsChrooted() bool { + if VFS2Enabled { + realRoot := t.mountNamespaceVFS2.Root() + defer realRoot.DecRef() + root := t.fsContext.RootDirectoryVFS2() + defer root.DecRef() + return root != realRoot + } + + realRoot := t.tg.mounts.Root() + defer realRoot.DecRef() + root := t.fsContext.RootDirectory() + if root != nil { + defer root.DecRef() + } + return root != realRoot +} + +// TaskContext returns t's TaskContext. +// +// Precondition: The caller must be running on the task goroutine, or t.mu must +// be locked. +func (t *Task) TaskContext() *TaskContext { + return &t.tc +} + +// FSContext returns t's FSContext. FSContext does not take an additional +// reference on the returned FSContext. +// +// Precondition: The caller must be running on the task goroutine, or t.mu must +// be locked. +func (t *Task) FSContext() *FSContext { + return t.fsContext +} + +// FDTable returns t's FDTable. FDMTable does not take an additional reference +// on the returned FDMap. +// +// Precondition: The caller must be running on the task goroutine, or t.mu must +// be locked. +func (t *Task) FDTable() *FDTable { + return t.fdTable +} + +// GetFile is a convenience wrapper for t.FDTable().Get. +// +// Precondition: same as FDTable.Get. +func (t *Task) GetFile(fd int32) *fs.File { + f, _ := t.fdTable.Get(fd) + return f +} + +// GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2. +// +// Precondition: same as FDTable.Get. +func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription { + f, _ := t.fdTable.GetVFS2(fd) + return f +} + +// NewFDs is a convenience wrapper for t.FDTable().NewFDs. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) { + return t.fdTable.NewFDs(t, fd, files, flags) +} + +// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) { + return t.fdTable.NewFDsVFS2(t, fd, files, flags) +} + +// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) { + fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags) + if err != nil { + return 0, err + } + return fds[0], nil +} + +// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable.Get. +func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { + return t.fdTable.NewFDVFS2(t, fd, file, flags) +} + +// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error { + return t.fdTable.NewFDAt(t, fd, file, flags) +} + +// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error { + return t.fdTable.NewFDAtVFS2(t, fd, file, flags) +} + +// WithMuLocked executes f with t.mu locked. +func (t *Task) WithMuLocked(f func(*Task)) { + t.mu.Lock() + f(t) + t.mu.Unlock() +} + +// MountNamespace returns t's MountNamespace. MountNamespace does not take an +// additional reference on the returned MountNamespace. +func (t *Task) MountNamespace() *fs.MountNamespace { + return t.tg.mounts +} + +// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the +// returned mount namespace. +func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace { + t.mu.Lock() + defer t.mu.Unlock() + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 +} + +// AbstractSockets returns t's AbstractSocketNamespace. +func (t *Task) AbstractSockets() *AbstractSocketNamespace { + return t.abstractSockets +} + +// ContainerID returns t's container ID. +func (t *Task) ContainerID() string { + return t.containerID +} + +// OOMScoreAdj gets the task's thread group's OOM score adjustment. +func (t *Task) OOMScoreAdj() int32 { + return atomic.LoadInt32(&t.tg.oomScoreAdj) +} + +// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The +// value should be between -1000 and 1000 inclusive. +func (t *Task) SetOOMScoreAdj(adj int32) error { + if adj > 1000 || adj < -1000 { + return syserror.EINVAL + } + atomic.StoreInt32(&t.tg.oomScoreAdj, adj) + return nil +} + +// UID returns t's uid. +// TODO(gvisor.dev/issue/170): This method is not namespaced yet. +func (t *Task) UID() uint32 { + return uint32(t.Credentials().EffectiveKUID) +} + +// GID returns t's gid. +// TODO(gvisor.dev/issue/170): This method is not namespaced yet. +func (t *Task) GID() uint32 { + return uint32(t.Credentials().EffectiveKGID) +} diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go new file mode 100644 index 000000000..5f3e60fe8 --- /dev/null +++ b/pkg/sentry/kernel/task_acct.go @@ -0,0 +1,196 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// Accounting, limits, timers. + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Getitimer implements getitimer(2). +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) { + var tm ktime.Time + var s ktime.Setting + switch id { + case linux.ITIMER_REAL: + tm, s = t.tg.itimerRealTimer.Get() + case linux.ITIMER_VIRTUAL: + tm = t.tg.UserCPUClock().Now() + t.tg.signalHandlers.mu.Lock() + s, _ = t.tg.itimerVirtSetting.At(tm) + t.tg.signalHandlers.mu.Unlock() + case linux.ITIMER_PROF: + tm = t.tg.CPUClock().Now() + t.tg.signalHandlers.mu.Lock() + s, _ = t.tg.itimerProfSetting.At(tm) + t.tg.signalHandlers.mu.Unlock() + default: + return linux.ItimerVal{}, syserror.EINVAL + } + val, iv := ktime.SpecFromSetting(tm, s) + return linux.ItimerVal{ + Value: linux.DurationToTimeval(val), + Interval: linux.DurationToTimeval(iv), + }, nil +} + +// Setitimer implements setitimer(2). +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, error) { + var tm ktime.Time + var olds ktime.Setting + switch id { + case linux.ITIMER_REAL: + news, err := ktime.SettingFromSpec(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), t.tg.itimerRealTimer.Clock()) + if err != nil { + return linux.ItimerVal{}, err + } + tm, olds = t.tg.itimerRealTimer.Swap(news) + case linux.ITIMER_VIRTUAL: + c := t.tg.UserCPUClock() + var err error + t.k.cpuClockTicker.Atomically(func() { + tm = c.Now() + var news ktime.Setting + news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm) + if err != nil { + return + } + t.tg.signalHandlers.mu.Lock() + olds = t.tg.itimerVirtSetting + t.tg.itimerVirtSetting = news + t.tg.updateCPUTimersEnabledLocked() + t.tg.signalHandlers.mu.Unlock() + }) + if err != nil { + return linux.ItimerVal{}, err + } + case linux.ITIMER_PROF: + c := t.tg.CPUClock() + var err error + t.k.cpuClockTicker.Atomically(func() { + tm = c.Now() + var news ktime.Setting + news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm) + if err != nil { + return + } + t.tg.signalHandlers.mu.Lock() + olds = t.tg.itimerProfSetting + t.tg.itimerProfSetting = news + t.tg.updateCPUTimersEnabledLocked() + t.tg.signalHandlers.mu.Unlock() + }) + if err != nil { + return linux.ItimerVal{}, err + } + default: + return linux.ItimerVal{}, syserror.EINVAL + } + oldval, oldiv := ktime.SpecFromSetting(tm, olds) + return linux.ItimerVal{ + Value: linux.DurationToTimeval(oldval), + Interval: linux.DurationToTimeval(oldiv), + }, nil +} + +// IOUsage returns the io usage of the thread. +func (t *Task) IOUsage() *usage.IO { + return t.ioUsage +} + +// IOUsage returns the total io usage of all dead and live threads in the group. +func (tg *ThreadGroup) IOUsage() *usage.IO { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + + io := *tg.ioUsage + // Account for active tasks. + for t := tg.tasks.Front(); t != nil; t = t.Next() { + io.Accumulate(t.IOUsage()) + } + return &io +} + +// Name returns t's name. +func (t *Task) Name() string { + t.mu.Lock() + defer t.mu.Unlock() + return t.tc.Name +} + +// SetName changes t's name. +func (t *Task) SetName(name string) { + t.mu.Lock() + defer t.mu.Unlock() + t.tc.Name = name + t.Debugf("Set thread name to %q", name) +} + +// Limits implements context.Context.Limits. +func (t *Task) Limits() *limits.LimitSet { + return t.ThreadGroup().Limits() +} + +// StartTime returns t's start time. +func (t *Task) StartTime() ktime.Time { + t.mu.Lock() + defer t.mu.Unlock() + return t.startTime +} + +// MaxRSS returns the maximum resident set size of the task in bytes. which +// should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or +// RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these +// flags. +func (t *Task) MaxRSS(which int32) uint64 { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + + switch which { + case linux.RUSAGE_SELF, linux.RUSAGE_THREAD: + // If there's an active mm we can use its value. + if mm := t.MemoryManager(); mm != nil { + if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > t.tg.maxRSS { + return mmMaxRSS + } + } + return t.tg.maxRSS + case linux.RUSAGE_CHILDREN: + return t.tg.childMaxRSS + case linux.RUSAGE_BOTH: + maxRSS := t.tg.maxRSS + if maxRSS < t.tg.childMaxRSS { + maxRSS = t.tg.childMaxRSS + } + if mm := t.MemoryManager(); mm != nil { + if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > maxRSS { + return mmMaxRSS + } + } + return maxRSS + default: + // We'll only get here if which is invalid. + return 0 + } +} diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go new file mode 100644 index 000000000..4a4a69ee2 --- /dev/null +++ b/pkg/sentry/kernel/task_block.go @@ -0,0 +1,230 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "runtime" + "runtime/trace" + "time" + + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" +) + +// BlockWithTimeout blocks t until an event is received from C, the application +// monotonic clock indicates that timeout has elapsed (only if haveTimeout is true), +// or t is interrupted. It returns: +// +// - The remaining timeout, which is guaranteed to be 0 if the timeout expired, +// and is unspecified if haveTimeout is false. +// +// - An error which is nil if an event is received from C, ETIMEDOUT if the timeout +// expired, and syserror.ErrInterrupted if t is interrupted. +func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) { + if !haveTimeout { + return timeout, t.block(C, nil) + } + + start := t.Kernel().MonotonicClock().Now() + deadline := start.Add(timeout) + err := t.BlockWithDeadline(C, true, deadline) + + // Timeout, explicitly return a remaining duration of 0. + if err == syserror.ETIMEDOUT { + return 0, err + } + + // Compute the remaining timeout. Note that even if block() above didn't + // return due to a timeout, we may have used up any of the remaining time + // since then. We cap the remaining timeout to 0 to make it easier to + // directly use the returned duration. + end := t.Kernel().MonotonicClock().Now() + remainingTimeout := timeout - end.Sub(start) + if remainingTimeout < 0 { + remainingTimeout = 0 + } + + return remainingTimeout, err +} + +// BlockWithDeadline blocks t until an event is received from C, the +// application monotonic clock indicates a time of deadline (only if +// haveDeadline is true), or t is interrupted. It returns nil if an event is +// received from C, ETIMEDOUT if the deadline expired, and +// syserror.ErrInterrupted if t is interrupted. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error { + if !haveDeadline { + return t.block(C, nil) + } + + // Start the timeout timer. + t.blockingTimer.Swap(ktime.Setting{ + Enabled: true, + Next: deadline, + }) + + err := t.block(C, t.blockingTimerChan) + + // Stop the timeout timer and drain the channel. + t.blockingTimer.Swap(ktime.Setting{}) + select { + case <-t.blockingTimerChan: + default: + } + + return err +} + +// BlockWithTimer blocks t until an event is received from C or tchan, or t is +// interrupted. It returns nil if an event is received from C, ETIMEDOUT if an +// event is received from tchan, and syserror.ErrInterrupted if t is +// interrupted. +// +// Most clients should use BlockWithDeadline or BlockWithTimeout instead. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) BlockWithTimer(C <-chan struct{}, tchan <-chan struct{}) error { + return t.block(C, tchan) +} + +// Block blocks t until an event is received from C or t is interrupted. It +// returns nil if an event is received from C and syserror.ErrInterrupted if t +// is interrupted. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Block(C <-chan struct{}) error { + return t.block(C, nil) +} + +// block blocks a task on one of many events. +// N.B. defer is too expensive to be used here. +func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { + // Fast path if the request is already done. + select { + case <-C: + return nil + default: + } + + // Deactive our address space, we don't need it. + interrupt := t.SleepStart() + + // If the request is not completed, but the timer has already expired, + // then ensure that we run through a scheduler cycle. This is because + // we may see applications relying on timer slack to yield the thread. + // For example, they may attempt to sleep for some number of nanoseconds, + // and expect that this will actually yield the CPU and sleep for at + // least microseconds, e.g.: + // https://github.com/LMAX-Exchange/disruptor/commit/6ca210f2bcd23f703c479804d583718e16f43c07 + if len(timerChan) > 0 { + runtime.Gosched() + } + + region := trace.StartRegion(t.traceContext, blockRegion) + select { + case <-C: + region.End() + t.SleepFinish(true) + // Woken by event. + return nil + + case <-interrupt: + region.End() + t.SleepFinish(false) + // Return the indicated error on interrupt. + return syserror.ErrInterrupted + + case <-timerChan: + region.End() + t.SleepFinish(true) + // We've timed out. + return syserror.ETIMEDOUT + } +} + +// SleepStart implements amutex.Sleeper.SleepStart. +func (t *Task) SleepStart() <-chan struct{} { + t.Deactivate() + t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible) + return t.interruptChan +} + +// SleepFinish implements amutex.Sleeper.SleepFinish. +func (t *Task) SleepFinish(success bool) { + if !success { + // The interrupted notification is consumed only at the top-level + // (Run). Therefore we attempt to reset the pending notification. + // This will also elide our next entry back into the task, so we + // will process signals, state changes, etc. + t.interruptSelf() + } + t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible) + t.Activate() +} + +// Interrupted implements amutex.Sleeper.Interrupted +func (t *Task) Interrupted() bool { + return len(t.interruptChan) != 0 +} + +// UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart. +func (t *Task) UninterruptibleSleepStart(deactivate bool) { + if deactivate { + t.Deactivate() + } + t.accountTaskGoroutineEnter(TaskGoroutineBlockedUninterruptible) +} + +// UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish. +func (t *Task) UninterruptibleSleepFinish(activate bool) { + t.accountTaskGoroutineLeave(TaskGoroutineBlockedUninterruptible) + if activate { + t.Activate() + } +} + +// interrupted returns true if interrupt or interruptSelf has been called at +// least once since the last call to interrupted. +func (t *Task) interrupted() bool { + select { + case <-t.interruptChan: + return true + default: + return false + } +} + +// interrupt unblocks the task and interrupts it if it's currently running in +// userspace. +func (t *Task) interrupt() { + t.interruptSelf() + t.p.Interrupt() +} + +// interruptSelf is like Interrupt, but can only be called by the task +// goroutine. +func (t *Task) interruptSelf() { + select { + case t.interruptChan <- struct{}{}: + t.Debugf("Interrupt queued") + default: + t.Debugf("Dropping duplicate interrupt") + } + // platform.Context.Interrupt() is unnecessary since a task goroutine + // calling interruptSelf() cannot also be blocked in + // platform.Context.Switch(). +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go new file mode 100644 index 000000000..e1ecca99e --- /dev/null +++ b/pkg/sentry/kernel/task_clone.go @@ -0,0 +1,540 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// SharingOptions controls what resources are shared by a new task created by +// Task.Clone, or an existing task affected by Task.Unshare. +type SharingOptions struct { + // If NewAddressSpace is true, the task should have an independent virtual + // address space. + NewAddressSpace bool + + // If NewSignalHandlers is true, the task should use an independent set of + // signal handlers. + NewSignalHandlers bool + + // If NewThreadGroup is true, the task should be the leader of its own + // thread group. TerminationSignal is the signal that the thread group + // will send to its parent when it exits. If NewThreadGroup is false, + // TerminationSignal is ignored. + NewThreadGroup bool + TerminationSignal linux.Signal + + // If NewPIDNamespace is true: + // + // - In the context of Task.Clone, the new task should be the init task + // (TID 1) in a new PID namespace. + // + // - In the context of Task.Unshare, the task should create a new PID + // namespace, and all subsequent clones of the task should be members of + // the new PID namespace. + NewPIDNamespace bool + + // If NewUserNamespace is true, the task should have an independent user + // namespace. + NewUserNamespace bool + + // If NewNetworkNamespace is true, the task should have an independent + // network namespace. + NewNetworkNamespace bool + + // If NewFiles is true, the task should use an independent file descriptor + // table. + NewFiles bool + + // If NewFSContext is true, the task should have an independent FSContext. + NewFSContext bool + + // If NewUTSNamespace is true, the task should have an independent UTS + // namespace. + NewUTSNamespace bool + + // If NewIPCNamespace is true, the task should have an independent IPC + // namespace. + NewIPCNamespace bool +} + +// CloneOptions controls the behavior of Task.Clone. +type CloneOptions struct { + // SharingOptions defines the set of resources that the new task will share + // with its parent. + SharingOptions + + // Stack is the initial stack pointer of the new task. If Stack is 0, the + // new task will start with the same stack pointer as its parent. + Stack usermem.Addr + + // If SetTLS is true, set the new task's TLS (thread-local storage) + // descriptor to TLS. If SetTLS is false, TLS is ignored. + SetTLS bool + TLS usermem.Addr + + // If ChildClearTID is true, when the child exits, 0 is written to the + // address ChildTID in the child's memory, and if the write is successful a + // futex wake on the same address is performed. + // + // If ChildSetTID is true, the child's thread ID (in the child's PID + // namespace) is written to address ChildTID in the child's memory. (As in + // Linux, failed writes are silently ignored.) + ChildClearTID bool + ChildSetTID bool + ChildTID usermem.Addr + + // If ParentSetTID is true, the child's thread ID (in the parent's PID + // namespace) is written to address ParentTID in the parent's memory. (As + // in Linux, failed writes are silently ignored.) + // + // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID + // causes the child's thread ID to be written to ptid in both the parent + // and child's memory, but this is a documentation error fixed by + // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID"). + ParentSetTID bool + ParentTID usermem.Addr + + // If Vfork is true, place the parent in vforkStop until the cloned task + // releases its TaskContext. + Vfork bool + + // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for + // this clone(), and do not ptrace-attach the caller's tracer to the new + // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate). + Untraced bool + + // If InheritTracer is true, ptrace-attach the caller's tracer to the new + // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported + // for it. If both Untraced and InheritTracer are true, no event will be + // reported, but tracer inheritance will still occur. + InheritTracer bool +} + +// Clone implements the clone(2) syscall and returns the thread ID of the new +// task in t's PID namespace. Clone may return both a non-zero thread ID and a +// non-nil error. +// +// Preconditions: The caller must be running Task.doSyscallInvoke on the task +// goroutine. +func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { + // Since signal actions may refer to application signal handlers by virtual + // address, any set of signal handlers must refer to the same address + // space. + if !opts.NewSignalHandlers && opts.NewAddressSpace { + return 0, nil, syserror.EINVAL + } + // In order for the behavior of thread-group-directed signals to be sane, + // all tasks in a thread group must share signal handlers. + if !opts.NewThreadGroup && opts.NewSignalHandlers { + return 0, nil, syserror.EINVAL + } + // All tasks in a thread group must be in the same PID namespace. + if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) { + return 0, nil, syserror.EINVAL + } + // The two different ways of specifying a new PID namespace are + // incompatible. + if opts.NewPIDNamespace && t.childPIDNamespace != nil { + return 0, nil, syserror.EINVAL + } + // Thread groups and FS contexts cannot span user namespaces. + if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) { + return 0, nil, syserror.EINVAL + } + + // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a + // single clone(2) or unshare(2) call, the user namespace is guaranteed to + // be created first, giving the child (clone(2)) or caller (unshare(2)) + // privileges over the remaining namespaces created by the call." - + // user_namespaces(7) + creds := t.Credentials() + userns := creds.UserNamespace + if opts.NewUserNamespace { + var err error + // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and + // the caller is in a chroot environment (i.e., the caller's root + // directory does not match the root directory of the mount namespace + // in which it resides)." - clone(2). Neither chroot(2) nor + // user_namespaces(7) document this. + if t.IsChrooted() { + return 0, nil, syserror.EPERM + } + userns, err = creds.NewChildUserNamespace() + if err != nil { + return 0, nil, err + } + } + if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { + return 0, nil, syserror.EPERM + } + + utsns := t.UTSNamespace() + if opts.NewUTSNamespace { + // Note that this must happen after NewUserNamespace so we get + // the new userns if there is one. + utsns = t.UTSNamespace().Clone(userns) + } + + ipcns := t.IPCNamespace() + if opts.NewIPCNamespace { + // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC + // namespace" + ipcns = NewIPCNamespace(userns) + } + + netns := t.NetworkNamespace() + if opts.NewNetworkNamespace { + netns = inet.NewNamespace(netns) + } + + // TODO(b/63601033): Implement CLONE_NEWNS. + mntnsVFS2 := t.mountNamespaceVFS2 + if mntnsVFS2 != nil { + mntnsVFS2.IncRef() + } + + tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) + if err != nil { + return 0, nil, err + } + // clone() returns 0 in the child. + tc.Arch.SetReturn(0) + if opts.Stack != 0 { + tc.Arch.SetStack(uintptr(opts.Stack)) + } + if opts.SetTLS { + if !tc.Arch.SetTLS(uintptr(opts.TLS)) { + return 0, nil, syserror.EPERM + } + } + + var fsContext *FSContext + if opts.NewFSContext { + fsContext = t.fsContext.Fork() + } else { + fsContext = t.fsContext + fsContext.IncRef() + } + + var fdTable *FDTable + if opts.NewFiles { + fdTable = t.fdTable.Fork() + } else { + fdTable = t.fdTable + fdTable.IncRef() + } + + pidns := t.tg.pidns + if t.childPIDNamespace != nil { + pidns = t.childPIDNamespace + } else if opts.NewPIDNamespace { + pidns = pidns.NewChild(userns) + } + + tg := t.tg + rseqAddr := usermem.Addr(0) + rseqSignature := uint32(0) + if opts.NewThreadGroup { + if tg.mounts != nil { + tg.mounts.IncRef() + } + sh := t.tg.signalHandlers + if opts.NewSignalHandlers { + sh = sh.Fork() + } + tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj) + rseqAddr = t.rseqAddr + rseqSignature = t.rseqSignature + } + + cfg := &TaskConfig{ + Kernel: t.k, + ThreadGroup: tg, + SignalMask: t.SignalMask(), + TaskContext: tc, + FSContext: fsContext, + FDTable: fdTable, + Credentials: creds, + Niceness: t.Niceness(), + NetworkNamespace: netns, + AllowedCPUMask: t.CPUMask(), + UTSNamespace: utsns, + IPCNamespace: ipcns, + AbstractSocketNamespace: t.abstractSockets, + MountNamespaceVFS2: mntnsVFS2, + RSeqAddr: rseqAddr, + RSeqSignature: rseqSignature, + ContainerID: t.ContainerID(), + } + if opts.NewThreadGroup { + cfg.Parent = t + } else { + cfg.InheritParent = t + } + nt, err := t.tg.pidns.owner.NewTask(cfg) + if err != nil { + if opts.NewThreadGroup { + tg.release() + } + return 0, nil, err + } + + // "A child process created via fork(2) inherits a copy of its parent's + // alternate signal stack settings" - sigaltstack(2). + // + // However kernel/fork.c:copy_process() adds a limitation to this: + // "sigaltstack should be cleared when sharing the same VM". + if opts.NewAddressSpace || opts.Vfork { + nt.SetSignalStack(t.SignalStack()) + } + + if userns != creds.UserNamespace { + if err := nt.SetUserNamespace(userns); err != nil { + // This shouldn't be possible: userns was created from nt.creds, so + // nt should have CAP_SYS_ADMIN in userns. + panic("Task.Clone: SetUserNamespace failed: " + err.Error()) + } + } + + // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to + // nt that it must receive before its task goroutine starts running. + tid := nt.k.tasks.Root.IDOfTask(nt) + defer nt.Start(tid) + t.traceCloneEvent(tid) + + // "If fork/clone and execve are allowed by @prog, any child processes will + // be constrained to the same filters and system call ABI as the parent." - + // Documentation/prctl/seccomp_filter.txt + if f := t.syscallFilters.Load(); f != nil { + copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...) + nt.syscallFilters.Store(copiedFilters) + } + if opts.Vfork { + nt.vforkParent = t + } + + if opts.ChildClearTID { + nt.SetClearTID(opts.ChildTID) + } + if opts.ChildSetTID { + // Can't use Task.CopyOut, which assumes AddressSpaceActive. + usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{}) + } + ntid := t.tg.pidns.IDOfTask(nt) + if opts.ParentSetTID { + t.CopyOut(opts.ParentTID, ntid) + } + + kind := ptraceCloneKindClone + if opts.Vfork { + kind = ptraceCloneKindVfork + } else if opts.TerminationSignal == linux.SIGCHLD { + kind = ptraceCloneKindFork + } + if t.ptraceClone(kind, nt, opts) { + if opts.Vfork { + return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil + } + return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil + } + if opts.Vfork { + t.maybeBeginVforkStop(nt) + return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil + } + return ntid, nil, nil +} + +// maybeBeginVforkStop checks if a previously-started vfork child is still +// running and has not yet released its MM, such that its parent t should enter +// a vforkStop. +// +// Preconditions: The caller must be running on t's task goroutine. +func (t *Task) maybeBeginVforkStop(child *Task) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.killedLocked() { + child.vforkParent = nil + return + } + if child.vforkParent == t { + t.beginInternalStopLocked((*vforkStop)(nil)) + } +} + +func (t *Task) unstopVforkParent() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if p := t.vforkParent; p != nil { + p.tg.signalHandlers.mu.Lock() + defer p.tg.signalHandlers.mu.Unlock() + if _, ok := p.stop.(*vforkStop); ok { + p.endInternalStopLocked() + } + // Parent no longer needs to be unstopped. + t.vforkParent = nil + } +} + +// +stateify savable +type runSyscallAfterPtraceEventClone struct { + vforkChild *Task + + // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's + // PID namespace. vforkChildTID must be stored since the child may exit and + // release its TID before the PTRACE_EVENT stop ends. + vforkChildTID ThreadID +} + +func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState { + if r.vforkChild != nil { + t.maybeBeginVforkStop(r.vforkChild) + return &runSyscallAfterVforkStop{r.vforkChildTID} + } + return (*runSyscallExit)(nil) +} + +// +stateify savable +type runSyscallAfterVforkStop struct { + // childTID has the same meaning as + // runSyscallAfterPtraceEventClone.vforkChildTID. + childTID ThreadID +} + +func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { + t.ptraceVforkDone(r.childTID) + return (*runSyscallExit)(nil) +} + +// Unshare changes the set of resources t shares with other tasks, as specified +// by opts. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Unshare(opts *SharingOptions) error { + // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and + // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if + // t is the only task using its MM, which due to clone(2)'s rules imply + // that it is also the only task using its signal handlers / in its thread + // group, and cause EINVAL to be returned otherwise. + // + // Since we don't count the number of tasks using each address space or set + // of signal handlers, we reject NewSignalHandlers and NewAddressSpace + // altogether, and interpret NewThreadGroup as requiring that t be the only + // member of its thread group. This seems to be logically coherent, in the + // sense that clone(2) allows a task to share signal handlers and address + // spaces with tasks in other thread groups. + if opts.NewAddressSpace || opts.NewSignalHandlers { + return syserror.EINVAL + } + creds := t.Credentials() + if opts.NewThreadGroup { + t.tg.signalHandlers.mu.Lock() + if t.tg.tasksCount != 1 { + t.tg.signalHandlers.mu.Unlock() + return syserror.EINVAL + } + t.tg.signalHandlers.mu.Unlock() + // This isn't racy because we're the only living task, and therefore + // the only task capable of creating new ones, in our thread group. + } + if opts.NewUserNamespace { + if t.IsChrooted() { + return syserror.EPERM + } + newUserNS, err := creds.NewChildUserNamespace() + if err != nil { + return err + } + err = t.SetUserNamespace(newUserNS) + if err != nil { + return err + } + // Need to reload creds, becaue t.SetUserNamespace() changed task credentials. + creds = t.Credentials() + } + haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) + if opts.NewPIDNamespace { + if !haveCapSysAdmin { + return syserror.EPERM + } + t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) + } + t.mu.Lock() + // Can't defer unlock: DecRefs must occur without holding t.mu. + if opts.NewNetworkNamespace { + if !haveCapSysAdmin { + t.mu.Unlock() + return syserror.EPERM + } + t.netns = inet.NewNamespace(t.netns) + } + if opts.NewUTSNamespace { + if !haveCapSysAdmin { + t.mu.Unlock() + return syserror.EPERM + } + // Note that this must happen after NewUserNamespace, so the + // new user namespace is used if there is one. + t.utsns = t.utsns.Clone(creds.UserNamespace) + } + if opts.NewIPCNamespace { + if !haveCapSysAdmin { + t.mu.Unlock() + return syserror.EPERM + } + // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC + // namespace" + t.ipcns = NewIPCNamespace(creds.UserNamespace) + } + var oldFDTable *FDTable + if opts.NewFiles { + oldFDTable = t.fdTable + t.fdTable = oldFDTable.Fork() + } + var oldFSContext *FSContext + if opts.NewFSContext { + oldFSContext = t.fsContext + t.fsContext = oldFSContext.Fork() + } + t.mu.Unlock() + if oldFDTable != nil { + oldFDTable.DecRef() + } + if oldFSContext != nil { + oldFSContext.DecRef() + } + return nil +} + +// vforkStop is a TaskStop imposed on a task that creates a child with +// CLONE_VFORK or vfork(2), that ends when the child task ceases to use its +// current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so +// that the child and parent share mappings until the child execve()s into a +// new process image or exits.) +// +// +stateify savable +type vforkStop struct{} + +// StopIgnoresKill implements TaskStop.Killable. +func (*vforkStop) Killable() bool { return true } diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go new file mode 100644 index 000000000..9fa528384 --- /dev/null +++ b/pkg/sentry/kernel/task_context.go @@ -0,0 +1,169 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/usermem" +) + +var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) + +// Auxmap contains miscellaneous data for the task. +type Auxmap map[string]interface{} + +// TaskContext is the subset of a task's data that is provided by the loader. +// +// +stateify savable +type TaskContext struct { + // Name is the thread name set by the prctl(PR_SET_NAME) system call. + Name string + + // Arch is the architecture-specific context (registers, etc.) + Arch arch.Context + + // MemoryManager is the task's address space. + MemoryManager *mm.MemoryManager + + // fu implements futexes in the address space. + fu *futex.Manager + + // st is the task's syscall table. + st *SyscallTable `state:".(syscallTableInfo)"` +} + +// release releases all resources held by the TaskContext. release is called by +// the task when it execs into a new TaskContext or exits. +func (tc *TaskContext) release() { + // Nil out pointers so that if the task is saved after release, it doesn't + // follow the pointers to possibly now-invalid objects. + if tc.MemoryManager != nil { + tc.MemoryManager.DecUsers(context.Background()) + tc.MemoryManager = nil + } + tc.fu = nil +} + +// Fork returns a duplicate of tc. The copied TaskContext always has an +// independent arch.Context. If shareAddressSpace is true, the copied +// TaskContext shares an address space with the original; otherwise, the copied +// TaskContext has an independent address space that is initially a duplicate +// of the original's. +func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) { + newTC := &TaskContext{ + Name: tc.Name, + Arch: tc.Arch.Fork(), + st: tc.st, + } + if shareAddressSpace { + newTC.MemoryManager = tc.MemoryManager + if newTC.MemoryManager != nil { + if !newTC.MemoryManager.IncUsers() { + // Shouldn't be possible since tc.MemoryManager should be a + // counted user. + panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager")) + } + } + newTC.fu = tc.fu + } else { + newMM, err := tc.MemoryManager.Fork(ctx) + if err != nil { + return nil, err + } + newTC.MemoryManager = newMM + newTC.fu = k.futexes.Fork() + } + return newTC, nil +} + +// Arch returns t's arch.Context. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Arch() arch.Context { + return t.tc.Arch +} + +// MemoryManager returns t's MemoryManager. MemoryManager does not take an +// additional reference on the returned MM. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) MemoryManager() *mm.MemoryManager { + return t.tc.MemoryManager +} + +// SyscallTable returns t's syscall table. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) SyscallTable() *SyscallTable { + return t.tc.st +} + +// Stack returns the userspace stack. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Stack() *arch.Stack { + return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())} +} + +// LoadTaskImage loads a specified file into a new TaskContext. +// +// args.MemoryManager does not need to be set by the caller. +func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) { + // If File is not nil, we should load that instead of resolving Filename. + if args.File != nil { + args.Filename = args.File.PathnameWithDeleted(ctx) + } + + // Prepare a new user address space to load into. + m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) + defer m.DecUsers(ctx) + args.MemoryManager = m + + os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso) + if err != nil { + return nil, err + } + + // Lookup our new syscall table. + st, ok := LookupSyscallTable(os, ac.Arch()) + if !ok { + // No syscall table found. This means that the ELF binary does not match + // the architecture. + return nil, errNoSyscalls + } + + if !m.IncUsers() { + panic("Failed to increment users count on new MM") + } + return &TaskContext{ + Name: name, + Arch: ac, + MemoryManager: m, + fu: k.futexes.Fork(), + st: st, + }, nil +} diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go new file mode 100644 index 000000000..9b69f3cbe --- /dev/null +++ b/pkg/sentry/kernel/task_exec.go @@ -0,0 +1,277 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file implements the machinery behind the execve() syscall. In brief, a +// thread executes an execve() by killing all other threads in its thread +// group, assuming the leader's identity, and then switching process images. +// +// This design is effectively mandated by Linux. From ptrace(2): +// +// """ +// execve(2) under ptrace +// When one thread in a multithreaded process calls execve(2), the +// kernel destroys all other threads in the process, and resets the +// thread ID of the execing thread to the thread group ID (process ID). +// (Or, to put things another way, when a multithreaded process does an +// execve(2), at completion of the call, it appears as though the +// execve(2) occurred in the thread group leader, regardless of which +// thread did the execve(2).) This resetting of the thread ID looks +// very confusing to tracers: +// +// * All other threads stop in PTRACE_EVENT_EXIT stop, if the +// PTRACE_O_TRACEEXIT option was turned on. Then all other threads +// except the thread group leader report death as if they exited via +// _exit(2) with exit code 0. +// +// * The execing tracee changes its thread ID while it is in the +// execve(2). (Remember, under ptrace, the "pid" returned from +// waitpid(2), or fed into ptrace calls, is the tracee's thread ID.) +// That is, the tracee's thread ID is reset to be the same as its +// process ID, which is the same as the thread group leader's thread +// ID. +// +// * Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC +// option was turned on. +// +// * If the thread group leader has reported its PTRACE_EVENT_EXIT stop +// by this time, it appears to the tracer that the dead thread leader +// "reappears from nowhere". (Note: the thread group leader does not +// report death via WIFEXITED(status) until there is at least one +// other live thread. This eliminates the possibility that the +// tracer will see it dying and then reappearing.) If the thread +// group leader was still alive, for the tracer this may look as if +// thread group leader returns from a different system call than it +// entered, or even "returned from a system call even though it was +// not in any system call". If the thread group leader was not +// traced (or was traced by a different tracer), then during +// execve(2) it will appear as if it has become a tracee of the +// tracer of the execing tracee. +// +// All of the above effects are the artifacts of the thread ID change in +// the tracee. +// """ + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// execStop is a TaskStop that a task sets on itself when it wants to execve +// and is waiting for the other tasks in its thread group to exit first. +// +// +stateify savable +type execStop struct{} + +// Killable implements TaskStop.Killable. +func (*execStop) Killable() bool { return true } + +// Execve implements the execve(2) syscall by killing all other tasks in its +// thread group and switching to newTC. Execve always takes ownership of newTC. +// +// Preconditions: The caller must be running Task.doSyscallInvoke on the task +// goroutine. +func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + + if t.tg.exiting || t.tg.execing != nil { + // We lost to a racing group-exit, kill, or exec from another thread + // and should just exit. + newTC.release() + return nil, syserror.EINTR + } + + // Cancel any racing group stops. + t.tg.endGroupStopLocked(false) + + // If the task has any siblings, they have to exit before the exec can + // continue. + t.tg.execing = t + if t.tg.tasks.Front() != t.tg.tasks.Back() { + // "[All] other threads except the thread group leader report death as + // if they exited via _exit(2) with exit code 0." - ptrace(2) + for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { + if t != sibling { + sibling.killLocked() + } + } + // The last sibling to exit will wake t. + t.beginInternalStopLocked((*execStop)(nil)) + } + + return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil +} + +// The runSyscallAfterExecStop state continues execve(2) after all siblings of +// a thread in the execve syscall have exited. +// +// +stateify savable +type runSyscallAfterExecStop struct { + tc *TaskContext +} + +func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { + t.traceExecEvent(r.tc) + t.tg.pidns.owner.mu.Lock() + t.tg.execing = nil + if t.killed() { + t.tg.pidns.owner.mu.Unlock() + r.tc.release() + return (*runInterrupt)(nil) + } + // We are the thread group leader now. Save our old thread ID for + // PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this + // point it will get a PID of 0, but this is consistent with Linux. + oldTID := ThreadID(0) + if tracer := t.Tracer(); tracer != nil { + oldTID = tracer.tg.pidns.tids[t] + } + t.promoteLocked() + // "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle + // this first since POSIX timers are protected by the signal mutex, which + // we're about to change. Note that we have to stop and destroy timers + // without holding any mutexes to avoid circular lock ordering. + var its []*IntervalTimer + t.tg.signalHandlers.mu.Lock() + for _, it := range t.tg.timers { + its = append(its, it) + } + t.tg.timers = make(map[linux.TimerID]*IntervalTimer) + t.tg.signalHandlers.mu.Unlock() + t.tg.pidns.owner.mu.Unlock() + for _, it := range its { + it.DestroyTimer() + } + t.tg.pidns.owner.mu.Lock() + // "During an execve(2), the dispositions of handled signals are reset to + // the default; the dispositions of ignored signals are left unchanged. ... + // [The] signal mask is preserved across execve(2). ... [The] pending + // signal set is preserved across an execve(2)." - signal(7) + // + // Details: + // + // - If the thread group is sharing its signal handlers with another thread + // group via CLONE_SIGHAND, execve forces the signal handlers to be copied + // (see Linux's fs/exec.c:de_thread). We're not reference-counting signal + // handlers, so we always make a copy. + // + // - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags, + // restorer (if present), and mask are always reset. (See Linux's + // fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.) + t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec() + t.endStopCond.L = &t.tg.signalHandlers.mu + // "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2) + t.signalStack = arch.SignalStack{Flags: arch.SignalStackFlagDisable} + // "The termination signal is reset to SIGCHLD (see clone(2))." + t.tg.terminationSignal = linux.SIGCHLD + // execed indicates that the process can no longer join a process group + // in some scenarios (namely, the parent call setpgid(2) on the child). + // See the JoinProcessGroup function in sessions.go for more context. + t.tg.execed = true + // Maximum RSS is preserved across execve(2). + t.updateRSSLocked() + // Restartable sequence state is discarded. + t.rseqPreempted = false + t.rseqCPU = -1 + t.rseqAddr = 0 + t.rseqSignature = 0 + t.oldRSeqCPUAddr = 0 + t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) + t.tg.pidns.owner.mu.Unlock() + + oldFDTable := t.fdTable + t.fdTable = t.fdTable.Fork() + oldFDTable.DecRef() + + // Remove FDs with the CloseOnExec flag set. + t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { + return flags.CloseOnExec + }) + + // NOTE(b/30815691): We currently do not implement privileged + // executables (set-user/group-ID bits and file capabilities). This + // allows us to unconditionally enable user dumpability on the new mm. + // See fs/exec.c:setup_new_exec. + r.tc.MemoryManager.SetDumpability(mm.UserDumpable) + + // Switch to the new process. + t.MemoryManager().Deactivate() + t.mu.Lock() + // Update credentials to reflect the execve. This should precede switching + // MMs to ensure that dumpability has been reset first, if needed. + t.updateCredsForExecLocked() + t.tc.release() + t.tc = *r.tc + t.mu.Unlock() + t.unstopVforkParent() + // NOTE(b/30316266): All locks must be dropped prior to calling Activate. + t.MemoryManager().Activate(t) + + t.ptraceExec(oldTID) + return (*runSyscallExit)(nil) +} + +// promoteLocked makes t the leader of its thread group. If t is already the +// thread group leader, promoteLocked is a no-op. +// +// Preconditions: All other tasks in t's thread group, including the existing +// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must +// be locked for writing. +func (t *Task) promoteLocked() { + oldLeader := t.tg.leader + if t == oldLeader { + return + } + // Swap the leader's TIDs with the execing task's. The latter will be + // released when the old leader is reaped below. + for ns := t.tg.pidns; ns != nil; ns = ns.parent { + oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader] + ns.tids[oldLeader] = oldTID + ns.tids[t] = leaderTID + ns.tasks[oldTID] = oldLeader + ns.tasks[leaderTID] = t + // Neither the ThreadGroup nor TGID change, so no need to + // update ns.tgids. + } + + // Inherit the old leader's start time. + oldStartTime := oldLeader.StartTime() + t.mu.Lock() + t.startTime = oldStartTime + t.mu.Unlock() + + t.tg.leader = t + t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t]) + t.updateInfoLocked() + // Reap the original leader. If it has a tracer, detach it instead of + // waiting for it to acknowledge the original leader's death. + oldLeader.exitParentNotified = true + oldLeader.exitParentAcked = true + if tracer := oldLeader.Tracer(); tracer != nil { + delete(tracer.ptraceTracees, oldLeader) + oldLeader.forgetTracerLocked() + // Notify the tracer that it will no longer be receiving these events + // from the tracee. + tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue) + } + oldLeader.exitNotifyLocked(false) +} diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go new file mode 100644 index 000000000..c4ade6e8e --- /dev/null +++ b/pkg/sentry/kernel/task_exit.go @@ -0,0 +1,1167 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file implements the task exit cycle: +// +// - Tasks are asynchronously requested to exit with Task.Kill. +// +// - When able, the task goroutine enters the exit path starting from state +// runExit. +// +// - Other tasks observe completed exits with Task.Wait (which implements the +// wait*() family of syscalls). + +import ( + "errors" + "fmt" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// An ExitStatus is a value communicated from an exiting task or thread group +// to the party that reaps it. +// +// +stateify savable +type ExitStatus struct { + // Code is the numeric value passed to the call to exit or exit_group that + // caused the exit. If the exit was not caused by such a call, Code is 0. + Code int + + // Signo is the signal that caused the exit. If the exit was not caused by + // a signal, Signo is 0. + Signo int +} + +// Signaled returns true if the ExitStatus indicates that the exiting task or +// thread group was killed by a signal. +func (es ExitStatus) Signaled() bool { + return es.Signo != 0 +} + +// Status returns the numeric representation of the ExitStatus returned by e.g. +// the wait4() system call. +func (es ExitStatus) Status() uint32 { + return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff) +} + +// ShellExitCode returns the numeric exit code that Bash would return for an +// exit status of es. +func (es ExitStatus) ShellExitCode() int { + if es.Signaled() { + return 128 + es.Signo + } + return es.Code +} + +// TaskExitState represents a step in the task exit path. +// +// "Exiting" and "exited" are often ambiguous; prefer to name specific states. +type TaskExitState int + +const ( + // TaskExitNone indicates that the task has not begun exiting. + TaskExitNone TaskExitState = iota + + // TaskExitInitiated indicates that the task goroutine has entered the exit + // path, and the task is no longer eligible to participate in group stops + // or group signal handling. TaskExitInitiated is analogous to Linux's + // PF_EXITING. + TaskExitInitiated + + // TaskExitZombie indicates that the task has released its resources, and + // the task no longer prevents a sibling thread from completing execve. + TaskExitZombie + + // TaskExitDead indicates that the task's thread IDs have been released, + // and the task no longer prevents its thread group leader from being + // reaped. ("Reaping" refers to the transitioning of a task from + // TaskExitZombie to TaskExitDead.) + TaskExitDead +) + +// String implements fmt.Stringer. +func (t TaskExitState) String() string { + switch t { + case TaskExitNone: + return "TaskExitNone" + case TaskExitInitiated: + return "TaskExitInitiated" + case TaskExitZombie: + return "TaskExitZombie" + case TaskExitDead: + return "TaskExitDead" + default: + return strconv.Itoa(int(t)) + } +} + +// killLocked marks t as killed by enqueueing a SIGKILL, without causing the +// thread-group-affecting side effects SIGKILL usually has. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) killLocked() { + // Clear killable stops. + if t.stop != nil && t.stop.Killable() { + t.endInternalStopLocked() + } + t.pendingSignals.enqueue(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + // Linux just sets SIGKILL in the pending signal bitmask without + // enqueueing an actual siginfo, such that + // kernel/signal.c:collect_signal() initializes si_code to SI_USER. + Code: arch.SignalInfoUser, + }, nil) + t.interrupt() +} + +// killed returns true if t has a SIGKILL pending. killed is analogous to +// Linux's fatal_signal_pending(). +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) killed() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.killedLocked() +} + +func (t *Task) killedLocked() bool { + return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0 +} + +// PrepareExit indicates an exit with status es. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) PrepareExit(es ExitStatus) { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.exitStatus = es +} + +// PrepareGroupExit indicates a group exit with status es to t's thread group. +// +// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it +// does not tail-call do_exit(), except that it *does* set Task.exitStatus. +// (Linux does not do so until within do_exit(), since it reuses exit_code for +// ptrace.) +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) PrepareGroupExit(es ExitStatus) { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.tg.exiting || t.tg.execing != nil { + // Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e. + // this "group exit" is being executed by the killed sibling of an + // execing task, then Task.Execve never set t.tg.exitStatus, so it's + // still the zero value. This is consistent with Linux, both in intent + // ("all other threads ... report death as if they exited via _exit(2) + // with exit code 0" - ptrace(2), "execve under ptrace") and in + // implementation (compare fs/exec.c:de_thread() => + // kernel/signal.c:zap_other_threads() and + // kernel/exit.c:do_group_exit() => + // include/linux/sched.h:signal_group_exit()). + t.exitStatus = t.tg.exitStatus + return + } + t.tg.exiting = true + t.tg.exitStatus = es + t.exitStatus = es + for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { + if sibling != t { + sibling.killLocked() + } + } +} + +// Kill requests that all tasks in ts exit as if group exiting with status es. +// Kill does not wait for tasks to exit. +// +// Kill has no analogue in Linux; it's provided for save/restore only. +func (ts *TaskSet) Kill(es ExitStatus) { + ts.mu.Lock() + defer ts.mu.Unlock() + ts.Root.exiting = true + for t := range ts.Root.tids { + t.tg.signalHandlers.mu.Lock() + if !t.tg.exiting { + t.tg.exiting = true + t.tg.exitStatus = es + } + t.killLocked() + t.tg.signalHandlers.mu.Unlock() + } +} + +// advanceExitStateLocked checks that t's current exit state is oldExit, then +// sets it to newExit. If t's current exit state is not oldExit, +// advanceExitStateLocked panics. +// +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) { + if t.exitState != oldExit { + panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState)) + } + t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit) + t.exitState = newExit +} + +// runExit is the entry point into the task exit path. +// +// +stateify savable +type runExit struct{} + +func (*runExit) execute(t *Task) taskRunState { + t.ptraceExit() + return (*runExitMain)(nil) +} + +// +stateify savable +type runExitMain struct{} + +func (*runExitMain) execute(t *Task) taskRunState { + t.traceExitEvent() + lastExiter := t.exitThreadGroup() + + // If the task has a cleartid, and the thread group wasn't killed by a + // signal, handle that before releasing the MM. + if t.cleartid != 0 { + t.tg.signalHandlers.mu.Lock() + signaled := t.tg.exiting && t.tg.exitStatus.Signaled() + t.tg.signalHandlers.mu.Unlock() + if !signaled { + if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil { + t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1) + } + // If the CopyOut fails, there's nothing we can do. + } + } + + // Deactivate the address space and update max RSS before releasing the + // task's MM. + t.Deactivate() + t.tg.pidns.owner.mu.Lock() + t.updateRSSLocked() + t.tg.pidns.owner.mu.Unlock() + t.mu.Lock() + t.tc.release() + t.mu.Unlock() + + // Releasing the MM unblocks a blocked CLONE_VFORK parent. + t.unstopVforkParent() + + t.fsContext.DecRef() + t.fdTable.DecRef() + + t.mu.Lock() + if t.mountNamespaceVFS2 != nil { + t.mountNamespaceVFS2.DecRef() + t.mountNamespaceVFS2 = nil + } + t.mu.Unlock() + + // If this is the last task to exit from the thread group, release the + // thread group's resources. + if lastExiter { + t.tg.release() + } + + // Detach tracees. + t.exitPtrace() + + // Reparent the task's children. + t.exitChildren() + + // Don't tail-call runExitNotify, as exitChildren may have initiated a stop + // to wait for a PID namespace to die. + return (*runExitNotify)(nil) +} + +// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread +// group that it is no longer eligible to participate in group activities. It +// returns true if t is the last task in its thread group to call +// exitThreadGroup. +func (t *Task) exitThreadGroup() bool { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.tg.signalHandlers.mu.Lock() + // Can't defer unlock: see below. + + t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated) + t.tg.activeTasks-- + last := t.tg.activeTasks == 0 + + // Ensure that someone will handle the signals we can't. + t.setSignalMaskLocked(^linux.SignalSet(0)) + + // Check if this task's exit interacts with an initiated group stop. + if !t.groupStopPending { + t.tg.signalHandlers.mu.Unlock() + return last + } + t.groupStopPending = false + sig := t.tg.groupStopSignal + notifyParent := t.participateGroupStopLocked() + // signalStop must be called with t's signal mutex unlocked. + t.tg.signalHandlers.mu.Unlock() + if notifyParent && t.tg.leader.parent != nil { + t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) + } + return last +} + +func (t *Task) exitChildren() { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + newParent := t.findReparentTargetLocked() + if newParent == nil { + // "If the init process of a PID namespace terminates, the kernel + // terminates all of the processes in the namespace via a SIGKILL + // signal." - pid_namespaces(7) + t.Debugf("Init process terminating, killing namespace") + t.tg.pidns.exiting = true + for other := range t.tg.pidns.tgids { + if other == t.tg { + continue + } + other.signalHandlers.mu.Lock() + other.leader.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + }, true /* group */) + other.signalHandlers.mu.Unlock() + } + // TODO(b/37722272): The init process waits for all processes in the + // namespace to exit before completing its own exit + // (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all + // other tasks in the namespace are dead, except possibly for this + // thread group's leader (which can't be reaped until this task exits). + } + // This is correct even if newParent is nil (it ensures that children don't + // wait for a parent to reap them.) + for c := range t.children { + if sig := c.ParentDeathSignal(); sig != 0 { + siginfo := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + siginfo.SetPid(int32(c.tg.pidns.tids[t])) + siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) + c.tg.signalHandlers.mu.Lock() + c.sendSignalLocked(siginfo, true /* group */) + c.tg.signalHandlers.mu.Unlock() + } + c.reparentLocked(newParent) + if newParent != nil { + newParent.children[c] = struct{}{} + } + } +} + +// findReparentTargetLocked returns the task to which t's children should be +// reparented. If no such task exists, findNewParentLocked returns nil. +// +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) findReparentTargetLocked() *Task { + // Reparent to any sibling in the same thread group that hasn't begun + // exiting. + if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil { + return t2 + } + // "A child process that is orphaned within the namespace will be + // reparented to [the init process for the namespace] ..." - + // pid_namespaces(7) + if init := t.tg.pidns.tasks[InitTID]; init != nil { + return init.tg.anyNonExitingTaskLocked() + } + return nil +} + +func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task { + for t := tg.tasks.Front(); t != nil; t = t.Next() { + if t.exitState == TaskExitNone { + return t + } + } + return nil +} + +// reparentLocked changes t's parent. The new parent may be nil. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) reparentLocked(parent *Task) { + oldParent := t.parent + t.parent = parent + // If a thread group leader's parent changes, reset the thread group's + // termination signal to SIGCHLD and re-check exit notification. (Compare + // kernel/exit.c:reparent_leader().) + if t != t.tg.leader { + return + } + if oldParent == nil && parent == nil { + return + } + if oldParent != nil && parent != nil && oldParent.tg == parent.tg { + return + } + t.tg.terminationSignal = linux.SIGCHLD + if t.exitParentNotified && !t.exitParentAcked { + t.exitParentNotified = false + t.exitNotifyLocked(false) + } +} + +// When a task exits, other tasks in the system, notably the task's parent and +// ptracer, may want to be notified. The exit notification system ensures that +// interested tasks receive signals and/or are woken from blocking calls to +// wait*() syscalls; these notifications must be resolved before exiting tasks +// can be reaped and disappear from the system. +// +// Each task may have a parent task and/or a tracer task. If both a parent and +// a tracer exist, they may be the same task, different tasks in the same +// thread group, or tasks in different thread groups. (In the last case, Linux +// refers to the task as being ptrace-reparented due to an implementation +// detail; we avoid this terminology to avoid confusion.) +// +// A thread group is *empty* if all non-leader tasks in the thread group are +// dead, and the leader is either a zombie or dead. The exit of a thread group +// leader is never waitable - by either the parent or tracer - until the thread +// group is empty. +// +// There are a few ways for an exit notification to be resolved: +// +// - The exit notification may be acknowledged by a call to Task.Wait with +// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall). +// +// - If the notified party is the parent, and the parent thread group is not +// also the tracer thread group, and the notification signal is SIGCHLD, the +// parent may explicitly ignore the notification (see quote in exitNotify). +// Note that it's possible for the notified party to ignore the signal in other +// cases, but the notification is only resolved under the above conditions. +// (Actually, there is one exception; see the last paragraph of the "leader, +// has tracer, tracer thread group is parent thread group" case below.) +// +// - If the notified party is the parent, and the parent does not exist, the +// notification is resolved as if ignored. (This is only possible in the +// sentry. In Linux, the only task / thread group without a parent is global +// init, and killing global init causes a kernel panic.) +// +// - If the notified party is a tracer, the tracer may detach the traced task. +// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.) +// +// In addition, if the notified party is the parent, the parent may exit and +// cause the notifying task to be reparented to another thread group. This does +// not resolve the notification; instead, the notification must be resent to +// the new parent. +// +// The series of notifications generated for a given task's exit depend on +// whether it is a thread group leader; whether the task is ptraced; and, if +// so, whether the tracer thread group is the same as the parent thread group. +// +// - Non-leader, no tracer: No notification is generated; the task is reaped +// immediately. +// +// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer +// notification is resolved (by waiting or detaching), the task is reaped. (For +// non-leaders, whether the tracer and parent thread groups are the same is +// irrelevant.) +// +// - Leader, no tracer: The task remains a zombie, with no notification sent, +// until all other tasks in the thread group are dead. (In Linux terms, this +// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks +// are removed from their thread_group list in kernel/exit.c:release_task() => +// __exit_signal() => __unhash_process().) Then the thread group's termination +// signal is sent to the parent. When the parent notification is resolved (by +// waiting or ignoring), the task is reaped. +// +// - Leader, has tracer, tracer thread group is not parent thread group: +// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by +// waiting or detaching), and all other tasks in the thread group are dead, the +// thread group's termination signal is sent to the parent. (Note that the +// tracer cannot resolve the exit notification by waiting until the thread +// group is empty.) When the parent notification is resolved, the task is +// reaped. +// +// - Leader, has tracer, tracer thread group is parent thread group: +// +// If all other tasks in the thread group are dead, the thread group's +// termination signal is sent to the parent. At this point, the notification +// can only be resolved by waiting. If the parent detaches from the task as a +// tracer, the notification is not resolved, but the notification can now be +// resolved by waiting or ignoring. When the parent notification is resolved, +// the task is reaped. +// +// If at least one task in the thread group is not dead, SIGCHLD is sent to the +// parent. At this point, the notification cannot be resolved at all; once the +// thread group becomes empty, it can be resolved only by waiting. If the +// parent detaches from the task as a tracer before all remaining tasks die, +// then exit notification proceeds as in the case where the leader never had a +// tracer. If the parent detaches from the task as a tracer after all remaining +// tasks die, the notification is not resolved, but the notification can now be +// resolved by waiting or ignoring. When the parent notification is resolved, +// the task is reaped. +// +// In both of the above cases, when the parent detaches from the task as a +// tracer while the thread group is empty, whether or not the parent resolves +// the notification by ignoring it is based on the parent's SIGCHLD signal +// action, whether or not the thread group's termination signal is SIGCHLD +// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()). +// +// There is one final wrinkle: A leader can become a non-leader due to a +// sibling execve. In this case, the execing thread detaches the leader's +// tracer (if one exists) and reaps the leader immediately. In Linux, this is +// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked(). + +// +stateify savable +type runExitNotify struct{} + +func (*runExitNotify) execute(t *Task) taskRunState { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie) + t.tg.liveTasks-- + // Check if this completes a sibling's execve. + if t.tg.execing != nil && t.tg.liveTasks == 1 { + // execing blocks the addition of new tasks to the thread group, so + // the sole living task must be the execing one. + e := t.tg.execing + e.tg.signalHandlers.mu.Lock() + if _, ok := e.stop.(*execStop); ok { + e.endInternalStopLocked() + } + e.tg.signalHandlers.mu.Unlock() + } + t.exitNotifyLocked(false) + // The task goroutine will now exit. + return nil +} + +// exitNotifyLocked is called after changes to t's state that affect exit +// notification. +// +// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace; +// thanks to Linux's haphazard implementation of this functionality, such cases +// determine whether parent notifications are ignored based on the parent's +// handling of SIGCHLD, regardless of what the exited task's thread group's +// termination signal is. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { + if t.exitState != TaskExitZombie { + return + } + if !t.exitTracerNotified { + t.exitTracerNotified = true + tracer := t.Tracer() + if tracer == nil { + t.exitTracerAcked = true + } else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg { + // Don't set exitParentNotified if t is non-leader, even if the + // tracer is in the parent thread group, so that if the parent + // detaches the following call to exitNotifyLocked passes through + // the !exitParentNotified case below and causes t to be reaped + // immediately. + // + // Tracer notification doesn't care about about + // SIG_IGN/SA_NOCLDWAIT. + tracer.tg.signalHandlers.mu.Lock() + tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */) + tracer.tg.signalHandlers.mu.Unlock() + // Wake EventTraceeStop waiters as well since this task will never + // ptrace-stop again. + tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop) + } else { + // t is a leader and the tracer is in the parent thread group. + t.exitParentNotified = true + sig := linux.SIGCHLD + if t.tg.tasksCount == 1 { + sig = t.tg.terminationSignal + } + // This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either + // (in Linux, the check in do_notify_parent() is gated by + // !tsk->ptrace.) + t.parent.tg.signalHandlers.mu.Lock() + t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */) + t.parent.tg.signalHandlers.mu.Unlock() + // See below for rationale for this event mask. + t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) + } + } + if t.exitTracerAcked && !t.exitParentNotified { + if t != t.tg.leader { + t.exitParentNotified = true + t.exitParentAcked = true + } else if t.tg.tasksCount == 1 { + t.exitParentNotified = true + if t.parent == nil { + t.exitParentAcked = true + } else { + // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is + // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see + // sigaction(2)), then children that terminate do not become + // zombies and a call to wait() or waitpid() will block until all + // children have terminated, and then fail with errno set to + // ECHILD. (The original POSIX standard left the behavior of + // setting SIGCHLD to SIG_IGN unspecified. Note that even though + // the default disposition of SIGCHLD is "ignore", explicitly + // setting the disposition to SIG_IGN results in different + // treatment of zombie process children.) Linux 2.6 conforms to + // this specification." - wait(2) + // + // Some undocumented Linux-specific details: + // + // - All of the above is ignored if the termination signal isn't + // SIGCHLD. + // + // - SA_NOCLDWAIT causes the leader to be immediately reaped, but + // does not suppress the SIGCHLD. + signalParent := t.tg.terminationSignal.IsValid() + t.parent.tg.signalHandlers.mu.Lock() + if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach { + if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok { + if act.Handler == arch.SignalActIgnore { + t.exitParentAcked = true + signalParent = false + } else if act.Flags&arch.SignalFlagNoCldWait != 0 { + t.exitParentAcked = true + } + } + } + if signalParent { + t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */) + } + t.parent.tg.signalHandlers.mu.Unlock() + // If a task in the parent was waiting for a child group stop + // or continue, it needs to be notified of the exit, because + // there may be no remaining eligible tasks (so that wait + // should return ECHILD). + t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) + } + } + } + if t.exitTracerAcked && t.exitParentAcked { + t.advanceExitStateLocked(TaskExitZombie, TaskExitDead) + for ns := t.tg.pidns; ns != nil; ns = ns.parent { + tid := ns.tids[t] + delete(ns.tasks, tid) + delete(ns.tids, t) + if t == t.tg.leader { + delete(ns.tgids, t.tg) + } + } + t.tg.exitedCPUStats.Accumulate(t.CPUStats()) + t.tg.ioUsage.Accumulate(t.ioUsage) + t.tg.signalHandlers.mu.Lock() + t.tg.tasks.Remove(t) + t.tg.tasksCount-- + tc := t.tg.tasksCount + t.tg.signalHandlers.mu.Unlock() + if tc == 1 && t != t.tg.leader { + // Our fromPtraceDetach doesn't matter here (in Linux terms, this + // is via a call to release_task()). + t.tg.leader.exitNotifyLocked(false) + } else if tc == 0 { + t.tg.processGroup.decRefWithParent(t.tg.parentPG()) + } + if t.parent != nil { + delete(t.parent.children, t) + t.parent = nil + } + } +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo { + info := &arch.SignalInfo{ + Signo: int32(sig), + } + info.SetPid(int32(receiver.tg.pidns.tids[t])) + info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + if t.exitStatus.Signaled() { + info.Code = arch.CLD_KILLED + info.SetStatus(int32(t.exitStatus.Signo)) + } else { + info.Code = arch.CLD_EXITED + info.SetStatus(int32(t.exitStatus.Code)) + } + // TODO(b/72102453): Set utime, stime. + return info +} + +// ExitStatus returns t's exit status, which is only guaranteed to be +// meaningful if t.ExitState() != TaskExitNone. +func (t *Task) ExitStatus() ExitStatus { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.exitStatus +} + +// ExitStatus returns the exit status that would be returned by a consuming +// wait*() on tg. +func (tg *ThreadGroup) ExitStatus() ExitStatus { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + if tg.exiting { + return tg.exitStatus + } + return tg.leader.exitStatus +} + +// TerminationSignal returns the thread group's termination signal. +func (tg *ThreadGroup) TerminationSignal() linux.Signal { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.terminationSignal +} + +// Task events that can be waited for. +const ( + // EventExit represents an exit notification generated for a child thread + // group leader or a tracee under the conditions specified in the comment + // above runExitNotify. + EventExit waiter.EventMask = 1 << iota + + // EventChildGroupStop occurs when a child thread group completes a group + // stop (i.e. all tasks in the child thread group have entered a stopped + // state as a result of a group stop). + EventChildGroupStop + + // EventTraceeStop occurs when a task that is ptraced by a task in the + // notified thread group enters a ptrace stop (see ptrace(2)). + EventTraceeStop + + // EventGroupContinue occurs when a child thread group, or a thread group + // whose leader is ptraced by a task in the notified thread group, that had + // initiated or completed a group stop leaves the group stop, due to the + // child thread group or any task in the child thread group being sent + // SIGCONT. + EventGroupContinue +) + +// WaitOptions controls the behavior of Task.Wait. +type WaitOptions struct { + // If SpecificTID is non-zero, only events from the task with thread ID + // SpecificTID are eligible to be waited for. SpecificTID is resolved in + // the PID namespace of the waiter (the method receiver of Task.Wait). If + // no such task exists, or that task would not otherwise be eligible to be + // waited for by the waiting task, then there are no waitable tasks and + // Wait will return ECHILD. + SpecificTID ThreadID + + // If SpecificPGID is non-zero, only events from ThreadGroups with a + // matching ProcessGroupID are eligible to be waited for. (Same + // constraints as SpecificTID apply.) + SpecificPGID ProcessGroupID + + // Terminology note: Per waitpid(2), "a clone child is one which delivers + // no signal, or a signal other than SIGCHLD to its parent upon + // termination." In Linux, termination signal is technically a per-task + // property rather than a per-thread-group property. However, clone() + // forces no termination signal for tasks created with CLONE_THREAD, and + // execve() resets the termination signal to SIGCHLD, so all + // non-group-leader threads have no termination signal and are therefore + // "clone tasks". + + // If NonCloneTasks is true, events from non-clone tasks are eligible to be + // waited for. + NonCloneTasks bool + + // If CloneTasks is true, events from clone tasks are eligible to be waited + // for. + CloneTasks bool + + // If SiblingChildren is true, events from children tasks of any task + // in the thread group of the waiter are eligible to be waited for. + SiblingChildren bool + + // Events is a bitwise combination of the events defined above that specify + // what events are of interest to the call to Wait. + Events waiter.EventMask + + // If ConsumeEvent is true, the Wait should consume the event such that it + // cannot be returned by a future Wait. Note that if a task exit is + // consumed in this way, in most cases the task will be reaped. + ConsumeEvent bool + + // If BlockInterruptErr is not nil, Wait will block until either an event + // is available or there are no tasks that could produce a waitable event; + // if that blocking is interrupted, Wait returns BlockInterruptErr. If + // BlockInterruptErr is nil, Wait will not block. + BlockInterruptErr error +} + +// Preconditions: The TaskSet mutex must be locked (for reading or writing). +func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool { + if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] { + return false + } + if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] { + return false + } + // Tracees are always eligible. + if tracee { + return true + } + if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD { + return o.NonCloneTasks + } + return o.CloneTasks +} + +// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g. +// waitpid(WNOHANG)) that find no waitable events, but determine that waitable +// events may exist in the future. (In contrast, if a non-blocking or blocking +// Wait determines that there are no tasks that can produce a waitable event, +// Task.Wait returns ECHILD.) +var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events") + +// WaitResult contains information about a waited-for event. +type WaitResult struct { + // Task is the task that reported the event. + Task *Task + + // TID is the thread ID of Task in the PID namespace of the task that + // called Wait (that is, the method receiver of the call to Task.Wait). TID + // is provided because consuming exit waits cause the thread ID to be + // deallocated. + TID ThreadID + + // UID is the real UID of Task in the user namespace of the task that + // called Wait. + UID auth.UID + + // Event is exactly one of the events defined above. + Event waiter.EventMask + + // Status is the numeric status associated with the event. + Status uint32 +} + +// Wait waits for an event from a thread group that is a child of t's thread +// group, or a task in such a thread group, or a task that is ptraced by t, +// subject to the options specified in opts. +func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) { + if opts.BlockInterruptErr == nil { + return t.waitOnce(opts) + } + w, ch := waiter.NewChannelEntry(nil) + t.tg.eventQueue.EventRegister(&w, opts.Events) + defer t.tg.eventQueue.EventUnregister(&w) + for { + wr, err := t.waitOnce(opts) + if err != ErrNoWaitableEvent { + // This includes err == nil. + return wr, err + } + if err := t.Block(ch); err != nil { + return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr) + } + } +} + +func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) { + anyWaitableTasks := false + + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + + if opts.SiblingChildren { + // We can wait on the children and tracees of any task in the + // same thread group. + for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() { + wr, any := t.waitParentLocked(opts, parent) + if wr != nil { + return wr, nil + } + anyWaitableTasks = anyWaitableTasks || any + } + } else { + // We can only wait on this task. + var wr *WaitResult + wr, anyWaitableTasks = t.waitParentLocked(opts, t) + if wr != nil { + return wr, nil + } + } + + if anyWaitableTasks { + return nil, ErrNoWaitableEvent + } + return nil, syserror.ECHILD +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) { + anyWaitableTasks := false + + for child := range parent.children { + if !opts.matchesTask(child, parent.tg.pidns, false) { + continue + } + // Non-leaders don't notify parents on exit and aren't eligible to + // be waited on. + if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked { + anyWaitableTasks = true + if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil { + return wr, anyWaitableTasks + } + } + // Check for group stops and continues. Tasks that have passed + // TaskExitInitiated can no longer participate in group stops. + if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 { + continue + } + if child.exitState >= TaskExitInitiated { + continue + } + // If the waiter is in the same thread group as the task's + // tracer, do not report its group stops; they will be reported + // as ptrace stops instead. This also skips checking for group + // continues, but they'll be checked for when scanning tracees + // below. (Per kernel/exit.c:wait_consider_task(): "If a + // ptracer wants to distinguish the two events for its own + // children, it should create a separate process which takes + // the role of real parent.") + if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg { + continue + } + anyWaitableTasks = true + if opts.Events&EventChildGroupStop != 0 { + if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil { + return wr, anyWaitableTasks + } + } + if opts.Events&EventGroupContinue != 0 { + if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil { + return wr, anyWaitableTasks + } + } + } + for tracee := range parent.ptraceTracees { + if !opts.matchesTask(tracee, parent.tg.pidns, true) { + continue + } + // Non-leaders do notify tracers on exit. + if opts.Events&EventExit != 0 && !tracee.exitTracerAcked { + anyWaitableTasks = true + if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil { + return wr, anyWaitableTasks + } + } + if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 { + continue + } + if tracee.exitState >= TaskExitInitiated { + continue + } + anyWaitableTasks = true + if opts.Events&EventTraceeStop != 0 { + if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil { + return wr, anyWaitableTasks + } + } + if opts.Events&EventGroupContinue != 0 { + if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil { + return wr, anyWaitableTasks + } + } + } + + return nil, anyWaitableTasks +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult { + if asPtracer && !target.exitTracerNotified { + return nil + } + if !asPtracer && !target.exitParentNotified { + return nil + } + // Zombied thread group leaders are never waitable until their thread group + // is otherwise empty. Usually this is caught by the + // target.exitParentNotified check above, but if t is both (in the thread + // group of) target's tracer and parent, asPtracer may be true. + if target == target.tg.leader && target.tg.tasksCount != 1 { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + status := target.exitStatus.Status() + if !opts.ConsumeEvent { + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventExit, + Status: status, + } + } + // Surprisingly, the exit status reported by a non-consuming wait can + // differ from that reported by a consuming wait; the latter will return + // the group exit code if one is available. + if target.tg.exiting { + status = target.tg.exitStatus.Status() + } + // t may be (in the thread group of) target's parent, tracer, or both. We + // don't need to check for !exitTracerAcked because tracees are detached + // here, and we don't need to check for !exitParentAcked because zombies + // will be reaped here. + if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified { + target.exitTracerAcked = true + target.ptraceTracer.Store((*Task)(nil)) + delete(t.ptraceTracees, target) + } + if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified { + target.exitParentAcked = true + if target == target.tg.leader { + // target.tg.exitedCPUStats doesn't include target.CPUStats() yet, + // and won't until after target.exitNotifyLocked() (maybe). Include + // target.CPUStats() explicitly. This is consistent with Linux, + // which accounts an exited task's cputime to its thread group in + // kernel/exit.c:release_task() => __exit_signal(), and uses + // thread_group_cputime_adjusted() in wait_task_zombie(). + t.tg.childCPUStats.Accumulate(target.CPUStats()) + t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats) + t.tg.childCPUStats.Accumulate(target.tg.childCPUStats) + // Update t's child max resident set size. The size will be the maximum + // of this thread's size and all its childrens' sizes. + if t.tg.childMaxRSS < target.tg.maxRSS { + t.tg.childMaxRSS = target.tg.maxRSS + } + if t.tg.childMaxRSS < target.tg.childMaxRSS { + t.tg.childMaxRSS = target.tg.childMaxRSS + } + } + } + target.exitNotifyLocked(false) + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventExit, + Status: status, + } +} + +// updateRSSLocked updates t.tg.maxRSS. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) updateRSSLocked() { + if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS { + t.tg.maxRSS = mmMaxRSS + } +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult { + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if !target.tg.groupStopWaitable { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + sig := target.tg.groupStopSignal + if opts.ConsumeEvent { + target.tg.groupStopWaitable = false + } + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventChildGroupStop, + // There is no name for these status constants. + Status: (uint32(sig)&0xff)<<8 | 0x7f, + } +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult { + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if !target.tg.groupContWaitable { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + if opts.ConsumeEvent { + target.tg.groupContWaitable = false + } + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventGroupContinue, + Status: 0xffff, + } +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult { + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.stop == nil { + return nil + } + if _, ok := target.stop.(*ptraceStop); !ok { + return nil + } + if target.ptraceCode == 0 { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + code := target.ptraceCode + if opts.ConsumeEvent { + target.ptraceCode = 0 + } + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventTraceeStop, + Status: uint32(code)<<8 | 0x7f, + } +} + +// ExitState returns t's current progress through the exit path. +func (t *Task) ExitState() TaskExitState { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + return t.exitState +} + +// ParentDeathSignal returns t's parent death signal. +func (t *Task) ParentDeathSignal() linux.Signal { + t.mu.Lock() + defer t.mu.Unlock() + return t.parentDeathSignal +} + +// SetParentDeathSignal sets t's parent death signal. +func (t *Task) SetParentDeathSignal(sig linux.Signal) { + t.mu.Lock() + defer t.mu.Unlock() + t.parentDeathSignal = sig +} diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go new file mode 100644 index 000000000..a53e77c9f --- /dev/null +++ b/pkg/sentry/kernel/task_futex.go @@ -0,0 +1,54 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Futex returns t's futex manager. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Futex() *futex.Manager { + return t.tc.fu +} + +// SwapUint32 implements futex.Target.SwapUint32. +func (t *Task) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { + return t.MemoryManager().SwapUint32(t, addr, new, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CompareAndSwapUint32 implements futex.Target.CompareAndSwapUint32. +func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { + return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// LoadUint32 implements futex.Target.LoadUint32. +func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) { + return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// GetSharedKey implements futex.Target.GetSharedKey. +func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) { + return t.MemoryManager().GetSharedFutexKey(t, addr) +} diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go new file mode 100644 index 000000000..0325967e4 --- /dev/null +++ b/pkg/sentry/kernel/task_identity.go @@ -0,0 +1,606 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Credentials returns t's credentials. +// +// This value must be considered immutable. +func (t *Task) Credentials() *auth.Credentials { + return t.creds.Load() +} + +// UserNamespace returns the user namespace associated with the task. +func (t *Task) UserNamespace() *auth.UserNamespace { + return t.Credentials().UserNamespace +} + +// HasCapabilityIn checks if the task has capability cp in user namespace ns. +func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool { + return t.Credentials().HasCapabilityIn(cp, ns) +} + +// HasCapability checks if the task has capability cp in its user namespace. +func (t *Task) HasCapability(cp linux.Capability) bool { + return t.Credentials().HasCapability(cp) +} + +// SetUID implements the semantics of setuid(2). +func (t *Task) SetUID(uid auth.UID) error { + // setuid considers -1 to be invalid. + if !uid.Ok() { + return syserror.EINVAL + } + + t.mu.Lock() + defer t.mu.Unlock() + + creds := t.Credentials() + kuid := creds.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return syserror.EINVAL + } + // "setuid() sets the effective user ID of the calling process. If the + // effective UID of the caller is root (more precisely: if the caller has + // the CAP_SETUID capability), the real UID and saved set-user-ID are also + // set." - setuid(2) + if creds.HasCapability(linux.CAP_SETUID) { + t.setKUIDsUncheckedLocked(kuid, kuid, kuid) + return nil + } + // "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID + // capability) and uid does not match the real UID or saved set-user-ID of + // the calling process." + if kuid != creds.RealKUID && kuid != creds.SavedKUID { + return syserror.EPERM + } + t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID) + return nil +} + +// SetREUID implements the semantics of setreuid(2). +func (t *Task) SetREUID(r, e auth.UID) error { + t.mu.Lock() + defer t.mu.Unlock() + // "Supplying a value of -1 for either the real or effective user ID forces + // the system to leave that ID unchanged." - setreuid(2) + creds := t.Credentials() + newR := creds.RealKUID + if r.Ok() { + newR = creds.UserNamespace.MapToKUID(r) + if !newR.Ok() { + return syserror.EINVAL + } + } + newE := creds.EffectiveKUID + if e.Ok() { + newE = creds.UserNamespace.MapToKUID(e) + if !newE.Ok() { + return syserror.EINVAL + } + } + if !creds.HasCapability(linux.CAP_SETUID) { + // "Unprivileged processes may only set the effective user ID to the + // real user ID, the effective user ID, or the saved set-user-ID." + if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID { + return syserror.EPERM + } + // "Unprivileged users may only set the real user ID to the real user + // ID or the effective user ID." + if newR != creds.RealKUID && newR != creds.EffectiveKUID { + return syserror.EPERM + } + } + // "If the real user ID is set (i.e., ruid is not -1) or the effective user + // ID is set to a value not equal to the previous real user ID, the saved + // set-user-ID will be set to the new effective user ID." + newS := creds.SavedKUID + if r.Ok() || (e.Ok() && newE != creds.EffectiveKUID) { + newS = newE + } + t.setKUIDsUncheckedLocked(newR, newE, newS) + return nil +} + +// SetRESUID implements the semantics of the setresuid(2) syscall. +func (t *Task) SetRESUID(r, e, s auth.UID) error { + t.mu.Lock() + defer t.mu.Unlock() + // "Unprivileged user processes may change the real UID, effective UID, and + // saved set-user-ID, each to one of: the current real UID, the current + // effective UID or the current saved set-user-ID. Privileged processes (on + // Linux, those having the CAP_SETUID capability) may set the real UID, + // effective UID, and saved set-user-ID to arbitrary values. If one of the + // arguments equals -1, the corresponding value is not changed." - + // setresuid(2) + var err error + creds := t.Credentials() + newR := creds.RealKUID + if r.Ok() { + newR, err = creds.UseUID(r) + if err != nil { + return err + } + } + newE := creds.EffectiveKUID + if e.Ok() { + newE, err = creds.UseUID(e) + if err != nil { + return err + } + } + newS := creds.SavedKUID + if s.Ok() { + newS, err = creds.UseUID(s) + if err != nil { + return err + } + } + t.setKUIDsUncheckedLocked(newR, newE, newS) + return nil +} + +// Preconditions: t.mu must be locked. +func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + root := creds.UserNamespace.MapToKUID(auth.RootUID) + oldR, oldE, oldS := creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID + creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID = newR, newE, newS + + // "1. If one or more of the real, effective or saved set user IDs was + // previously 0, and as a result of the UID changes all of these IDs have a + // nonzero value, then all capabilities are cleared from the permitted and + // effective capability sets." - capabilities(7) + if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) { + // prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's + // "keep capabilities" flag, which determines whether the thread's permitted + // capability set is cleared when a change is made to the + // thread's user IDs such that the thread's real UID, effective + // UID, and saved set-user-ID all become nonzero when at least + // one of them previously had the value 0. By default, the + // permitted capability set is cleared when such a change is + // made; setting the "keep capabilities" flag prevents it from + // being cleared." (A thread's effective capability set is always + // cleared when such a credential change is made, + // regardless of the setting of the "keep capabilities" flag.) + if !creds.KeepCaps { + creds.PermittedCaps = 0 + creds.EffectiveCaps = 0 + } + } + // """ + // 2. If the effective user ID is changed from 0 to nonzero, then all + // capabilities are cleared from the effective set. + // + // 3. If the effective user ID is changed from nonzero to 0, then the + // permitted set is copied to the effective set. + // """ + if oldE == root && newE != root { + creds.EffectiveCaps = 0 + } else if oldE != root && newE == root { + creds.EffectiveCaps = creds.PermittedCaps + } + // "4. If the filesystem user ID is changed from 0 to nonzero (see + // setfsuid(2)), then the following capabilities are cleared from the + // effective set: ..." + // (filesystem UIDs aren't implemented, nor are any of the capabilities in + // question) + + if oldE != newE { + // "[dumpability] is reset to the current value contained in + // the file /proc/sys/fs/suid_dumpable (which by default has + // the value 0), in the following circumstances: The process's + // effective user or group ID is changed." - prctl(2) + // + // (suid_dumpable isn't implemented, so we just use the + // default. + t.MemoryManager().SetDumpability(mm.NotDumpable) + + // Not documented, but compare Linux's kernel/cred.c:commit_creds(). + t.parentDeathSignal = 0 + } + t.creds.Store(creds) +} + +// SetGID implements the semantics of setgid(2). +func (t *Task) SetGID(gid auth.GID) error { + if !gid.Ok() { + return syserror.EINVAL + } + + t.mu.Lock() + defer t.mu.Unlock() + + creds := t.Credentials() + kgid := creds.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + if creds.HasCapability(linux.CAP_SETGID) { + t.setKGIDsUncheckedLocked(kgid, kgid, kgid) + return nil + } + if kgid != creds.RealKGID && kgid != creds.SavedKGID { + return syserror.EPERM + } + t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID) + return nil +} + +// SetREGID implements the semantics of setregid(2). +func (t *Task) SetREGID(r, e auth.GID) error { + t.mu.Lock() + defer t.mu.Unlock() + + creds := t.Credentials() + newR := creds.RealKGID + if r.Ok() { + newR = creds.UserNamespace.MapToKGID(r) + if !newR.Ok() { + return syserror.EINVAL + } + } + newE := creds.EffectiveKGID + if e.Ok() { + newE = creds.UserNamespace.MapToKGID(e) + if !newE.Ok() { + return syserror.EINVAL + } + } + if !creds.HasCapability(linux.CAP_SETGID) { + if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID { + return syserror.EPERM + } + if newR != creds.RealKGID && newR != creds.EffectiveKGID { + return syserror.EPERM + } + } + newS := creds.SavedKGID + if r.Ok() || (e.Ok() && newE != creds.EffectiveKGID) { + newS = newE + } + t.setKGIDsUncheckedLocked(newR, newE, newS) + return nil +} + +// SetRESGID implements the semantics of the setresgid(2) syscall. +func (t *Task) SetRESGID(r, e, s auth.GID) error { + var err error + + t.mu.Lock() + defer t.mu.Unlock() + + creds := t.Credentials() + newR := creds.RealKGID + if r.Ok() { + newR, err = creds.UseGID(r) + if err != nil { + return err + } + } + newE := creds.EffectiveKGID + if e.Ok() { + newE, err = creds.UseGID(e) + if err != nil { + return err + } + } + newS := creds.SavedKGID + if s.Ok() { + newS, err = creds.UseGID(s) + if err != nil { + return err + } + } + t.setKGIDsUncheckedLocked(newR, newE, newS) + return nil +} + +func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + oldE := creds.EffectiveKGID + creds.RealKGID, creds.EffectiveKGID, creds.SavedKGID = newR, newE, newS + + if oldE != newE { + // "[dumpability] is reset to the current value contained in + // the file /proc/sys/fs/suid_dumpable (which by default has + // the value 0), in the following circumstances: The process's + // effective user or group ID is changed." - prctl(2) + // + // (suid_dumpable isn't implemented, so we just use the + // default. + t.MemoryManager().SetDumpability(mm.NotDumpable) + + // Not documented, but compare Linux's + // kernel/cred.c:commit_creds(). + t.parentDeathSignal = 0 + } + t.creds.Store(creds) +} + +// SetExtraGIDs attempts to change t's supplemental groups. All IDs are +// interpreted as being in t's user namespace. +func (t *Task) SetExtraGIDs(gids []auth.GID) error { + t.mu.Lock() + defer t.mu.Unlock() + creds := t.Credentials() + if !creds.HasCapability(linux.CAP_SETGID) { + return syserror.EPERM + } + kgids := make([]auth.KGID, len(gids)) + for i, gid := range gids { + kgid := creds.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + kgids[i] = kgid + } + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.ExtraKGIDs = kgids + t.creds.Store(creds) + return nil +} + +// SetCapabilitySets attempts to change t's permitted, inheritable, and +// effective capability sets. +func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error { + t.mu.Lock() + defer t.mu.Unlock() + // "Permitted: This is a limiting superset for the effective capabilities + // that the thread may assume." - capabilities(7) + if effective & ^permitted != 0 { + return syserror.EPERM + } + creds := t.Credentials() + // "It is also a limiting superset for the capabilities that may be added + // to the inheritable set by a thread that does not have the CAP_SETPCAP + // capability in its effective set." + if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) { + return syserror.EPERM + } + // "If a thread drops a capability from its permitted set, it can never + // reacquire that capability (unless it execve(2)s ..." + if permitted & ^creds.PermittedCaps != 0 { + return syserror.EPERM + } + // "... if a capability is not in the bounding set, then a thread can't add + // this capability to its inheritable set, even if it was in its permitted + // capabilities ..." + if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 { + return syserror.EPERM + } + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.PermittedCaps = permitted + creds.InheritableCaps = inheritable + creds.EffectiveCaps = effective + t.creds.Store(creds) + return nil +} + +// DropBoundingCapability attempts to drop capability cp from t's capability +// bounding set. +func (t *Task) DropBoundingCapability(cp linux.Capability) error { + t.mu.Lock() + defer t.mu.Unlock() + creds := t.Credentials() + if !creds.HasCapability(linux.CAP_SETPCAP) { + return syserror.EPERM + } + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.BoundingCaps &^= auth.CapabilitySetOf(cp) + t.creds.Store(creds) + return nil +} + +// SetUserNamespace attempts to move c into ns. +func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { + t.mu.Lock() + defer t.mu.Unlock() + + creds := t.Credentials() + // "A process reassociating itself with a user namespace must have the + // CAP_SYS_ADMIN capability in the target user namespace." - setns(2) + // + // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN + // in ns (by rule 3 in auth.Credentials.HasCapability). + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { + return syserror.EPERM + } + + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + creds.UserNamespace = ns + // "The child process created by clone(2) with the CLONE_NEWUSER flag + // starts out with a complete set of capabilities in the new user + // namespace. Likewise, a process that creates a new user namespace using + // unshare(2) or joins an existing user namespace using setns(2) gains a + // full set of capabilities in that namespace." + creds.PermittedCaps = auth.AllCapabilities + creds.InheritableCaps = 0 + creds.EffectiveCaps = auth.AllCapabilities + creds.BoundingCaps = auth.AllCapabilities + // "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER + // flag sets the "securebits" flags (see capabilities(7)) to their default + // values (all flags disabled) in the child (for clone(2)) or caller (for + // unshare(2), or setns(2)." - user_namespaces(7) + creds.KeepCaps = false + t.creds.Store(creds) + + return nil +} + +// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS. +func (t *Task) SetKeepCaps(k bool) { + t.mu.Lock() + defer t.mu.Unlock() + creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. + creds.KeepCaps = k + t.creds.Store(creds) +} + +// updateCredsForExecLocked updates t.creds to reflect an execve(). +// +// NOTE(b/30815691): We currently do not implement privileged executables +// (set-user/group-ID bits and file capabilities). This allows us to make a lot +// of simplifying assumptions: +// +// - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which +// disables the features we don't support anyway, is always set. This +// drastically simplifies this function. +// +// - We don't set AT_SECURE = 1, because no_new_privs always being set means +// that the conditions that require AT_SECURE = 1 never arise. (Compare Linux's +// security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().) +// +// - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since +// seccomp-bpf is also allowed if the task has no_new_privs set. +// +// - Task.ptraceAttach does not serialize with execve as it does in Linux, +// since no_new_privs being set has the same effect as the presence of an +// unprivileged tracer. +// +// Preconditions: t.mu must be locked. +func (t *Task) updateCredsForExecLocked() { + // """ + // During an execve(2), the kernel calculates the new capabilities of + // the process using the following algorithm: + // + // P'(permitted) = (P(inheritable) & F(inheritable)) | + // (F(permitted) & cap_bset) + // + // P'(effective) = F(effective) ? P'(permitted) : 0 + // + // P'(inheritable) = P(inheritable) [i.e., unchanged] + // + // where: + // + // P denotes the value of a thread capability set before the + // execve(2) + // + // P' denotes the value of a thread capability set after the + // execve(2) + // + // F denotes a file capability set + // + // cap_bset is the value of the capability bounding set + // + // ... + // + // In order to provide an all-powerful root using capability sets, during + // an execve(2): + // + // 1. If a set-user-ID-root program is being executed, or the real user ID + // of the process is 0 (root) then the file inheritable and permitted sets + // are defined to be all ones (i.e. all capabilities enabled). + // + // 2. If a set-user-ID-root program is being executed, then the file + // effective bit is defined to be one (enabled). + // + // The upshot of the above rules, combined with the capabilities + // transformations described above, is that when a process execve(2)s a + // set-user-ID-root program, or when a process with an effective UID of 0 + // execve(2)s a program, it gains all capabilities in its permitted and + // effective capability sets, except those masked out by the capability + // bounding set. + // """ - capabilities(7) + // (ambient capability sets omitted) + // + // As the last paragraph implies, the case of "a set-user-ID root program + // is being executed" also includes the case where (namespace) root is + // executing a non-set-user-ID program; the actual check is just based on + // the effective user ID. + var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0 + fileEffective := false + creds := t.Credentials() + root := creds.UserNamespace.MapToKUID(auth.RootUID) + if creds.EffectiveKUID == root || creds.RealKUID == root { + newPermitted = creds.InheritableCaps | creds.BoundingCaps + if creds.EffectiveKUID == root { + fileEffective = true + } + } + + creds = creds.Fork() // The credentials object is immutable. See doc for creds. + + // Now we enter poorly-documented, somewhat confusing territory. (The + // accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds + // is not very helpful.) My reading of it is: + // + // If at least one of the following is true: + // + // A1. The execing task is ptraced, and the tracer did not have + // CAP_SYS_PTRACE in the execing task's user namespace at the time of + // PTRACE_ATTACH. + // + // A2. The execing task shares its FS context with at least one task in + // another thread group. + // + // A3. The execing task has no_new_privs set. + // + // AND at least one of the following is true: + // + // B1. The new effective user ID (which may come from set-user-ID, or be the + // execing task's existing effective user ID) is not equal to the task's + // real UID. + // + // B2. The new effective group ID (which may come from set-group-ID, or be + // the execing task's existing effective group ID) is not equal to the + // task's real GID. + // + // B3. The new permitted capability set contains capabilities not in the + // task's permitted capability set. + // + // Then: + // + // C1. Limit the new permitted capability set to the task's permitted + // capability set. + // + // C2. If either the task does not have CAP_SETUID in its user namespace, or + // the task has no_new_privs set, force the new effective UID and GID to + // the task's real UID and GID. + // + // But since no_new_privs is always set (A3 is always true), this becomes + // much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1 + // is a no-op. So we can just do C1 and C2 unconditionally. + if creds.EffectiveKUID != creds.RealKUID || creds.EffectiveKGID != creds.RealKGID { + creds.EffectiveKUID = creds.RealKUID + creds.EffectiveKGID = creds.RealKGID + t.parentDeathSignal = 0 + } + // (Saved set-user-ID is always set to the new effective user ID, and saved + // set-group-ID is always set to the new effective group ID, regardless of + // the above.) + creds.SavedKUID = creds.RealKUID + creds.SavedKGID = creds.RealKGID + creds.PermittedCaps &= newPermitted + if fileEffective { + creds.EffectiveCaps = creds.PermittedCaps + } else { + creds.EffectiveCaps = 0 + } + + // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent + // calls to execve(2). + creds.KeepCaps = false + + // "The bounding set is inherited at fork(2) from the thread's parent, and + // is preserved across an execve(2)". So we're done. + t.creds.Store(creds) +} diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go new file mode 100644 index 000000000..eeccaa197 --- /dev/null +++ b/pkg/sentry/kernel/task_log.go @@ -0,0 +1,208 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "runtime/trace" + "sort" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/usermem" +) + +const ( + // maxStackDebugBytes is the maximum number of user stack bytes that may be + // printed by debugDumpStack. + maxStackDebugBytes = 1024 +) + +// Infof logs an formatted info message by calling log.Infof. +func (t *Task) Infof(fmt string, v ...interface{}) { + if log.IsLogging(log.Info) { + log.InfofAtDepth(1, t.logPrefix.Load().(string)+fmt, v...) + } +} + +// Warningf logs a warning string by calling log.Warningf. +func (t *Task) Warningf(fmt string, v ...interface{}) { + if log.IsLogging(log.Warning) { + log.WarningfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...) + } +} + +// Debugf creates a debug string that includes the task ID. +func (t *Task) Debugf(fmt string, v ...interface{}) { + if log.IsLogging(log.Debug) { + log.DebugfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...) + } +} + +// IsLogging returns true iff this level is being logged. +func (t *Task) IsLogging(level log.Level) bool { + return log.IsLogging(level) +} + +// DebugDumpState logs task state at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) DebugDumpState() { + t.debugDumpRegisters() + t.debugDumpStack() + if mm := t.MemoryManager(); mm != nil { + t.Debugf("Mappings:\n%s", mm) + } + t.Debugf("FDTable:\n%s", t.fdTable) +} + +// debugDumpRegisters logs register state at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) debugDumpRegisters() { + if !t.IsLogging(log.Debug) { + return + } + regmap, err := t.Arch().RegisterMap() + if err != nil { + t.Debugf("Registers: %v", err) + } else { + t.Debugf("Registers:") + var regs []string + for reg := range regmap { + regs = append(regs, reg) + } + sort.Strings(regs) + for _, reg := range regs { + t.Debugf("%-8s = %016x", reg, regmap[reg]) + } + } +} + +// debugDumpStack logs user stack contents at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) debugDumpStack() { + if !t.IsLogging(log.Debug) { + return + } + m := t.MemoryManager() + if m == nil { + t.Debugf("Memory manager for task is gone, skipping application stack dump.") + return + } + t.Debugf("Stack:") + start := usermem.Addr(t.Arch().Stack()) + // Round addr down to a 16-byte boundary. + start &= ^usermem.Addr(15) + // Print 16 bytes per line, one byte at a time. + for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 { + addr, ok := start.AddLength(offset) + if !ok { + break + } + var data [16]byte + n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{ + IgnorePermissions: true, + }) + // Print as much of the line as we can, even if an error was + // encountered. + if n > 0 { + t.Debugf("%x: % x", addr, data[:n]) + } + if err != nil { + t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err) + break + } + } +} + +// trace definitions. +// +// Note that all region names are prefixed by ':' in order to ensure that they +// are lexically ordered before all system calls, which use the naked system +// call name (e.g. "read") for maximum clarity. +const ( + traceCategory = "task" + runRegion = ":run" + blockRegion = ":block" + cpuidRegion = ":cpuid" + faultRegion = ":fault" +) + +// updateInfoLocked updates the task's cached log prefix and tracing +// information to reflect its current thread ID. +// +// Preconditions: The task's owning TaskSet.mu must be locked. +func (t *Task) updateInfoLocked() { + // Use the task's TID in the root PID namespace for logging. + tid := t.tg.pidns.owner.Root.tids[t] + t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid)) + t.rebuildTraceContext(tid) +} + +// rebuildTraceContext rebuilds the trace context. +// +// Precondition: the passed tid must be the tid in the root namespace. +func (t *Task) rebuildTraceContext(tid ThreadID) { + // Re-initialize the trace context. + if t.traceTask != nil { + t.traceTask.End() + } + + // Note that we define the "task type" to be the dynamic TID. This does + // not align perfectly with the documentation for "tasks" in the + // tracing package. Tasks may be assumed to be bounded by analysis + // tools. However, if we just use a generic "task" type here, then the + // "user-defined tasks" page on the tracing dashboard becomes nearly + // unusable, as it loads all traces from all tasks. + // + // We can assume that the number of tasks in the system is not + // arbitrarily large (in general it won't be, especially for cases + // where we're collecting a brief profile), so using the TID is a + // reasonable compromise in this case. + t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid)) +} + +// traceCloneEvent is called when a new task is spawned. +// +// ntid must be the new task's ThreadID in the root namespace. +func (t *Task) traceCloneEvent(ntid ThreadID) { + if !trace.IsEnabled() { + return + } + trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid) +} + +// traceExitEvent is called when a task exits. +func (t *Task) traceExitEvent() { + if !trace.IsEnabled() { + return + } + trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status()) +} + +// traceExecEvent is called when a task calls exec. +func (t *Task) traceExecEvent(tc *TaskContext) { + if !trace.IsEnabled() { + return + } + file := tc.MemoryManager.Executable() + if file == nil { + trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") + return + } + defer file.DecRef() + trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) +} diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go new file mode 100644 index 000000000..f7711232c --- /dev/null +++ b/pkg/sentry/kernel/task_net.go @@ -0,0 +1,44 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/sentry/inet" +) + +// IsNetworkNamespaced returns true if t is in a non-root network namespace. +func (t *Task) IsNetworkNamespaced() bool { + t.mu.Lock() + defer t.mu.Unlock() + return !t.netns.IsRoot() +} + +// NetworkContext returns the network stack used by the task. NetworkContext +// may return nil if no network stack is available. +// +// TODO(gvisor.dev/issue/1833): Migrate callers of this method to +// NetworkNamespace(). +func (t *Task) NetworkContext() inet.Stack { + t.mu.Lock() + defer t.mu.Unlock() + return t.netns.Stack() +} + +// NetworkNamespace returns the network namespace observed by the task. +func (t *Task) NetworkNamespace() *inet.Namespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.netns +} diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go new file mode 100644 index 000000000..d654dd997 --- /dev/null +++ b/pkg/sentry/kernel/task_run.go @@ -0,0 +1,380 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "runtime" + "runtime/trace" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/hostcpu" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/usermem" +) + +// A taskRunState is a reified state in the task state machine. See README.md +// for details. The canonical list of all run states, as well as transitions +// between them, is given in run_states.dot. +// +// The set of possible states is enumerable and completely defined by the +// kernel package, so taskRunState would ideally be represented by a +// discriminated union. However, Go does not support sum types. +// +// Hence, as with TaskStop, data-free taskRunStates should be represented as +// typecast nils to avoid unnecessary allocation. +type taskRunState interface { + // execute executes the code associated with this state over the given task + // and returns the following state. If execute returns nil, the task + // goroutine should exit. + // + // It is valid to tail-call a following state's execute to avoid the + // overhead of converting the following state to an interface object and + // checking for stops, provided that the tail-call cannot recurse. + execute(*Task) taskRunState +} + +// run runs the task goroutine. +// +// threadID a dummy value set to the task's TID in the root PID namespace to +// make it visible in stack dumps. A goroutine for a given task can be identified +// searching for Task.run()'s argument value. +func (t *Task) run(threadID uintptr) { + // Construct t.blockingTimer here. We do this here because we can't + // reconstruct t.blockingTimer during restore in Task.afterLoad(), because + // kernel.timekeeper.SetClocks() hasn't been called yet. + blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier() + t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier) + defer t.blockingTimer.Destroy() + t.blockingTimerChan = blockingTimerChan + + // Activate our address space. + t.Activate() + // The corresponding t.Deactivate occurs in the exit path + // (runExitMain.execute) so that when + // Platform.CooperativelySharesAddressSpace() == true, we give up the + // AddressSpace before the task goroutine finishes executing. + + // If this is a newly-started task, it should check for participation in + // group stops. If this is a task resuming after restore, it was + // interrupted by saving. In either case, the task is initially + // interrupted. + t.interruptSelf() + + for { + // Explanation for this ordering: + // + // - A freshly-started task that is stopped should not do anything + // before it enters the stop. + // + // - If taskRunState.execute returns nil, the task goroutine should + // exit without checking for a stop. + // + // - Task.Start won't start Task.run if t.runState is nil, so this + // ordering is safe. + t.doStop() + t.runState = t.runState.execute(t) + if t.runState == nil { + t.accountTaskGoroutineEnter(TaskGoroutineNonexistent) + t.goroutineStopped.Done() + t.tg.liveGoroutines.Done() + t.tg.pidns.owner.liveGoroutines.Done() + t.tg.pidns.owner.runningGoroutines.Done() + t.p.Release() + + // Keep argument alive because stack trace for dead variables may not be correct. + runtime.KeepAlive(threadID) + return + } + } +} + +// doStop is called by Task.run to block until the task is not stopped. +func (t *Task) doStop() { + if atomic.LoadInt32(&t.stopCount) == 0 { + return + } + t.Deactivate() + // NOTE(b/30316266): t.Activate() must be called without any locks held, so + // this defer must precede the defer for unlocking the signal mutex. + defer t.Activate() + t.accountTaskGoroutineEnter(TaskGoroutineStopped) + defer t.accountTaskGoroutineLeave(TaskGoroutineStopped) + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.tg.pidns.owner.runningGoroutines.Add(-1) + defer t.tg.pidns.owner.runningGoroutines.Add(1) + t.goroutineStopped.Add(-1) + defer t.goroutineStopped.Add(1) + for t.stopCount > 0 { + t.endStopCond.Wait() + } +} + +func (*runApp) handleCPUIDInstruction(t *Task) error { + if len(arch.CPUIDInstruction) == 0 { + // CPUID emulation isn't supported, but this code can be + // executed, because the ptrace platform returns + // ErrContextSignalCPUID on page faults too. Look at + // pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more + // details. + return platform.ErrContextSignal + } + // Is this a CPUID instruction? + region := trace.StartRegion(t.traceContext, cpuidRegion) + expected := arch.CPUIDInstruction[:] + found := make([]byte, len(expected)) + _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) + if err == nil && bytes.Equal(expected, found) { + // Skip the cpuid instruction. + t.Arch().CPUIDEmulate(t) + t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) + region.End() + + return nil + } + region.End() // Not an actual CPUID, but required copy-in. + return platform.ErrContextSignal +} + +// The runApp state checks for interrupts before executing untrusted +// application code. +// +// +stateify savable +type runApp struct{} + +func (app *runApp) execute(t *Task) taskRunState { + if t.interrupted() { + // Checkpointing instructs tasks to stop by sending an interrupt, so we + // must check for stops before entering runInterrupt (instead of + // tail-calling it). + return (*runInterrupt)(nil) + } + + // We're about to switch to the application again. If there's still a + // unhandled SyscallRestartErrno that wasn't translated to an EINTR, + // restart the syscall that was interrupted. If there's a saved signal + // mask, restore it. (Note that restoring the saved signal mask may unblock + // a pending signal, causing another interruption, but that signal should + // not interact with the interrupted syscall.) + if t.haveSyscallReturn { + if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + if sre == ERESTART_RESTARTBLOCK { + t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) + t.Arch().RestartSyscallWithRestartBlock() + } else { + t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) + t.Arch().RestartSyscall() + } + } + t.haveSyscallReturn = false + } + if t.haveSavedSignalMask { + t.SetSignalMask(t.savedSignalMask) + t.haveSavedSignalMask = false + if t.interrupted() { + return (*runInterrupt)(nil) + } + } + + // Apply restartable sequences. + if t.rseqPreempted { + t.rseqPreempted = false + if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 { + // Linux writes the CPU on every preemption. We only do + // so if it changed. Thus we may delay delivery of + // SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid. + cpu := int32(hostcpu.GetCPU()) + if t.rseqCPU != cpu { + t.rseqCPU = cpu + if err := t.rseqCopyOutCPU(); err != nil { + t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) + t.forceSignal(linux.SIGSEGV, false) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + // Re-enter the task run loop for signal delivery. + return (*runApp)(nil) + } + if err := t.oldRSeqCopyOutCPU(); err != nil { + t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err) + t.forceSignal(linux.SIGSEGV, false) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + // Re-enter the task run loop for signal delivery. + return (*runApp)(nil) + } + } + } + t.rseqInterrupt() + } + + // Check if we need to enable single-stepping. Tracers expect that the + // kernel preserves the value of the single-step flag set by PTRACE_SETREGS + // whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this + // includes our ptrace platform, by the way), so we should only clear the + // single-step flag if we're responsible for setting it. (clearSinglestep + // is therefore analogous to Linux's TIF_FORCED_TF.) + // + // Strictly speaking, we should also not clear the single-step flag if we + // single-step through an instruction that sets the single-step flag + // (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their + // own TF. (Famous last words, I know.) + clearSinglestep := false + if t.hasTracer() { + t.tg.pidns.owner.mu.RLock() + if t.ptraceSinglestep { + clearSinglestep = !t.Arch().SingleStep() + t.Arch().SetSingleStep() + } + t.tg.pidns.owner.mu.RUnlock() + } + + region := trace.StartRegion(t.traceContext, runRegion) + t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) + info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) + t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) + region.End() + + if clearSinglestep { + t.Arch().ClearSingleStep() + } + + switch err { + case nil: + // Handle application system call. + return t.doSyscall() + + case platform.ErrContextInterrupt: + // Interrupted by platform.Context.Interrupt(). Re-enter the run + // loop to figure out why. + return (*runApp)(nil) + + case platform.ErrContextSignalCPUID: + if err := app.handleCPUIDInstruction(t); err == nil { + // Resume execution. + return (*runApp)(nil) + } + + // The instruction at the given RIP was not a CPUID, and we + // fallthrough to the default signal deliver behavior below. + fallthrough + + case platform.ErrContextSignal: + // Looks like a signal has been delivered to us. If it's a synchronous + // signal (SEGV, SIGBUS, etc.), it should be sent to the application + // thread that received it. + sig := linux.Signal(info.Signo) + + // Was it a fault that we should handle internally? If so, this wasn't + // an application-generated signal and we should continue execution + // normally. + if at.Any() { + region := trace.StartRegion(t.traceContext, faultRegion) + addr := usermem.Addr(info.Addr()) + err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack())) + region.End() + if err == nil { + // The fault was handled appropriately. + // We can resume running the application. + return (*runApp)(nil) + } + + // Is this a vsyscall that we need emulate? + // + // Note that we don't track vsyscalls as part of a + // specific trace region. This is because regions don't + // stack, and the actual system call will count as a + // region. We should be able to easily identify + // vsyscalls by having a <fault><syscall> pair. + if at.Execute { + if sysno, ok := t.tc.st.LookupEmulate(addr); ok { + return t.doVsyscall(addr, sysno) + } + } + + // Faults are common, log only at debug level. + t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err) + t.DebugDumpState() + + // Continue to signal handling. + // + // Convert a BusError error to a SIGBUS from a SIGSEGV. All + // other info bits stay the same (address, etc.). + if _, ok := err.(*memmap.BusError); ok { + sig = linux.SIGBUS + info.Signo = int32(linux.SIGBUS) + } + } + + switch sig { + case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP: + // Synchronous signal. Send it to ourselves. Assume the signal is + // legitimate and force it (work around the signal being ignored or + // blocked) like Linux does. Conveniently, this is even the correct + // behavior for SIGTRAP from single-stepping. + t.forceSignal(linux.Signal(sig), false /* unconditional */) + t.SendSignal(info) + + case platform.SignalInterrupt: + // Assume that a call to platform.Context.Interrupt() misfired. + + case linux.SIGPROF: + // It's a profiling interrupt: there's not much + // we can do. We've already paid a decent cost + // by intercepting the signal, at this point we + // simply ignore it. + + default: + // Asynchronous signal. Let the system deal with it. + t.k.sendExternalSignal(info, "application") + } + + return (*runApp)(nil) + + case platform.ErrContextCPUPreempted: + // Ensure that rseq critical sections are interrupted and per-thread + // CPU values are updated before the next platform.Context.Switch(). + t.rseqPreempted = true + return (*runApp)(nil) + + default: + // What happened? Can't continue. + t.Warningf("Unexpected SwitchToApp error: %v", err) + t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)}) + return (*runExit)(nil) + } +} + +// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. +func (t *Task) waitGoroutineStoppedOrExited() { + t.goroutineStopped.Wait() +} + +// WaitExited blocks until all task goroutines in tg have exited. +// +// WaitExited does not correspond to anything in Linux; it's provided so that +// external callers of Kernel.CreateProcess can wait for the created thread +// group to terminate. +func (tg *ThreadGroup) WaitExited() { + tg.liveGoroutines.Wait() +} + +// Yield yields the processor for the calling task. +func (t *Task) Yield() { + atomic.AddUint64(&t.yieldCount, 1) + runtime.Gosched() +} diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go new file mode 100644 index 000000000..09366b60c --- /dev/null +++ b/pkg/sentry/kernel/task_sched.go @@ -0,0 +1,668 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// CPU scheduling, real and fake. + +import ( + "fmt" + "math/rand" + "sync/atomic" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/hostcpu" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/syserror" +) + +// TaskGoroutineState is a coarse representation of the current execution +// status of a kernel.Task goroutine. +type TaskGoroutineState int + +const ( + // TaskGoroutineNonexistent indicates that the task goroutine has either + // not yet been created by Task.Start() or has returned from Task.run(). + // This must be the zero value for TaskGoroutineState. + TaskGoroutineNonexistent TaskGoroutineState = iota + + // TaskGoroutineRunningSys indicates that the task goroutine is executing + // sentry code. + TaskGoroutineRunningSys + + // TaskGoroutineRunningApp indicates that the task goroutine is executing + // application code. + TaskGoroutineRunningApp + + // TaskGoroutineBlockedInterruptible indicates that the task goroutine is + // blocked in Task.block(), and hence may be woken by Task.interrupt() + // (e.g. due to signal delivery). + TaskGoroutineBlockedInterruptible + + // TaskGoroutineBlockedUninterruptible indicates that the task goroutine is + // stopped outside of Task.block() and Task.doStop(), and hence cannot be + // woken by Task.interrupt(). + TaskGoroutineBlockedUninterruptible + + // TaskGoroutineStopped indicates that the task goroutine is blocked in + // Task.doStop(). TaskGoroutineStopped is similar to + // TaskGoroutineBlockedUninterruptible, but is a separate state to make it + // possible to determine when Task.stop is meaningful. + TaskGoroutineStopped +) + +// TaskGoroutineSchedInfo contains task goroutine scheduling state which must +// be read and updated atomically. +// +// +stateify savable +type TaskGoroutineSchedInfo struct { + // Timestamp was the value of Kernel.cpuClock when this + // TaskGoroutineSchedInfo was last updated. + Timestamp uint64 + + // State is the current state of the task goroutine. + State TaskGoroutineState + + // UserTicks is the amount of time the task goroutine has spent executing + // its associated Task's application code, in units of linux.ClockTick. + UserTicks uint64 + + // SysTicks is the amount of time the task goroutine has spent executing in + // the sentry, in units of linux.ClockTick. + SysTicks uint64 +} + +// userTicksAt returns the extrapolated value of ts.UserTicks after +// Kernel.CPUClockNow() indicates a time of now. +// +// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is +// monotonic, this is satisfied if now is the result of a previous call to +// Kernel.CPUClockNow().) This requirement exists because otherwise a racing +// change to t.gosched can cause userTicksAt to adjust stats by too much, +// making the observed stats non-monotonic. +func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 { + if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp { + // Update stats to reflect execution since the last update. + return ts.UserTicks + (now - ts.Timestamp) + } + return ts.UserTicks +} + +// sysTicksAt returns the extrapolated value of ts.SysTicks after +// Kernel.CPUClockNow() indicates a time of now. +// +// Preconditions: As for userTicksAt. +func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 { + if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys { + return ts.SysTicks + (now - ts.Timestamp) + } + return ts.SysTicks +} + +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) { + now := t.k.CPUClockNow() + if t.gosched.State != TaskGoroutineRunningSys { + panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state)) + } + t.goschedSeq.BeginWrite() + // This function is very hot; avoid defer. + t.gosched.SysTicks += now - t.gosched.Timestamp + t.gosched.Timestamp = now + t.gosched.State = state + t.goschedSeq.EndWrite() + + if state != TaskGoroutineRunningApp { + // Task is blocking/stopping. + t.k.decRunningTasks() + } +} + +// Preconditions: The caller must be running on the task goroutine, and leaving +// a state indicated by a previous call to +// t.accountTaskGoroutineEnter(state). +func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { + if state != TaskGoroutineRunningApp { + // Task is unblocking/continuing. + t.k.incRunningTasks() + } + + now := t.k.CPUClockNow() + if t.gosched.State != state { + panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys)) + } + t.goschedSeq.BeginWrite() + // This function is very hot; avoid defer. + if state == TaskGoroutineRunningApp { + t.gosched.UserTicks += now - t.gosched.Timestamp + } + t.gosched.Timestamp = now + t.gosched.State = TaskGoroutineRunningSys + t.goschedSeq.EndWrite() +} + +// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info. +// Most clients should use t.CPUStats() instead. +func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo { + return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched) +} + +// CPUStats returns the CPU usage statistics of t. +func (t *Task) CPUStats() usage.CPUStats { + return t.cpuStatsAt(t.k.CPUClockNow()) +} + +// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. +func (t *Task) cpuStatsAt(now uint64) usage.CPUStats { + tsched := t.TaskGoroutineSchedInfo() + return usage.CPUStats{ + UserTime: time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)), + SysTime: time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)), + VoluntarySwitches: atomic.LoadUint64(&t.yieldCount), + } +} + +// CPUStats returns the combined CPU usage statistics of all past and present +// threads in tg. +func (tg *ThreadGroup) CPUStats() usage.CPUStats { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + // Hack to get a pointer to the Kernel. + if tg.leader == nil { + // Per comment on tg.leader, this is only possible if nothing in the + // ThreadGroup has ever executed anyway. + return usage.CPUStats{} + } + return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow()) +} + +// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex +// must be locked. +func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats { + stats := tg.exitedCPUStats + // Account for live tasks. + for t := tg.tasks.Front(); t != nil; t = t.Next() { + stats.Accumulate(t.cpuStatsAt(now)) + } + return stats +} + +// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return +// resource usage statistics for all children of [tg] that have terminated and +// been waited for. These statistics will include the resources used by +// grandchildren, and further removed descendants, if all of the intervening +// descendants waited on their terminated children." +func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.childCPUStats +} + +// taskClock is a ktime.Clock that measures the time that a task has spent +// executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID. +// +// +stateify savable +type taskClock struct { + t *Task + + // If includeSys is true, the taskClock includes both time spent executing + // application code as well as time spent in the sentry. Otherwise, the + // taskClock includes only time spent executing application code. + includeSys bool + + // Implements waiter.Waitable. TimeUntil wouldn't change its estimation + // based on either of the clock events, so there's no event to be + // notified for. + ktime.NoClockEvents `state:"nosave"` + + // Implements ktime.Clock.WallTimeUntil. + // + // As an upper bound, a task's clock cannot advance faster than CPU + // time. It would have to execute at a rate of more than 1 task-second + // per 1 CPU-second, which isn't possible. + ktime.WallRateClock `state:"nosave"` +} + +// UserCPUClock returns a clock measuring the CPU time the task has spent +// executing application code. +func (t *Task) UserCPUClock() ktime.Clock { + return &taskClock{t: t, includeSys: false} +} + +// CPUClock returns a clock measuring the CPU time the task has spent executing +// application and "kernel" code. +func (t *Task) CPUClock() ktime.Clock { + return &taskClock{t: t, includeSys: true} +} + +// Now implements ktime.Clock.Now. +func (tc *taskClock) Now() ktime.Time { + stats := tc.t.CPUStats() + if tc.includeSys { + return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) + } + return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) +} + +// tgClock is a ktime.Clock that measures the time a thread group has spent +// executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID. +// +// +stateify savable +type tgClock struct { + tg *ThreadGroup + + // If includeSys is true, the tgClock includes both time spent executing + // application code as well as time spent in the sentry. Otherwise, the + // tgClock includes only time spent executing application code. + includeSys bool + + // Implements waiter.Waitable. + ktime.ClockEventsQueue `state:"nosave"` +} + +// Now implements ktime.Clock.Now. +func (tgc *tgClock) Now() ktime.Time { + stats := tgc.tg.CPUStats() + if tgc.includeSys { + return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) + } + return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) +} + +// WallTimeUntil implements ktime.Clock.WallTimeUntil. +func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration { + // Thread group CPU time should not exceed wall time * live tasks, since + // task goroutines exit after the transition to TaskExitZombie in + // runExitNotify. + tgc.tg.pidns.owner.mu.RLock() + n := tgc.tg.liveTasks + tgc.tg.pidns.owner.mu.RUnlock() + if n == 0 { + if t.Before(now) { + return 0 + } + // The timer tick raced with thread group exit, after which no more + // tasks can enter the thread group. So tgc.Now() will never advance + // again. Return a large delay; the timer should be stopped long before + // it comes again anyway. + return time.Hour + } + // This is a lower bound on the amount of time that can elapse before an + // associated timer expires, so returning this value tends to result in a + // sequence of closely-spaced ticks just before timer expiry. To avoid + // this, round up to the nearest ClockTick; CPU usage measurements are + // limited to this resolution anyway. + remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond + return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick +} + +// UserCPUClock returns a ktime.Clock that measures the time that a thread +// group has spent executing. +func (tg *ThreadGroup) UserCPUClock() ktime.Clock { + return &tgClock{tg: tg, includeSys: false} +} + +// CPUClock returns a ktime.Clock that measures the time that a thread group +// has spent executing, including sentry time. +func (tg *ThreadGroup) CPUClock() ktime.Clock { + return &tgClock{tg: tg, includeSys: true} +} + +type kernelCPUClockTicker struct { + k *Kernel + + // These are essentially kernelCPUClockTicker.Notify local variables that + // are cached between calls to reduce allocations. + rng *rand.Rand + tgs []*ThreadGroup +} + +func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker { + return &kernelCPUClockTicker{ + k: k, + rng: rand.New(rand.NewSource(rand.Int63())), + } +} + +// Notify implements ktime.TimerListener.Notify. +func (ticker *kernelCPUClockTicker) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { + // Only increment cpuClock by 1 regardless of the number of expirations. + // This approximately compensates for cases where thread throttling or bad + // Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and + // presumably task goroutines as well, from executing for a long period of + // time. It's also necessary to prevent CPU clocks from seeing large + // discontinuous jumps. + now := atomic.AddUint64(&ticker.k.cpuClock, 1) + + // Check thread group CPU timers. + tgs := ticker.k.tasks.Root.ThreadGroupsAppend(ticker.tgs) + for _, tg := range tgs { + if atomic.LoadUint32(&tg.cpuTimersEnabled) == 0 { + continue + } + + ticker.k.tasks.mu.RLock() + if tg.leader == nil { + // No tasks have ever run in this thread group. + ticker.k.tasks.mu.RUnlock() + continue + } + // Accumulate thread group CPU stats, and randomly select running tasks + // using reservoir sampling to receive CPU timer signals. + var virtReceiver *Task + nrVirtCandidates := 0 + var profReceiver *Task + nrProfCandidates := 0 + tgUserTime := tg.exitedCPUStats.UserTime + tgSysTime := tg.exitedCPUStats.SysTime + for t := tg.tasks.Front(); t != nil; t = t.Next() { + tsched := t.TaskGoroutineSchedInfo() + tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)) + tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)) + switch tsched.State { + case TaskGoroutineRunningApp: + // Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU + // timers. + nrVirtCandidates++ + if int(randInt31n(ticker.rng, int32(nrVirtCandidates))) == 0 { + virtReceiver = t + } + fallthrough + case TaskGoroutineRunningSys: + // Considered by ITIMER_PROF and RLIMIT_CPU timers. + nrProfCandidates++ + if int(randInt31n(ticker.rng, int32(nrProfCandidates))) == 0 { + profReceiver = t + } + } + } + tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds()) + tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds()) + + // All of the following are standard (not real-time) signals, which are + // automatically deduplicated, so we ignore the number of expirations. + tg.signalHandlers.mu.Lock() + // It should only be possible for these timers to advance if we found + // at least one running task. + if virtReceiver != nil { + // ITIMER_VIRTUAL + newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow) + tg.itimerVirtSetting = newItimerVirtSetting + if exp != 0 { + virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true) + } + } + if profReceiver != nil { + // ITIMER_PROF + newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow) + tg.itimerProfSetting = newItimerProfSetting + if exp != 0 { + profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true) + } + // RLIMIT_CPU soft limit + newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow) + tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting + if exp != 0 { + profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true) + } + // RLIMIT_CPU hard limit + rlimitCPUMax := tg.limits.Get(limits.CPU).Max + if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) { + profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true) + } + } + tg.signalHandlers.mu.Unlock() + + ticker.k.tasks.mu.RUnlock() + } + + // Retain tgs between calls to Notify to reduce allocations. + for i := range tgs { + tgs[i] = nil + } + ticker.tgs = tgs[:0] + + // If nothing is running, we can disable the timer. + tasks := atomic.LoadInt64(&ticker.k.runningTasks) + if tasks == 0 { + ticker.k.runningTasksMu.Lock() + defer ticker.k.runningTasksMu.Unlock() + tasks := atomic.LoadInt64(&ticker.k.runningTasks) + if tasks != 0 { + // Raced with a 0 -> 1 transition. + return setting, false + } + + // Stop the timer. We must cache the current setting so the + // kernel can access it without violating the lock order. + ticker.k.cpuClockTickerSetting = setting + ticker.k.cpuClockTickerDisabled = true + setting.Enabled = false + return setting, true + } + + return setting, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (ticker *kernelCPUClockTicker) Destroy() { +} + +// randInt31n returns a random integer in [0, n). +// +// randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported. +// See that function for details. +func randInt31n(rng *rand.Rand, n int32) int32 { + v := rng.Uint32() + prod := uint64(v) * uint64(n) + low := uint32(prod) + if low < uint32(n) { + thresh := uint32(-n) % uint32(n) + for low < thresh { + v = rng.Uint32() + prod = uint64(v) * uint64(n) + low = uint32(prod) + } + } + return int32(prod >> 32) +} + +// NotifyRlimitCPUUpdated is called by setrlimit. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) NotifyRlimitCPUUpdated() { + t.k.cpuClockTicker.Atomically(func() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + rlimitCPU := t.tg.limits.Get(limits.CPU) + t.tg.rlimitCPUSoftSetting = ktime.Setting{ + Enabled: rlimitCPU.Cur != limits.Infinity, + Next: ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()), + Period: time.Second, + } + if rlimitCPU.Max != limits.Infinity { + // Check if tg is already over the hard limit. + tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow()) + tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds()) + if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) { + t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true) + } + } + t.tg.updateCPUTimersEnabledLocked() + }) +} + +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) updateCPUTimersEnabledLocked() { + rlimitCPU := tg.limits.Get(limits.CPU) + if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity { + atomic.StoreUint32(&tg.cpuTimersEnabled, 1) + } else { + atomic.StoreUint32(&tg.cpuTimersEnabled, 0) + } +} + +// StateStatus returns a string representation of the task's current state, +// appropriate for /proc/[pid]/status. +func (t *Task) StateStatus() string { + switch s := t.TaskGoroutineSchedInfo().State; s { + case TaskGoroutineNonexistent: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + switch t.exitState { + case TaskExitZombie: + return "Z (zombie)" + case TaskExitDead: + return "X (dead)" + default: + // The task goroutine can't exit before passing through + // runExitNotify, so this indicates that the task has been created, + // but the task goroutine hasn't yet started. The Linux equivalent + // is struct task_struct::state == TASK_NEW + // (kernel/fork.c:copy_process() => + // kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is + // masked out by TASK_REPORT for /proc/[pid]/status, leaving only + // TASK_RUNNING. + return "R (running)" + } + case TaskGoroutineRunningSys, TaskGoroutineRunningApp: + return "R (running)" + case TaskGoroutineBlockedInterruptible: + return "S (sleeping)" + case TaskGoroutineStopped: + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + switch t.stop.(type) { + case *groupStop: + return "T (stopped)" + case *ptraceStop: + return "t (tracing stop)" + } + fallthrough + case TaskGoroutineBlockedUninterruptible: + // This is the name Linux uses for TASK_UNINTERRUPTIBLE and + // TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL): + // fs/proc/array.c:task_state_array. + return "D (disk sleep)" + default: + panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s)) + } +} + +// CPUMask returns a copy of t's allowed CPU mask. +func (t *Task) CPUMask() sched.CPUSet { + t.mu.Lock() + defer t.mu.Unlock() + return t.allowedCPUMask.Copy() +} + +// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of +// mask. +// +// Preconditions: mask.Size() == +// sched.CPUSetSize(t.Kernel().ApplicationCores()). +func (t *Task) SetCPUMask(mask sched.CPUSet) error { + if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want { + panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want)) + } + + // Remove CPUs in mask above Kernel.applicationCores. + mask.ClearAbove(t.k.applicationCores) + + // Ensure that at least 1 CPU is still allowed. + if mask.NumCPUs() == 0 { + return syserror.EINVAL + } + + if t.k.useHostCores { + // No-op; pretend the mask was immediately changed back. + return nil + } + + t.tg.pidns.owner.mu.RLock() + rootTID := t.tg.pidns.owner.Root.tids[t] + t.tg.pidns.owner.mu.RUnlock() + + t.mu.Lock() + defer t.mu.Unlock() + t.allowedCPUMask = mask + atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID)) + return nil +} + +// CPU returns the cpu id for a given task. +func (t *Task) CPU() int32 { + if t.k.useHostCores { + return int32(hostcpu.GetCPU()) + } + + return atomic.LoadInt32(&t.cpu) +} + +// assignCPU returns the virtualized CPU number for the task with global TID +// tid and allowedCPUMask allowed. +func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) { + // To pretend that threads are evenly distributed to allowed CPUs, choose n + // to be less than the number of CPUs in allowed ... + n := int(tid) % int(allowed.NumCPUs()) + // ... then pick the nth CPU in allowed. + allowed.ForEachCPU(func(c uint) { + if n--; n == 0 { + cpu = int32(c) + } + }) + return cpu +} + +// Niceness returns t's niceness. +func (t *Task) Niceness() int { + t.mu.Lock() + defer t.mu.Unlock() + return t.niceness +} + +// Priority returns t's priority. +func (t *Task) Priority() int { + t.mu.Lock() + defer t.mu.Unlock() + return t.niceness + 20 +} + +// SetNiceness sets t's niceness to n. +func (t *Task) SetNiceness(n int) { + t.mu.Lock() + defer t.mu.Unlock() + t.niceness = n +} + +// NumaPolicy returns t's current numa policy. +func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) { + t.mu.Lock() + defer t.mu.Unlock() + return t.numaPolicy, t.numaNodeMask +} + +// SetNumaPolicy sets t's numa policy. +func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) { + t.mu.Lock() + defer t.mu.Unlock() + t.numaPolicy = policy + t.numaNodeMask = nodeMask +} diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go new file mode 100644 index 000000000..79766cafe --- /dev/null +++ b/pkg/sentry/kernel/task_signals.go @@ -0,0 +1,1139 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file defines the behavior of task signal handling. + +import ( + "fmt" + "sync/atomic" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SignalAction is an internal signal action. +type SignalAction int + +// Available signal actions. +// Note that although we refer the complete set internally, +// the application is only capable of using the Default and +// Ignore actions from the system call interface. +const ( + SignalActionTerm SignalAction = iota + SignalActionCore + SignalActionStop + SignalActionIgnore + SignalActionHandler +) + +// Default signal handler actions. Note that for most signals, +// (except SIGKILL and SIGSTOP) these can be overridden by the app. +var defaultActions = map[linux.Signal]SignalAction{ + // POSIX.1-1990 standard. + linux.SIGHUP: SignalActionTerm, + linux.SIGINT: SignalActionTerm, + linux.SIGQUIT: SignalActionCore, + linux.SIGILL: SignalActionCore, + linux.SIGABRT: SignalActionCore, + linux.SIGFPE: SignalActionCore, + linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects + linux.SIGSEGV: SignalActionCore, + linux.SIGPIPE: SignalActionTerm, + linux.SIGALRM: SignalActionTerm, + linux.SIGTERM: SignalActionTerm, + linux.SIGUSR1: SignalActionTerm, + linux.SIGUSR2: SignalActionTerm, + linux.SIGCHLD: SignalActionIgnore, + linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects + linux.SIGSTOP: SignalActionStop, + linux.SIGTSTP: SignalActionStop, + linux.SIGTTIN: SignalActionStop, + linux.SIGTTOU: SignalActionStop, + // POSIX.1-2001 standard. + linux.SIGBUS: SignalActionCore, + linux.SIGPROF: SignalActionTerm, + linux.SIGSYS: SignalActionCore, + linux.SIGTRAP: SignalActionCore, + linux.SIGURG: SignalActionIgnore, + linux.SIGVTALRM: SignalActionTerm, + linux.SIGXCPU: SignalActionCore, + linux.SIGXFSZ: SignalActionCore, + // The rest on linux. + linux.SIGSTKFLT: SignalActionTerm, + linux.SIGIO: SignalActionTerm, + linux.SIGPWR: SignalActionTerm, + linux.SIGWINCH: SignalActionIgnore, +} + +// computeAction figures out what to do given a signal number +// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop, +// and SIGKILL always results in a SignalActionTerm. +// Signal 0 is always ignored as many programs use it for various internal functions +// and don't expect it to do anything. +// +// In the event the signal is not one of these, act.Handler determines what +// happens next. +// If act.Handler is: +// 0, the default action is taken; +// 1, the signal is ignored; +// anything else, the function returns SignalActionHandler. +func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction { + switch sig { + case linux.SIGSTOP: + return SignalActionStop + case linux.SIGKILL: + return SignalActionTerm + case linux.Signal(0): + return SignalActionIgnore + } + + switch act.Handler { + case arch.SignalActDefault: + return defaultActions[sig] + case arch.SignalActIgnore: + return SignalActionIgnore + default: + return SignalActionHandler + } +} + +// UnblockableSignals contains the set of signals which cannot be blocked. +var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP) + +// StopSignals is the set of signals whose default action is SignalActionStop. +var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU) + +// dequeueSignalLocked returns a pending signal that is *not* included in mask. +// If there are no pending unmasked signals, dequeueSignalLocked returns nil. +// +// Preconditions: t.tg.signalHandlers.mu must be locked. +func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *arch.SignalInfo { + if info := t.pendingSignals.dequeue(mask); info != nil { + return info + } + return t.tg.pendingSignals.dequeue(mask) +} + +// discardSpecificLocked removes all instances of the given signal from all +// signal queues in tg. +// +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) { + tg.pendingSignals.discardSpecific(sig) + for t := tg.tasks.Front(); t != nil; t = t.Next() { + t.pendingSignals.discardSpecific(sig) + } +} + +// PendingSignals returns the set of pending signals. +func (t *Task) PendingSignals() linux.SignalSet { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet +} + +// deliverSignal delivers the given signal and returns the following run state. +func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState { + sigact := computeAction(linux.Signal(info.Signo), act) + + if t.haveSyscallReturn { + if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + // Signals that are ignored, cause a thread group stop, or + // terminate the thread group do not interact with interrupted + // syscalls; in Linux terms, they are never returned to the signal + // handling path from get_signal => get_signal_to_deliver. The + // behavior of an interrupted syscall is determined by the first + // signal that is actually handled (by userspace). + if sigact == SignalActionHandler { + switch { + case sre == ERESTARTNOHAND: + fallthrough + case sre == ERESTART_RESTARTBLOCK: + fallthrough + case (sre == ERESTARTSYS && !act.IsRestart()): + t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) + t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1))) + default: + t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) + t.Arch().RestartSyscall() + } + } + } + } + + switch sigact { + case SignalActionTerm, SignalActionCore: + // "Default action is to terminate the process." - signal(7) + t.Debugf("Signal %d: terminating thread group", info.Signo) + + // Emit an event channel messages related to this uncaught signal. + ucs := &ucspb.UncaughtSignal{ + Tid: int32(t.Kernel().TaskSet().Root.IDOfTask(t)), + Pid: int32(t.Kernel().TaskSet().Root.IDOfThreadGroup(t.ThreadGroup())), + Registers: t.Arch().StateData().Proto(), + SignalNumber: info.Signo, + } + + // Attach an fault address if appropriate. + switch linux.Signal(info.Signo) { + case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS: + ucs.FaultAddr = info.Addr() + } + + eventchannel.Emit(ucs) + + t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)}) + return (*runExit)(nil) + + case SignalActionStop: + // "Default action is to stop the process." + t.initiateGroupStop(info) + + case SignalActionIgnore: + // "Default action is to ignore the signal." + t.Debugf("Signal %d: ignored", info.Signo) + + case SignalActionHandler: + // Try to deliver the signal to the user-configured handler. + t.Debugf("Signal %d: delivering to handler", info.Signo) + if err := t.deliverSignalToHandler(info, act); err != nil { + // This is not a warning, it can occur during normal operation. + t.Debugf("Failed to deliver signal %+v to user handler: %v", info, err) + + // Send a forced SIGSEGV. If the signal that couldn't be delivered + // was a SIGSEGV, force the handler to SIG_DFL. + t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + } + + default: + panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act))) + } + return (*runInterrupt)(nil) +} + +// deliverSignalToHandler changes the task's userspace state to enter the given +// user-configured handler for the given signal. +func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error { + // Signal delivery to an application handler interrupts restartable + // sequences. + t.rseqInterrupt() + + // Are executing on the main stack, + // or the provided alternate stack? + sp := usermem.Addr(t.Arch().Stack()) + + // N.B. This is a *copy* of the alternate stack that the user's signal + // handler expects to see in its ucontext (even if it's not in use). + alt := t.signalStack + if act.IsOnStack() && alt.IsEnabled() { + alt.SetOnStack() + if !alt.Contains(sp) { + sp = usermem.Addr(alt.Top()) + } + } + + // Set up the signal handler. If we have a saved signal mask, the signal + // handler should run with the current mask, but sigreturn should restore + // the saved one. + st := &arch.Stack{t.Arch(), t.MemoryManager(), sp} + mask := t.signalMask + if t.haveSavedSignalMask { + mask = t.savedSignalMask + } + + // Set up the restorer. + // x86-64 should always uses SA_RESTORER, but this flag is optional on other platforms. + // Please see the linux code as reference: + // linux/arch/x86/kernel/signal.c:__setup_rt_frame() + // If SA_RESTORER is not configured, we can use the sigreturn trampolines + // the vdso provides instead. + // Please see the linux code as reference: + // linux/arch/arm64/kernel/signal.c:setup_return() + if act.Flags&linux.SA_RESTORER == 0 { + act.Restorer = t.MemoryManager().VDSOSigReturn() + } + + if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil { + return err + } + t.haveSavedSignalMask = false + + // Add our signal mask. + newMask := t.signalMask | act.Mask + if !act.IsNoDefer() { + newMask |= linux.SignalSetOf(linux.Signal(info.Signo)) + } + t.SetSignalMask(newMask) + + return nil +} + +var ctrlResume = &SyscallControl{ignoreReturn: true} + +// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if +// rt is true). +func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) { + st := t.Stack() + sigset, alt, err := t.Arch().SignalRestore(st, rt) + if err != nil { + return nil, err + } + + // Attempt to record the given signal stack. Note that we silently + // ignore failures here, as does Linux. Only an EFAULT may be + // generated, but SignalRestore has already deserialized the entire + // frame successfully. + t.SetSignalStack(alt) + + // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked. + t.SetSignalMask(sigset &^ UnblockableSignals) + + return ctrlResume, nil +} + +// Sigtimedwait implements the semantics of sigtimedwait(2). +// +// Preconditions: The caller must be running on the task goroutine. t.exitState +// < TaskExitZombie. +func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) { + // set is the set of signals we're interested in; invert it to get the set + // of signals to block. + mask := ^(set &^ UnblockableSignals) + + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if info := t.dequeueSignalLocked(mask); info != nil { + return info, nil + } + + if timeout == 0 { + return nil, syserror.EAGAIN + } + + // Unblock signals we're waiting for. Remember the original signal mask so + // that Task.sendSignalTimerLocked doesn't discard ignored signals that + // we're temporarily unblocking. + t.realSignalMask = t.signalMask + t.setSignalMaskLocked(t.signalMask & mask) + + // Wait for a timeout or new signal. + t.tg.signalHandlers.mu.Unlock() + _, err := t.BlockWithTimeout(nil, true, timeout) + t.tg.signalHandlers.mu.Lock() + + // Restore the original signal mask. + t.setSignalMaskLocked(t.realSignalMask) + t.realSignalMask = 0 + + if info := t.dequeueSignalLocked(mask); info != nil { + return info, nil + } + if err == syserror.ETIMEDOUT { + return nil, syserror.EAGAIN + } + return nil, err +} + +// SendSignal sends the given signal to t. +// +// The following errors may be returned: +// +// syserror.ESRCH - The task has exited. +// syserror.EINVAL - The signal is not valid. +// syserror.EAGAIN - THe signal is realtime, and cannot be queued. +// +func (t *Task) SendSignal(info *arch.SignalInfo) error { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.sendSignalLocked(info, false /* group */) +} + +// SendGroupSignal sends the given signal to t's thread group. +func (t *Task) SendGroupSignal(info *arch.SignalInfo) error { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.sendSignalLocked(info, true /* group */) +} + +// SendSignal sends the given signal to tg, using tg's leader to determine if +// the signal is blocked. +func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + return tg.leader.sendSignalLocked(info, true /* group */) +} + +func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error { + return t.sendSignalTimerLocked(info, group, nil) +} + +func (t *Task) sendSignalTimerLocked(info *arch.SignalInfo, group bool, timer *IntervalTimer) error { + if t.exitState == TaskExitDead { + return syserror.ESRCH + } + sig := linux.Signal(info.Signo) + if sig == 0 { + return nil + } + if !sig.IsValid() { + return syserror.EINVAL + } + + // Signal side effects apply even if the signal is ultimately discarded. + t.tg.applySignalSideEffectsLocked(sig) + + // TODO: "Only signals for which the "init" process has established a + // signal handler can be sent to the "init" process by other members of the + // PID namespace. This restriction applies even to privileged processes, + // and prevents other members of the PID namespace from accidentally + // killing the "init" process." - pid_namespaces(7). We don't currently do + // this for child namespaces, though we should; we also don't do this for + // the root namespace (the same restriction applies to global init on + // Linux), where whether or not we should is much murkier. In practice, + // most sandboxed applications are not prepared to function as an init + // process. + + // Unmasked, ignored signals are discarded without being queued, unless + // they will be visible to a tracer. Even for group signals, it's the + // originally-targeted task's signal mask and tracer that matter; compare + // Linux's kernel/signal.c:__send_signal() => prepare_signal() => + // sig_ignored(). + ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore + if sigset := linux.SignalSetOf(sig); sigset&t.signalMask == 0 && sigset&t.realSignalMask == 0 && ignored && !t.hasTracer() { + t.Debugf("Discarding ignored signal %d", sig) + if timer != nil { + timer.signalRejectedLocked() + } + return nil + } + + q := &t.pendingSignals + if group { + q = &t.tg.pendingSignals + } + if !q.enqueue(info, timer) { + if sig.IsRealtime() { + return syserror.EAGAIN + } + t.Debugf("Discarding duplicate signal %d", sig) + if timer != nil { + timer.signalRejectedLocked() + } + return nil + } + + // Find a receiver to notify. Note that the task we choose to notify, if + // any, may not be the task that actually dequeues and handles the signal; + // e.g. a racing signal mask change may cause the notified task to become + // ineligible, or a racing sibling task may dequeue the signal first. + if t.canReceiveSignalLocked(sig) { + t.Debugf("Notified of signal %d", sig) + t.interrupt() + return nil + } + if group { + if nt := t.tg.findSignalReceiverLocked(sig); nt != nil { + nt.Debugf("Notified of group signal %d", sig) + nt.interrupt() + return nil + } + } + t.Debugf("No task notified of signal %d", sig) + return nil +} + +func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) { + switch { + case linux.SignalSetOf(sig)&StopSignals != 0: + // Stop signals cause all prior SIGCONT to be discarded. (This is + // despite the fact this has little effect since SIGCONT's most + // important effect is applied when the signal is sent in the branch + // below, not when the signal is delivered.) + tg.discardSpecificLocked(linux.SIGCONT) + case sig == linux.SIGCONT: + // "The SIGCONT signal has a side effect of waking up (all threads of) + // a group-stopped process. This side effect happens before + // signal-delivery-stop. The tracer can't suppress this side effect (it + // can only suppress signal injection, which only causes the SIGCONT + // handler to not be executed in the tracee, if such a handler is + // installed." - ptrace(2) + tg.endGroupStopLocked(true) + case sig == linux.SIGKILL: + // "SIGKILL does not generate signal-delivery-stop and therefore the + // tracer can't suppress it. SIGKILL kills even within system calls + // (syscall-exit-stop is not generated prior to death by SIGKILL)." - + // ptrace(2) + // + // Note that this differs from ThreadGroup.requestExit in that it + // ignores tg.execing. + if !tg.exiting { + tg.exiting = true + tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)} + } + for t := tg.tasks.Front(); t != nil; t = t.Next() { + t.killLocked() + } + } +} + +// canReceiveSignalLocked returns true if t should be interrupted to receive +// the given signal. canReceiveSignalLocked is analogous to Linux's +// kernel/signal.c:wants_signal(), but see below for divergences. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool { + // Notify that the signal is queued. + t.signalQueue.Notify(waiter.EventMask(linux.MakeSignalSet(sig))) + + // - Do not choose tasks that are blocking the signal. + if linux.SignalSetOf(sig)&t.signalMask != 0 { + return false + } + // - No need to check Task.exitState, as the exit path sets every bit in the + // signal mask when it transitions from TaskExitNone to TaskExitInitiated. + // - No special case for SIGKILL: SIGKILL already interrupted all tasks in the + // task group via applySignalSideEffects => killLocked. + // - Do not choose stopped tasks, which cannot handle signals. + if t.stop != nil { + return false + } + // - Do not choose tasks that have already been interrupted, as they may be + // busy handling another signal. + if len(t.interruptChan) != 0 { + return false + } + return true +} + +// findSignalReceiverLocked returns a task in tg that should be interrupted to +// receive the given signal. If no such task exists, findSignalReceiverLocked +// returns nil. +// +// Linux actually records curr_target to balance the group signal targets. +// +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task { + for t := tg.tasks.Front(); t != nil; t = t.Next() { + if t.canReceiveSignalLocked(sig) { + return t + } + } + return nil +} + +// forceSignal ensures that the task is not ignoring or blocking the given +// signal. If unconditional is true, forceSignal takes action even if the +// signal isn't being ignored or blocked. +func (t *Task) forceSignal(sig linux.Signal, unconditional bool) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.forceSignalLocked(sig, unconditional) +} + +func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) { + blocked := linux.SignalSetOf(sig)&t.signalMask != 0 + act := t.tg.signalHandlers.actions[sig] + ignored := act.Handler == arch.SignalActIgnore + if blocked || ignored || unconditional { + act.Handler = arch.SignalActDefault + t.tg.signalHandlers.actions[sig] = act + if blocked { + t.setSignalMaskLocked(t.signalMask &^ linux.SignalSetOf(sig)) + } + } +} + +// SignalMask returns a copy of t's signal mask. +func (t *Task) SignalMask() linux.SignalSet { + return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.signalMask))) +} + +// SetSignalMask sets t's signal mask. +// +// Preconditions: SetSignalMask can only be called by the task goroutine. +// t.exitState < TaskExitZombie. +func (t *Task) SetSignalMask(mask linux.SignalSet) { + // By precondition, t prevents t.tg from completing an execve and mutating + // t.tg.signalHandlers, so we can skip the TaskSet mutex. + t.tg.signalHandlers.mu.Lock() + t.setSignalMaskLocked(mask) + t.tg.signalHandlers.mu.Unlock() +} + +// Preconditions: The signal mutex must be locked. +func (t *Task) setSignalMaskLocked(mask linux.SignalSet) { + oldMask := t.signalMask + atomic.StoreUint64((*uint64)(&t.signalMask), uint64(mask)) + + // If the new mask blocks any signals that were not blocked by the old + // mask, and at least one such signal is pending in tg.pendingSignals, and + // t has been woken, it could be the case that t was woken to handle that + // signal, but will no longer do so as a result of its new signal mask, so + // we have to pick a replacement. + blocked := mask &^ oldMask + blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet + if blockedGroupPending != 0 && t.interrupted() { + linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) { + if nt := t.tg.findSignalReceiverLocked(sig); nt != nil { + nt.interrupt() + return + } + }) + // We have to re-issue the interrupt consumed by t.interrupted() since + // it might have been for a different reason. + t.interruptSelf() + } + + // Conversely, if the new mask unblocks any signals that were blocked by + // the old mask, and at least one such signal is pending, we may now need + // to handle that signal. + unblocked := oldMask &^ mask + unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet) + if unblockedPending != 0 { + t.interruptSelf() + } +} + +// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's +// comment). +// +// Preconditions: SetSavedSignalMask can only be called by the task goroutine. +func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { + t.savedSignalMask = mask + t.haveSavedSignalMask = true +} + +// SignalStack returns the task-private signal stack. +func (t *Task) SignalStack() arch.SignalStack { + alt := t.signalStack + if t.onSignalStack(alt) { + alt.Flags |= arch.SignalStackFlagOnStack + } + return alt +} + +// onSignalStack returns true if the task is executing on the given signal stack. +func (t *Task) onSignalStack(alt arch.SignalStack) bool { + sp := usermem.Addr(t.Arch().Stack()) + return alt.Contains(sp) +} + +// SetSignalStack sets the task-private signal stack. +// +// This value may not be changed if the task is currently executing on the +// signal stack, i.e. if t.onSignalStack returns true. In this case, this +// function will return false. Otherwise, true is returned. +func (t *Task) SetSignalStack(alt arch.SignalStack) bool { + // Check that we're not executing on the stack. + if t.onSignalStack(t.signalStack) { + return false + } + + if alt.Flags&arch.SignalStackFlagDisable != 0 { + // Don't record anything beyond the flags. + t.signalStack = arch.SignalStack{ + Flags: arch.SignalStackFlagDisable, + } + } else { + // Mask out irrelevant parts: only disable matters. + alt.Flags &= arch.SignalStackFlagDisable + t.signalStack = alt + } + return true +} + +// SetSignalAct atomically sets the thread group's signal action for signal sig +// to *actptr (if actptr is not nil) and returns the old signal action. +func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) { + if !sig.IsValid() { + return arch.SignalAct{}, syserror.EINVAL + } + + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + sh := tg.signalHandlers + sh.mu.Lock() + defer sh.mu.Unlock() + oldact := sh.actions[sig] + if actptr != nil { + if sig == linux.SIGKILL || sig == linux.SIGSTOP { + return oldact, syserror.EINVAL + } + + act := *actptr + act.Mask &^= UnblockableSignals + sh.actions[sig] = act + // From POSIX, by way of Linux: + // + // "Setting a signal action to SIG_IGN for a signal that is pending + // shall cause the pending signal to be discarded, whether or not it is + // blocked." + // + // "Setting a signal action to SIG_DFL for a signal that is pending and + // whose default action is to ignore the signal (for example, SIGCHLD), + // shall cause the pending signal to be discarded, whether or not it is + // blocked." + if computeAction(sig, act) == SignalActionIgnore { + tg.discardSpecificLocked(sig) + } + } + return oldact, nil +} + +// CopyOutSignalAct converts the given SignalAct into an architecture-specific +// type and then copies it out to task memory. +func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error { + n := t.Arch().NewSignalAct() + n.SerializeFrom(s) + _, err := n.CopyOut(t, addr) + return err +} + +// CopyInSignalAct copies an architecture-specific sigaction type from task +// memory and then converts it into a SignalAct. +func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) { + n := t.Arch().NewSignalAct() + var s arch.SignalAct + if _, err := n.CopyIn(t, addr); err != nil { + return s, err + } + n.DeserializeTo(&s) + return s, nil +} + +// CopyOutSignalStack converts the given SignalStack into an +// architecture-specific type and then copies it out to task memory. +func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error { + n := t.Arch().NewSignalStack() + n.SerializeFrom(s) + _, err := n.CopyOut(t, addr) + return err +} + +// CopyInSignalStack copies an architecture-specific stack_t from task memory +// and then converts it into a SignalStack. +func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) { + n := t.Arch().NewSignalStack() + var s arch.SignalStack + if _, err := n.CopyIn(t, addr); err != nil { + return s, err + } + n.DeserializeTo(&s) + return s, nil +} + +// groupStop is a TaskStop placed on tasks that have received a stop signal +// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from +// the ptrace man page.) +// +// +stateify savable +type groupStop struct{} + +// Killable implements TaskStop.Killable. +func (*groupStop) Killable() bool { return true } + +// initiateGroupStop attempts to initiate a group stop based on a +// previously-dequeued stop signal. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) initiateGroupStop(info *arch.SignalInfo) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.groupStopPending { + t.Debugf("Signal %d: not stopping thread group: lost to racing stop signal", info.Signo) + return + } + if !t.tg.groupStopDequeued { + t.Debugf("Signal %d: not stopping thread group: lost to racing SIGCONT", info.Signo) + return + } + if t.tg.exiting { + t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo) + return + } + if t.tg.execing != nil { + t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo) + return + } + if !t.tg.groupStopComplete { + t.tg.groupStopSignal = linux.Signal(info.Signo) + } + t.tg.groupStopPendingCount = 0 + for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() { + if t2.killedLocked() || t2.exitState >= TaskExitInitiated { + t2.groupStopPending = false + continue + } + t2.groupStopPending = true + t2.groupStopAcknowledged = false + if t2.ptraceSeized { + t2.trapNotifyPending = true + if s, ok := t2.stop.(*ptraceStop); ok && s.listen { + t2.endInternalStopLocked() + } + } + t2.interrupt() + t.tg.groupStopPendingCount++ + } + t.Debugf("Signal %d: stopping %d threads in thread group", info.Signo, t.tg.groupStopPendingCount) +} + +// endGroupStopLocked ensures that all prior stop signals received by tg are +// not stopping tg and will not stop tg in the future. If broadcast is true, +// parent and tracer notification will be scheduled if appropriate. +// +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) { + // Discard all previously-queued stop signals. + linux.ForEachSignal(StopSignals, tg.discardSpecificLocked) + + if tg.groupStopPendingCount == 0 && !tg.groupStopComplete { + return + } + + completeStr := "incomplete" + if tg.groupStopComplete { + completeStr = "complete" + } + tg.leader.Debugf("Ending %s group stop with %d threads pending", completeStr, tg.groupStopPendingCount) + for t := tg.tasks.Front(); t != nil; t = t.Next() { + t.groupStopPending = false + if t.ptraceSeized { + t.trapNotifyPending = true + if s, ok := t.stop.(*ptraceStop); ok && s.listen { + t.endInternalStopLocked() + } + } else { + if _, ok := t.stop.(*groupStop); ok { + t.endInternalStopLocked() + } + } + } + if broadcast { + // Instead of notifying the parent here, set groupContNotify so that + // one of the continuing tasks does so. (Linux does something similar.) + // The reason we do this is to keep locking sane. In order to send a + // signal to the parent, we need to lock its signal mutex, but we're + // already holding tg's signal mutex, and the TaskSet mutex must be + // locked for writing for us to hold two signal mutexes. Since we don't + // want to require this for endGroupStopLocked (which is called from + // signal-sending paths), nor do we want to lose atomicity by releasing + // the mutexes we're already holding, just let the continuing thread + // group deal with it. + tg.groupContNotify = true + tg.groupContInterrupted = !tg.groupStopComplete + tg.groupContWaitable = true + } + // Unsetting groupStopDequeued will cause racing calls to initiateGroupStop + // to recognize that the group stop has been cancelled. + tg.groupStopDequeued = false + tg.groupStopSignal = 0 + tg.groupStopPendingCount = 0 + tg.groupStopComplete = false + tg.groupStopWaitable = false +} + +// participateGroupStopLocked is called to handle thread group side effects +// after t unsets t.groupStopPending. The caller must handle task side effects +// (e.g. placing the task goroutine into the group stop). It returns true if +// the caller must notify t.tg.leader's parent of a completed group stop (which +// participateGroupStopLocked cannot do due to holding the wrong locks). +// +// Preconditions: The signal mutex must be locked. +func (t *Task) participateGroupStopLocked() bool { + if t.groupStopAcknowledged { + return false + } + t.groupStopAcknowledged = true + t.tg.groupStopPendingCount-- + if t.tg.groupStopPendingCount != 0 { + return false + } + if t.tg.groupStopComplete { + return false + } + t.Debugf("Completing group stop") + t.tg.groupStopComplete = true + t.tg.groupStopWaitable = true + t.tg.groupContNotify = false + t.tg.groupContWaitable = false + return true +} + +// signalStop sends a signal to t's thread group of a new group stop, group +// continue, or ptrace stop, if appropriate. code and status are set in the +// signal sent to tg, if any. +// +// Preconditions: The TaskSet mutex must be locked (for reading or writing). +func (t *Task) signalStop(target *Task, code int32, status int32) { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD] + if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) { + sigchld := &arch.SignalInfo{ + Signo: int32(linux.SIGCHLD), + Code: code, + } + sigchld.SetPid(int32(t.tg.pidns.tids[target])) + sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + sigchld.SetStatus(status) + // TODO(b/72102453): Set utime, stime. + t.sendSignalLocked(sigchld, true /* group */) + } +} + +// The runInterrupt state handles conditions indicated by interrupts. +// +// +stateify savable +type runInterrupt struct{} + +func (*runInterrupt) execute(t *Task) taskRunState { + // Interrupts are de-duplicated (if t is interrupted twice before + // t.interrupted() is called, t.interrupted() will only return true once), + // so early exits from this function must re-enter the runInterrupt state + // to check for more interrupt-signaled conditions. + + t.tg.signalHandlers.mu.Lock() + + // Did we just leave a group stop? + if t.tg.groupContNotify { + t.tg.groupContNotify = false + sig := t.tg.groupStopSignal + intr := t.tg.groupContInterrupted + t.tg.signalHandlers.mu.Unlock() + t.tg.pidns.owner.mu.RLock() + // For consistency with Linux, if the parent and (thread group + // leader's) tracer are in the same thread group, deduplicate + // notifications. + notifyParent := t.tg.leader.parent != nil + if tracer := t.tg.leader.Tracer(); tracer != nil { + if notifyParent && tracer.tg == t.tg.leader.parent.tg { + notifyParent = false + } + // Sending CLD_STOPPED to the tracer doesn't really make any sense; + // the thread group leader may have already entered the stop and + // notified its tracer accordingly. But it's consistent with + // Linux... + if intr { + tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + if !notifyParent { + tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop) + } else { + tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop) + } + } else { + tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig)) + tracer.tg.eventQueue.Notify(EventGroupContinue) + } + } + if notifyParent { + // If groupContInterrupted, do as Linux does and pretend the group + // stop completed just before it ended. The theoretical behavior in + // this case would be to send a SIGCHLD indicating the completed + // stop, followed by a SIGCHLD indicating the continue. However, + // SIGCHLD is a standard signal, so the latter would always be + // dropped. Hence sending only the former is equivalent. + if intr { + t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop) + } else { + t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue) + } + } + t.tg.pidns.owner.mu.RUnlock() + return (*runInterrupt)(nil) + } + + // Do we need to enter a group stop or related ptrace stop? This path is + // analogous to Linux's kernel/signal.c:get_signal() => do_signal_stop() + // (with ptrace enabled) and do_jobctl_trap(). + if t.groupStopPending || t.trapStopPending || t.trapNotifyPending { + sig := t.tg.groupStopSignal + notifyParent := false + if t.groupStopPending { + t.groupStopPending = false + // We care about t.tg.groupStopSignal (for tracer notification) + // even if this doesn't complete a group stop, so keep the + // value of sig we've already read. + notifyParent = t.participateGroupStopLocked() + } + t.trapStopPending = false + t.trapNotifyPending = false + // Drop the signal mutex so we can take the TaskSet mutex. + t.tg.signalHandlers.mu.Unlock() + + t.tg.pidns.owner.mu.RLock() + if t.tg.leader.parent == nil { + notifyParent = false + } + if tracer := t.Tracer(); tracer != nil { + if t.ptraceSeized { + if sig == 0 { + sig = linux.SIGTRAP + } + // "If tracee was attached using PTRACE_SEIZE, group-stop is + // indicated by PTRACE_EVENT_STOP: status>>16 == + // PTRACE_EVENT_STOP. This allows detection of group-stops + // without requiring an extra PTRACE_GETSIGINFO call." - + // "Group-stop", ptrace(2) + t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8 + t.ptraceSiginfo = &arch.SignalInfo{ + Signo: int32(sig), + Code: t.ptraceCode, + } + t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + } else { + t.ptraceCode = int32(sig) + t.ptraceSiginfo = nil + } + if t.beginPtraceStopLocked() { + tracer.signalStop(t, arch.CLD_STOPPED, int32(sig)) + // For consistency with Linux, if the parent and tracer are in the + // same thread group, deduplicate notification signals. + if notifyParent && tracer.tg == t.tg.leader.parent.tg { + notifyParent = false + tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop) + } else { + tracer.tg.eventQueue.Notify(EventTraceeStop) + } + } + } else { + t.tg.signalHandlers.mu.Lock() + if !t.killedLocked() { + t.beginInternalStopLocked((*groupStop)(nil)) + } + t.tg.signalHandlers.mu.Unlock() + } + if notifyParent { + t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) + } + t.tg.pidns.owner.mu.RUnlock() + + return (*runInterrupt)(nil) + } + + // Are there signals pending? + if info := t.dequeueSignalLocked(t.signalMask); info != nil { + if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 { + // Indicate that we've dequeued a stop signal before unlocking the + // signal mutex; initiateGroupStop will check for races with + // endGroupStopLocked after relocking it. + t.tg.groupStopDequeued = true + } + if t.ptraceSignalLocked(info) { + // Dequeueing the signal action must wait until after the + // signal-delivery-stop ends since the tracer can change or + // suppress the signal. + t.tg.signalHandlers.mu.Unlock() + return (*runInterruptAfterSignalDeliveryStop)(nil) + } + act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo)) + t.tg.signalHandlers.mu.Unlock() + return t.deliverSignal(info, act) + } + + t.tg.signalHandlers.mu.Unlock() + return (*runApp)(nil) +} + +// +stateify savable +type runInterruptAfterSignalDeliveryStop struct{} + +func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState { + t.tg.pidns.owner.mu.Lock() + // Can't defer unlock: deliverSignal must be called without holding TaskSet + // mutex. + sig := linux.Signal(t.ptraceCode) + defer func() { + t.ptraceSiginfo = nil + }() + if !sig.IsValid() { + t.tg.pidns.owner.mu.Unlock() + return (*runInterrupt)(nil) + } + info := t.ptraceSiginfo + if sig != linux.Signal(info.Signo) { + info.Signo = int32(sig) + info.Errno = 0 + info.Code = arch.SignalInfoUser + // pid isn't a valid field for all signal numbers, but Linux + // doesn't care (kernel/signal.c:ptrace_signal()). + // + // Linux uses t->parent for the tid and uid here, which is the tracer + // if it hasn't detached or the real parent otherwise. + parent := t.parent + if tracer := t.Tracer(); tracer != nil { + parent = tracer + } + if parent == nil { + // Tracer has detached and t was created by Kernel.CreateProcess(). + // Pretend the parent is in an ancestor PID + user namespace. + info.SetPid(0) + info.SetUid(int32(auth.OverflowUID)) + } else { + info.SetPid(int32(t.tg.pidns.tids[parent])) + info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + } + } + t.tg.signalHandlers.mu.Lock() + t.tg.pidns.owner.mu.Unlock() + // If the signal is masked, re-queue it. + if linux.SignalSetOf(sig)&t.signalMask != 0 { + t.sendSignalLocked(info, false /* group */) + t.tg.signalHandlers.mu.Unlock() + return (*runInterrupt)(nil) + } + act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo)) + t.tg.signalHandlers.mu.Unlock() + return t.deliverSignal(info, act) +} + +// SignalRegister registers a waiter for pending signals. +func (t *Task) SignalRegister(e *waiter.Entry, mask waiter.EventMask) { + t.tg.signalHandlers.mu.Lock() + t.signalQueue.EventRegister(e, mask) + t.tg.signalHandlers.mu.Unlock() +} + +// SignalUnregister unregisters a waiter for pending signals. +func (t *Task) SignalUnregister(e *waiter.Entry) { + t.tg.signalHandlers.mu.Lock() + t.signalQueue.EventUnregister(e) + t.tg.signalHandlers.mu.Unlock() +} diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go new file mode 100644 index 000000000..8485fb4b6 --- /dev/null +++ b/pkg/sentry/kernel/task_start.go @@ -0,0 +1,319 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// TaskConfig defines the configuration of a new Task (see below). +type TaskConfig struct { + // Kernel is the owning Kernel. + Kernel *Kernel + + // Parent is the new task's parent. Parent may be nil. + Parent *Task + + // If InheritParent is not nil, use InheritParent's parent as the new + // task's parent. + InheritParent *Task + + // ThreadGroup is the ThreadGroup the new task belongs to. + ThreadGroup *ThreadGroup + + // SignalMask is the new task's initial signal mask. + SignalMask linux.SignalSet + + // TaskContext is the TaskContext of the new task. Ownership of the + // TaskContext is transferred to TaskSet.NewTask, whether or not it + // succeeds. + TaskContext *TaskContext + + // FSContext is the FSContext of the new task. A reference must be held on + // FSContext, which is transferred to TaskSet.NewTask whether or not it + // succeeds. + FSContext *FSContext + + // FDTable is the FDTableof the new task. A reference must be held on + // FDMap, which is transferred to TaskSet.NewTask whether or not it + // succeeds. + FDTable *FDTable + + // Credentials is the Credentials of the new task. + Credentials *auth.Credentials + + // Niceness is the niceness of the new task. + Niceness int + + // NetworkNamespace is the network namespace to be used for the new task. + NetworkNamespace *inet.Namespace + + // AllowedCPUMask contains the cpus that this task can run on. + AllowedCPUMask sched.CPUSet + + // UTSNamespace is the UTSNamespace of the new task. + UTSNamespace *UTSNamespace + + // IPCNamespace is the IPCNamespace of the new task. + IPCNamespace *IPCNamespace + + // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. + AbstractSocketNamespace *AbstractSocketNamespace + + // MountNamespaceVFS2 is the MountNamespace of the new task. + MountNamespaceVFS2 *vfs.MountNamespace + + // RSeqAddr is a pointer to the the userspace linux.RSeq structure. + RSeqAddr usermem.Addr + + // RSeqSignature is the signature that the rseq abort IP must be signed + // with. + RSeqSignature uint32 + + // ContainerID is the container the new task belongs to. + ContainerID string +} + +// NewTask creates a new task defined by cfg. +// +// NewTask does not start the returned task; the caller must call Task.Start. +func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { + t, err := ts.newTask(cfg) + if err != nil { + cfg.TaskContext.release() + cfg.FSContext.DecRef() + cfg.FDTable.DecRef() + if cfg.MountNamespaceVFS2 != nil { + cfg.MountNamespaceVFS2.DecRef() + } + return nil, err + } + return t, nil +} + +// newTask is a helper for TaskSet.NewTask that only takes ownership of parts +// of cfg if it succeeds. +func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { + tg := cfg.ThreadGroup + tc := cfg.TaskContext + t := &Task{ + taskNode: taskNode{ + tg: tg, + parent: cfg.Parent, + children: make(map[*Task]struct{}), + }, + runState: (*runApp)(nil), + interruptChan: make(chan struct{}, 1), + signalMask: cfg.SignalMask, + signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, + tc: *tc, + fsContext: cfg.FSContext, + fdTable: cfg.FDTable, + p: cfg.Kernel.Platform.NewContext(), + k: cfg.Kernel, + ptraceTracees: make(map[*Task]struct{}), + allowedCPUMask: cfg.AllowedCPUMask.Copy(), + ioUsage: &usage.IO{}, + niceness: cfg.Niceness, + netns: cfg.NetworkNamespace, + utsns: cfg.UTSNamespace, + ipcns: cfg.IPCNamespace, + abstractSockets: cfg.AbstractSocketNamespace, + mountNamespaceVFS2: cfg.MountNamespaceVFS2, + rseqCPU: -1, + rseqAddr: cfg.RSeqAddr, + rseqSignature: cfg.RSeqSignature, + futexWaiter: futex.NewWaiter(), + containerID: cfg.ContainerID, + } + t.creds.Store(cfg.Credentials) + t.endStopCond.L = &t.tg.signalHandlers.mu + t.ptraceTracer.Store((*Task)(nil)) + // We don't construct t.blockingTimer until Task.run(); see that function + // for justification. + + // Make the new task (and possibly thread group) visible to the rest of + // the system atomically. + ts.mu.Lock() + defer ts.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + if tg.exiting || tg.execing != nil { + // If the caller is in the same thread group, then what we return + // doesn't matter too much since the caller will exit before it returns + // to userspace. If the caller isn't in the same thread group, then + // we're in uncharted territory and can return whatever we want. + return nil, syserror.EINTR + } + if err := ts.assignTIDsLocked(t); err != nil { + return nil, err + } + // Below this point, newTask is expected not to fail (there is no rollback + // of assignTIDsLocked or any of the following). + + // Logging on t's behalf will panic if t.logPrefix hasn't been + // initialized. This is the earliest point at which we can do so + // (since t now has thread IDs). + t.updateInfoLocked() + + if cfg.InheritParent != nil { + t.parent = cfg.InheritParent.parent + } + if t.parent != nil { + t.parent.children[t] = struct{}{} + } + + if tg.leader == nil { + // New thread group. + tg.leader = t + if parentPG := tg.parentPG(); parentPG == nil { + tg.createSession() + } else { + // Inherit the process group and terminal. + parentPG.incRefWithParent(parentPG) + tg.processGroup = parentPG + tg.tty = t.parent.tg.tty + } + } + tg.tasks.PushBack(t) + tg.tasksCount++ + tg.liveTasks++ + tg.activeTasks++ + + // Propagate external TaskSet stops to the new task. + t.stopCount = ts.stopCount + + t.mu.Lock() + defer t.mu.Unlock() + + t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t]) + + t.startTime = t.k.RealtimeClock().Now() + + return t, nil +} + +// assignTIDsLocked ensures that new task t is visible in all PID namespaces in +// which it should be visible. +// +// Preconditions: ts.mu must be locked for writing. +func (ts *TaskSet) assignTIDsLocked(t *Task) error { + type allocatedTID struct { + ns *PIDNamespace + tid ThreadID + } + var allocatedTIDs []allocatedTID + for ns := t.tg.pidns; ns != nil; ns = ns.parent { + tid, err := ns.allocateTID() + if err != nil { + // Failure. Remove the tids we already allocated in descendant + // namespaces. + for _, a := range allocatedTIDs { + delete(a.ns.tasks, a.tid) + delete(a.ns.tids, t) + if t.tg.leader == nil { + delete(a.ns.tgids, t.tg) + } + } + return err + } + ns.tasks[tid] = t + ns.tids[t] = tid + if t.tg.leader == nil { + // New thread group. + ns.tgids[t.tg] = tid + } + allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) + } + return nil +} + +// allocateTID returns an unused ThreadID from ns. +// +// Preconditions: ns.owner.mu must be locked for writing. +func (ns *PIDNamespace) allocateTID() (ThreadID, error) { + if ns.exiting { + // "In this case, a subsequent fork(2) into this PID namespace will + // fail with the error ENOMEM; it is not possible to create a new + // processes [sic] in a PID namespace whose init process has + // terminated." - pid_namespaces(7) + return 0, syserror.ENOMEM + } + tid := ns.last + for { + // Next. + tid++ + if tid > TasksLimit { + tid = InitTID + 1 + } + + // Is it available? + tidInUse := func() bool { + if _, ok := ns.tasks[tid]; ok { + return true + } + if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { + return true + } + if _, ok := ns.sessions[SessionID(tid)]; ok { + return true + } + return false + }() + + if !tidInUse { + ns.last = tid + return tid, nil + } + + // Did we do a full cycle? + if tid == ns.last { + // No tid available. + return 0, syserror.EAGAIN + } + } +} + +// Start starts the task goroutine. Start must be called exactly once for each +// task returned by NewTask. +// +// 'tid' must be the task's TID in the root PID namespace and it's used for +// debugging purposes only (set as parameter to Task.run to make it visible +// in stack dumps). +func (t *Task) Start(tid ThreadID) { + // If the task was restored, it may be "starting" after having already exited. + if t.runState == nil { + return + } + t.goroutineStopped.Add(1) + t.tg.liveGoroutines.Add(1) + t.tg.pidns.owner.liveGoroutines.Add(1) + t.tg.pidns.owner.runningGoroutines.Add(1) + + // Task is now running in system mode. + t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) + + // Use the task's TID in the root PID namespace to make it visible in stack dumps. + go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops +} diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go new file mode 100644 index 000000000..10c6e455c --- /dev/null +++ b/pkg/sentry/kernel/task_stop.go @@ -0,0 +1,226 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file implements task stops, which represent the equivalent of Linux's +// uninterruptible sleep states in a way that is compatible with save/restore. +// Task stops comprise both internal stops (which form part of the task's +// "normal" control flow) and external stops (which do not); see README.md for +// details. +// +// There are multiple interfaces for interacting with stops because there are +// multiple cases to consider: +// +// - A task goroutine can begin a stop on its associated task (e.g. a +// vfork() syscall stopping the calling task until the child task releases its +// MM). In this case, calling Task.interrupt is both unnecessary (the task +// goroutine obviously cannot be blocked in Task.block or executing application +// code) and undesirable (as it may spuriously interrupt a in-progress +// syscall). +// +// Beginning internal stops in this case is implemented by +// Task.beginInternalStop / Task.beginInternalStopLocked. As of this writing, +// there are no instances of this case that begin external stops, except for +// autosave; however, autosave terminates the sentry without ending the +// external stop, so the spurious interrupt is moot. +// +// - An arbitrary goroutine can begin a stop on an unrelated task (e.g. all +// tasks being stopped in preparation for state checkpointing). If the task +// goroutine may be in Task.block or executing application code, it must be +// interrupted by Task.interrupt for it to actually enter the stop; since, +// strictly speaking, we have no way of determining this, we call +// Task.interrupt unconditionally. +// +// Beginning external stops in this case is implemented by +// Task.BeginExternalStop. As of this writing, there are no instances of this +// case that begin internal stops. +// +// - An arbitrary goroutine can end a stop on an unrelated task (e.g. an +// exiting task resuming a sibling task that has been blocked in an execve() +// syscall waiting for other tasks to exit). In this case, Task.endStopCond +// must be notified to kick the task goroutine out of Task.doStop. +// +// Ending internal stops in this case is implemented by +// Task.endInternalStopLocked. Ending external stops in this case is +// implemented by Task.EndExternalStop. +// +// - Hypothetically, a task goroutine can end an internal stop on its +// associated task. As of this writing, there are no instances of this case. +// However, any instances of this case could still use the above functions, +// since notifying Task.endStopCond would be unnecessary but harmless. + +import ( + "fmt" + "sync/atomic" +) + +// A TaskStop is a condition visible to the task control flow graph that +// prevents a task goroutine from running or exiting, i.e. an internal stop. +// +// NOTE(b/30793614): Most TaskStops don't contain any data; they're +// distinguished by their type. The obvious way to implement such a TaskStop +// is: +// +// type groupStop struct{} +// func (groupStop) Killable() bool { return true } +// ... +// t.beginInternalStop(groupStop{}) +// +// However, this doesn't work because the state package can't serialize values, +// only pointers. Furthermore, the correctness of save/restore depends on the +// ability to pass a TaskStop to endInternalStop that will compare equal to the +// TaskStop that was passed to beginInternalStop, even if a save/restore cycle +// occurred between the two. As a result, the current idiom is to always use a +// typecast nil for data-free TaskStops: +// +// type groupStop struct{} +// func (*groupStop) Killable() bool { return true } +// ... +// t.beginInternalStop((*groupStop)(nil)) +// +// This is pretty gross, but the alternatives seem grosser. +type TaskStop interface { + // Killable returns true if Task.Kill should end the stop prematurely. + // Killable is analogous to Linux's TASK_WAKEKILL. + Killable() bool +} + +// beginInternalStop indicates the start of an internal stop that applies to t. +// +// Preconditions: The task must not already be in an internal stop (i.e. t.stop +// == nil). The caller must be running on the task goroutine. +func (t *Task) beginInternalStop(s TaskStop) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.beginInternalStopLocked(s) +} + +// Preconditions: The signal mutex must be locked. All preconditions for +// Task.beginInternalStop also apply. +func (t *Task) beginInternalStopLocked(s TaskStop) { + if t.stop != nil { + panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop)) + } + t.Debugf("Entering internal stop %#v", s) + t.stop = s + t.beginStopLocked() +} + +// endInternalStopLocked indicates the end of an internal stop that applies to +// t. endInternalStopLocked does not wait for the task to resume. +// +// The caller is responsible for ensuring that the internal stop they expect +// actually applies to t; this requires holding the signal mutex which protects +// t.stop, which is why there is no endInternalStop that locks the signal mutex +// for you. +// +// Preconditions: The signal mutex must be locked. The task must be in an +// internal stop (i.e. t.stop != nil). +func (t *Task) endInternalStopLocked() { + if t.stop == nil { + panic("Attempting to leave non-existent internal stop") + } + t.Debugf("Leaving internal stop %#v", t.stop) + t.stop = nil + t.endStopLocked() +} + +// BeginExternalStop indicates the start of an external stop that applies to t. +// BeginExternalStop does not wait for t's task goroutine to stop. +func (t *Task) BeginExternalStop() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.beginStopLocked() + t.interrupt() +} + +// EndExternalStop indicates the end of an external stop started by a previous +// call to Task.BeginExternalStop. EndExternalStop does not wait for t's task +// goroutine to resume. +func (t *Task) EndExternalStop() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.endStopLocked() +} + +// beginStopLocked increments t.stopCount to indicate that a new internal or +// external stop applies to t. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) beginStopLocked() { + if newval := atomic.AddInt32(&t.stopCount, 1); newval <= 0 { + // Most likely overflow. + panic(fmt.Sprintf("Invalid stopCount: %d", newval)) + } +} + +// endStopLocked decrements t.stopCount to indicate that an existing internal +// or external stop no longer applies to t. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) endStopLocked() { + if newval := atomic.AddInt32(&t.stopCount, -1); newval < 0 { + panic(fmt.Sprintf("Invalid stopCount: %d", newval)) + } else if newval == 0 { + t.endStopCond.Signal() + } +} + +// BeginExternalStop indicates the start of an external stop that applies to +// all current and future tasks in ts. BeginExternalStop does not wait for +// task goroutines to stop. +func (ts *TaskSet) BeginExternalStop() { + ts.mu.Lock() + defer ts.mu.Unlock() + ts.stopCount++ + if ts.stopCount <= 0 { + panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount)) + } + if ts.Root == nil { + return + } + for t := range ts.Root.tids { + t.tg.signalHandlers.mu.Lock() + t.beginStopLocked() + t.tg.signalHandlers.mu.Unlock() + t.interrupt() + } +} + +// EndExternalStop indicates the end of an external stop started by a previous +// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task +// goroutines to resume. +func (ts *TaskSet) EndExternalStop() { + ts.mu.Lock() + defer ts.mu.Unlock() + ts.stopCount-- + if ts.stopCount < 0 { + panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount)) + } + if ts.Root == nil { + return + } + for t := range ts.Root.tids { + t.tg.signalHandlers.mu.Lock() + t.endStopLocked() + t.tg.signalHandlers.mu.Unlock() + } +} diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go new file mode 100644 index 000000000..a5903b0b5 --- /dev/null +++ b/pkg/sentry/kernel/task_syscall.go @@ -0,0 +1,469 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "os" + "runtime/trace" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/metric" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel +// include/linux/errno.h. These errnos are never returned to userspace +// directly, but are used to communicate the expected behavior of an +// interrupted syscall from the syscall to signal handling. +type SyscallRestartErrno int + +// These numeric values are significant because ptrace syscall exit tracing can +// observe them. +// +// For all of the following errnos, if the syscall is not interrupted by a +// signal delivered to a user handler, the syscall is restarted. +const ( + // ERESTARTSYS is returned by an interrupted syscall to indicate that it + // should be converted to EINTR if interrupted by a signal delivered to a + // user handler without SA_RESTART set, and restarted otherwise. + ERESTARTSYS = SyscallRestartErrno(512) + + // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it + // should always be restarted. + ERESTARTNOINTR = SyscallRestartErrno(513) + + // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it + // should be converted to EINTR if interrupted by a signal delivered to a + // user handler, and restarted otherwise. + ERESTARTNOHAND = SyscallRestartErrno(514) + + // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate + // that it should be restarted using a custom function. The interrupted + // syscall must register a custom restart function by calling + // Task.SetRestartSyscallFn. + ERESTART_RESTARTBLOCK = SyscallRestartErrno(516) +) + +var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application") + +// Error implements error.Error. +func (e SyscallRestartErrno) Error() string { + // Descriptions are borrowed from strace. + switch e { + case ERESTARTSYS: + return "to be restarted if SA_RESTART is set" + case ERESTARTNOINTR: + return "to be restarted" + case ERESTARTNOHAND: + return "to be restarted if no handler" + case ERESTART_RESTARTBLOCK: + return "interrupted by signal" + default: + return "(unknown interrupt error)" + } +} + +// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by +// rv, the value in a syscall return register. +func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) { + switch int(rv) { + case -int(ERESTARTSYS): + return ERESTARTSYS, true + case -int(ERESTARTNOINTR): + return ERESTARTNOINTR, true + case -int(ERESTARTNOHAND): + return ERESTARTNOHAND, true + case -int(ERESTART_RESTARTBLOCK): + return ERESTART_RESTARTBLOCK, true + default: + return 0, false + } +} + +// SyscallRestartBlock represents the restart block for a syscall restartable +// with a custom function. It encapsulates the state required to restart a +// syscall across a S/R. +type SyscallRestartBlock interface { + Restart(t *Task) (uintptr, error) +} + +// SyscallControl is returned by syscalls to control the behavior of +// Task.doSyscallInvoke. +type SyscallControl struct { + // next is the state that the task goroutine should switch to. If next is + // nil, the task goroutine should continue to syscall exit as usual. + next taskRunState + + // If ignoreReturn is true, Task.doSyscallInvoke should not store any value + // in the task's syscall return value register. + ignoreReturn bool +} + +var ( + // CtrlDoExit is returned by the implementations of the exit and exit_group + // syscalls to enter the task exit path directly, skipping syscall exit + // tracing. + CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true} + + // ctrlStopAndReinvokeSyscall is returned by syscalls using the external + // feature before syscall execution. This causes Task.doSyscallInvoke + // to return runSyscallReinvoke, allowing Task.run to check for stops + // before immediately re-invoking the syscall (skipping the re-checking + // of seccomp filters and ptrace which would confuse userspace + // tracing). + ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true} + + // ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at + // their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather + // than tail-calling it, allowing stops to be checked before syscall exit. + ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)} +) + +func (t *Task) invokeExternal() { + t.BeginExternalStop() + go func() { // S/R-SAFE: External control flow. + defer t.EndExternalStop() + t.SyscallTable().External(t.Kernel()) + }() +} + +func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) { + s := t.SyscallTable() + + fe := s.FeatureEnable.Word(sysno) + + var straceContext interface{} + if bits.IsAnyOn32(fe, StraceEnableBits) { + straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe) + } + + if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) { + t.invokeExternal() + // Ensure we check for stops, then invoke the syscall again. + ctrl = ctrlStopAndReinvokeSyscall + } else { + fn := s.Lookup(sysno) + var region *trace.Region // Only non-nil if tracing == true. + if trace.IsEnabled() { + region = trace.StartRegion(t.traceContext, s.LookupName(sysno)) + } + if fn != nil { + // Call our syscall implementation. + rval, ctrl, err = fn(t, args) + } else { + // Use the missing function if not found. + rval, err = t.SyscallTable().Missing(t, sysno, args) + } + if region != nil { + region.End() + } + } + + if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { + t.invokeExternal() + // Don't reinvoke the syscall. + } + + if bits.IsAnyOn32(fe, StraceEnableBits) { + s.Stracer.SyscallExit(straceContext, t, sysno, rval, err) + } + + return +} + +// doSyscall is the entry point for an invocation of a system call specified by +// the current state of t's registers. +// +// The syscall path is very hot; avoid defer. +func (t *Task) doSyscall() taskRunState { + // Save value of the register which is clobbered in the following + // t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64. + // + // On x86, register rax was shared by syscall number and return + // value, and at the entry of the syscall handler, the rax was + // saved to regs.orig_rax which was exposed to userspace. + // But on arm64, syscall number was passed through X8, and the X0 + // was shared by the first syscall argument and return value. The + // X0 was saved to regs.orig_x0 which was not exposed to userspace. + // So we have to do the same operation here to save the X0 value + // into the task context. + t.Arch().SyscallSaveOrig() + + sysno := t.Arch().SyscallNo() + args := t.Arch().SyscallArgs() + + // Tracers expect to see this between when the task traps into the kernel + // to perform a syscall and when the syscall is actually invoked. + // This useless-looking temporary is needed because Go. + tmp := uintptr(syscall.ENOSYS) + t.Arch().SetReturn(-tmp) + + // Check seccomp filters. The nil check is for performance (as seccomp use + // is rare), not needed for correctness. + if t.syscallFilters.Load() != nil { + switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r { + case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: + t.Debugf("Syscall %d: denied by seccomp", sysno) + return (*runSyscallExit)(nil) + case linux.SECCOMP_RET_ALLOW: + // ok + case linux.SECCOMP_RET_KILL_THREAD: + t.Debugf("Syscall %d: killed by seccomp", sysno) + t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + return (*runExit)(nil) + case linux.SECCOMP_RET_TRACE: + t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno) + return (*runSyscallAfterPtraceEventSeccomp)(nil) + default: + panic(fmt.Sprintf("Unknown seccomp result %d", r)) + } + } + + return t.doSyscallEnter(sysno, args) +} + +type runSyscallAfterPtraceEventSeccomp struct{} + +func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { + if t.killed() { + // "[S]yscall-exit-stop is not generated prior to death by SIGKILL." - + // ptrace(2) + return (*runInterrupt)(nil) + } + sysno := t.Arch().SyscallNo() + // "The tracer can skip the system call by changing the syscall number to + // -1." - Documentation/prctl/seccomp_filter.txt + if sysno == ^uintptr(0) { + return (*runSyscallExit)(nil).execute(t) + } + args := t.Arch().SyscallArgs() + return t.doSyscallEnter(sysno, args) +} + +func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState { + if next, ok := t.ptraceSyscallEnter(); ok { + return next + } + return t.doSyscallInvoke(sysno, args) +} + +// +stateify savable +type runSyscallAfterSyscallEnterStop struct{} + +func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState { + if sig := linux.Signal(t.ptraceCode); sig.IsValid() { + t.tg.signalHandlers.mu.Lock() + t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) + t.tg.signalHandlers.mu.Unlock() + } + if t.killed() { + return (*runInterrupt)(nil) + } + sysno := t.Arch().SyscallNo() + if sysno == ^uintptr(0) { + return (*runSyscallExit)(nil) + } + args := t.Arch().SyscallArgs() + + return t.doSyscallInvoke(sysno, args) +} + +// +stateify savable +type runSyscallAfterSysemuStop struct{} + +func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState { + if sig := linux.Signal(t.ptraceCode); sig.IsValid() { + t.tg.signalHandlers.mu.Lock() + t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) + t.tg.signalHandlers.mu.Unlock() + } + if t.killed() { + return (*runInterrupt)(nil) + } + return (*runSyscallExit)(nil).execute(t) +} + +func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState { + rval, ctrl, err := t.executeSyscall(sysno, args) + + if ctrl != nil { + if !ctrl.ignoreReturn { + t.Arch().SetReturn(rval) + } + if ctrl.next != nil { + return ctrl.next + } + } else if err != nil { + t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) + t.haveSyscallReturn = true + } else { + t.Arch().SetReturn(rval) + } + + return (*runSyscallExit)(nil).execute(t) +} + +// +stateify savable +type runSyscallReinvoke struct{} + +func (*runSyscallReinvoke) execute(t *Task) taskRunState { + if t.killed() { + // It's possible that since the last execution, the task has + // been forcible killed. Invoking the system call here could + // result in an infinite loop if it is again preempted by an + // external stop and reinvoked. + return (*runInterrupt)(nil) + } + + sysno := t.Arch().SyscallNo() + args := t.Arch().SyscallArgs() + return t.doSyscallInvoke(sysno, args) +} + +// +stateify savable +type runSyscallExit struct{} + +func (*runSyscallExit) execute(t *Task) taskRunState { + t.ptraceSyscallExit() + return (*runApp)(nil) +} + +// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as +// indicated by an execution fault at address addr. doVsyscall returns the +// task's next run state. +func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { + vsyscallCount.Increment() + + // Grab the caller up front, to make sure there's a sensible stack. + caller := t.Arch().Native(uintptr(0)) + if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil { + t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return (*runApp)(nil) + } + + // For _vsyscalls_, there is no need to translate System V calling convention + // to syscall ABI because they both use RDI, RSI, and RDX for the first three + // arguments and none of the vsyscalls uses more than two arguments. + args := t.Arch().SyscallArgs() + if t.syscallFilters.Load() != nil { + switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r { + case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: + t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller)) + return (*runApp)(nil) + case linux.SECCOMP_RET_ALLOW: + // ok + case linux.SECCOMP_RET_TRACE: + t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller)) + return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller} + case linux.SECCOMP_RET_KILL_THREAD: + t.Debugf("vsyscall %d: killed by seccomp", sysno) + t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + return (*runExit)(nil) + default: + panic(fmt.Sprintf("Unknown seccomp result %d", r)) + } + } + + return t.doVsyscallInvoke(sysno, args, caller) +} + +type runVsyscallAfterPtraceEventSeccomp struct { + addr usermem.Addr + sysno uintptr + caller interface{} +} + +func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { + if t.killed() { + return (*runInterrupt)(nil) + } + sysno := t.Arch().SyscallNo() + // "... the syscall may not be changed to another system call using the + // orig_rax register. It may only be changed to -1 order [sic] to skip the + // currently emulated call. ... The tracer MUST NOT modify rip or rsp." - + // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip + // causes do_exit(SIGSYS), and changing sp is ignored. + if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr { + t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + return (*runExit)(nil) + } + if sysno == ^uintptr(0) { + return (*runApp)(nil) + } + return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller) +} + +func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState { + rval, ctrl, err := t.executeSyscall(sysno, args) + if ctrl != nil { + t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl) + // Set the return value. The stack has already been adjusted. + t.Arch().SetReturn(0) + } else if err == nil { + t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller)) + // Set the return value. The stack has already been adjusted. + t.Arch().SetReturn(uintptr(rval)) + } else { + t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err) + if err == syserror.EFAULT { + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + // A return is not emulated in this case. + return (*runApp)(nil) + } + t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) + } + t.Arch().SetIP(t.Arch().Value(caller)) + t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width())) + return (*runApp)(nil) +} + +// ExtractErrno extracts an integer error number from the error. +// The syscall number is purely for context in the error case. Use -1 if +// syscall number is unknown. +func ExtractErrno(err error, sysno int) int { + switch err := err.(type) { + case nil: + return 0 + case syscall.Errno: + return int(err) + case SyscallRestartErrno: + return int(err) + case *memmap.BusError: + // Bus errors may generate SIGBUS, but for syscalls they still + // return EFAULT. See case in task_run.go where the fault is + // handled (and the SIGBUS is delivered). + return int(syscall.EFAULT) + case *os.PathError: + return ExtractErrno(err.Err, sysno) + case *os.LinkError: + return ExtractErrno(err.Err, sysno) + case *os.SyscallError: + return ExtractErrno(err.Err, sysno) + default: + if errno, ok := syserror.TranslateError(err); ok { + return int(errno) + } + } + panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err)) +} diff --git a/pkg/sentry/kernel/task_test.go b/pkg/sentry/kernel/task_test.go new file mode 100644 index 000000000..cfcde9a7a --- /dev/null +++ b/pkg/sentry/kernel/task_test.go @@ -0,0 +1,69 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" +) + +func TestTaskCPU(t *testing.T) { + for _, test := range []struct { + mask sched.CPUSet + tid ThreadID + cpu int32 + }{ + { + mask: []byte{0xff}, + tid: 1, + cpu: 0, + }, + { + mask: []byte{0xff}, + tid: 10, + cpu: 1, + }, + { + // more than 8 cpus. + mask: []byte{0xff, 0xff}, + tid: 10, + cpu: 9, + }, + { + // missing the first cpu. + mask: []byte{0xfe}, + tid: 1, + cpu: 1, + }, + { + mask: []byte{0xfe}, + tid: 10, + cpu: 3, + }, + { + // missing the fifth cpu. + mask: []byte{0xef}, + tid: 10, + cpu: 2, + }, + } { + assigned := assignCPU(test.mask, test.tid) + if test.cpu != assigned { + t.Errorf("assignCPU(%v, %v) got %v, want %v", test.mask, test.tid, assigned, test.cpu) + } + } + +} diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go new file mode 100644 index 000000000..b02044ad2 --- /dev/null +++ b/pkg/sentry/kernel/task_usermem.go @@ -0,0 +1,301 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "math" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// MAX_RW_COUNT is the maximum size in bytes of a single read or write. +// Reads and writes that exceed this size may be silently truncated. +// (Linux: include/linux/fs.h:MAX_RW_COUNT) +var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown()) + +// Activate ensures that the task has an active address space. +func (t *Task) Activate() { + if mm := t.MemoryManager(); mm != nil { + if err := mm.Activate(t); err != nil { + panic("unable to activate mm: " + err.Error()) + } + } +} + +// Deactivate relinquishes the task's active address space. +func (t *Task) Deactivate() { + if mm := t.MemoryManager(); mm != nil { + mm.Deactivate() + } +} + +// CopyIn copies a fixed-size value or slice of fixed-size values in from the +// task's memory. The copy will fail with syscall.EFAULT if it traverses user +// memory that is unmapped or not readable by the user. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) { + return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyInBytes is a fast version of CopyIn if the caller can serialize the +// data without reflection and pass in a byte slice. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { + return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyOut copies a fixed-size value or slice of fixed-size values out to the +// task's memory. The copy will fail with syscall.EFAULT if it traverses user +// memory that is unmapped or not writeable by the user. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) { + return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyOutBytes is a fast version of CopyOut if the caller can serialize the +// data without reflection and pass in a byte slice. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { + return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyInString copies a NUL-terminated string of length at most maxlen in from +// the task's memory. The copy will fail with syscall.EFAULT if it traverses +// user memory that is unmapped or not readable by the user. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) { + return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyInVector copies a NULL-terminated vector of strings from the task's +// memory. The copy will fail with syscall.EFAULT if it traverses +// user memory that is unmapped or not readable by the user. +// +// maxElemSize is the maximum size of each individual element. +// +// maxTotalSize is the maximum total length of all elements plus the total +// number of elements. For example, the following strings correspond to +// the following set of sizes: +// +// { "a", "b", "c" } => 6 (3 for lengths, 3 for elements) +// { "abc" } => 4 (3 for length, 1 for elements) +// +// This Task's AddressSpace must be active. +func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) { + var v []string + for { + argAddr := t.Arch().Native(0) + if _, err := t.CopyIn(addr, argAddr); err != nil { + return v, err + } + if t.Arch().Value(argAddr) == 0 { + break + } + // Each string has a zero terminating byte counted, so copying out a string + // requires at least one byte of space. Also, see the calculation below. + if maxTotalSize <= 0 { + return nil, syserror.ENOMEM + } + thisMax := maxElemSize + if maxTotalSize < thisMax { + thisMax = maxTotalSize + } + arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax) + if err != nil { + return v, err + } + v = append(v, arg) + addr += usermem.Addr(t.Arch().Width()) + maxTotalSize -= len(arg) + 1 + } + return v, nil +} + +// CopyOutIovecs converts src to an array of struct iovecs and copies it to the +// memory mapped at addr. +// +// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the +// task goroutine. t's AddressSpace must be active. +func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error { + switch t.Arch().Width() { + case 8: + const itemLen = 16 + if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok { + return syserror.EFAULT + } + + b := t.CopyScratchBuffer(itemLen) + for ; !src.IsEmpty(); src = src.Tail() { + ar := src.Head() + usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start)) + usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length())) + if _, err := t.CopyOutBytes(addr, b); err != nil { + return err + } + addr += itemLen + } + + default: + return syserror.ENOSYS + } + + return nil +} + +// CopyInIovecs copies an array of numIovecs struct iovecs from the memory +// mapped at addr, converts them to usermem.AddrRanges, and returns them as a +// usermem.AddrRangeSeq. +// +// CopyInIovecs shares the following properties with Linux's +// lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector(): +// +// - If the length of any AddrRange would exceed the range of an ssize_t, +// CopyInIovecs returns EINVAL. +// +// - If the length of any AddrRange would cause its end to overflow, +// CopyInIovecs returns EFAULT. +// +// - If any AddrRange would include addresses outside the application address +// range, CopyInIovecs returns EFAULT. +// +// - The combined length of all AddrRanges is limited to MAX_RW_COUNT. If the +// combined length of all AddrRanges would otherwise exceed this amount, ranges +// beyond MAX_RW_COUNT are silently truncated. +// +// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the +// task goroutine. t's AddressSpace must be active. +func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) { + if numIovecs == 0 { + return usermem.AddrRangeSeq{}, nil + } + + var dst []usermem.AddrRange + if numIovecs > 1 { + dst = make([]usermem.AddrRange, 0, numIovecs) + } + + switch t.Arch().Width() { + case 8: + const itemLen = 16 + if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok { + return usermem.AddrRangeSeq{}, syserror.EFAULT + } + + b := t.CopyScratchBuffer(itemLen) + for i := 0; i < numIovecs; i++ { + if _, err := t.CopyInBytes(addr, b); err != nil { + return usermem.AddrRangeSeq{}, err + } + + base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8])) + length := usermem.ByteOrder.Uint64(b[8:16]) + if length > math.MaxInt64 { + return usermem.AddrRangeSeq{}, syserror.EINVAL + } + ar, ok := t.MemoryManager().CheckIORange(base, int64(length)) + if !ok { + return usermem.AddrRangeSeq{}, syserror.EFAULT + } + + if numIovecs == 1 { + // Special case to avoid allocating dst. + return usermem.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil + } + dst = append(dst, ar) + + addr += itemLen + } + + default: + return usermem.AddrRangeSeq{}, syserror.ENOSYS + } + + // Truncate to MAX_RW_COUNT. + var total uint64 + for i := range dst { + dstlen := uint64(dst[i].Length()) + if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen { + dst[i].End -= usermem.Addr(dstlen - rem) + dstlen = rem + } + total += dstlen + } + + return usermem.AddrRangeSeqFromSlice(dst), nil +} + +// SingleIOSequence returns a usermem.IOSequence representing [addr, +// addr+length) in t's address space. If this contains addresses outside the +// application address range, it returns EFAULT. If length exceeds +// MAX_RW_COUNT, the range is silently truncated. +// +// SingleIOSequence is analogous to Linux's +// lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and +// write syscalls in Linux do not use import_single_range(). However they check +// access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address +// ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().) +func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) { + if length > MAX_RW_COUNT { + length = MAX_RW_COUNT + } + ar, ok := t.MemoryManager().CheckIORange(addr, int64(length)) + if !ok { + return usermem.IOSequence{}, syserror.EFAULT + } + return usermem.IOSequence{ + IO: t.MemoryManager(), + Addrs: usermem.AddrRangeSeqOf(ar), + Opts: opts, + }, nil +} + +// IovecsIOSequence returns a usermem.IOSequence representing the array of +// iovcnt struct iovecs at addr in t's address space. opts applies to the +// returned IOSequence, not the reading of the struct iovec array. +// +// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec(). +// +// Preconditions: As for Task.CopyInIovecs. +func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) { + if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { + return usermem.IOSequence{}, syserror.EINVAL + } + ars, err := t.CopyInIovecs(addr, iovcnt) + if err != nil { + return usermem.IOSequence{}, err + } + return usermem.IOSequence{ + IO: t.MemoryManager(), + Addrs: ars, + Opts: opts, + }, nil +} diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go new file mode 100644 index 000000000..4dfd2c990 --- /dev/null +++ b/pkg/sentry/kernel/thread_group.go @@ -0,0 +1,531 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// A ThreadGroup is a logical grouping of tasks that has widespread +// significance to other kernel features (e.g. signal handling). ("Thread +// groups" are usually called "processes" in userspace documentation.) +// +// ThreadGroup is a superset of Linux's struct signal_struct. +// +// +stateify savable +type ThreadGroup struct { + threadGroupNode + + // signalHandlers is the set of signal handlers used by every task in this + // thread group. (signalHandlers may also be shared with other thread + // groups.) + // + // signalHandlers.mu (hereafter "the signal mutex") protects state related + // to signal handling, as well as state that usually needs to be atomic + // with signal handling, for all ThreadGroups and Tasks using + // signalHandlers. (This is analogous to Linux's use of struct + // sighand_struct::siglock.) + // + // The signalHandlers pointer can only be mutated during an execve + // (Task.finishExec). Consequently, when it's possible for a task in the + // thread group to be completing an execve, signalHandlers is protected by + // the owning TaskSet.mu. Otherwise, it is possible to read the + // signalHandlers pointer without synchronization. In particular, + // completing an execve requires that all other tasks in the thread group + // have exited, so task goroutines do not need the owning TaskSet.mu to + // read the signalHandlers pointer of their thread groups. + signalHandlers *SignalHandlers + + // pendingSignals is the set of pending signals that may be handled by any + // task in this thread group. + // + // pendingSignals is protected by the signal mutex. + pendingSignals pendingSignals + + // If groupStopDequeued is true, a task in the thread group has dequeued a + // stop signal, but has not yet initiated the group stop. + // + // groupStopDequeued is analogous to Linux's JOBCTL_STOP_DEQUEUED. + // + // groupStopDequeued is protected by the signal mutex. + groupStopDequeued bool + + // groupStopSignal is the signal that caused a group stop to be initiated. + // + // groupStopSignal is protected by the signal mutex. + groupStopSignal linux.Signal + + // groupStopPendingCount is the number of active tasks in the thread group + // for which Task.groupStopPending is set. + // + // groupStopPendingCount is analogous to Linux's + // signal_struct::group_stop_count. + // + // groupStopPendingCount is protected by the signal mutex. + groupStopPendingCount int + + // If groupStopComplete is true, groupStopPendingCount transitioned from + // non-zero to zero without an intervening SIGCONT. + // + // groupStopComplete is analogous to Linux's SIGNAL_STOP_STOPPED. + // + // groupStopComplete is protected by the signal mutex. + groupStopComplete bool + + // If groupStopWaitable is true, the thread group is indicating a waitable + // group stop event (as defined by EventChildGroupStop). + // + // Linux represents the analogous state as SIGNAL_STOP_STOPPED being set + // and group_exit_code being non-zero. + // + // groupStopWaitable is protected by the signal mutex. + groupStopWaitable bool + + // If groupContNotify is true, then a SIGCONT has recently ended a group + // stop on this thread group, and the first task to observe it should + // notify its parent. groupContInterrupted is true iff SIGCONT ended an + // incomplete group stop. If groupContNotify is false, groupContInterrupted is + // meaningless. + // + // Analogues in Linux: + // + // - groupContNotify && groupContInterrupted is represented by + // SIGNAL_CLD_STOPPED. + // + // - groupContNotify && !groupContInterrupted is represented by + // SIGNAL_CLD_CONTINUED. + // + // - !groupContNotify is represented by neither flag being set. + // + // groupContNotify and groupContInterrupted are protected by the signal + // mutex. + groupContNotify bool + groupContInterrupted bool + + // If groupContWaitable is true, the thread group is indicating a waitable + // continue event (as defined by EventGroupContinue). + // + // groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED. + // + // groupContWaitable is protected by the signal mutex. + groupContWaitable bool + + // exiting is true if all tasks in the ThreadGroup should exit. exiting is + // analogous to Linux's SIGNAL_GROUP_EXIT. + // + // exiting is protected by the signal mutex. exiting can only transition + // from false to true. + exiting bool + + // exitStatus is the thread group's exit status. + // + // While exiting is false, exitStatus is protected by the signal mutex. + // When exiting becomes true, exitStatus becomes immutable. + exitStatus ExitStatus + + // terminationSignal is the signal that this thread group's leader will + // send to its parent when it exits. + // + // terminationSignal is protected by the TaskSet mutex. + terminationSignal linux.Signal + + // liveGoroutines is the number of non-exited task goroutines in the thread + // group. + // + // liveGoroutines is not saved; it is reset as task goroutines are + // restarted by Task.Start. + liveGoroutines sync.WaitGroup `state:"nosave"` + + timerMu sync.Mutex `state:"nosave"` + + // itimerRealTimer implements ITIMER_REAL for the thread group. + itimerRealTimer *ktime.Timer + + // itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group. + // + // itimerVirtSetting is protected by the signal mutex. + itimerVirtSetting ktime.Setting + + // itimerProfSetting is the ITIMER_PROF setting for the thread group. + // + // itimerProfSetting is protected by the signal mutex. + itimerProfSetting ktime.Setting + + // rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit + // notifications for the thread group. + // + // rlimitCPUSoftSetting is protected by the signal mutex. + rlimitCPUSoftSetting ktime.Setting + + // cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true, + // itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true, + // or limits.Get(CPU) is finite. + // + // cpuTimersEnabled is protected by the signal mutex. cpuTimersEnabled is + // accessed using atomic memory operations. + cpuTimersEnabled uint32 + + // timers is the thread group's POSIX interval timers. nextTimerID is the + // TimerID at which allocation should begin searching for an unused ID. + // + // timers and nextTimerID are protected by timerMu. + timers map[linux.TimerID]*IntervalTimer + nextTimerID linux.TimerID + + // exitedCPUStats is the CPU usage for all exited tasks in the thread + // group. exitedCPUStats is protected by the TaskSet mutex. + exitedCPUStats usage.CPUStats + + // childCPUStats is the CPU usage of all joined descendants of this thread + // group. childCPUStats is protected by the TaskSet mutex. + childCPUStats usage.CPUStats + + // ioUsage is the I/O usage for all exited tasks in the thread group. + // The ioUsage pointer is immutable. + ioUsage *usage.IO + + // maxRSS is the historical maximum resident set size of the thread group, updated when: + // + // - A task in the thread group exits, since after all tasks have + // exited the MemoryManager is no longer reachable. + // + // - The thread group completes an execve, since this changes + // MemoryManagers. + // + // maxRSS is protected by the TaskSet mutex. + maxRSS uint64 + + // childMaxRSS is the maximum resident set size in bytes of all joined + // descendants of this thread group. + // + // childMaxRSS is protected by the TaskSet mutex. + childMaxRSS uint64 + + // Resource limits for this ThreadGroup. The limits pointer is immutable. + limits *limits.LimitSet + + // processGroup is the processGroup for this thread group. + // + // processGroup is protected by the TaskSet mutex. + processGroup *ProcessGroup + + // execed indicates an exec has occurred since creation. This will be + // set by finishExec, and new TheadGroups will have this field cleared. + // When execed is set, the processGroup may no longer be changed. + // + // execed is protected by the TaskSet mutex. + execed bool + + // oldRSeqCritical is the thread group's old rseq critical region. + oldRSeqCritical atomic.Value `state:".(*OldRSeqCriticalRegion)"` + + // mounts is the thread group's mount namespace. This does not really + // correspond to a "mount namespace" in Linux, but is more like a + // complete VFS that need not be shared between processes. See the + // comment in mounts.go for more information. + // + // mounts is immutable. + mounts *fs.MountNamespace + + // tty is the thread group's controlling terminal. If nil, there is no + // controlling terminal. + // + // tty is protected by the signal mutex. + tty *TTY + + // oomScoreAdj is the thread group's OOM score adjustment. This is + // currently not used but is maintained for consistency. + // TODO(gvisor.dev/issue/1967) + // + // oomScoreAdj is accessed using atomic memory operations. + oomScoreAdj int32 +} + +// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The +// thread group leader will send its parent terminationSignal when it exits. +// The new thread group isn't visible to the system until a task has been +// created inside of it by a successful call to TaskSet.NewTask. +func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup { + tg := &ThreadGroup{ + threadGroupNode: threadGroupNode{ + pidns: pidns, + }, + signalHandlers: sh, + terminationSignal: terminationSignal, + ioUsage: &usage.IO{}, + limits: limits, + mounts: mntns, + } + tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg}) + tg.timers = make(map[linux.TimerID]*IntervalTimer) + tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) + return tg +} + +// saveOldRSeqCritical is invoked by stateify. +func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion { + return tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) +} + +// loadOldRSeqCritical is invoked by stateify. +func (tg *ThreadGroup) loadOldRSeqCritical(r *OldRSeqCriticalRegion) { + tg.oldRSeqCritical.Store(r) +} + +// SignalHandlers returns the signal handlers used by tg. +// +// Preconditions: The caller must provide the synchronization required to read +// tg.signalHandlers, as described in the field's comment. +func (tg *ThreadGroup) SignalHandlers() *SignalHandlers { + return tg.signalHandlers +} + +// Limits returns tg's limits. +func (tg *ThreadGroup) Limits() *limits.LimitSet { + return tg.limits +} + +// release releases the thread group's resources. +func (tg *ThreadGroup) release() { + // Timers must be destroyed without holding the TaskSet or signal mutexes + // since timers send signals with Timer.mu locked. + tg.itimerRealTimer.Destroy() + var its []*IntervalTimer + tg.pidns.owner.mu.Lock() + tg.signalHandlers.mu.Lock() + for _, it := range tg.timers { + its = append(its, it) + } + tg.timers = make(map[linux.TimerID]*IntervalTimer) // nil maps can't be saved + tg.signalHandlers.mu.Unlock() + tg.pidns.owner.mu.Unlock() + for _, it := range its { + it.DestroyTimer() + } + if tg.mounts != nil { + tg.mounts.DecRef() + } +} + +// forEachChildThreadGroupLocked indicates over all child ThreadGroups. +// +// Precondition: TaskSet.mu must be held. +func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) { + for t := tg.tasks.Front(); t != nil; t = t.Next() { + for child := range t.children { + if child == child.tg.leader { + fn(child.tg) + } + } + } +} + +// SetControllingTTY sets tty as the controlling terminal of tg. +func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error { + tty.mu.Lock() + defer tty.mu.Unlock() + + // We might be asked to set the controlling terminal of multiple + // processes, so we lock both the TaskSet and SignalHandlers. + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + + // "The calling process must be a session leader and not have a + // controlling terminal already." - tty_ioctl(4) + if tg.processGroup.session.leader != tg || tg.tty != nil { + return syserror.EINVAL + } + + // "If this terminal is already the controlling terminal of a different + // session group, then the ioctl fails with EPERM, unless the caller + // has the CAP_SYS_ADMIN capability and arg equals 1, in which case the + // terminal is stolen, and all processes that had it as controlling + // terminal lose it." - tty_ioctl(4) + if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session { + // Stealing requires CAP_SYS_ADMIN in the root user namespace. + if creds := auth.CredentialsFromContext(tg.leader); !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) || arg != 1 { + return syserror.EPERM + } + // Steal the TTY away. Unlike TIOCNOTTY, don't send signals. + for othertg := range tg.pidns.owner.Root.tgids { + // This won't deadlock by locking tg.signalHandlers + // because at this point: + // - We only lock signalHandlers if it's in the same + // session as the tty's controlling thread group. + // - We know that the calling thread group is not in + // the same session as the tty's controlling thread + // group. + if othertg.processGroup.session == tty.tg.processGroup.session { + othertg.signalHandlers.mu.Lock() + othertg.tty = nil + othertg.signalHandlers.mu.Unlock() + } + } + } + + // Set the controlling terminal and foreground process group. + tg.tty = tty + tg.processGroup.session.foreground = tg.processGroup + // Set this as the controlling process of the terminal. + tty.tg = tg + + return nil +} + +// ReleaseControllingTTY gives up tty as the controlling tty of tg. +func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error { + tty.mu.Lock() + defer tty.mu.Unlock() + + // We might be asked to set the controlling terminal of multiple + // processes, so we lock both the TaskSet and SignalHandlers. + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + + // Just below, we may re-lock signalHandlers in order to send signals. + // Thus we can't defer Unlock here. + tg.signalHandlers.mu.Lock() + + if tg.tty == nil || tg.tty != tty { + tg.signalHandlers.mu.Unlock() + return syserror.ENOTTY + } + + // "If the process was session leader, then send SIGHUP and SIGCONT to + // the foreground process group and all processes in the current + // session lose their controlling terminal." - tty_ioctl(4) + // Remove tty as the controlling tty for each process in the session, + // then send them SIGHUP and SIGCONT. + + // If we're not the session leader, we don't have to do much. + if tty.tg != tg { + tg.tty = nil + tg.signalHandlers.mu.Unlock() + return nil + } + + tg.signalHandlers.mu.Unlock() + + // We're the session leader. SIGHUP and SIGCONT the foreground process + // group and remove all controlling terminals in the session. + var lastErr error + for othertg := range tg.pidns.owner.Root.tgids { + if othertg.processGroup.session == tg.processGroup.session { + othertg.signalHandlers.mu.Lock() + othertg.tty = nil + if othertg.processGroup == tg.processGroup.session.foreground { + if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil { + lastErr = err + } + if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil { + lastErr = err + } + } + othertg.signalHandlers.mu.Unlock() + } + } + + return lastErr +} + +// ForegroundProcessGroup returns the process group ID of the foreground +// process group. +func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) { + tty.mu.Lock() + defer tty.mu.Unlock() + + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + + // "When fd does not refer to the controlling terminal of the calling + // process, -1 is returned" - tcgetpgrp(3) + if tg.tty != tty { + return -1, syserror.ENOTTY + } + + return int32(tg.processGroup.session.foreground.id), nil +} + +// SetForegroundProcessGroup sets the foreground process group of tty to pgid. +func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) (int32, error) { + tty.mu.Lock() + defer tty.mu.Unlock() + + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + + // TODO(b/129283598): "If tcsetpgrp() is called by a member of a + // background process group in its session, and the calling process is + // not blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all + // members of this background process group." + + // tty must be the controlling terminal. + if tg.tty != tty { + return -1, syserror.ENOTTY + } + + // pgid must be positive. + if pgid < 0 { + return -1, syserror.EINVAL + } + + // pg must not be empty. Empty process groups are removed from their + // pid namespaces. + pg, ok := tg.pidns.processGroups[pgid] + if !ok { + return -1, syserror.ESRCH + } + + // pg must be part of this process's session. + if tg.processGroup.session != pg.session { + return -1, syserror.EPERM + } + + tg.processGroup.session.foreground.id = pgid + return 0, nil +} + +// itimerRealListener implements ktime.Listener for ITIMER_REAL expirations. +// +// +stateify savable +type itimerRealListener struct { + tg *ThreadGroup +} + +// Notify implements ktime.TimerListener.Notify. +func (l *itimerRealListener) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { + l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM)) + return ktime.Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (l *itimerRealListener) Destroy() { +} diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go new file mode 100644 index 000000000..872e1a82d --- /dev/null +++ b/pkg/sentry/kernel/threads.go @@ -0,0 +1,478 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/waiter" +) + +// TasksLimit is the maximum number of threads for untrusted application. +// Linux doesn't really limit this directly, rather it is limited by total +// memory size, stacks allocated and a global maximum. There's no real reason +// for us to limit it either, (esp. since threads are backed by go routines), +// and we would expect to hit resource limits long before hitting this number. +// However, for correctness, we still check that the user doesn't exceed this +// number. +// +// Note that because of the way futexes are implemented, there *are* in fact +// serious restrictions on valid thread IDs. They are limited to 2^30 - 1 +// (kernel/fork.c:MAX_THREADS). +const TasksLimit = (1 << 16) + +// ThreadID is a generic thread identifier. +type ThreadID int32 + +// String returns a decimal representation of the ThreadID. +func (tid ThreadID) String() string { + return fmt.Sprintf("%d", tid) +} + +// InitTID is the TID given to the first task added to each PID namespace. The +// thread group led by InitTID is called the namespace's init process. The +// death of a PID namespace's init process causes all tasks visible in that +// namespace to be killed. +const InitTID ThreadID = 1 + +// A TaskSet comprises all tasks in a system. +// +// +stateify savable +type TaskSet struct { + // mu protects all relationships betweens tasks and thread groups in the + // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.) + mu sync.RWMutex `state:"nosave"` + + // Root is the root PID namespace, in which all tasks in the TaskSet are + // visible. The Root pointer is immutable. + Root *PIDNamespace + + // sessions is the set of all sessions. + sessions sessionList + + // stopCount is the number of active external stops applicable to all tasks + // in the TaskSet (calls to TaskSet.BeginExternalStop that have not been + // paired with a call to TaskSet.EndExternalStop). stopCount is protected + // by mu. + // + // stopCount is not saved for the same reason as Task.stopCount; it is + // always reset to zero after restore. + stopCount int32 `state:"nosave"` + + // liveGoroutines is the number of non-exited task goroutines in the + // TaskSet. + // + // liveGoroutines is not saved; it is reset as task goroutines are + // restarted by Task.Start. + liveGoroutines sync.WaitGroup `state:"nosave"` + + // runningGoroutines is the number of running task goroutines in the + // TaskSet. + // + // runningGoroutines is not saved; its counter value is required to be zero + // at time of save (but note that this is not necessarily the same thing as + // sync.WaitGroup's zero value). + runningGoroutines sync.WaitGroup `state:"nosave"` + + // aioGoroutines is the number of goroutines running async I/O + // callbacks. + // + // aioGoroutines is not saved but is required to be zero at the time of + // save. + aioGoroutines sync.WaitGroup `state:"nosave"` +} + +// newTaskSet returns a new, empty TaskSet. +func newTaskSet(pidns *PIDNamespace) *TaskSet { + ts := &TaskSet{Root: pidns} + pidns.owner = ts + return ts +} + +// forEachThreadGroupLocked applies f to each thread group in ts. +// +// Preconditions: ts.mu must be locked (for reading or writing). +func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) { + for tg := range ts.Root.tgids { + f(tg) + } +} + +// A PIDNamespace represents a PID namespace, a bimap between thread IDs and +// tasks. See the pid_namespaces(7) man page for further details. +// +// N.B. A task is said to be visible in a PID namespace if the PID namespace +// contains a thread ID that maps to that task. +// +// +stateify savable +type PIDNamespace struct { + // owner is the TaskSet that this PID namespace belongs to. The owner + // pointer is immutable. + owner *TaskSet + + // parent is the PID namespace of the process that created this one. If + // this is the root PID namespace, parent is nil. The parent pointer is + // immutable. + // + // Invariant: All tasks that are visible in this namespace are also visible + // in all ancestor namespaces. + parent *PIDNamespace + + // userns is the user namespace with which this PID namespace is + // associated. Privileged operations on this PID namespace must have + // appropriate capabilities in userns. The userns pointer is immutable. + userns *auth.UserNamespace + + // The following fields are protected by owner.mu. + + // last is the last ThreadID to be allocated in this namespace. + last ThreadID + + // tasks is a mapping from ThreadIDs in this namespace to tasks visible in + // the namespace. + tasks map[ThreadID]*Task + + // tids is a mapping from tasks visible in this namespace to their + // identifiers in this namespace. + tids map[*Task]ThreadID + + // tgids is a mapping from thread groups visible in this namespace to + // their identifiers in this namespace. + // + // The content of tgids is equivalent to tids[tg.leader]. This exists + // primarily as an optimization to quickly find all thread groups. + tgids map[*ThreadGroup]ThreadID + + // sessions is a mapping from SessionIDs in this namespace to sessions + // visible in the namespace. + sessions map[SessionID]*Session + + // sids is a mapping from sessions visible in this namespace to their + // identifiers in this namespace. + sids map[*Session]SessionID + + // processGroups is a mapping from ProcessGroupIDs in this namespace to + // process groups visible in the namespace. + processGroups map[ProcessGroupID]*ProcessGroup + + // pgids is a mapping from process groups visible in this namespace to + // their identifiers in this namespace. + pgids map[*ProcessGroup]ProcessGroupID + + // exiting indicates that the namespace's init process is exiting or has + // exited. + exiting bool +} + +func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace { + return &PIDNamespace{ + owner: ts, + parent: parent, + userns: userns, + tasks: make(map[ThreadID]*Task), + tids: make(map[*Task]ThreadID), + tgids: make(map[*ThreadGroup]ThreadID), + sessions: make(map[SessionID]*Session), + sids: make(map[*Session]SessionID), + processGroups: make(map[ProcessGroupID]*ProcessGroup), + pgids: make(map[*ProcessGroup]ProcessGroupID), + } +} + +// NewRootPIDNamespace creates the root PID namespace. 'owner' is not available +// yet when root namespace is created and must be set by caller. +func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace { + return newPIDNamespace(nil, nil, userns) +} + +// NewChild returns a new, empty PID namespace that is a child of ns. Authority +// over the new PID namespace is controlled by userns. +func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace { + return newPIDNamespace(ns.owner, ns, userns) +} + +// TaskWithID returns the task with thread ID tid in PID namespace ns. If no +// task has that TID, TaskWithID returns nil. +func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task { + ns.owner.mu.RLock() + t := ns.tasks[tid] + ns.owner.mu.RUnlock() + return t +} + +// ThreadGroupWithID returns the thread group lead by the task with thread ID +// tid in PID namespace ns. If no task has that TID, or if the task with that +// TID is not a thread group leader, ThreadGroupWithID returns nil. +func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + t := ns.tasks[tid] + if t == nil { + return nil + } + if t != t.tg.leader { + return nil + } + return t.tg +} + +// IDOfTask returns the TID assigned to the given task in PID namespace ns. If +// the task is not visible in that namespace, IDOfTask returns 0. (This return +// value is significant in some cases, e.g. getppid() is documented as +// returning 0 if the caller's parent is in an ancestor namespace and +// consequently not visible to the caller.) If the task is nil, IDOfTask returns +// 0. +func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID { + ns.owner.mu.RLock() + id := ns.tids[t] + ns.owner.mu.RUnlock() + return id +} + +// IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns. +// If the task is not visible in that namespace, IDOfThreadGroup returns 0. +func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID { + ns.owner.mu.RLock() + id := ns.tgids[tg] + ns.owner.mu.RUnlock() + return id +} + +// Tasks returns a snapshot of the tasks in ns. +func (ns *PIDNamespace) Tasks() []*Task { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + tasks := make([]*Task, 0, len(ns.tasks)) + for t := range ns.tids { + tasks = append(tasks, t) + } + return tasks +} + +// ThreadGroups returns a snapshot of the thread groups in ns. +func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup { + return ns.ThreadGroupsAppend(nil) +} + +// ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs. +func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + for tg := range ns.tgids { + tgs = append(tgs, tg) + } + return tgs +} + +// UserNamespace returns the user namespace associated with PID namespace ns. +func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace { + return ns.userns +} + +// A threadGroupNode defines the relationship between a thread group and the +// rest of the system. Conceptually, threadGroupNode is data belonging to the +// owning TaskSet, as if TaskSet contained a field `nodes +// map[*ThreadGroup]*threadGroupNode`. However, for practical reasons, +// threadGroupNode is embedded in the ThreadGroup it represents. +// (threadGroupNode is an anonymous field in ThreadGroup; this is to expose +// threadGroupEntry's methods on ThreadGroup to make it implement +// threadGroupLinker.) +// +// +stateify savable +type threadGroupNode struct { + // pidns is the PID namespace containing the thread group and all of its + // member tasks. The pidns pointer is immutable. + pidns *PIDNamespace + + // eventQueue is notified whenever a event of interest to Task.Wait occurs + // in a child of this thread group, or a ptrace tracee of a task in this + // thread group. Events are defined in task_exit.go. + // + // Note that we cannot check and save this wait queue similarly to other + // wait queues, as the queue will not be empty by the time of saving, due + // to the wait sourced from Exec(). + eventQueue waiter.Queue `state:"nosave"` + + // leader is the thread group's leader, which is the oldest task in the + // thread group; usually the last task in the thread group to call + // execve(), or if no such task exists then the first task in the thread + // group, which was created by a call to fork() or clone() without + // CLONE_THREAD. Once a thread group has been made visible to the rest of + // the system by TaskSet.newTask, leader is never nil. + // + // Note that it's possible for the leader to exit without causing the rest + // of the thread group to exit; in such a case, leader will still be valid + // and non-nil, but leader will not be in tasks. + // + // leader is protected by the TaskSet mutex. + leader *Task + + // If execing is not nil, it is a task in the thread group that has killed + // all other tasks so that it can become the thread group leader and + // perform an execve. (execing may already be the thread group leader.) + // + // execing is analogous to Linux's signal_struct::group_exit_task. + // + // execing is protected by the TaskSet mutex. + execing *Task + + // tasks is all tasks in the thread group that have not yet been reaped. + // + // tasks is protected by both the TaskSet mutex and the signal mutex: + // Mutating tasks requires locking the TaskSet mutex for writing *and* + // locking the signal mutex. Reading tasks requires locking the TaskSet + // mutex *or* locking the signal mutex. + tasks taskList + + // tasksCount is the number of tasks in the thread group that have not yet + // been reaped; equivalently, tasksCount is the number of tasks in tasks. + // + // tasksCount is protected by both the TaskSet mutex and the signal mutex, + // as with tasks. + tasksCount int + + // liveTasks is the number of tasks in the thread group that have not yet + // reached TaskExitZombie. + // + // liveTasks is protected by the TaskSet mutex (NOT the signal mutex). + liveTasks int + + // activeTasks is the number of tasks in the thread group that have not yet + // reached TaskExitInitiated. + // + // activeTasks is protected by both the TaskSet mutex and the signal mutex, + // as with tasks. + activeTasks int +} + +// PIDNamespace returns the PID namespace containing tg. +func (tg *ThreadGroup) PIDNamespace() *PIDNamespace { + return tg.pidns +} + +// TaskSet returns the TaskSet containing tg. +func (tg *ThreadGroup) TaskSet() *TaskSet { + return tg.pidns.owner +} + +// Leader returns tg's leader. +func (tg *ThreadGroup) Leader() *Task { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.leader +} + +// Count returns the number of non-exited threads in the group. +func (tg *ThreadGroup) Count() int { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + var count int + for t := tg.tasks.Front(); t != nil; t = t.Next() { + count++ + } + return count +} + +// MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for +// all tasks in tg. +func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + + var tasks []ThreadID + for t := tg.tasks.Front(); t != nil; t = t.Next() { + if id, ok := pidns.tids[t]; ok { + tasks = append(tasks, id) + } + } + return tasks +} + +// ID returns tg's leader's thread ID in its own PID namespace. If tg's leader +// is dead, ID returns 0. +func (tg *ThreadGroup) ID() ThreadID { + tg.pidns.owner.mu.RLock() + id := tg.pidns.tgids[tg] + tg.pidns.owner.mu.RUnlock() + return id +} + +// A taskNode defines the relationship between a task and the rest of the +// system. The comments on threadGroupNode also apply to taskNode. +// +// +stateify savable +type taskNode struct { + // tg is the thread group that this task belongs to. The tg pointer is + // immutable. + tg *ThreadGroup `state:"wait"` + + // taskEntry links into tg.tasks. Note that this means that + // Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread + // group. See threadGroupNode.tasks for synchronization info. + taskEntry + + // parent is the task's parent. parent may be nil. + // + // parent is protected by the TaskSet mutex. + parent *Task + + // children is this task's children. + // + // children is protected by the TaskSet mutex. + children map[*Task]struct{} + + // If childPIDNamespace is not nil, all new tasks created by this task will + // be members of childPIDNamespace rather than this one. (As a corollary, + // this task becomes unable to create sibling tasks in the same thread + // group.) + // + // childPIDNamespace is exclusive to the task goroutine. + childPIDNamespace *PIDNamespace +} + +// ThreadGroup returns the thread group containing t. +func (t *Task) ThreadGroup() *ThreadGroup { + return t.tg +} + +// PIDNamespace returns the PID namespace containing t. +func (t *Task) PIDNamespace() *PIDNamespace { + return t.tg.pidns +} + +// TaskSet returns the TaskSet containing t. +func (t *Task) TaskSet() *TaskSet { + return t.tg.pidns.owner +} + +// Timekeeper returns the system Timekeeper. +func (t *Task) Timekeeper() *Timekeeper { + return t.k.timekeeper +} + +// Parent returns t's parent. +func (t *Task) Parent() *Task { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + return t.parent +} + +// ThreadID returns t's thread ID in its own PID namespace. If the task is +// dead, ThreadID returns 0. +func (t *Task) ThreadID() ThreadID { + return t.tg.pidns.IDOfTask(t) +} diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD new file mode 100644 index 000000000..7ba7dc50c --- /dev/null +++ b/pkg/sentry/kernel/time/BUILD @@ -0,0 +1,19 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "time", + srcs = [ + "context.go", + "time.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sync", + "//pkg/syserror", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go new file mode 100644 index 000000000..00b729d88 --- /dev/null +++ b/pkg/sentry/kernel/time/context.go @@ -0,0 +1,44 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package time + +import ( + "gvisor.dev/gvisor/pkg/context" +) + +// contextID is the time package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxRealtimeClock is a Context.Value key for the current real time. + CtxRealtimeClock contextID = iota +) + +// RealtimeClockFromContext returns the real time clock associated with context +// ctx. +func RealtimeClockFromContext(ctx context.Context) Clock { + if v := ctx.Value(CtxRealtimeClock); v != nil { + return v.(Clock) + } + return nil +} + +// NowFromContext returns the current real time associated with context ctx. +func NowFromContext(ctx context.Context) Time { + if clk := RealtimeClockFromContext(ctx); clk != nil { + return clk.Now() + } + panic("encountered context without RealtimeClock") +} diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go new file mode 100644 index 000000000..e959700f2 --- /dev/null +++ b/pkg/sentry/kernel/time/time.go @@ -0,0 +1,709 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package time defines the Timer type, which provides a periodic timer that +// works by sampling a user-provided clock. +package time + +import ( + "fmt" + "math" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Events that may be generated by a Clock. +const ( + // ClockEventSet occurs when a Clock undergoes a discontinuous change. + ClockEventSet waiter.EventMask = 1 << iota + + // ClockEventRateIncrease occurs when the rate at which a Clock advances + // increases significantly, such that values returned by previous calls to + // Clock.WallTimeUntil may be too large. + ClockEventRateIncrease +) + +// Time represents an instant in time with nanosecond precision. +// +// Time may represent time with respect to any clock and may not have any +// meaning in the real world. +// +// +stateify savable +type Time struct { + ns int64 +} + +var ( + // MinTime is the zero time instant, the lowest possible time that can + // be represented by Time. + MinTime = Time{ns: math.MinInt64} + + // MaxTime is the highest possible time that can be represented by + // Time. + MaxTime = Time{ns: math.MaxInt64} + + // ZeroTime represents the zero time in an unspecified Clock's domain. + ZeroTime = Time{ns: 0} +) + +const ( + // MinDuration is the minimum duration representable by time.Duration. + MinDuration = time.Duration(math.MinInt64) + + // MaxDuration is the maximum duration representable by time.Duration. + MaxDuration = time.Duration(math.MaxInt64) +) + +// FromNanoseconds returns a Time representing the point ns nanoseconds after +// an unspecified Clock's zero time. +func FromNanoseconds(ns int64) Time { + return Time{ns} +} + +// FromSeconds returns a Time representing the point s seconds after an +// unspecified Clock's zero time. +func FromSeconds(s int64) Time { + if s > math.MaxInt64/time.Second.Nanoseconds() { + return MaxTime + } + return Time{s * 1e9} +} + +// FromUnix converts from Unix seconds and nanoseconds to Time, assuming a real +// time Unix clock domain. +func FromUnix(s int64, ns int64) Time { + if s > math.MaxInt64/time.Second.Nanoseconds() { + return MaxTime + } + t := s * 1e9 + if t > math.MaxInt64-ns { + return MaxTime + } + return Time{t + ns} +} + +// FromTimespec converts from Linux Timespec to Time. +func FromTimespec(ts linux.Timespec) Time { + return Time{ts.ToNsecCapped()} +} + +// FromTimeval converts a Linux Timeval to Time. +func FromTimeval(tv linux.Timeval) Time { + return Time{tv.ToNsecCapped()} +} + +// Nanoseconds returns nanoseconds elapsed since the zero time in t's Clock +// domain. If t represents walltime, this is nanoseconds since the Unix epoch. +func (t Time) Nanoseconds() int64 { + return t.ns +} + +// Seconds returns seconds elapsed since the zero time in t's Clock domain. If +// t represents walltime, this is seconds since Unix epoch. +func (t Time) Seconds() int64 { + return t.Nanoseconds() / time.Second.Nanoseconds() +} + +// Timespec converts Time to a Linux timespec. +func (t Time) Timespec() linux.Timespec { + return linux.NsecToTimespec(t.Nanoseconds()) +} + +// Unix returns the (seconds, nanoseconds) representation of t such that +// seconds*1e9 + nanoseconds = t. +func (t Time) Unix() (s int64, ns int64) { + s = t.ns / 1e9 + ns = t.ns % 1e9 + return +} + +// TimeT converts Time to a Linux time_t. +func (t Time) TimeT() linux.TimeT { + return linux.NsecToTimeT(t.Nanoseconds()) +} + +// Timeval converts Time to a Linux timeval. +func (t Time) Timeval() linux.Timeval { + return linux.NsecToTimeval(t.Nanoseconds()) +} + +// StatxTimestamp converts Time to a Linux statx_timestamp. +func (t Time) StatxTimestamp() linux.StatxTimestamp { + return linux.NsecToStatxTimestamp(t.Nanoseconds()) +} + +// Add adds the duration of d to t. +func (t Time) Add(d time.Duration) Time { + if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) { + return MaxTime + } + if t.ns < 0 && d.Nanoseconds() < math.MinInt64-int64(t.ns) { + return MinTime + } + return Time{int64(t.ns) + d.Nanoseconds()} +} + +// AddTime adds the duration of u to t. +func (t Time) AddTime(u Time) Time { + return t.Add(time.Duration(u.ns)) +} + +// Equal reports whether the two times represent the same instant in time. +func (t Time) Equal(u Time) bool { + return t.ns == u.ns +} + +// Before reports whether the instant t is before the instant u. +func (t Time) Before(u Time) bool { + return t.ns < u.ns +} + +// After reports whether the instant t is after the instant u. +func (t Time) After(u Time) bool { + return t.ns > u.ns +} + +// Sub returns the duration of t - u. +// +// N.B. This measure may not make sense for every Time returned by ktime.Clock. +// Callers who need wall time duration can use ktime.Clock.WallTimeUntil to +// estimate that wall time. +func (t Time) Sub(u Time) time.Duration { + dur := time.Duration(int64(t.ns)-int64(u.ns)) * time.Nanosecond + switch { + case u.Add(dur).Equal(t): + return dur + case t.Before(u): + return MinDuration + default: + return MaxDuration + } +} + +// IsMin returns whether t represents the lowest possible time instant. +func (t Time) IsMin() bool { + return t == MinTime +} + +// IsZero returns whether t represents the zero time instant in t's Clock domain. +func (t Time) IsZero() bool { + return t == ZeroTime +} + +// String returns the time represented in nanoseconds as a string. +func (t Time) String() string { + return fmt.Sprintf("%dns", t.Nanoseconds()) +} + +// A Clock is an abstract time source. +type Clock interface { + // Now returns the current time in nanoseconds according to the Clock. + Now() Time + + // WallTimeUntil returns the estimated wall time until Now will return a + // value greater than or equal to t, given that a recent call to Now + // returned now. If t has already passed, WallTimeUntil may return 0 or a + // negative value. + // + // WallTimeUntil must be abstract to support Clocks that do not represent + // wall time (e.g. thread group execution timers). Clocks that represent + // wall times may embed the WallRateClock type to obtain an appropriate + // trivial implementation of WallTimeUntil. + // + // WallTimeUntil is used to determine when associated Timers should next + // check for expirations. Returning too small a value may result in + // spurious Timer goroutine wakeups, while returning too large a value may + // result in late expirations. Implementations should usually err on the + // side of underestimating. + WallTimeUntil(t, now Time) time.Duration + + // Waitable methods may be used to subscribe to Clock events. Waiters will + // not be preserved by Save and must be re-established during restore. + // + // Since Clock events are transient, implementations of + // waiter.Waitable.Readiness should return 0. + waiter.Waitable +} + +// WallRateClock implements Clock.WallTimeUntil for Clocks that elapse at the +// same rate as wall time. +type WallRateClock struct{} + +// WallTimeUntil implements Clock.WallTimeUntil. +func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration { + return t.Sub(now) +} + +// NoClockEvents implements waiter.Waitable for Clocks that do not generate +// events. +type NoClockEvents struct{} + +// Readiness implements waiter.Waitable.Readiness. +func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask { + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (*NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (*NoClockEvents) EventUnregister(e *waiter.Entry) { +} + +// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and +// defining waiter.Waitable.Readiness as required by Clock. +type ClockEventsQueue struct { + waiter.Queue +} + +// Readiness implements waiter.Waitable.Readiness. +func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask { + return 0 +} + +// A TimerListener receives expirations from a Timer. +type TimerListener interface { + // Notify is called when its associated Timer expires. exp is the number of + // expirations. setting is the next timer Setting. + // + // Notify is called with the associated Timer's mutex locked, so Notify + // must not take any locks that precede Timer.mu in lock order. + // + // If Notify returns true, the timer will use the returned setting + // rather than the passed one. + // + // Preconditions: exp > 0. + Notify(exp uint64, setting Setting) (newSetting Setting, update bool) + + // Destroy is called when the timer is destroyed. + Destroy() +} + +// Setting contains user-controlled mutable Timer properties. +// +// +stateify savable +type Setting struct { + // Enabled is true if the timer is running. + Enabled bool + + // Next is the time in nanoseconds of the next expiration. + Next Time + + // Period is the time in nanoseconds between expirations. If Period is + // zero, the timer will not automatically restart after expiring. + // + // Invariant: Period >= 0. + Period time.Duration +} + +// SettingFromSpec converts a (value, interval) pair to a Setting based on a +// reading from c. value is interpreted as a time relative to c.Now(). +func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) { + return SettingFromSpecAt(value, interval, c.Now()) +} + +// SettingFromSpecAt converts a (value, interval) pair to a Setting. value is +// interpreted as a time relative to now. +func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) { + if value < 0 { + return Setting{}, syserror.EINVAL + } + if value == 0 { + return Setting{Period: interval}, nil + } + return Setting{ + Enabled: true, + Next: now.Add(value), + Period: interval, + }, nil +} + +// SettingFromAbsSpec converts a (value, interval) pair to a Setting. value is +// interpreted as an absolute time. +func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) { + if value.Before(ZeroTime) { + return Setting{}, syserror.EINVAL + } + if value.IsZero() { + return Setting{Period: interval}, nil + } + return Setting{ + Enabled: true, + Next: value, + Period: interval, + }, nil +} + +// SettingFromItimerspec converts a linux.Itimerspec to a Setting. If abs is +// true, its.Value is interpreted as an absolute time. Otherwise, it is +// interpreted as a time relative to c.Now(). +func SettingFromItimerspec(its linux.Itimerspec, abs bool, c Clock) (Setting, error) { + if abs { + return SettingFromAbsSpec(FromTimespec(its.Value), its.Interval.ToDuration()) + } + return SettingFromSpec(its.Value.ToDuration(), its.Interval.ToDuration(), c) +} + +// SpecFromSetting converts a timestamp and a Setting to a (relative value, +// interval) pair, as used by most Linux syscalls that return a struct +// itimerval or struct itimerspec. +func SpecFromSetting(now Time, s Setting) (value, period time.Duration) { + if !s.Enabled { + return 0, s.Period + } + return s.Next.Sub(now), s.Period +} + +// ItimerspecFromSetting converts a Setting to a linux.Itimerspec. +func ItimerspecFromSetting(now Time, s Setting) linux.Itimerspec { + val, iv := SpecFromSetting(now, s) + return linux.Itimerspec{ + Interval: linux.DurationToTimespec(iv), + Value: linux.DurationToTimespec(val), + } +} + +// At returns an updated Setting and a number of expirations after the +// associated Clock indicates a time of now. +// +// Settings may be created by successive calls to At with decreasing +// values of now (i.e. time may appear to go backward). Supporting this is +// required to support non-monotonic clocks, as well as allowing +// Timer.clock.Now() to be called without holding Timer.mu. +func (s Setting) At(now Time) (Setting, uint64) { + if !s.Enabled { + return s, 0 + } + if s.Next.After(now) { + return s, 0 + } + if s.Period == 0 { + s.Enabled = false + return s, 1 + } + exp := 1 + uint64(now.Sub(s.Next).Nanoseconds())/uint64(s.Period) + s.Next = s.Next.Add(time.Duration(uint64(s.Period) * exp)) + return s, exp +} + +// Timer is an optionally-periodic timer driven by sampling a user-specified +// Clock. Timer's semantics support the requirements of Linux's interval timers +// (setitimer(2), timer_create(2), timerfd_create(2)). +// +// Timers should be created using NewTimer and must be cleaned up by calling +// Timer.Destroy when no longer used. +// +// +stateify savable +type Timer struct { + // clock is the time source. clock is immutable. + clock Clock + + // listener is notified of expirations. listener is immutable. + listener TimerListener + + // mu protects the following mutable fields. + mu sync.Mutex `state:"nosave"` + + // setting is the timer setting. setting is protected by mu. + setting Setting + + // paused is true if the Timer is paused. paused is protected by mu. + paused bool + + // kicker is used to wake the Timer goroutine. The kicker pointer is + // immutable, but its state is protected by mu. + kicker *time.Timer `state:"nosave"` + + // entry is registered with clock.EventRegister. entry is immutable. + // + // Per comment in Clock, entry must be re-registered after restore; per + // comment in Timer.Load, this is done in Timer.Resume. + entry waiter.Entry `state:"nosave"` + + // events is the channel that will be notified whenever entry receives an + // event. It is also closed by Timer.Destroy to instruct the Timer + // goroutine to exit. + events chan struct{} `state:"nosave"` +} + +// timerTickEvents are Clock events that require the Timer goroutine to Tick +// prematurely. +const timerTickEvents = ClockEventSet | ClockEventRateIncrease + +// NewTimer returns a new Timer that will obtain time from clock and send +// expirations to listener. The Timer is initially stopped and has no first +// expiration or period configured. +func NewTimer(clock Clock, listener TimerListener) *Timer { + t := &Timer{ + clock: clock, + listener: listener, + } + t.init() + return t +} + +// After waits for the duration to elapse according to clock and then sends a +// notification on the returned channel. The timer is started immediately and +// will fire exactly once. The second return value is the start time used with +// the duration. +// +// Callers must call Timer.Destroy. +func After(clock Clock, duration time.Duration) (*Timer, Time, <-chan struct{}) { + notifier, tchan := NewChannelNotifier() + t := NewTimer(clock, notifier) + now := clock.Now() + + t.Swap(Setting{ + Enabled: true, + Period: 0, + Next: now.Add(duration), + }) + return t, now, tchan +} + +// init initializes Timer state that is not preserved across save/restore. If +// init has already been called, calling it again is a no-op. +// +// Preconditions: t.mu must be locked, or the caller must have exclusive access +// to t. +func (t *Timer) init() { + if t.kicker != nil { + return + } + // If t.kicker is nil, the Timer goroutine can't be running, so we can't + // race with it. + t.kicker = time.NewTimer(0) + t.entry, t.events = waiter.NewChannelEntry(nil) + t.clock.EventRegister(&t.entry, timerTickEvents) + go t.runGoroutine() // S/R-SAFE: synchronized by t.mu +} + +// Destroy releases resources owned by the Timer. A Destroyed Timer must not be +// used again; in particular, a Destroyed Timer should not be Saved. +func (t *Timer) Destroy() { + // Stop the Timer, ensuring that the Timer goroutine will not call + // t.kicker.Reset, before calling t.kicker.Stop. + t.mu.Lock() + t.setting.Enabled = false + t.mu.Unlock() + t.kicker.Stop() + // Unregister t.entry, ensuring that the Clock will not send to t.events, + // before closing t.events to instruct the Timer goroutine to exit. + t.clock.EventUnregister(&t.entry) + close(t.events) + t.listener.Destroy() +} + +func (t *Timer) runGoroutine() { + for { + select { + case <-t.kicker.C: + case _, ok := <-t.events: + if !ok { + // Channel closed by Destroy. + return + } + } + t.Tick() + } +} + +// Tick requests that the Timer immediately check for expirations and +// re-evaluate when it should next check for expirations. +func (t *Timer) Tick() { + now := t.clock.Now() + t.mu.Lock() + defer t.mu.Unlock() + if t.paused { + return + } + s, exp := t.setting.At(now) + t.setting = s + if exp > 0 { + if newS, ok := t.listener.Notify(exp, t.setting); ok { + t.setting = newS + } + } + t.resetKickerLocked(now) +} + +// Pause pauses the Timer, ensuring that it does not generate any further +// expirations until Resume is called. If the Timer is already paused, Pause +// has no effect. +func (t *Timer) Pause() { + t.mu.Lock() + defer t.mu.Unlock() + t.paused = true + // t.kicker may be nil if we were restored but never resumed. + if t.kicker != nil { + t.kicker.Stop() + } +} + +// Resume ends the effect of Pause. If the Timer is not paused, Resume has no +// effect. +func (t *Timer) Resume() { + t.mu.Lock() + defer t.mu.Unlock() + if !t.paused { + return + } + t.paused = false + + // Lazily initialize the Timer. We can't call Timer.init until Timer.Resume + // because save/restore will restore Timers before + // kernel.Timekeeper.SetClocks() has been called, so if t.clock is backed + // by a kernel.Timekeeper then the Timer goroutine will panic if it calls + // t.clock.Now(). + t.init() + + // Kick the Timer goroutine in case it was already initialized, but the + // Timer goroutine was sleeping. + t.kicker.Reset(0) +} + +// Get returns a snapshot of the Timer's current Setting and the time +// (according to the Timer's Clock) at which the snapshot was taken. +// +// Preconditions: The Timer must not be paused (since its Setting cannot +// be advanced to the current time while it is paused.) +func (t *Timer) Get() (Time, Setting) { + now := t.clock.Now() + t.mu.Lock() + defer t.mu.Unlock() + if t.paused { + panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t)) + } + s, exp := t.setting.At(now) + t.setting = s + if exp > 0 { + if newS, ok := t.listener.Notify(exp, t.setting); ok { + t.setting = newS + } + } + t.resetKickerLocked(now) + return now, s +} + +// Swap atomically changes the Timer's Setting and returns the Timer's previous +// Setting and the time (according to the Timer's Clock) at which the snapshot +// was taken. Setting s.Enabled to true starts the Timer, while setting +// s.Enabled to false stops it. +// +// Preconditions: The Timer must not be paused. +func (t *Timer) Swap(s Setting) (Time, Setting) { + return t.SwapAnd(s, nil) +} + +// SwapAnd atomically changes the Timer's Setting, calls f if it is not nil, +// and returns the Timer's previous Setting and the time (according to the +// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true +// starts the timer, while setting s.Enabled to false stops it. +// +// Preconditions: The Timer must not be paused. f cannot call any Timer methods +// since it is called with the Timer mutex locked. +func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) { + now := t.clock.Now() + t.mu.Lock() + defer t.mu.Unlock() + if t.paused { + panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t)) + } + oldS, oldExp := t.setting.At(now) + if oldExp > 0 { + t.listener.Notify(oldExp, oldS) + // N.B. The returned Setting doesn't matter because we're about + // to overwrite. + } + if f != nil { + f() + } + newS, newExp := s.At(now) + t.setting = newS + if newExp > 0 { + if newS, ok := t.listener.Notify(newExp, t.setting); ok { + t.setting = newS + } + } + t.resetKickerLocked(now) + return now, oldS +} + +// Atomically invokes f atomically with respect to expirations of t; that is, t +// cannot generate expirations while f is being called. +// +// Preconditions: f cannot call any Timer methods since it is called with the +// Timer mutex locked. +func (t *Timer) Atomically(f func()) { + t.mu.Lock() + defer t.mu.Unlock() + f() +} + +// Preconditions: t.mu must be locked. +func (t *Timer) resetKickerLocked(now Time) { + if t.setting.Enabled { + // Clock.WallTimeUntil may return a negative value. This is fine; + // time.when treats negative Durations as 0. + t.kicker.Reset(t.clock.WallTimeUntil(t.setting.Next, now)) + } + // We don't call t.kicker.Stop if !t.setting.Enabled because in most cases + // resetKickerLocked will be called from the Timer goroutine itself, in + // which case t.kicker has already fired and t.kicker.Stop will be an + // expensive no-op (time.Timer.Stop => time.stopTimer => runtime.stopTimer + // => runtime.deltimer). +} + +// Clock returns the Clock used by t. +func (t *Timer) Clock() Clock { + return t.clock +} + +// ChannelNotifier is a TimerListener that sends a message on an empty struct +// channel. +// +// ChannelNotifier cannot be saved or loaded. +type ChannelNotifier struct { + // tchan must be a buffered channel. + tchan chan struct{} +} + +// NewChannelNotifier creates a new channel notifier. +// +// If the notifier is used with a timer, Timer.Destroy will close the channel +// returned here. +func NewChannelNotifier() (TimerListener, <-chan struct{}) { + tchan := make(chan struct{}, 1) + return &ChannelNotifier{tchan}, tchan +} + +// Notify implements ktime.TimerListener.Notify. +func (c *ChannelNotifier) Notify(uint64, Setting) (Setting, bool) { + select { + case c.tchan <- struct{}{}: + default: + } + + return Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy and will close the channel. +func (c *ChannelNotifier) Destroy() { + close(c.tchan) +} diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go new file mode 100644 index 000000000..0adf25691 --- /dev/null +++ b/pkg/sentry/kernel/timekeeper.go @@ -0,0 +1,325 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sync/atomic" + "time" + + "gvisor.dev/gvisor/pkg/log" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + sentrytime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sync" +) + +// Timekeeper manages all of the kernel clocks. +// +// +stateify savable +type Timekeeper struct { + // clocks are the clock sources. + // + // These are not saved directly, as the new machine's clock may behave + // differently. + // + // It is set only once, by SetClocks. + clocks sentrytime.Clocks `state:"nosave"` + + // bootTime is the realtime when the system "booted". i.e., when + // SetClocks was called in the initial (not restored) run. + bootTime ktime.Time + + // monotonicOffset is the offset to apply to the monotonic clock output + // from clocks. + // + // It is set only once, by SetClocks. + monotonicOffset int64 `state:"nosave"` + + // monotonicLowerBound is the lowerBound for monotonic time. + monotonicLowerBound int64 `state:"nosave"` + + // restored, if non-nil, indicates that this Timekeeper was restored + // from a state file. The clocks are not set until restored is closed. + restored chan struct{} `state:"nosave"` + + // saveMonotonic is the (offset) value of the monotonic clock at the + // time of save. + // + // It is only valid if restored is non-nil. + // + // It is only used in SetClocks after restore to compute the new + // monotonicOffset. + saveMonotonic int64 + + // saveRealtime is the value of the realtime clock at the time of save. + // + // It is only valid if restored is non-nil. + // + // It is only used in SetClocks after restore to compute the new + // monotonicOffset. + saveRealtime int64 + + // params manages the parameter page. + params *VDSOParamPage + + // mu protects destruction with stop and wg. + mu sync.Mutex `state:"nosave"` + + // stop is used to tell the update goroutine to exit. + stop chan struct{} `state:"nosave"` + + // wg is used to indicate that the update goroutine has exited. + wg sync.WaitGroup `state:"nosave"` +} + +// NewTimekeeper returns a Timekeeper that is automatically kept up-to-date. +// NewTimekeeper does not take ownership of paramPage. +// +// SetClocks must be called on the returned Timekeeper before it is usable. +func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) { + return &Timekeeper{ + params: NewVDSOParamPage(mfp, paramPage), + }, nil +} + +// SetClocks the backing clock source. +// +// SetClocks must be called before the Timekeeper is used, and it may not be +// called more than once, as changing the clock source without extra correction +// could cause time discontinuities. +// +// It must also be called after Load. +func (t *Timekeeper) SetClocks(c sentrytime.Clocks) { + // Update the params, marking them "not ready", as we may need to + // restart calibration on this new machine. + if t.restored != nil { + if err := t.params.Write(func() vdsoParams { + return vdsoParams{} + }); err != nil { + panic("unable to reset VDSO params: " + err.Error()) + } + } + + if t.clocks != nil { + panic("SetClocks called on previously-initialized Timekeeper") + } + + t.clocks = c + + // Compute the offset of the monotonic clock from the base Clocks. + // + // In a fresh (not restored) sentry, monotonic time starts at zero. + // + // In a restored sentry, monotonic time jumps forward by approximately + // the same amount as real time. There are no guarantees here, we are + // just making a best-effort attempt to make it appear that the app + // was simply not scheduled for a long period, rather than that the + // real time clock was changed. + // + // If real time went backwards, it remains the same. + wantMonotonic := int64(0) + + nowMonotonic, err := t.clocks.GetTime(sentrytime.Monotonic) + if err != nil { + panic("Unable to get current monotonic time: " + err.Error()) + } + + nowRealtime, err := t.clocks.GetTime(sentrytime.Realtime) + if err != nil { + panic("Unable to get current realtime: " + err.Error()) + } + + if t.restored != nil { + wantMonotonic = t.saveMonotonic + elapsed := nowRealtime - t.saveRealtime + if elapsed > 0 { + wantMonotonic += elapsed + } + } + + t.monotonicOffset = wantMonotonic - nowMonotonic + + if t.restored == nil { + // Hold on to the initial "boot" time. + t.bootTime = ktime.FromNanoseconds(nowRealtime) + } + + t.mu.Lock() + defer t.mu.Unlock() + t.startUpdater() + + if t.restored != nil { + close(t.restored) + } +} + +// startUpdater starts an update goroutine that keeps the clocks updated. +// +// mu must be held. +func (t *Timekeeper) startUpdater() { + if t.stop != nil { + // Timekeeper already started + return + } + t.stop = make(chan struct{}) + + // Keep the clocks up to date. + // + // Note that the Go runtime uses host CLOCK_MONOTONIC to service the + // timer, so it may run at a *slightly* different rate from the + // application CLOCK_MONOTONIC. That is fine, as we only need to update + // at approximately this rate. + timer := time.NewTicker(sentrytime.ApproxUpdateInterval) + t.wg.Add(1) + go func() { // S/R-SAFE: stopped during save. + defer t.wg.Done() + for { + // Start with an update immediately, so the clocks are + // ready ASAP. + + // Call Update within a Write block to prevent the VDSO + // from using the old params between Update and + // Write. + if err := t.params.Write(func() vdsoParams { + monotonicParams, monotonicOk, realtimeParams, realtimeOk := t.clocks.Update() + + var p vdsoParams + if monotonicOk { + p.monotonicReady = 1 + p.monotonicBaseCycles = int64(monotonicParams.BaseCycles) + p.monotonicBaseRef = int64(monotonicParams.BaseRef) + t.monotonicOffset + p.monotonicFrequency = monotonicParams.Frequency + } + if realtimeOk { + p.realtimeReady = 1 + p.realtimeBaseCycles = int64(realtimeParams.BaseCycles) + p.realtimeBaseRef = int64(realtimeParams.BaseRef) + p.realtimeFrequency = realtimeParams.Frequency + } + + log.Debugf("Updating VDSO parameters: %+v", p) + + return p + }); err != nil { + log.Warningf("Unable to update VDSO parameter page: %v", err) + } + + select { + case <-timer.C: + case <-t.stop: + return + } + } + }() +} + +// stopUpdater stops the update goroutine, blocking until it exits. +// +// mu must be held. +func (t *Timekeeper) stopUpdater() { + if t.stop == nil { + // Updater not running. + return + } + + close(t.stop) + t.wg.Wait() + t.stop = nil +} + +// Destroy destroys the Timekeeper, freeing all associated resources. +func (t *Timekeeper) Destroy() { + t.mu.Lock() + defer t.mu.Unlock() + + t.stopUpdater() +} + +// PauseUpdates stops clock parameter updates. This should only be used when +// Tasks are not running and thus cannot access the clock. +func (t *Timekeeper) PauseUpdates() { + t.mu.Lock() + defer t.mu.Unlock() + t.stopUpdater() +} + +// ResumeUpdates restarts clock parameter updates stopped by PauseUpdates. +func (t *Timekeeper) ResumeUpdates() { + t.mu.Lock() + defer t.mu.Unlock() + t.startUpdater() +} + +// GetTime returns the current time in nanoseconds. +func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) { + if t.clocks == nil { + if t.restored == nil { + panic("Timekeeper used before initialized with SetClocks") + } + <-t.restored + } + now, err := t.clocks.GetTime(c) + if err == nil && c == sentrytime.Monotonic { + now += t.monotonicOffset + for { + // It's possible that the clock is shaky. This may be due to + // platform issues, e.g. the KVM platform relies on the guest + // TSC and host TSC, which may not be perfectly in sync. To + // work around this issue, ensure that the monotonic time is + // always bounded by the last time read. + oldLowerBound := atomic.LoadInt64(&t.monotonicLowerBound) + if now < oldLowerBound { + now = oldLowerBound + break + } + if atomic.CompareAndSwapInt64(&t.monotonicLowerBound, oldLowerBound, now) { + break + } + } + } + return now, err +} + +// BootTime returns the system boot real time. +func (t *Timekeeper) BootTime() ktime.Time { + return t.bootTime +} + +// timekeeperClock is a ktime.Clock that reads time from a +// kernel.Timekeeper-managed clock. +// +// +stateify savable +type timekeeperClock struct { + tk *Timekeeper + c sentrytime.ClockID + + // Implements ktime.Clock.WallTimeUntil. + ktime.WallRateClock `state:"nosave"` + + // Implements waiter.Waitable. (We have no ability to detect + // discontinuities from external changes to CLOCK_REALTIME). + ktime.NoClockEvents `state:"nosave"` +} + +// Now implements ktime.Clock.Now. +func (tc *timekeeperClock) Now() ktime.Time { + now, err := tc.tk.GetTime(tc.c) + if err != nil { + panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err)) + } + return ktime.FromNanoseconds(now) +} diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go new file mode 100644 index 000000000..8e961c832 --- /dev/null +++ b/pkg/sentry/kernel/timekeeper_state.go @@ -0,0 +1,41 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/sentry/time" +) + +// beforeSave is invoked by stateify. +func (t *Timekeeper) beforeSave() { + if t.stop != nil { + panic("pauseUpdates must be called before Save") + } + + // N.B. we want the *offset* monotonic time. + var err error + if t.saveMonotonic, err = t.GetTime(time.Monotonic); err != nil { + panic("unable to get current monotonic time: " + err.Error()) + } + + if t.saveRealtime, err = t.GetTime(time.Realtime); err != nil { + panic("unable to get current realtime: " + err.Error()) + } +} + +// afterLoad is invoked by stateify. +func (t *Timekeeper) afterLoad() { + t.restored = make(chan struct{}) +} diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go new file mode 100644 index 000000000..cf2f7ca72 --- /dev/null +++ b/pkg/sentry/kernel/timekeeper_test.go @@ -0,0 +1,156 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + sentrytime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// mockClocks is a sentrytime.Clocks that simply returns the times in the +// struct. +type mockClocks struct { + monotonic int64 + realtime int64 +} + +// Update implements sentrytime.Clocks.Update. It does nothing. +func (*mockClocks) Update() (monotonicParams sentrytime.Parameters, monotonicOk bool, realtimeParam sentrytime.Parameters, realtimeOk bool) { + return +} + +// Update implements sentrytime.Clocks.GetTime. +func (c *mockClocks) GetTime(id sentrytime.ClockID) (int64, error) { + switch id { + case sentrytime.Monotonic: + return c.monotonic, nil + case sentrytime.Realtime: + return c.realtime, nil + default: + return 0, syserror.EINVAL + } +} + +// stateTestClocklessTimekeeper returns a test Timekeeper which has not had +// SetClocks called. +func stateTestClocklessTimekeeper(tb testing.TB) *Timekeeper { + ctx := contexttest.Context(tb) + mfp := pgalloc.MemoryFileProviderFromContext(ctx) + fr, err := mfp.MemoryFile().Allocate(usermem.PageSize, usage.Anonymous) + if err != nil { + tb.Fatalf("failed to allocate memory: %v", err) + } + return &Timekeeper{ + params: NewVDSOParamPage(mfp, fr), + } +} + +func stateTestTimekeeper(tb testing.TB) *Timekeeper { + t := stateTestClocklessTimekeeper(tb) + t.SetClocks(sentrytime.NewCalibratedClocks()) + return t +} + +// TestTimekeeperMonotonicZero tests that monotonic time starts at zero. +func TestTimekeeperMonotonicZero(t *testing.T) { + c := &mockClocks{ + monotonic: 100000, + } + + tk := stateTestClocklessTimekeeper(t) + tk.SetClocks(c) + defer tk.Destroy() + + now, err := tk.GetTime(sentrytime.Monotonic) + if err != nil { + t.Errorf("GetTime err got %v want nil", err) + } + if now != 0 { + t.Errorf("GetTime got %d want 0", now) + } + + c.monotonic += 10 + + now, err = tk.GetTime(sentrytime.Monotonic) + if err != nil { + t.Errorf("GetTime err got %v want nil", err) + } + if now != 10 { + t.Errorf("GetTime got %d want 10", now) + } +} + +// TestTimekeeperMonotonicJumpForward tests that monotonic time jumps forward +// after restore. +func TestTimekeeperMonotonicForward(t *testing.T) { + c := &mockClocks{ + monotonic: 900000, + realtime: 600000, + } + + tk := stateTestClocklessTimekeeper(t) + tk.restored = make(chan struct{}) + tk.saveMonotonic = 100000 + tk.saveRealtime = 400000 + tk.SetClocks(c) + defer tk.Destroy() + + // The monotonic clock should jump ahead by 200000 to 300000. + // + // The new system monotonic time (900000) is irrelevant to what the app + // sees. + now, err := tk.GetTime(sentrytime.Monotonic) + if err != nil { + t.Errorf("GetTime err got %v want nil", err) + } + if now != 300000 { + t.Errorf("GetTime got %d want 300000", now) + } +} + +// TestTimekeeperMonotonicJumpBackwards tests that monotonic time does not jump +// backwards when realtime goes backwards. +func TestTimekeeperMonotonicJumpBackwards(t *testing.T) { + c := &mockClocks{ + monotonic: 900000, + realtime: 400000, + } + + tk := stateTestClocklessTimekeeper(t) + tk.restored = make(chan struct{}) + tk.saveMonotonic = 100000 + tk.saveRealtime = 600000 + tk.SetClocks(c) + defer tk.Destroy() + + // The monotonic clock should remain at 100000. + // + // The new system monotonic time (900000) is irrelevant to what the app + // sees and we don't want to jump the monotonic clock backwards like + // realtime did. + now, err := tk.GetTime(sentrytime.Monotonic) + if err != nil { + t.Errorf("GetTime err got %v want nil", err) + } + if now != 100000 { + t.Errorf("GetTime got %d want 100000", now) + } +} diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go new file mode 100644 index 000000000..d0e0810e8 --- /dev/null +++ b/pkg/sentry/kernel/tty.go @@ -0,0 +1,41 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import "gvisor.dev/gvisor/pkg/sync" + +// TTY defines the relationship between a thread group and its controlling +// terminal. +// +// +stateify savable +type TTY struct { + // Index is the terminal index. It is immutable. + Index uint32 + + mu sync.Mutex `state:"nosave"` + + // tg is protected by mu. + tg *ThreadGroup +} + +// TTY returns the thread group's controlling terminal. If nil, there is no +// controlling terminal. +func (tg *ThreadGroup) TTY() *TTY { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + return tg.tty +} diff --git a/pkg/sentry/kernel/uncaught_signal.proto b/pkg/sentry/kernel/uncaught_signal.proto new file mode 100644 index 000000000..0bdb062cb --- /dev/null +++ b/pkg/sentry/kernel/uncaught_signal.proto @@ -0,0 +1,37 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package gvisor; + +import "pkg/sentry/arch/registers.proto"; + +message UncaughtSignal { + // Thread ID. + int32 tid = 1; + + // Process ID. + int32 pid = 2; + + // Registers at the time of the fault or signal. + Registers registers = 3; + + // Signal number. + int32 signal_number = 4; + + // The memory location which caused the fault (set if applicable, 0 + // otherwise). This will be set for SIGILL, SIGFPE, SIGSEGV, and SIGBUS. + uint64 fault_addr = 5; +} diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go new file mode 100644 index 000000000..8ccf04bd1 --- /dev/null +++ b/pkg/sentry/kernel/uts_namespace.go @@ -0,0 +1,101 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" +) + +// UTSNamespace represents a UTS namespace, a holder of two system identifiers: +// the hostname and domain name. +// +// +stateify savable +type UTSNamespace struct { + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + hostName string + domainName string + + // userns is the user namespace associated with the UTSNamespace. + // Privileged operations on this UTSNamespace must have appropriate + // capabilities in userns. + // + // userns is immutable. + userns *auth.UserNamespace +} + +// NewUTSNamespace creates a new UTS namespace. +func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace { + return &UTSNamespace{ + hostName: hostName, + domainName: domainName, + userns: userns, + } +} + +// UTSNamespace returns the task's UTS namespace. +func (t *Task) UTSNamespace() *UTSNamespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.utsns +} + +// HostName returns the host name of this UTS namespace. +func (u *UTSNamespace) HostName() string { + u.mu.Lock() + defer u.mu.Unlock() + return u.hostName +} + +// SetHostName sets the host name of this UTS namespace. +func (u *UTSNamespace) SetHostName(host string) { + u.mu.Lock() + defer u.mu.Unlock() + u.hostName = host +} + +// DomainName returns the domain name of this UTS namespace. +func (u *UTSNamespace) DomainName() string { + u.mu.Lock() + defer u.mu.Unlock() + return u.domainName +} + +// SetDomainName sets the domain name of this UTS namespace. +func (u *UTSNamespace) SetDomainName(domain string) { + u.mu.Lock() + defer u.mu.Unlock() + u.domainName = domain +} + +// UserNamespace returns the user namespace associated with this UTS namespace. +func (u *UTSNamespace) UserNamespace() *auth.UserNamespace { + u.mu.Lock() + defer u.mu.Unlock() + return u.userns +} + +// Clone makes a copy of this UTS namespace, associating the given user +// namespace. +func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace { + u.mu.Lock() + defer u.mu.Unlock() + return &UTSNamespace{ + hostName: u.hostName, + domainName: u.domainName, + userns: userns, + } +} diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go new file mode 100644 index 000000000..f1b3c212c --- /dev/null +++ b/pkg/sentry/kernel/vdso.go @@ -0,0 +1,148 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/usermem" +) + +// vdsoParams are the parameters exposed to the VDSO. +// +// They are exposed to the VDSO via a parameter page managed by VDSOParamPage, +// which also includes a sequence counter. +type vdsoParams struct { + monotonicReady uint64 + monotonicBaseCycles int64 + monotonicBaseRef int64 + monotonicFrequency uint64 + + realtimeReady uint64 + realtimeBaseCycles int64 + realtimeBaseRef int64 + realtimeFrequency uint64 +} + +// VDSOParamPage manages a VDSO parameter page. +// +// Its memory layout looks like: +// +// type page struct { +// // seq is a sequence counter that protects the fields below. +// seq uint64 +// vdsoParams +// } +// +// Everything in the struct is 8 bytes for easy alignment. +// +// It must be kept in sync with params in vdso/vdso_time.cc. +// +// +stateify savable +type VDSOParamPage struct { + // The parameter page is fr, allocated from mfp.MemoryFile(). + mfp pgalloc.MemoryFileProvider + fr platform.FileRange + + // seq is the current sequence count written to the page. + // + // A write is in progress if bit 1 of the counter is set. + // + // Timekeeper's updater goroutine may call Write before equality is + // checked in state_test_util tests, causing this field to change across + // save / restore. + seq uint64 +} + +// NewVDSOParamPage returns a VDSOParamPage. +// +// Preconditions: +// +// * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does +// not take ownership of fr; it must remain allocated for the lifetime of the +// VDSOParamPage. +// +// * VDSOParamPage must be the only writer to fr. +// +// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block. +func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage { + return &VDSOParamPage{mfp: mfp, fr: fr} +} + +// access returns a mapping of the param page. +func (v *VDSOParamPage) access() (safemem.Block, error) { + bs, err := v.mfp.MemoryFile().MapInternal(v.fr, usermem.ReadWrite) + if err != nil { + return safemem.Block{}, err + } + if bs.NumBlocks() != 1 { + panic(fmt.Sprintf("Multiple blocks (%d) in VDSO param BlockSeq", bs.NumBlocks())) + } + return bs.Head(), nil +} + +// incrementSeq increments the sequence counter in the param page. +func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error { + next := v.seq + 1 + old, err := safemem.SwapUint64(paramPage, next) + if err != nil { + return err + } + + if old != v.seq { + return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq) + } + + v.seq = next + return nil +} + +// Write updates the VDSO parameters. +// +// Write starts a write block, calls f to get the new parameters, writes +// out the new parameters, then ends the write block. +func (v *VDSOParamPage) Write(f func() vdsoParams) error { + paramPage, err := v.access() + if err != nil { + return err + } + + // Write begin. + next := v.seq + 1 + if next%2 != 1 { + panic("Out-of-order sequence count") + } + + err = v.incrementSeq(paramPage) + if err != nil { + return err + } + + // Get the new params. + p := f() + buf := binary.Marshal(nil, usermem.ByteOrder, p) + + // Skip the sequence counter. + if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil { + panic(fmt.Sprintf("Unable to get set VDSO parameters: %v", err)) + } + + // Write end. + return v.incrementSeq(paramPage) +} diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go new file mode 100644 index 000000000..5640dd71d --- /dev/null +++ b/pkg/sentry/kernel/version.go @@ -0,0 +1,33 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// Version defines the application-visible system version. +type Version struct { + // Operating system name (e.g. "Linux"). + Sysname string + + // Operating system release (e.g. "4.4-amd64"). + Release string + + // Operating system version. On Linux this takes the shape + // "#VERSION CONFIG_FLAGS TIMESTAMP" + // where: + // - VERSION is a sequence counter incremented on every successful build + // - CONFIG_FLAGS is a space-separated list of major enabled kernel features + // (e.g. "SMP" and "PREEMPT") + // - TIMESTAMP is the build timestamp as returned by `date` + Version string +} |