summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/kernel.go15
-rw-r--r--pkg/sentry/kernel/kernel_state_autogen.go97
-rw-r--r--pkg/sentry/kernel/ptrace.go175
-rw-r--r--pkg/sentry/kernel/task.go7
-rw-r--r--pkg/sentry/kernel/task_exit.go3
-rw-r--r--pkg/sentry/kernel/threads.go2
6 files changed, 238 insertions, 61 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index ef4e934a1..43065b45a 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -282,6 +282,18 @@ type Kernel struct {
// If set to true, report address space activation waits as if the task is in
// external wait so that the watchdog doesn't report the task stuck.
SleepForAddressSpaceActivation bool
+
+ // Exceptions to YAMA ptrace restrictions. Each key-value pair represents a
+ // tracee-tracer relationship. The key is a process (technically, the thread
+ // group leader) that can be traced by any thread that is a descendant of the
+ // value. If the value is nil, then anyone can trace the process represented by
+ // the key.
+ //
+ // ptraceExceptions is protected by the TaskSet mutex.
+ ptraceExceptions map[*Task]*Task
+
+ // YAMAPtraceScope is the current level of YAMA ptrace restrictions.
+ YAMAPtraceScope int32
}
// InitKernelArgs holds arguments to Init.
@@ -382,6 +394,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
k.futexes = futex.NewManager()
k.netlinkPorts = port.New()
+ k.ptraceExceptions = make(map[*Task]*Task)
+ k.YAMAPtraceScope = linux.YAMA_SCOPE_RELATIONAL
if VFS2Enabled {
ctx := k.SupervisorContext()
@@ -425,7 +439,6 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)
}
-
return nil
}
diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go
index fc4843f36..12b076fc6 100644
--- a/pkg/sentry/kernel/kernel_state_autogen.go
+++ b/pkg/sentry/kernel/kernel_state_autogen.go
@@ -331,6 +331,8 @@ func (k *Kernel) StateFields() []string {
"shmMount",
"socketMount",
"SleepForAddressSpaceActivation",
+ "ptraceExceptions",
+ "YAMAPtraceScope",
}
}
@@ -377,6 +379,8 @@ func (k *Kernel) StateSave(stateSinkObject state.Sink) {
stateSinkObject.Save(34, &k.shmMount)
stateSinkObject.Save(35, &k.socketMount)
stateSinkObject.Save(36, &k.SleepForAddressSpaceActivation)
+ stateSinkObject.Save(37, &k.ptraceExceptions)
+ stateSinkObject.Save(38, &k.YAMAPtraceScope)
}
func (k *Kernel) afterLoad() {}
@@ -417,6 +421,8 @@ func (k *Kernel) StateLoad(stateSourceObject state.Source) {
stateSourceObject.Load(34, &k.shmMount)
stateSourceObject.Load(35, &k.socketMount)
stateSourceObject.Load(36, &k.SleepForAddressSpaceActivation)
+ stateSourceObject.Load(37, &k.ptraceExceptions)
+ stateSourceObject.Load(38, &k.YAMAPtraceScope)
stateSourceObject.LoadValue(24, new([]tcpip.Endpoint), func(y interface{}) { k.loadDanglingEndpoints(y.([]tcpip.Endpoint)) })
stateSourceObject.LoadValue(28, new(*device.Registry), func(y interface{}) { k.loadDeviceRegistry(y.(*device.Registry)) })
}
@@ -1179,6 +1185,7 @@ func (t *Task) StateFields() []string {
"ptraceCode",
"ptraceSiginfo",
"ptraceEventMsg",
+ "ptraceYAMAExceptionAdded",
"ioUsage",
"creds",
"utsns",
@@ -1214,7 +1221,7 @@ func (t *Task) StateSave(stateSinkObject state.Sink) {
var ptraceTracerValue *Task = t.savePtraceTracer()
stateSinkObject.SaveValue(31, ptraceTracerValue)
var syscallFiltersValue []bpf.Program = t.saveSyscallFilters()
- stateSinkObject.SaveValue(47, syscallFiltersValue)
+ stateSinkObject.SaveValue(48, syscallFiltersValue)
stateSinkObject.Save(0, &t.taskNode)
stateSinkObject.Save(1, &t.runState)
stateSinkObject.Save(2, &t.taskWorkCount)
@@ -1254,27 +1261,28 @@ func (t *Task) StateSave(stateSinkObject state.Sink) {
stateSinkObject.Save(37, &t.ptraceCode)
stateSinkObject.Save(38, &t.ptraceSiginfo)
stateSinkObject.Save(39, &t.ptraceEventMsg)
- stateSinkObject.Save(40, &t.ioUsage)
- stateSinkObject.Save(41, &t.creds)
- stateSinkObject.Save(42, &t.utsns)
- stateSinkObject.Save(43, &t.ipcns)
- stateSinkObject.Save(44, &t.abstractSockets)
- stateSinkObject.Save(45, &t.mountNamespaceVFS2)
- stateSinkObject.Save(46, &t.parentDeathSignal)
- stateSinkObject.Save(48, &t.cleartid)
- stateSinkObject.Save(49, &t.allowedCPUMask)
- stateSinkObject.Save(50, &t.cpu)
- stateSinkObject.Save(51, &t.niceness)
- stateSinkObject.Save(52, &t.numaPolicy)
- stateSinkObject.Save(53, &t.numaNodeMask)
- stateSinkObject.Save(54, &t.netns)
- stateSinkObject.Save(55, &t.rseqCPU)
- stateSinkObject.Save(56, &t.oldRSeqCPUAddr)
- stateSinkObject.Save(57, &t.rseqAddr)
- stateSinkObject.Save(58, &t.rseqSignature)
- stateSinkObject.Save(59, &t.robustList)
- stateSinkObject.Save(60, &t.startTime)
- stateSinkObject.Save(61, &t.kcov)
+ stateSinkObject.Save(40, &t.ptraceYAMAExceptionAdded)
+ stateSinkObject.Save(41, &t.ioUsage)
+ stateSinkObject.Save(42, &t.creds)
+ stateSinkObject.Save(43, &t.utsns)
+ stateSinkObject.Save(44, &t.ipcns)
+ stateSinkObject.Save(45, &t.abstractSockets)
+ stateSinkObject.Save(46, &t.mountNamespaceVFS2)
+ stateSinkObject.Save(47, &t.parentDeathSignal)
+ stateSinkObject.Save(49, &t.cleartid)
+ stateSinkObject.Save(50, &t.allowedCPUMask)
+ stateSinkObject.Save(51, &t.cpu)
+ stateSinkObject.Save(52, &t.niceness)
+ stateSinkObject.Save(53, &t.numaPolicy)
+ stateSinkObject.Save(54, &t.numaNodeMask)
+ stateSinkObject.Save(55, &t.netns)
+ stateSinkObject.Save(56, &t.rseqCPU)
+ stateSinkObject.Save(57, &t.oldRSeqCPUAddr)
+ stateSinkObject.Save(58, &t.rseqAddr)
+ stateSinkObject.Save(59, &t.rseqSignature)
+ stateSinkObject.Save(60, &t.robustList)
+ stateSinkObject.Save(61, &t.startTime)
+ stateSinkObject.Save(62, &t.kcov)
}
func (t *Task) StateLoad(stateSourceObject state.Source) {
@@ -1317,29 +1325,30 @@ func (t *Task) StateLoad(stateSourceObject state.Source) {
stateSourceObject.Load(37, &t.ptraceCode)
stateSourceObject.Load(38, &t.ptraceSiginfo)
stateSourceObject.Load(39, &t.ptraceEventMsg)
- stateSourceObject.Load(40, &t.ioUsage)
- stateSourceObject.Load(41, &t.creds)
- stateSourceObject.Load(42, &t.utsns)
- stateSourceObject.Load(43, &t.ipcns)
- stateSourceObject.Load(44, &t.abstractSockets)
- stateSourceObject.Load(45, &t.mountNamespaceVFS2)
- stateSourceObject.Load(46, &t.parentDeathSignal)
- stateSourceObject.Load(48, &t.cleartid)
- stateSourceObject.Load(49, &t.allowedCPUMask)
- stateSourceObject.Load(50, &t.cpu)
- stateSourceObject.Load(51, &t.niceness)
- stateSourceObject.Load(52, &t.numaPolicy)
- stateSourceObject.Load(53, &t.numaNodeMask)
- stateSourceObject.Load(54, &t.netns)
- stateSourceObject.Load(55, &t.rseqCPU)
- stateSourceObject.Load(56, &t.oldRSeqCPUAddr)
- stateSourceObject.Load(57, &t.rseqAddr)
- stateSourceObject.Load(58, &t.rseqSignature)
- stateSourceObject.Load(59, &t.robustList)
- stateSourceObject.Load(60, &t.startTime)
- stateSourceObject.Load(61, &t.kcov)
+ stateSourceObject.Load(40, &t.ptraceYAMAExceptionAdded)
+ stateSourceObject.Load(41, &t.ioUsage)
+ stateSourceObject.Load(42, &t.creds)
+ stateSourceObject.Load(43, &t.utsns)
+ stateSourceObject.Load(44, &t.ipcns)
+ stateSourceObject.Load(45, &t.abstractSockets)
+ stateSourceObject.Load(46, &t.mountNamespaceVFS2)
+ stateSourceObject.Load(47, &t.parentDeathSignal)
+ stateSourceObject.Load(49, &t.cleartid)
+ stateSourceObject.Load(50, &t.allowedCPUMask)
+ stateSourceObject.Load(51, &t.cpu)
+ stateSourceObject.Load(52, &t.niceness)
+ stateSourceObject.Load(53, &t.numaPolicy)
+ stateSourceObject.Load(54, &t.numaNodeMask)
+ stateSourceObject.Load(55, &t.netns)
+ stateSourceObject.Load(56, &t.rseqCPU)
+ stateSourceObject.Load(57, &t.oldRSeqCPUAddr)
+ stateSourceObject.Load(58, &t.rseqAddr)
+ stateSourceObject.Load(59, &t.rseqSignature)
+ stateSourceObject.Load(60, &t.robustList)
+ stateSourceObject.Load(61, &t.startTime)
+ stateSourceObject.Load(62, &t.kcov)
stateSourceObject.LoadValue(31, new(*Task), func(y interface{}) { t.loadPtraceTracer(y.(*Task)) })
- stateSourceObject.LoadValue(47, new([]bpf.Program), func(y interface{}) { t.loadSyscallFilters(y.([]bpf.Program)) })
+ stateSourceObject.LoadValue(48, new([]bpf.Program), func(y interface{}) { t.loadSyscallFilters(y.([]bpf.Program)) })
stateSourceObject.AfterLoad(t.afterLoad)
}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index cef58a590..c3980350a 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -16,6 +16,7 @@ package kernel
import (
"fmt"
+ "sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/marshal/primitive"
@@ -95,7 +96,11 @@ const (
// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access
// mode PTRACE_MODE_READ.
//
-// NOTE(b/30815691): The result of CanTrace is immediately stale (e.g., a
+// In Linux, ptrace access restrictions may be configured by LSMs. While we do
+// not support LSMs, we do add additional restrictions based on the commoncap
+// and YAMA LSMs.
+//
+// TODO(gvisor.dev/issue/212): The result of CanTrace is immediately stale (e.g., a
// racing setuid(2) may change traceability). This may pose a risk when a task
// changes from traceable to not traceable. This is only problematic across
// execve, where privileges may increase.
@@ -103,7 +108,7 @@ const (
// We currently do not implement privileged executables (set-user/group-ID bits
// and file capabilities), so that case is not reachable.
func (t *Task) CanTrace(target *Task, attach bool) bool {
- // "1. If the calling thread and the target thread are in the same thread
+ // "If the calling thread and the target thread are in the same thread
// group, access is always allowed." - ptrace(2)
//
// Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access()
@@ -115,9 +120,57 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
return true
}
+ if !t.canTraceStandard(target, attach) {
+ return false
+ }
+
+ // YAMA only supported for vfs2.
+ if !VFS2Enabled {
+ return true
+ }
+
+ if atomic.LoadInt32(&t.k.YAMAPtraceScope) == linux.YAMA_SCOPE_RELATIONAL {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ if !t.canTraceYAMALocked(target) {
+ return false
+ }
+ }
+ return true
+}
+
+// canTraceLocked is the same as CanTrace, except the caller must already hold
+// the TaskSet mutex (for reading or writing).
+func (t *Task) canTraceLocked(target *Task, attach bool) bool {
+ if t.tg == target.tg {
+ return true
+ }
+
+ if !t.canTraceStandard(target, attach) {
+ return false
+ }
+
+ // YAMA only supported for vfs2.
+ if !VFS2Enabled {
+ return true
+ }
+
+ if atomic.LoadInt32(&t.k.YAMAPtraceScope) == linux.YAMA_SCOPE_RELATIONAL {
+ if !t.canTraceYAMALocked(target) {
+ return false
+ }
+ }
+ return true
+}
+
+// canTraceStandard performs standard ptrace access checks as defined by
+// kernel/ptrace.c:__ptrace_may_access as well as the commoncap LSM
+// implementation of the security_ptrace_access_check() interface, which is
+// always invoked.
+func (t *Task) canTraceStandard(target *Task, attach bool) bool {
// """
- // 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped,
- // doesn't exist until Linux 4.5).
+ // TODO(gvisor.dev/issue/260): 1. If the access mode specifies
+ // PTRACE_MODE_FSCREDS (ED: snipped, doesn't exist until Linux 4.5).
//
// Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the
// caller's real UID and GID for the checks in the next step. (Most APIs
@@ -125,7 +178,7 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
// historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs
// instead.)
//
- // 3. Deny access if neither of the following is true:
+ // 2. Deny access if neither of the following is true:
//
// - The real, effective, and saved-set user IDs of the target match the
// caller's user ID, *and* the real, effective, and saved-set group IDs of
@@ -134,15 +187,12 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
// - The caller has the CAP_SYS_PTRACE capability in the user namespace of
// the target.
//
- // 4. Deny access if the target process "dumpable" attribute has a value
+ // 3. Deny access if the target process "dumpable" attribute has a value
// other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in
// prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in
// the user namespace of the target process.
//
- // 5. The kernel LSM security_ptrace_access_check() interface is invoked to
- // see if ptrace access is permitted. The results depend on the LSM(s). The
- // implementation of this interface in the commoncap LSM performs the
- // following steps:
+ // 4. The commoncap LSM performs the following steps:
//
// a) If the access mode includes PTRACE_MODE_FSCREDS, then use the
// caller's effective capability set; otherwise (the access mode specifies
@@ -188,6 +238,94 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
return true
}
+// canTraceYAMALocked performs ptrace access checks as defined by the YAMA LSM
+// implementation of the security_ptrace_access_check() interface, with YAMA
+// configured to mode 1. This is a common default among various Linux
+// distributions.
+//
+// It only permits the tracer to proceed if one of the following conditions is
+// met:
+//
+// a) The tracer is already attached to the tracee.
+//
+// b) The target is a descendant of the tracer.
+//
+// c) The target has explicitly given permission to the tracer through the
+// PR_SET_PTRACER prctl.
+//
+// d) The tracer has CAP_SYS_PTRACE.
+//
+// See security/yama/yama_lsm.c:yama_ptrace_access_check.
+//
+// Precondition: the TaskSet mutex must be locked (for reading or writing).
+func (t *Task) canTraceYAMALocked(target *Task) bool {
+ if tracer := target.Tracer(); tracer != nil {
+ if tracer.tg == t.tg {
+ return true
+ }
+ }
+ if target.isYAMADescendantOfLocked(t) {
+ return true
+ }
+ if target.hasYAMAExceptionForLocked(t) {
+ return true
+ }
+ if t.HasCapabilityIn(linux.CAP_SYS_PTRACE, target.UserNamespace()) {
+ return true
+ }
+ return false
+}
+
+// Determines whether t is considered a descendant of ancestor for the purposes
+// of YAMA permissions (specifically, whether t's thread group is descended from
+// ancestor's).
+//
+// Precondition: the TaskSet mutex must be locked (for reading or writing).
+func (t *Task) isYAMADescendantOfLocked(ancestor *Task) bool {
+ walker := t
+ for walker != nil {
+ if walker.tg.leader == ancestor.tg.leader {
+ return true
+ }
+ walker = walker.parent
+ }
+ return false
+}
+
+// Precondition: the TaskSet mutex must be locked (for reading or writing).
+func (t *Task) hasYAMAExceptionForLocked(tracer *Task) bool {
+ allowed, ok := t.k.ptraceExceptions[t]
+ if !ok {
+ return false
+ }
+ return allowed == nil || tracer.isYAMADescendantOfLocked(allowed)
+}
+
+// ClearYAMAException removes any YAMA exception with t as the tracee.
+func (t *Task) ClearYAMAException() {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ tracee := t.tg.leader
+ delete(t.k.ptraceExceptions, tracee)
+}
+
+// SetYAMAException creates a YAMA exception allowing all descendants of tracer
+// to trace t. If tracer is nil, then any task is allowed to trace t.
+//
+// If there was an existing exception, it is overwritten with the new one.
+func (t *Task) SetYAMAException(tracer *Task) {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+
+ tracee := t.tg.leader
+ tracee.ptraceYAMAExceptionAdded = true
+ if tracer != nil {
+ tracer.ptraceYAMAExceptionAdded = true
+ }
+
+ t.k.ptraceExceptions[tracee] = tracer
+}
+
// Tracer returns t's ptrace Tracer.
func (t *Task) Tracer() *Task {
return t.ptraceTracer.Load().(*Task)
@@ -358,7 +496,7 @@ func (t *Task) ptraceTraceme() error {
// returning nil here is correct.
return nil
}
- if !t.parent.CanTrace(t, true) {
+ if !t.parent.canTraceLocked(t, true) {
return syserror.EPERM
}
if t.parent.exitState != TaskExitNone {
@@ -377,11 +515,11 @@ func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error {
if t.tg == target.tg {
return syserror.EPERM
}
- if !t.CanTrace(target, true) {
- return syserror.EPERM
- }
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
+ if !t.canTraceLocked(target, true) {
+ return syserror.EPERM
+ }
if target.hasTracer() {
return syserror.EPERM
}
@@ -459,6 +597,15 @@ func (t *Task) exitPtrace() {
}
// "nil maps cannot be saved"
t.ptraceTracees = make(map[*Task]struct{})
+
+ if t.ptraceYAMAExceptionAdded {
+ delete(t.k.ptraceExceptions, t)
+ for tracee, tracer := range t.k.ptraceExceptions {
+ if tracer == t {
+ delete(t.k.ptraceExceptions, tracee)
+ }
+ }
+ }
}
// forgetTracerLocked detaches t's tracer and ensures that t is no longer
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index c0ab53c94..36141dd09 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -403,6 +403,13 @@ type Task struct {
// ptraceEventMsg is protected by the TaskSet mutex.
ptraceEventMsg uint64
+ // ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has
+ // been added before. This is used during task exit to decide whether we need
+ // to clean up YAMA exceptions.
+ //
+ // ptraceYAMAExceptionAdded is protected by the TaskSet mutex.
+ ptraceYAMAExceptionAdded bool
+
// The struct that holds the IO-related usage. The ioUsage pointer is
// immutable.
ioUsage *usage.IO
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index f7765fa3a..ad59e4f60 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -694,7 +694,8 @@ func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
}
if t.parent != nil {
delete(t.parent.children, t)
- t.parent = nil
+ // Do not clear t.parent. It may be still be needed after the task has exited
+ // (for example, to perform ptrace access checks on /proc/[pid] files).
}
}
}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index e9da99067..09d070ec8 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -55,7 +55,7 @@ const InitTID ThreadID = 1
//
// +stateify savable
type TaskSet struct {
- // mu protects all relationships betweens tasks and thread groups in the
+ // mu protects all relationships between tasks and thread groups in the
// TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
mu sync.RWMutex `state:"nosave"`