diff options
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 15 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel_state_autogen.go | 97 | ||||
-rw-r--r-- | pkg/sentry/kernel/ptrace.go | 175 | ||||
-rw-r--r-- | pkg/sentry/kernel/task.go | 7 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_exit.go | 3 | ||||
-rw-r--r-- | pkg/sentry/kernel/threads.go | 2 |
6 files changed, 238 insertions, 61 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index ef4e934a1..43065b45a 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -282,6 +282,18 @@ type Kernel struct { // If set to true, report address space activation waits as if the task is in // external wait so that the watchdog doesn't report the task stuck. SleepForAddressSpaceActivation bool + + // Exceptions to YAMA ptrace restrictions. Each key-value pair represents a + // tracee-tracer relationship. The key is a process (technically, the thread + // group leader) that can be traced by any thread that is a descendant of the + // value. If the value is nil, then anyone can trace the process represented by + // the key. + // + // ptraceExceptions is protected by the TaskSet mutex. + ptraceExceptions map[*Task]*Task + + // YAMAPtraceScope is the current level of YAMA ptrace restrictions. + YAMAPtraceScope int32 } // InitKernelArgs holds arguments to Init. @@ -382,6 +394,8 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() + k.ptraceExceptions = make(map[*Task]*Task) + k.YAMAPtraceScope = linux.YAMA_SCOPE_RELATIONAL if VFS2Enabled { ctx := k.SupervisorContext() @@ -425,7 +439,6 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord) } - return nil } diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go index fc4843f36..12b076fc6 100644 --- a/pkg/sentry/kernel/kernel_state_autogen.go +++ b/pkg/sentry/kernel/kernel_state_autogen.go @@ -331,6 +331,8 @@ func (k *Kernel) StateFields() []string { "shmMount", "socketMount", "SleepForAddressSpaceActivation", + "ptraceExceptions", + "YAMAPtraceScope", } } @@ -377,6 +379,8 @@ func (k *Kernel) StateSave(stateSinkObject state.Sink) { stateSinkObject.Save(34, &k.shmMount) stateSinkObject.Save(35, &k.socketMount) stateSinkObject.Save(36, &k.SleepForAddressSpaceActivation) + stateSinkObject.Save(37, &k.ptraceExceptions) + stateSinkObject.Save(38, &k.YAMAPtraceScope) } func (k *Kernel) afterLoad() {} @@ -417,6 +421,8 @@ func (k *Kernel) StateLoad(stateSourceObject state.Source) { stateSourceObject.Load(34, &k.shmMount) stateSourceObject.Load(35, &k.socketMount) stateSourceObject.Load(36, &k.SleepForAddressSpaceActivation) + stateSourceObject.Load(37, &k.ptraceExceptions) + stateSourceObject.Load(38, &k.YAMAPtraceScope) stateSourceObject.LoadValue(24, new([]tcpip.Endpoint), func(y interface{}) { k.loadDanglingEndpoints(y.([]tcpip.Endpoint)) }) stateSourceObject.LoadValue(28, new(*device.Registry), func(y interface{}) { k.loadDeviceRegistry(y.(*device.Registry)) }) } @@ -1179,6 +1185,7 @@ func (t *Task) StateFields() []string { "ptraceCode", "ptraceSiginfo", "ptraceEventMsg", + "ptraceYAMAExceptionAdded", "ioUsage", "creds", "utsns", @@ -1214,7 +1221,7 @@ func (t *Task) StateSave(stateSinkObject state.Sink) { var ptraceTracerValue *Task = t.savePtraceTracer() stateSinkObject.SaveValue(31, ptraceTracerValue) var syscallFiltersValue []bpf.Program = t.saveSyscallFilters() - stateSinkObject.SaveValue(47, syscallFiltersValue) + stateSinkObject.SaveValue(48, syscallFiltersValue) stateSinkObject.Save(0, &t.taskNode) stateSinkObject.Save(1, &t.runState) stateSinkObject.Save(2, &t.taskWorkCount) @@ -1254,27 +1261,28 @@ func (t *Task) StateSave(stateSinkObject state.Sink) { stateSinkObject.Save(37, &t.ptraceCode) stateSinkObject.Save(38, &t.ptraceSiginfo) stateSinkObject.Save(39, &t.ptraceEventMsg) - stateSinkObject.Save(40, &t.ioUsage) - stateSinkObject.Save(41, &t.creds) - stateSinkObject.Save(42, &t.utsns) - stateSinkObject.Save(43, &t.ipcns) - stateSinkObject.Save(44, &t.abstractSockets) - stateSinkObject.Save(45, &t.mountNamespaceVFS2) - stateSinkObject.Save(46, &t.parentDeathSignal) - stateSinkObject.Save(48, &t.cleartid) - stateSinkObject.Save(49, &t.allowedCPUMask) - stateSinkObject.Save(50, &t.cpu) - stateSinkObject.Save(51, &t.niceness) - stateSinkObject.Save(52, &t.numaPolicy) - stateSinkObject.Save(53, &t.numaNodeMask) - stateSinkObject.Save(54, &t.netns) - stateSinkObject.Save(55, &t.rseqCPU) - stateSinkObject.Save(56, &t.oldRSeqCPUAddr) - stateSinkObject.Save(57, &t.rseqAddr) - stateSinkObject.Save(58, &t.rseqSignature) - stateSinkObject.Save(59, &t.robustList) - stateSinkObject.Save(60, &t.startTime) - stateSinkObject.Save(61, &t.kcov) + stateSinkObject.Save(40, &t.ptraceYAMAExceptionAdded) + stateSinkObject.Save(41, &t.ioUsage) + stateSinkObject.Save(42, &t.creds) + stateSinkObject.Save(43, &t.utsns) + stateSinkObject.Save(44, &t.ipcns) + stateSinkObject.Save(45, &t.abstractSockets) + stateSinkObject.Save(46, &t.mountNamespaceVFS2) + stateSinkObject.Save(47, &t.parentDeathSignal) + stateSinkObject.Save(49, &t.cleartid) + stateSinkObject.Save(50, &t.allowedCPUMask) + stateSinkObject.Save(51, &t.cpu) + stateSinkObject.Save(52, &t.niceness) + stateSinkObject.Save(53, &t.numaPolicy) + stateSinkObject.Save(54, &t.numaNodeMask) + stateSinkObject.Save(55, &t.netns) + stateSinkObject.Save(56, &t.rseqCPU) + stateSinkObject.Save(57, &t.oldRSeqCPUAddr) + stateSinkObject.Save(58, &t.rseqAddr) + stateSinkObject.Save(59, &t.rseqSignature) + stateSinkObject.Save(60, &t.robustList) + stateSinkObject.Save(61, &t.startTime) + stateSinkObject.Save(62, &t.kcov) } func (t *Task) StateLoad(stateSourceObject state.Source) { @@ -1317,29 +1325,30 @@ func (t *Task) StateLoad(stateSourceObject state.Source) { stateSourceObject.Load(37, &t.ptraceCode) stateSourceObject.Load(38, &t.ptraceSiginfo) stateSourceObject.Load(39, &t.ptraceEventMsg) - stateSourceObject.Load(40, &t.ioUsage) - stateSourceObject.Load(41, &t.creds) - stateSourceObject.Load(42, &t.utsns) - stateSourceObject.Load(43, &t.ipcns) - stateSourceObject.Load(44, &t.abstractSockets) - stateSourceObject.Load(45, &t.mountNamespaceVFS2) - stateSourceObject.Load(46, &t.parentDeathSignal) - stateSourceObject.Load(48, &t.cleartid) - stateSourceObject.Load(49, &t.allowedCPUMask) - stateSourceObject.Load(50, &t.cpu) - stateSourceObject.Load(51, &t.niceness) - stateSourceObject.Load(52, &t.numaPolicy) - stateSourceObject.Load(53, &t.numaNodeMask) - stateSourceObject.Load(54, &t.netns) - stateSourceObject.Load(55, &t.rseqCPU) - stateSourceObject.Load(56, &t.oldRSeqCPUAddr) - stateSourceObject.Load(57, &t.rseqAddr) - stateSourceObject.Load(58, &t.rseqSignature) - stateSourceObject.Load(59, &t.robustList) - stateSourceObject.Load(60, &t.startTime) - stateSourceObject.Load(61, &t.kcov) + stateSourceObject.Load(40, &t.ptraceYAMAExceptionAdded) + stateSourceObject.Load(41, &t.ioUsage) + stateSourceObject.Load(42, &t.creds) + stateSourceObject.Load(43, &t.utsns) + stateSourceObject.Load(44, &t.ipcns) + stateSourceObject.Load(45, &t.abstractSockets) + stateSourceObject.Load(46, &t.mountNamespaceVFS2) + stateSourceObject.Load(47, &t.parentDeathSignal) + stateSourceObject.Load(49, &t.cleartid) + stateSourceObject.Load(50, &t.allowedCPUMask) + stateSourceObject.Load(51, &t.cpu) + stateSourceObject.Load(52, &t.niceness) + stateSourceObject.Load(53, &t.numaPolicy) + stateSourceObject.Load(54, &t.numaNodeMask) + stateSourceObject.Load(55, &t.netns) + stateSourceObject.Load(56, &t.rseqCPU) + stateSourceObject.Load(57, &t.oldRSeqCPUAddr) + stateSourceObject.Load(58, &t.rseqAddr) + stateSourceObject.Load(59, &t.rseqSignature) + stateSourceObject.Load(60, &t.robustList) + stateSourceObject.Load(61, &t.startTime) + stateSourceObject.Load(62, &t.kcov) stateSourceObject.LoadValue(31, new(*Task), func(y interface{}) { t.loadPtraceTracer(y.(*Task)) }) - stateSourceObject.LoadValue(47, new([]bpf.Program), func(y interface{}) { t.loadSyscallFilters(y.([]bpf.Program)) }) + stateSourceObject.LoadValue(48, new([]bpf.Program), func(y interface{}) { t.loadSyscallFilters(y.([]bpf.Program)) }) stateSourceObject.AfterLoad(t.afterLoad) } diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index cef58a590..c3980350a 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -16,6 +16,7 @@ package kernel import ( "fmt" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal/primitive" @@ -95,7 +96,11 @@ const ( // checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access // mode PTRACE_MODE_READ. // -// NOTE(b/30815691): The result of CanTrace is immediately stale (e.g., a +// In Linux, ptrace access restrictions may be configured by LSMs. While we do +// not support LSMs, we do add additional restrictions based on the commoncap +// and YAMA LSMs. +// +// TODO(gvisor.dev/issue/212): The result of CanTrace is immediately stale (e.g., a // racing setuid(2) may change traceability). This may pose a risk when a task // changes from traceable to not traceable. This is only problematic across // execve, where privileges may increase. @@ -103,7 +108,7 @@ const ( // We currently do not implement privileged executables (set-user/group-ID bits // and file capabilities), so that case is not reachable. func (t *Task) CanTrace(target *Task, attach bool) bool { - // "1. If the calling thread and the target thread are in the same thread + // "If the calling thread and the target thread are in the same thread // group, access is always allowed." - ptrace(2) // // Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access() @@ -115,9 +120,57 @@ func (t *Task) CanTrace(target *Task, attach bool) bool { return true } + if !t.canTraceStandard(target, attach) { + return false + } + + // YAMA only supported for vfs2. + if !VFS2Enabled { + return true + } + + if atomic.LoadInt32(&t.k.YAMAPtraceScope) == linux.YAMA_SCOPE_RELATIONAL { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.canTraceYAMALocked(target) { + return false + } + } + return true +} + +// canTraceLocked is the same as CanTrace, except the caller must already hold +// the TaskSet mutex (for reading or writing). +func (t *Task) canTraceLocked(target *Task, attach bool) bool { + if t.tg == target.tg { + return true + } + + if !t.canTraceStandard(target, attach) { + return false + } + + // YAMA only supported for vfs2. + if !VFS2Enabled { + return true + } + + if atomic.LoadInt32(&t.k.YAMAPtraceScope) == linux.YAMA_SCOPE_RELATIONAL { + if !t.canTraceYAMALocked(target) { + return false + } + } + return true +} + +// canTraceStandard performs standard ptrace access checks as defined by +// kernel/ptrace.c:__ptrace_may_access as well as the commoncap LSM +// implementation of the security_ptrace_access_check() interface, which is +// always invoked. +func (t *Task) canTraceStandard(target *Task, attach bool) bool { // """ - // 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped, - // doesn't exist until Linux 4.5). + // TODO(gvisor.dev/issue/260): 1. If the access mode specifies + // PTRACE_MODE_FSCREDS (ED: snipped, doesn't exist until Linux 4.5). // // Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the // caller's real UID and GID for the checks in the next step. (Most APIs @@ -125,7 +178,7 @@ func (t *Task) CanTrace(target *Task, attach bool) bool { // historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs // instead.) // - // 3. Deny access if neither of the following is true: + // 2. Deny access if neither of the following is true: // // - The real, effective, and saved-set user IDs of the target match the // caller's user ID, *and* the real, effective, and saved-set group IDs of @@ -134,15 +187,12 @@ func (t *Task) CanTrace(target *Task, attach bool) bool { // - The caller has the CAP_SYS_PTRACE capability in the user namespace of // the target. // - // 4. Deny access if the target process "dumpable" attribute has a value + // 3. Deny access if the target process "dumpable" attribute has a value // other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in // prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in // the user namespace of the target process. // - // 5. The kernel LSM security_ptrace_access_check() interface is invoked to - // see if ptrace access is permitted. The results depend on the LSM(s). The - // implementation of this interface in the commoncap LSM performs the - // following steps: + // 4. The commoncap LSM performs the following steps: // // a) If the access mode includes PTRACE_MODE_FSCREDS, then use the // caller's effective capability set; otherwise (the access mode specifies @@ -188,6 +238,94 @@ func (t *Task) CanTrace(target *Task, attach bool) bool { return true } +// canTraceYAMALocked performs ptrace access checks as defined by the YAMA LSM +// implementation of the security_ptrace_access_check() interface, with YAMA +// configured to mode 1. This is a common default among various Linux +// distributions. +// +// It only permits the tracer to proceed if one of the following conditions is +// met: +// +// a) The tracer is already attached to the tracee. +// +// b) The target is a descendant of the tracer. +// +// c) The target has explicitly given permission to the tracer through the +// PR_SET_PTRACER prctl. +// +// d) The tracer has CAP_SYS_PTRACE. +// +// See security/yama/yama_lsm.c:yama_ptrace_access_check. +// +// Precondition: the TaskSet mutex must be locked (for reading or writing). +func (t *Task) canTraceYAMALocked(target *Task) bool { + if tracer := target.Tracer(); tracer != nil { + if tracer.tg == t.tg { + return true + } + } + if target.isYAMADescendantOfLocked(t) { + return true + } + if target.hasYAMAExceptionForLocked(t) { + return true + } + if t.HasCapabilityIn(linux.CAP_SYS_PTRACE, target.UserNamespace()) { + return true + } + return false +} + +// Determines whether t is considered a descendant of ancestor for the purposes +// of YAMA permissions (specifically, whether t's thread group is descended from +// ancestor's). +// +// Precondition: the TaskSet mutex must be locked (for reading or writing). +func (t *Task) isYAMADescendantOfLocked(ancestor *Task) bool { + walker := t + for walker != nil { + if walker.tg.leader == ancestor.tg.leader { + return true + } + walker = walker.parent + } + return false +} + +// Precondition: the TaskSet mutex must be locked (for reading or writing). +func (t *Task) hasYAMAExceptionForLocked(tracer *Task) bool { + allowed, ok := t.k.ptraceExceptions[t] + if !ok { + return false + } + return allowed == nil || tracer.isYAMADescendantOfLocked(allowed) +} + +// ClearYAMAException removes any YAMA exception with t as the tracee. +func (t *Task) ClearYAMAException() { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + tracee := t.tg.leader + delete(t.k.ptraceExceptions, tracee) +} + +// SetYAMAException creates a YAMA exception allowing all descendants of tracer +// to trace t. If tracer is nil, then any task is allowed to trace t. +// +// If there was an existing exception, it is overwritten with the new one. +func (t *Task) SetYAMAException(tracer *Task) { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + + tracee := t.tg.leader + tracee.ptraceYAMAExceptionAdded = true + if tracer != nil { + tracer.ptraceYAMAExceptionAdded = true + } + + t.k.ptraceExceptions[tracee] = tracer +} + // Tracer returns t's ptrace Tracer. func (t *Task) Tracer() *Task { return t.ptraceTracer.Load().(*Task) @@ -358,7 +496,7 @@ func (t *Task) ptraceTraceme() error { // returning nil here is correct. return nil } - if !t.parent.CanTrace(t, true) { + if !t.parent.canTraceLocked(t, true) { return syserror.EPERM } if t.parent.exitState != TaskExitNone { @@ -377,11 +515,11 @@ func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { if t.tg == target.tg { return syserror.EPERM } - if !t.CanTrace(target, true) { - return syserror.EPERM - } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() + if !t.canTraceLocked(target, true) { + return syserror.EPERM + } if target.hasTracer() { return syserror.EPERM } @@ -459,6 +597,15 @@ func (t *Task) exitPtrace() { } // "nil maps cannot be saved" t.ptraceTracees = make(map[*Task]struct{}) + + if t.ptraceYAMAExceptionAdded { + delete(t.k.ptraceExceptions, t) + for tracee, tracer := range t.k.ptraceExceptions { + if tracer == t { + delete(t.k.ptraceExceptions, tracee) + } + } + } } // forgetTracerLocked detaches t's tracer and ensures that t is no longer diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c0ab53c94..36141dd09 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -403,6 +403,13 @@ type Task struct { // ptraceEventMsg is protected by the TaskSet mutex. ptraceEventMsg uint64 + // ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has + // been added before. This is used during task exit to decide whether we need + // to clean up YAMA exceptions. + // + // ptraceYAMAExceptionAdded is protected by the TaskSet mutex. + ptraceYAMAExceptionAdded bool + // The struct that holds the IO-related usage. The ioUsage pointer is // immutable. ioUsage *usage.IO diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index f7765fa3a..ad59e4f60 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -694,7 +694,8 @@ func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { } if t.parent != nil { delete(t.parent.children, t) - t.parent = nil + // Do not clear t.parent. It may be still be needed after the task has exited + // (for example, to perform ptrace access checks on /proc/[pid] files). } } } diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index e9da99067..09d070ec8 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -55,7 +55,7 @@ const InitTID ThreadID = 1 // // +stateify savable type TaskSet struct { - // mu protects all relationships betweens tasks and thread groups in the + // mu protects all relationships between tasks and thread groups in the // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.) mu sync.RWMutex `state:"nosave"` |