1 files changed, 557 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
new file mode 100644
index 000000000..a51fa9d7e
--- /dev/null
+++ b/pkg/sentry/kernel/task_identity.go
@@ -0,0 +1,557 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Credentials returns t's credentials by value.
+func (t *Task) Credentials() auth.Credentials {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return *t.creds // Copy out with lock held.
+}
+
+// UserNamespace returns the user namespace associated with the task.
+func (t *Task) UserNamespace() *auth.UserNamespace {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.UserNamespace
+}
+
+// HasCapabilityIn checks if the task has capability cp in user namespace ns.
+func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.HasCapabilityIn(cp, ns)
+}
+
+// HasCapability checks if the task has capability cp in its user namespace.
+func (t *Task) HasCapability(cp linux.Capability) bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.creds.HasCapability(cp)
+}
+
+// SetUID implements the semantics of setuid(2).
+func (t *Task) SetUID(uid auth.UID) error {
+	// setuid considers -1 to be invalid.
+	if !uid.Ok() {
+		return syserror.EINVAL
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	kuid := t.creds.UserNamespace.MapToKUID(uid)
+	if !kuid.Ok() {
+		return syserror.EINVAL
+	}
+	// "setuid() sets the effective user ID of the calling process. If the
+	// effective UID of the caller is root (more precisely: if the caller has
+	// the CAP_SETUID capability), the real UID and saved set-user-ID are also
+	// set." - setuid(2)
+	if t.creds.HasCapability(linux.CAP_SETUID) {
+		t.setKUIDsUncheckedLocked(kuid, kuid, kuid)
+		return nil
+	}
+	// "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID
+	// capability) and uid does not match the real UID or saved set-user-ID of
+	// the calling process."
+	if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID {
+		return syserror.EPERM
+	}
+	t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID)
+	return nil
+}
+
+// SetREUID implements the semantics of setreuid(2).
+func (t *Task) SetREUID(r, e auth.UID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Supplying a value of -1 for either the real or effective user ID forces
+	// the system to leave that ID unchanged." - setreuid(2)
+	newR := t.creds.RealKUID
+	if r.Ok() {
+		newR = t.creds.UserNamespace.MapToKUID(r)
+		if !newR.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	newE := t.creds.EffectiveKUID
+	if e.Ok() {
+		newE = t.creds.UserNamespace.MapToKUID(e)
+		if !newE.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	if !t.creds.HasCapability(linux.CAP_SETUID) {
+		// "Unprivileged processes may only set the effective user ID to the
+		// real user ID, the effective user ID, or the saved set-user-ID."
+		if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID {
+			return syserror.EPERM
+		}
+		// "Unprivileged users may only set the real user ID to the real user
+		// ID or the effective user ID."
+		if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID {
+			return syserror.EPERM
+		}
+	}
+	// "If the real user ID is set (i.e., ruid is not -1) or the effective user
+	// ID is set to a value not equal to the previous real user ID, the saved
+	// set-user-ID will be set to the new effective user ID."
+	newS := t.creds.SavedKUID
+	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) {
+		newS = newE
+	}
+	t.setKUIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// SetRESUID implements the semantics of the setresuid(2) syscall.
+func (t *Task) SetRESUID(r, e, s auth.UID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Unprivileged user processes may change the real UID, effective UID, and
+	// saved set-user-ID, each to one of: the current real UID, the current
+	// effective UID or the current saved set-user-ID. Privileged processes (on
+	// Linux, those having the CAP_SETUID capability) may set the real UID,
+	// effective UID, and saved set-user-ID to arbitrary values. If one of the
+	// arguments equals -1, the corresponding value is not changed." -
+	// setresuid(2)
+	var err error
+	newR := t.creds.RealKUID
+	if r.Ok() {
+		newR, err = t.creds.UseUID(r)
+		if err != nil {
+			return err
+		}
+	}
+	newE := t.creds.EffectiveKUID
+	if e.Ok() {
+		newE, err = t.creds.UseUID(e)
+		if err != nil {
+			return err
+		}
+	}
+	newS := t.creds.SavedKUID
+	if s.Ok() {
+		newS, err = t.creds.UseUID(s)
+		if err != nil {
+			return err
+		}
+	}
+	t.setKUIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// Preconditions: t.mu must be locked.
+func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
+	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
+	oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID
+	t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS
+
+	// "1. If one or more of the real, effective or saved set user IDs was
+	// previously 0, and as a result of the UID changes all of these IDs have a
+	// nonzero value, then all capabilities are cleared from the permitted and
+	// effective capability sets." - capabilities(7)
+	if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) {
+		// prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's
+		// "keep capabilities" flag, which determines whether the thread's permitted
+		// capability set is cleared when a change is made to the
+		// thread's user IDs such that the thread's real UID, effective
+		// UID, and saved set-user-ID all become nonzero when at least
+		// one of them previously had the value 0.  By default, the
+		// permitted capability set is cleared when such a change is
+		// made; setting the "keep capabilities" flag prevents it from
+		// being cleared." (A thread's effective capability set is always
+		// cleared when such a credential change is made,
+		// regardless of the setting of the "keep capabilities" flag.)
+		if !t.creds.KeepCaps {
+			t.creds.PermittedCaps = 0
+			t.creds.EffectiveCaps = 0
+		}
+	}
+	// """
+	// 2. If the effective user ID is changed from 0 to nonzero, then all
+	// capabilities are cleared from the effective set.
+	//
+	// 3. If the effective user ID is changed from nonzero to 0, then the
+	// permitted set is copied to the effective set.
+	// """
+	if oldE == root && newE != root {
+		t.creds.EffectiveCaps = 0
+	} else if oldE != root && newE == root {
+		t.creds.EffectiveCaps = t.creds.PermittedCaps
+	}
+	// "4. If the filesystem user ID is changed from 0 to nonzero (see
+	// setfsuid(2)), then the following capabilities are cleared from the
+	// effective set: ..."
+	// (filesystem UIDs aren't implemented, nor are any of the capabilities in
+	// question)
+
+	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
+	if oldE != newE {
+		t.parentDeathSignal = 0
+	}
+}
+
+// SetGID implements the semantics of setgid(2).
+func (t *Task) SetGID(gid auth.GID) error {
+	if !gid.Ok() {
+		return syserror.EINVAL
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	kgid := t.creds.UserNamespace.MapToKGID(gid)
+	if !kgid.Ok() {
+		return syserror.EINVAL
+	}
+	if t.creds.HasCapability(linux.CAP_SETGID) {
+		t.setKGIDsUncheckedLocked(kgid, kgid, kgid)
+		return nil
+	}
+	if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID {
+		return syserror.EPERM
+	}
+	t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID)
+	return nil
+}
+
+// SetREGID implements the semantics of setregid(2).
+func (t *Task) SetREGID(r, e auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	newR := t.creds.RealKGID
+	if r.Ok() {
+		newR = t.creds.UserNamespace.MapToKGID(r)
+		if !newR.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	newE := t.creds.EffectiveKGID
+	if e.Ok() {
+		newE = t.creds.UserNamespace.MapToKGID(e)
+		if !newE.Ok() {
+			return syserror.EINVAL
+		}
+	}
+	if !t.creds.HasCapability(linux.CAP_SETGID) {
+		if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID {
+			return syserror.EPERM
+		}
+		if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID {
+			return syserror.EPERM
+		}
+	}
+	newS := t.creds.SavedKGID
+	if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) {
+		newS = newE
+	}
+	t.setKGIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+// SetRESGID implements the semantics of the setresgid(2) syscall.
+func (t *Task) SetRESGID(r, e, s auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	var err error
+	newR := t.creds.RealKGID
+	if r.Ok() {
+		newR, err = t.creds.UseGID(r)
+		if err != nil {
+			return err
+		}
+	}
+	newE := t.creds.EffectiveKGID
+	if e.Ok() {
+		newE, err = t.creds.UseGID(e)
+		if err != nil {
+			return err
+		}
+	}
+	newS := t.creds.SavedKGID
+	if s.Ok() {
+		newS, err = t.creds.UseGID(s)
+		if err != nil {
+			return err
+		}
+	}
+	t.setKGIDsUncheckedLocked(newR, newE, newS)
+	return nil
+}
+
+func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
+	oldE := t.creds.EffectiveKGID
+	t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS
+
+	// Not documented, but compare Linux's kernel/cred.c:commit_creds().
+	if oldE != newE {
+		t.parentDeathSignal = 0
+	}
+}
+
+// SetExtraGIDs attempts to change t's supplemental groups. All IDs are
+// interpreted as being in t's user namespace.
+func (t *Task) SetExtraGIDs(gids []auth.GID) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.creds.HasCapability(linux.CAP_SETGID) {
+		return syserror.EPERM
+	}
+	kgids := make([]auth.KGID, len(gids))
+	for i, gid := range gids {
+		kgid := t.creds.UserNamespace.MapToKGID(gid)
+		if !kgid.Ok() {
+			return syserror.EINVAL
+		}
+		kgids[i] = kgid
+	}
+	t.creds.ExtraKGIDs = kgids
+	return nil
+}
+
+// SetCapabilitySets attempts to change t's permitted, inheritable, and
+// effective capability sets.
+func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	// "Permitted: This is a limiting superset for the effective capabilities
+	// that the thread may assume." - capabilities(7)
+	if effective & ^permitted != 0 {
+		return syserror.EPERM
+	}
+	// "It is also a limiting superset for the capabilities that may be added
+	// to the inheritable set by a thread that does not have the CAP_SETPCAP
+	// capability in its effective set."
+	if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) {
+		return syserror.EPERM
+	}
+	// "If a thread drops a capability from its permitted set, it can never
+	// reacquire that capability (unless it execve(2)s ..."
+	if permitted & ^t.creds.PermittedCaps != 0 {
+		return syserror.EPERM
+	}
+	// "... if a capability is not in the bounding set, then a thread can't add
+	// this capability to its inheritable set, even if it was in its permitted
+	// capabilities ..."
+	if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 {
+		return syserror.EPERM
+	}
+	t.creds.PermittedCaps = permitted
+	t.creds.InheritableCaps = inheritable
+	t.creds.EffectiveCaps = effective
+	return nil
+}
+
+// DropBoundingCapability attempts to drop capability cp from t's capability
+// bounding set.
+func (t *Task) DropBoundingCapability(cp linux.Capability) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if !t.creds.HasCapability(linux.CAP_SETPCAP) {
+		return syserror.EPERM
+	}
+	t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
+	return nil
+}
+
+// SetUserNamespace attempts to move c into ns.
+func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// "A process reassociating itself with a user namespace must have the
+	// CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
+	//
+	// If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
+	// in ns (by rule 3 in auth.Credentials.HasCapability).
+	if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
+		return syserror.EPERM
+	}
+
+	t.creds.UserNamespace = ns
+	// "The child process created by clone(2) with the CLONE_NEWUSER flag
+	// starts out with a complete set of capabilities in the new user
+	// namespace. Likewise, a process that creates a new user namespace using
+	// unshare(2) or joins an existing user namespace using setns(2) gains a
+	// full set of capabilities in that namespace."
+	t.creds.PermittedCaps = auth.AllCapabilities
+	t.creds.InheritableCaps = 0
+	t.creds.EffectiveCaps = auth.AllCapabilities
+	t.creds.BoundingCaps = auth.AllCapabilities
+	// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
+	// flag sets the "securebits" flags (see capabilities(7)) to their default
+	// values (all flags disabled) in the child (for clone(2)) or caller (for
+	// unshare(2), or setns(2)." - user_namespaces(7)
+	t.creds.KeepCaps = false
+
+	return nil
+}
+
+// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS.
+func (t *Task) SetKeepCaps(k bool) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.creds.KeepCaps = k
+}
+
+// updateCredsForExec updates t.creds to reflect an execve().
+//
+// NOTE: We currently do not implement privileged executables
+// (set-user/group-ID bits and file capabilities). This allows us to make a lot
+// of simplifying assumptions:
+//
+// - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which
+// disables the features we don't support anyway, is always set. This
+// drastically simplifies this function.
+//
+// - We don't implement AT_SECURE, because no_new_privs always being set means
+// that the conditions that require AT_SECURE never arise. (Compare Linux's
+// security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().)
+//
+// - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since
+// seccomp-bpf is also allowed if the task has no_new_privs set.
+//
+// - Task.ptraceAttach does not serialize with execve as it does in Linux,
+// since no_new_privs being set has the same effect as the presence of an
+// unprivileged tracer.
+//
+// Preconditions: t.mu must be locked.
+func (t *Task) updateCredsForExecLocked() {
+	// """
+	// During an execve(2), the kernel calculates the new capabilities of
+	// the process using the following algorithm:
+	//
+	//     P'(permitted) = (P(inheritable) & F(inheritable)) |
+	//                     (F(permitted) & cap_bset)
+	//
+	//     P'(effective) = F(effective) ? P'(permitted) : 0
+	//
+	//     P'(inheritable) = P(inheritable)    [i.e., unchanged]
+	//
+	// where:
+	//
+	//     P         denotes the value of a thread capability set before the
+	//               execve(2)
+	//
+	//     P'        denotes the value of a thread capability set after the
+	//               execve(2)
+	//
+	//     F         denotes a file capability set
+	//
+	//     cap_bset  is the value of the capability bounding set
+	//
+	// ...
+	//
+	// In order to provide an all-powerful root using capability sets, during
+	// an execve(2):
+	//
+	// 1. If a set-user-ID-root program is being executed, or the real user ID
+	// of the process is 0 (root) then the file inheritable and permitted sets
+	// are defined to be all ones (i.e. all capabilities enabled).
+	//
+	// 2. If a set-user-ID-root program is being executed, then the file
+	// effective bit is defined to be one (enabled).
+	//
+	// The upshot of the above rules, combined with the capabilities
+	// transformations described above, is that when a process execve(2)s a
+	// set-user-ID-root program, or when a process with an effective UID of 0
+	// execve(2)s a program, it gains all capabilities in its permitted and
+	// effective capability sets, except those masked out by the capability
+	// bounding set.
+	// """ - capabilities(7)
+	// (ambient capability sets omitted)
+	//
+	// As the last paragraph implies, the case of "a set-user-ID root program
+	// is being executed" also includes the case where (namespace) root is
+	// executing a non-set-user-ID program; the actual check is just based on
+	// the effective user ID.
+	var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0
+	fileEffective := false
+	root := t.creds.UserNamespace.MapToKUID(auth.RootUID)
+	if t.creds.EffectiveKUID == root || t.creds.RealKUID == root {
+		newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps
+		if t.creds.EffectiveKUID == root {
+			fileEffective = true
+		}
+	}
+
+	// Now we enter poorly-documented, somewhat confusing territory. (The
+	// accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds
+	// is not very helpful.) My reading of it is:
+	//
+	// If at least one of the following is true:
+	//
+	// A1. The execing task is ptraced, and the tracer did not have
+	// CAP_SYS_PTRACE in the execing task's user namespace at the time of
+	// PTRACE_ATTACH.
+	//
+	// A2. The execing task shares its FS context with at least one task in
+	// another thread group.
+	//
+	// A3. The execing task has no_new_privs set.
+	//
+	// AND at least one of the following is true:
+	//
+	// B1. The new effective user ID (which may come from set-user-ID, or be the
+	// execing task's existing effective user ID) is not equal to the task's
+	// real UID.
+	//
+	// B2. The new effective group ID (which may come from set-group-ID, or be
+	// the execing task's existing effective group ID) is not equal to the
+	// task's real GID.
+	//
+	// B3. The new permitted capability set contains capabilities not in the
+	// task's permitted capability set.
+	//
+	// Then:
+	//
+	// C1. Limit the new permitted capability set to the task's permitted
+	// capability set.
+	//
+	// C2. If either the task does not have CAP_SETUID in its user namespace, or
+	// the task has no_new_privs set, force the new effective UID and GID to
+	// the task's real UID and GID.
+	//
+	// But since no_new_privs is always set (A3 is always true), this becomes
+	// much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1
+	// is a no-op. So we can just do C1 and C2 unconditionally.
+	if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID {
+		t.creds.EffectiveKUID = t.creds.RealKUID
+		t.creds.EffectiveKGID = t.creds.RealKGID
+		t.parentDeathSignal = 0
+	}
+	// (Saved set-user-ID is always set to the new effective user ID, and saved
+	// set-group-ID is always set to the new effective group ID, regardless of
+	// the above.)
+	t.creds.SavedKUID = t.creds.RealKUID
+	t.creds.SavedKGID = t.creds.RealKGID
+	t.creds.PermittedCaps &= newPermitted
+	if fileEffective {
+		t.creds.EffectiveCaps = t.creds.PermittedCaps
+	} else {
+		t.creds.EffectiveCaps = 0
+	}
+
+	// prctl(2): The "keep capabilities" value will be reset to 0 on subsequent
+	// calls to execve(2).
+	t.creds.KeepCaps = false
+
+	// "The bounding set is inherited at fork(2) from the thread's parent, and
+	// is preserved across an execve(2)". So we're done.
+}