summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/auth
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2019-06-02 06:44:55 +0000
committergVisor bot <gvisor-bot@google.com>2019-06-02 06:44:55 +0000
commitceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/kernel/auth
parentdeb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)
Merge 216da0b7 (automated)
Diffstat (limited to 'pkg/sentry/kernel/auth')
-rw-r--r--pkg/sentry/kernel/auth/auth.go22
-rwxr-xr-xpkg/sentry/kernel/auth/auth_state_autogen.go151
-rw-r--r--pkg/sentry/kernel/auth/capability_set.go61
-rw-r--r--pkg/sentry/kernel/auth/context.go36
-rw-r--r--pkg/sentry/kernel/auth/credentials.go234
-rw-r--r--pkg/sentry/kernel/auth/id.go121
-rw-r--r--pkg/sentry/kernel/auth/id_map.go285
-rw-r--r--pkg/sentry/kernel/auth/id_map_functions.go45
-rwxr-xr-xpkg/sentry/kernel/auth/id_map_range.go62
-rwxr-xr-xpkg/sentry/kernel/auth/id_map_set.go1270
-rw-r--r--pkg/sentry/kernel/auth/user_namespace.go129
11 files changed, 2416 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go
new file mode 100644
index 000000000..847d121aa
--- /dev/null
+++ b/pkg/sentry/kernel/auth/auth.go
@@ -0,0 +1,22 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package auth implements an access control model that is a subset of Linux's.
+//
+// The auth package supports two kinds of access controls: user/group IDs and
+// capabilities. Each resource in the security model is associated with a user
+// namespace; "privileged" operations check that the operator's credentials
+// have the required user/group IDs or capabilities within the user namespace
+// of accessed resources.
+package auth
diff --git a/pkg/sentry/kernel/auth/auth_state_autogen.go b/pkg/sentry/kernel/auth/auth_state_autogen.go
new file mode 100755
index 000000000..6f80381c6
--- /dev/null
+++ b/pkg/sentry/kernel/auth/auth_state_autogen.go
@@ -0,0 +1,151 @@
+// automatically generated by stateify.
+
+package auth
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *Credentials) beforeSave() {}
+func (x *Credentials) save(m state.Map) {
+ x.beforeSave()
+ m.Save("RealKUID", &x.RealKUID)
+ m.Save("EffectiveKUID", &x.EffectiveKUID)
+ m.Save("SavedKUID", &x.SavedKUID)
+ m.Save("RealKGID", &x.RealKGID)
+ m.Save("EffectiveKGID", &x.EffectiveKGID)
+ m.Save("SavedKGID", &x.SavedKGID)
+ m.Save("ExtraKGIDs", &x.ExtraKGIDs)
+ m.Save("PermittedCaps", &x.PermittedCaps)
+ m.Save("InheritableCaps", &x.InheritableCaps)
+ m.Save("EffectiveCaps", &x.EffectiveCaps)
+ m.Save("BoundingCaps", &x.BoundingCaps)
+ m.Save("KeepCaps", &x.KeepCaps)
+ m.Save("UserNamespace", &x.UserNamespace)
+}
+
+func (x *Credentials) afterLoad() {}
+func (x *Credentials) load(m state.Map) {
+ m.Load("RealKUID", &x.RealKUID)
+ m.Load("EffectiveKUID", &x.EffectiveKUID)
+ m.Load("SavedKUID", &x.SavedKUID)
+ m.Load("RealKGID", &x.RealKGID)
+ m.Load("EffectiveKGID", &x.EffectiveKGID)
+ m.Load("SavedKGID", &x.SavedKGID)
+ m.Load("ExtraKGIDs", &x.ExtraKGIDs)
+ m.Load("PermittedCaps", &x.PermittedCaps)
+ m.Load("InheritableCaps", &x.InheritableCaps)
+ m.Load("EffectiveCaps", &x.EffectiveCaps)
+ m.Load("BoundingCaps", &x.BoundingCaps)
+ m.Load("KeepCaps", &x.KeepCaps)
+ m.Load("UserNamespace", &x.UserNamespace)
+}
+
+func (x *IDMapEntry) beforeSave() {}
+func (x *IDMapEntry) save(m state.Map) {
+ x.beforeSave()
+ m.Save("FirstID", &x.FirstID)
+ m.Save("FirstParentID", &x.FirstParentID)
+ m.Save("Length", &x.Length)
+}
+
+func (x *IDMapEntry) afterLoad() {}
+func (x *IDMapEntry) load(m state.Map) {
+ m.Load("FirstID", &x.FirstID)
+ m.Load("FirstParentID", &x.FirstParentID)
+ m.Load("Length", &x.Length)
+}
+
+func (x *idMapRange) beforeSave() {}
+func (x *idMapRange) save(m state.Map) {
+ x.beforeSave()
+ m.Save("Start", &x.Start)
+ m.Save("End", &x.End)
+}
+
+func (x *idMapRange) afterLoad() {}
+func (x *idMapRange) load(m state.Map) {
+ m.Load("Start", &x.Start)
+ m.Load("End", &x.End)
+}
+
+func (x *idMapSet) beforeSave() {}
+func (x *idMapSet) save(m state.Map) {
+ x.beforeSave()
+ var root *idMapSegmentDataSlices = x.saveRoot()
+ m.SaveValue("root", root)
+}
+
+func (x *idMapSet) afterLoad() {}
+func (x *idMapSet) load(m state.Map) {
+ m.LoadValue("root", new(*idMapSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*idMapSegmentDataSlices)) })
+}
+
+func (x *idMapnode) beforeSave() {}
+func (x *idMapnode) save(m state.Map) {
+ x.beforeSave()
+ m.Save("nrSegments", &x.nrSegments)
+ m.Save("parent", &x.parent)
+ m.Save("parentIndex", &x.parentIndex)
+ m.Save("hasChildren", &x.hasChildren)
+ m.Save("keys", &x.keys)
+ m.Save("values", &x.values)
+ m.Save("children", &x.children)
+}
+
+func (x *idMapnode) afterLoad() {}
+func (x *idMapnode) load(m state.Map) {
+ m.Load("nrSegments", &x.nrSegments)
+ m.Load("parent", &x.parent)
+ m.Load("parentIndex", &x.parentIndex)
+ m.Load("hasChildren", &x.hasChildren)
+ m.Load("keys", &x.keys)
+ m.Load("values", &x.values)
+ m.Load("children", &x.children)
+}
+
+func (x *idMapSegmentDataSlices) beforeSave() {}
+func (x *idMapSegmentDataSlices) save(m state.Map) {
+ x.beforeSave()
+ m.Save("Start", &x.Start)
+ m.Save("End", &x.End)
+ m.Save("Values", &x.Values)
+}
+
+func (x *idMapSegmentDataSlices) afterLoad() {}
+func (x *idMapSegmentDataSlices) load(m state.Map) {
+ m.Load("Start", &x.Start)
+ m.Load("End", &x.End)
+ m.Load("Values", &x.Values)
+}
+
+func (x *UserNamespace) beforeSave() {}
+func (x *UserNamespace) save(m state.Map) {
+ x.beforeSave()
+ m.Save("parent", &x.parent)
+ m.Save("owner", &x.owner)
+ m.Save("uidMapFromParent", &x.uidMapFromParent)
+ m.Save("uidMapToParent", &x.uidMapToParent)
+ m.Save("gidMapFromParent", &x.gidMapFromParent)
+ m.Save("gidMapToParent", &x.gidMapToParent)
+}
+
+func (x *UserNamespace) afterLoad() {}
+func (x *UserNamespace) load(m state.Map) {
+ m.Load("parent", &x.parent)
+ m.Load("owner", &x.owner)
+ m.Load("uidMapFromParent", &x.uidMapFromParent)
+ m.Load("uidMapToParent", &x.uidMapToParent)
+ m.Load("gidMapFromParent", &x.gidMapFromParent)
+ m.Load("gidMapToParent", &x.gidMapToParent)
+}
+
+func init() {
+ state.Register("auth.Credentials", (*Credentials)(nil), state.Fns{Save: (*Credentials).save, Load: (*Credentials).load})
+ state.Register("auth.IDMapEntry", (*IDMapEntry)(nil), state.Fns{Save: (*IDMapEntry).save, Load: (*IDMapEntry).load})
+ state.Register("auth.idMapRange", (*idMapRange)(nil), state.Fns{Save: (*idMapRange).save, Load: (*idMapRange).load})
+ state.Register("auth.idMapSet", (*idMapSet)(nil), state.Fns{Save: (*idMapSet).save, Load: (*idMapSet).load})
+ state.Register("auth.idMapnode", (*idMapnode)(nil), state.Fns{Save: (*idMapnode).save, Load: (*idMapnode).load})
+ state.Register("auth.idMapSegmentDataSlices", (*idMapSegmentDataSlices)(nil), state.Fns{Save: (*idMapSegmentDataSlices).save, Load: (*idMapSegmentDataSlices).load})
+ state.Register("auth.UserNamespace", (*UserNamespace)(nil), state.Fns{Save: (*UserNamespace).save, Load: (*UserNamespace).load})
+}
diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go
new file mode 100644
index 000000000..7a0c967cd
--- /dev/null
+++ b/pkg/sentry/kernel/auth/capability_set.go
@@ -0,0 +1,61 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/bits"
+)
+
+// A CapabilitySet is a set of capabilities implemented as a bitset. The zero
+// value of CapabilitySet is a set containing no capabilities.
+type CapabilitySet uint64
+
+// AllCapabilities is a CapabilitySet containing all valid capabilities.
+var AllCapabilities = CapabilitySetOf(linux.MaxCapability+1) - 1
+
+// CapabilitySetOf returns a CapabilitySet containing only the given
+// capability.
+func CapabilitySetOf(cp linux.Capability) CapabilitySet {
+ return CapabilitySet(bits.MaskOf64(int(cp)))
+}
+
+// CapabilitySetOfMany returns a CapabilitySet containing the given capabilities.
+func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet {
+ var cs uint64
+ for _, cp := range cps {
+ cs |= bits.MaskOf64(int(cp))
+ }
+ return CapabilitySet(cs)
+}
+
+// TaskCapabilities represents all the capability sets for a task. Each of these
+// sets is explained in greater detail in capabilities(7).
+type TaskCapabilities struct {
+ // Permitted is a limiting superset for the effective capabilities that
+ // the thread may assume.
+ PermittedCaps CapabilitySet
+ // Inheritable is a set of capabilities preserved across an execve(2).
+ InheritableCaps CapabilitySet
+ // Effective is the set of capabilities used by the kernel to perform
+ // permission checks for the thread.
+ EffectiveCaps CapabilitySet
+ // Bounding is a limiting superset for the capabilities that a thread
+ // can add to its inheritable set using capset(2).
+ BoundingCaps CapabilitySet
+ // Ambient is a set of capabilities that are preserved across an
+ // execve(2) of a program that is not privileged.
+ AmbientCaps CapabilitySet
+}
diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go
new file mode 100644
index 000000000..16d110610
--- /dev/null
+++ b/pkg/sentry/kernel/auth/context.go
@@ -0,0 +1,36 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the auth package's type for context.Context.Value keys.
+type contextID int
+
+const (
+ // CtxCredentials is a Context.Value key for Credentials.
+ CtxCredentials contextID = iota
+)
+
+// CredentialsFromContext returns a copy of the Credentials used by ctx, or a
+// set of Credentials with no capabilities if ctx does not have Credentials.
+func CredentialsFromContext(ctx context.Context) *Credentials {
+ if v := ctx.Value(CtxCredentials); v != nil {
+ return v.(*Credentials)
+ }
+ return NewAnonymousCredentials()
+}
diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go
new file mode 100644
index 000000000..1511a0324
--- /dev/null
+++ b/pkg/sentry/kernel/auth/credentials.go
@@ -0,0 +1,234 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// Credentials contains information required to authorize privileged operations
+// in a user namespace.
+//
+// +stateify savable
+type Credentials struct {
+ // Real/effective/saved user/group IDs in the root user namespace. None of
+ // these should ever be NoID.
+ RealKUID KUID
+ EffectiveKUID KUID
+ SavedKUID KUID
+ RealKGID KGID
+ EffectiveKGID KGID
+ SavedKGID KGID
+
+ // Filesystem user/group IDs are not implemented. "... setfsuid() is
+ // nowadays unneeded and should be avoided in new applications (likewise
+ // for setfsgid(2))." - setfsuid(2)
+
+ // Supplementary groups used by set/getgroups.
+ //
+ // ExtraKGIDs slices are immutable, allowing multiple Credentials with the
+ // same ExtraKGIDs to share the same slice.
+ ExtraKGIDs []KGID
+
+ // The capability sets applicable to this set of credentials.
+ PermittedCaps CapabilitySet
+ InheritableCaps CapabilitySet
+ EffectiveCaps CapabilitySet
+ BoundingCaps CapabilitySet
+ // Ambient capabilities are not introduced until Linux 4.3.
+
+ // KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be
+ // maintained after a switch from root user to non-root user via setuid().
+ KeepCaps bool
+
+ // The user namespace associated with the owner of the credentials.
+ UserNamespace *UserNamespace
+}
+
+// NewAnonymousCredentials returns a set of credentials with no capabilities in
+// any user namespace.
+func NewAnonymousCredentials() *Credentials {
+ // Create a new root user namespace. Since the new namespace's owner is
+ // KUID 0 and the returned credentials have non-zero KUID/KGID, the
+ // returned credentials do not have any capabilities in the new namespace.
+ // Since the new namespace is not part of any existing user namespace
+ // hierarchy, the returned credentials do not have any capabilities in any
+ // other namespace.
+ return &Credentials{
+ RealKUID: NobodyKUID,
+ EffectiveKUID: NobodyKUID,
+ SavedKUID: NobodyKUID,
+ RealKGID: NobodyKGID,
+ EffectiveKGID: NobodyKGID,
+ SavedKGID: NobodyKGID,
+ UserNamespace: NewRootUserNamespace(),
+ }
+}
+
+// NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e.
+// global root) in user namespace ns.
+func NewRootCredentials(ns *UserNamespace) *Credentials {
+ // I can't find documentation for this anywhere, but it's correct for the
+ // inheritable capability set to be initially empty (the capabilities test
+ // checks for this property).
+ return &Credentials{
+ RealKUID: RootKUID,
+ EffectiveKUID: RootKUID,
+ SavedKUID: RootKUID,
+ RealKGID: RootKGID,
+ EffectiveKGID: RootKGID,
+ SavedKGID: RootKGID,
+ PermittedCaps: AllCapabilities,
+ EffectiveCaps: AllCapabilities,
+ BoundingCaps: AllCapabilities,
+ UserNamespace: ns,
+ }
+}
+
+// NewUserCredentials returns a set of credentials based on the given UID, GIDs,
+// and capabilities in a given namespace. If all arguments are their zero
+// values, this returns the same credentials as NewRootCredentials.
+func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials {
+ creds := NewRootCredentials(ns)
+
+ // Set the UID.
+ uid := kuid
+ creds.RealKUID = uid
+ creds.EffectiveKUID = uid
+ creds.SavedKUID = uid
+
+ // Set GID.
+ gid := kgid
+ creds.RealKGID = gid
+ creds.EffectiveKGID = gid
+ creds.SavedKGID = gid
+
+ // Set additional GIDs.
+ creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...)
+
+ // Set capabilities.
+ if capabilities != nil {
+ creds.PermittedCaps = capabilities.PermittedCaps
+ creds.EffectiveCaps = capabilities.EffectiveCaps
+ creds.BoundingCaps = capabilities.BoundingCaps
+ creds.InheritableCaps = capabilities.InheritableCaps
+ // TODO(nlacasse): Support ambient capabilities.
+ } else {
+ // If no capabilities are specified, grant capabilities consistent with
+ // setresuid + setresgid from NewRootCredentials to the given uid and
+ // gid.
+ if kuid == RootKUID {
+ creds.PermittedCaps = AllCapabilities
+ creds.EffectiveCaps = AllCapabilities
+ } else {
+ creds.PermittedCaps = 0
+ creds.EffectiveCaps = 0
+ }
+ creds.BoundingCaps = AllCapabilities
+ }
+
+ return creds
+}
+
+// Fork generates an identical copy of a set of credentials.
+func (c *Credentials) Fork() *Credentials {
+ nc := new(Credentials)
+ *nc = *c // Copy-by-value; this is legal for all fields.
+ return nc
+}
+
+// InGroup returns true if c is in group kgid. Compare Linux's
+// kernel/groups.c:in_group_p().
+func (c *Credentials) InGroup(kgid KGID) bool {
+ if c.EffectiveKGID == kgid {
+ return true
+ }
+ for _, extraKGID := range c.ExtraKGIDs {
+ if extraKGID == kgid {
+ return true
+ }
+ }
+ return false
+}
+
+// HasCapabilityIn returns true if c has capability cp in ns.
+func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool {
+ for {
+ // "1. A process has a capability inside a user namespace if it is a member
+ // of that namespace and it has the capability in its effective capability
+ // set." - user_namespaces(7)
+ if c.UserNamespace == ns {
+ return CapabilitySetOf(cp)&c.EffectiveCaps != 0
+ }
+ // "3. ... A process that resides in the parent of the user namespace and
+ // whose effective user ID matches the owner of the namespace has all
+ // capabilities in the namespace."
+ if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner {
+ return true
+ }
+ // "2. If a process has a capability in a user namespace, then it has that
+ // capability in all child (and further removed descendant) namespaces as
+ // well."
+ if ns.parent == nil {
+ return false
+ }
+ ns = ns.parent
+ }
+}
+
+// HasCapability returns true if c has capability cp in its user namespace.
+func (c *Credentials) HasCapability(cp linux.Capability) bool {
+ return c.HasCapabilityIn(cp, c.UserNamespace)
+}
+
+// UseUID checks that c can use uid in its user namespace, then translates it
+// to the root user namespace.
+//
+// The checks UseUID does are common, but you should verify that it's doing
+// exactly what you want.
+func (c *Credentials) UseUID(uid UID) (KUID, error) {
+ // uid must be mapped.
+ kuid := c.UserNamespace.MapToKUID(uid)
+ if !kuid.Ok() {
+ return NoID, syserror.EINVAL
+ }
+ // If c has CAP_SETUID, then it can use any UID in its user namespace.
+ if c.HasCapability(linux.CAP_SETUID) {
+ return kuid, nil
+ }
+ // Otherwise, c must already have the UID as its real, effective, or saved
+ // set-user-ID.
+ if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID {
+ return kuid, nil
+ }
+ return NoID, syserror.EPERM
+}
+
+// UseGID checks that c can use gid in its user namespace, then translates it
+// to the root user namespace.
+func (c *Credentials) UseGID(gid GID) (KGID, error) {
+ kgid := c.UserNamespace.MapToKGID(gid)
+ if !kgid.Ok() {
+ return NoID, syserror.EINVAL
+ }
+ if c.HasCapability(linux.CAP_SETGID) {
+ return kgid, nil
+ }
+ if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID {
+ return kgid, nil
+ }
+ return NoID, syserror.EPERM
+}
diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go
new file mode 100644
index 000000000..0a58ba17c
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id.go
@@ -0,0 +1,121 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "math"
+)
+
+// UID is a user ID in an unspecified user namespace.
+type UID uint32
+
+// GID is a group ID in an unspecified user namespace.
+type GID uint32
+
+// In the root user namespace, user/group IDs have a 1-to-1 relationship with
+// the users/groups they represent. In other user namespaces, this is not the
+// case; for example, two different unmapped users may both "have" the overflow
+// UID. This means that it is generally only valid to compare user and group
+// IDs in the root user namespace. We assign distinct types, KUID/KGID, to such
+// IDs to emphasize this distinction. ("k" is for "key", as in "unique key".
+// Linux also uses the prefix "k", but I think they mean "kernel".)
+
+// KUID is a user ID in the root user namespace.
+type KUID uint32
+
+// KGID is a group ID in the root user namespace.
+type KGID uint32
+
+const (
+ // NoID is uint32(-1). -1 is consistently used as a special value, in Linux
+ // and by extension in the auth package, to mean "no ID":
+ //
+ // - ID mapping returns -1 if the ID is not mapped.
+ //
+ // - Most set*id() syscalls accept -1 to mean "do not change this ID".
+ NoID = math.MaxUint32
+
+ // OverflowUID is the default value of /proc/sys/kernel/overflowuid. The
+ // "overflow UID" is usually [1] used when translating a user ID between
+ // namespaces fails because the ID is not mapped. (We don't implement this
+ // file, so the overflow UID is constant.)
+ //
+ // [1] "There is one notable case where unmapped user and group IDs are not
+ // converted to the corresponding overflow ID value. When viewing a uid_map
+ // or gid_map file in which there is no mapping for the second field, that
+ // field is displayed as 4294967295 (-1 as an unsigned integer);" -
+ // user_namespaces(7)
+ OverflowUID = UID(65534)
+ OverflowGID = GID(65534)
+
+ // NobodyKUID is the user ID usually reserved for the least privileged user
+ // "nobody".
+ NobodyKUID = KUID(65534)
+ NobodyKGID = KGID(65534)
+
+ // RootKUID is the user ID usually used for the most privileged user "root".
+ RootKUID = KUID(0)
+ RootKGID = KGID(0)
+ RootUID = UID(0)
+ RootGID = GID(0)
+)
+
+// Ok returns true if uid is not -1.
+func (uid UID) Ok() bool {
+ return uid != NoID
+}
+
+// Ok returns true if gid is not -1.
+func (gid GID) Ok() bool {
+ return gid != NoID
+}
+
+// Ok returns true if kuid is not -1.
+func (kuid KUID) Ok() bool {
+ return kuid != NoID
+}
+
+// Ok returns true if kgid is not -1.
+func (kgid KGID) Ok() bool {
+ return kgid != NoID
+}
+
+// OrOverflow returns uid if it is valid and the overflow UID otherwise.
+func (uid UID) OrOverflow() UID {
+ if uid.Ok() {
+ return uid
+ }
+ return OverflowUID
+}
+
+// OrOverflow returns gid if it is valid and the overflow GID otherwise.
+func (gid GID) OrOverflow() GID {
+ if gid.Ok() {
+ return gid
+ }
+ return OverflowGID
+}
+
+// In translates kuid into user namespace ns. If kuid is not mapped in ns, In
+// returns NoID.
+func (kuid KUID) In(ns *UserNamespace) UID {
+ return ns.MapFromKUID(kuid)
+}
+
+// In translates kgid into user namespace ns. If kgid is not mapped in ns, In
+// returns NoID.
+func (kgid KGID) In(ns *UserNamespace) GID {
+ return ns.MapFromKGID(kgid)
+}
diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go
new file mode 100644
index 000000000..e5d6028d6
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map.go
@@ -0,0 +1,285 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns.
+func (ns *UserNamespace) MapFromKUID(kuid KUID) UID {
+ if ns.parent == nil {
+ return UID(kuid)
+ }
+ return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid))))
+}
+
+// MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns.
+func (ns *UserNamespace) MapFromKGID(kgid KGID) GID {
+ if ns.parent == nil {
+ return GID(kgid)
+ }
+ return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid))))
+}
+
+// MapToKUID translates uid, a UID in ns, to a UID in the root namespace.
+func (ns *UserNamespace) MapToKUID(uid UID) KUID {
+ if ns.parent == nil {
+ return KUID(uid)
+ }
+ return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid))))
+}
+
+// MapToKGID translates gid, a GID in ns, to a GID in the root namespace.
+func (ns *UserNamespace) MapToKGID(gid GID) KGID {
+ if ns.parent == nil {
+ return KGID(gid)
+ }
+ return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid))))
+}
+
+func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 {
+ if id == NoID {
+ return NoID
+ }
+ ns.mu.Lock()
+ defer ns.mu.Unlock()
+ if it := m.FindSegment(id); it.Ok() {
+ return it.Value() + (id - it.Start())
+ }
+ return NoID
+}
+
+// allIDsMapped returns true if all IDs in the range [start, end) are mapped in
+// m.
+//
+// Preconditions: end >= start.
+func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool {
+ ns.mu.Lock()
+ defer ns.mu.Unlock()
+ return m.SpanRange(idMapRange{start, end}) == end-start
+}
+
+// An IDMapEntry represents a mapping from a range of contiguous IDs in a user
+// namespace to an equally-sized range of contiguous IDs in the namespace's
+// parent.
+//
+// +stateify savable
+type IDMapEntry struct {
+ // FirstID is the first ID in the range in the namespace.
+ FirstID uint32
+
+ // FirstParentID is the first ID in the range in the parent namespace.
+ FirstParentID uint32
+
+ // Length is the number of IDs in the range.
+ Length uint32
+}
+
+// SetUIDMap instructs ns to translate UIDs as specified by entries.
+//
+// Note: SetUIDMap does not place an upper bound on the number of entries, but
+// Linux does. This restriction is implemented in SetUIDMap's caller, the
+// implementation of /proc/[pid]/uid_map.
+func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error {
+ c := CredentialsFromContext(ctx)
+
+ ns.mu.Lock()
+ defer ns.mu.Unlock()
+ // "After the creation of a new user namespace, the uid_map file of *one*
+ // of the processes in the namespace may be written to *once* to define the
+ // mapping of user IDs in the new user namespace. An attempt to write more
+ // than once to a uid_map file in a user namespace fails with the error
+ // EPERM. Similar rules apply for gid_map files." - user_namespaces(7)
+ if !ns.uidMapFromParent.IsEmpty() {
+ return syserror.EPERM
+ }
+ // "At least one line must be written to the file."
+ if len(entries) == 0 {
+ return syserror.EINVAL
+ }
+ // """
+ // In order for a process to write to the /proc/[pid]/uid_map
+ // (/proc/[pid]/gid_map) file, all of the following requirements must be
+ // met:
+ //
+ // 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability
+ // in the user namespace of the process pid.
+ // """
+ if !c.HasCapabilityIn(linux.CAP_SETUID, ns) {
+ return syserror.EPERM
+ }
+ // "2. The writing process must either be in the user namespace of the process
+ // pid or be in the parent user namespace of the process pid."
+ if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+ return syserror.EPERM
+ }
+ // """
+ // 3. (see trySetUIDMap)
+ //
+ // 4. One of the following two cases applies:
+ //
+ // * Either the writing process has the CAP_SETUID (CAP_SETGID) capability
+ // in the parent user namespace.
+ // """
+ if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) {
+ // """
+ // * Or otherwise all of the following restrictions apply:
+ //
+ // + The data written to uid_map (gid_map) must consist of a single line
+ // that maps the writing process' effective user ID (group ID) in the
+ // parent user namespace to a user ID (group ID) in the user namespace.
+ // """
+ if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 {
+ return syserror.EPERM
+ }
+ // """
+ // + The writing process must have the same effective user ID as the
+ // process that created the user namespace.
+ // """
+ if c.EffectiveKUID != ns.owner {
+ return syserror.EPERM
+ }
+ }
+ // trySetUIDMap leaves data in maps if it fails.
+ if err := ns.trySetUIDMap(entries); err != nil {
+ ns.uidMapFromParent.RemoveAll()
+ ns.uidMapToParent.RemoveAll()
+ return err
+ }
+ return nil
+}
+
+func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error {
+ for _, e := range entries {
+ // Determine upper bounds and check for overflow. This implicitly
+ // checks for NoID.
+ lastID := e.FirstID + e.Length
+ if lastID <= e.FirstID {
+ return syserror.EINVAL
+ }
+ lastParentID := e.FirstParentID + e.Length
+ if lastParentID <= e.FirstParentID {
+ return syserror.EINVAL
+ }
+ // "3. The mapped user IDs (group IDs) must in turn have a mapping in
+ // the parent user namespace."
+ // Only the root namespace has a nil parent, and root is assigned
+ // mappings when it's created, so SetUIDMap would have returned EPERM
+ // without reaching this point if ns is root.
+ if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) {
+ return syserror.EPERM
+ }
+ // If either of these Adds fail, we have an overlapping range.
+ if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+ return syserror.EINVAL
+ }
+ if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+ return syserror.EINVAL
+ }
+ }
+ return nil
+}
+
+// SetGIDMap instructs ns to translate GIDs as specified by entries.
+func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error {
+ c := CredentialsFromContext(ctx)
+
+ ns.mu.Lock()
+ defer ns.mu.Unlock()
+ if !ns.gidMapFromParent.IsEmpty() {
+ return syserror.EPERM
+ }
+ if len(entries) == 0 {
+ return syserror.EINVAL
+ }
+ if !c.HasCapabilityIn(linux.CAP_SETGID, ns) {
+ return syserror.EPERM
+ }
+ if c.UserNamespace != ns && c.UserNamespace != ns.parent {
+ return syserror.EPERM
+ }
+ if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) {
+ if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 {
+ return syserror.EPERM
+ }
+ // It's correct for this to still be UID.
+ if c.EffectiveKUID != ns.owner {
+ return syserror.EPERM
+ }
+ // "In the case of gid_map, use of the setgroups(2) system call must
+ // first be denied by writing "deny" to the /proc/[pid]/setgroups file
+ // (see below) before writing to gid_map." (This file isn't implemented
+ // in the version of Linux we're emulating; see comment in
+ // UserNamespace.)
+ }
+ if err := ns.trySetGIDMap(entries); err != nil {
+ ns.gidMapFromParent.RemoveAll()
+ ns.gidMapToParent.RemoveAll()
+ return err
+ }
+ return nil
+}
+
+func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error {
+ for _, e := range entries {
+ lastID := e.FirstID + e.Length
+ if lastID <= e.FirstID {
+ return syserror.EINVAL
+ }
+ lastParentID := e.FirstParentID + e.Length
+ if lastParentID <= e.FirstParentID {
+ return syserror.EINVAL
+ }
+ if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) {
+ return syserror.EPERM
+ }
+ if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
+ return syserror.EINVAL
+ }
+ if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
+ return syserror.EINVAL
+ }
+ }
+ return nil
+}
+
+// UIDMap returns the user ID mappings configured for ns. If no mappings
+// have been configured, UIDMap returns nil.
+func (ns *UserNamespace) UIDMap() []IDMapEntry {
+ return ns.getIDMap(&ns.uidMapToParent)
+}
+
+// GIDMap returns the group ID mappings configured for ns. If no mappings
+// have been configured, GIDMap returns nil.
+func (ns *UserNamespace) GIDMap() []IDMapEntry {
+ return ns.getIDMap(&ns.gidMapToParent)
+}
+
+func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry {
+ ns.mu.Lock()
+ defer ns.mu.Unlock()
+ var entries []IDMapEntry
+ for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() {
+ entries = append(entries, IDMapEntry{
+ FirstID: it.Start(),
+ FirstParentID: it.Value(),
+ Length: it.Range().Length(),
+ })
+ }
+ return entries
+}
diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go
new file mode 100644
index 000000000..432dbfb6d
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map_functions.go
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+// idMapFunctions "implements" generic interface segment.Functions for
+// idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one
+// user namespace to non-overlapping ranges of contiguous IDs in another user
+// namespace. Each such ID mapping is implemented as a range-to-value mapping
+// in the set such that [range.Start(), range.End()) => [value, value +
+// range.Length()).
+type idMapFunctions struct{}
+
+func (idMapFunctions) MinKey() uint32 {
+ return 0
+}
+
+func (idMapFunctions) MaxKey() uint32 {
+ return NoID
+}
+
+func (idMapFunctions) ClearValue(*uint32) {}
+
+func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) {
+ // Mapped ranges have to be contiguous.
+ if val1+r1.Length() != val2 {
+ return 0, false
+ }
+ return val1, true
+}
+
+func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) {
+ return val, val + (split - r.Start)
+}
diff --git a/pkg/sentry/kernel/auth/id_map_range.go b/pkg/sentry/kernel/auth/id_map_range.go
new file mode 100755
index 000000000..833fa3518
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map_range.go
@@ -0,0 +1,62 @@
+package auth
+
+// A Range represents a contiguous range of T.
+//
+// +stateify savable
+type idMapRange struct {
+ // Start is the inclusive start of the range.
+ Start uint32
+
+ // End is the exclusive end of the range.
+ End uint32
+}
+
+// WellFormed returns true if r.Start <= r.End. All other methods on a Range
+// require that the Range is well-formed.
+func (r idMapRange) WellFormed() bool {
+ return r.Start <= r.End
+}
+
+// Length returns the length of the range.
+func (r idMapRange) Length() uint32 {
+ return r.End - r.Start
+}
+
+// Contains returns true if r contains x.
+func (r idMapRange) Contains(x uint32) bool {
+ return r.Start <= x && x < r.End
+}
+
+// Overlaps returns true if r and r2 overlap.
+func (r idMapRange) Overlaps(r2 idMapRange) bool {
+ return r.Start < r2.End && r2.Start < r.End
+}
+
+// IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is
+// contained within r.
+func (r idMapRange) IsSupersetOf(r2 idMapRange) bool {
+ return r.Start <= r2.Start && r.End >= r2.End
+}
+
+// Intersect returns a range consisting of the intersection between r and r2.
+// If r and r2 do not overlap, Intersect returns a range with unspecified
+// bounds, but for which Length() == 0.
+func (r idMapRange) Intersect(r2 idMapRange) idMapRange {
+ if r.Start < r2.Start {
+ r.Start = r2.Start
+ }
+ if r.End > r2.End {
+ r.End = r2.End
+ }
+ if r.End < r.Start {
+ r.End = r.Start
+ }
+ return r
+}
+
+// CanSplitAt returns true if it is legal to split a segment spanning the range
+// r at x; that is, splitting at x would produce two ranges, both of which have
+// non-zero length.
+func (r idMapRange) CanSplitAt(x uint32) bool {
+ return r.Contains(x) && r.Start < x
+}
diff --git a/pkg/sentry/kernel/auth/id_map_set.go b/pkg/sentry/kernel/auth/id_map_set.go
new file mode 100755
index 000000000..f72c839c7
--- /dev/null
+++ b/pkg/sentry/kernel/auth/id_map_set.go
@@ -0,0 +1,1270 @@
+package auth
+
+import (
+ "bytes"
+ "fmt"
+)
+
+const (
+ // minDegree is the minimum degree of an internal node in a Set B-tree.
+ //
+ // - Any non-root node has at least minDegree-1 segments.
+ //
+ // - Any non-root internal (non-leaf) node has at least minDegree children.
+ //
+ // - The root node may have fewer than minDegree-1 segments, but it may
+ // only have 0 segments if the tree is empty.
+ //
+ // Our implementation requires minDegree >= 3. Higher values of minDegree
+ // usually improve performance, but increase memory usage for small sets.
+ idMapminDegree = 3
+
+ idMapmaxDegree = 2 * idMapminDegree
+)
+
+// A Set is a mapping of segments with non-overlapping Range keys. The zero
+// value for a Set is an empty set. Set values are not safely movable nor
+// copyable. Set is thread-compatible.
+//
+// +stateify savable
+type idMapSet struct {
+ root idMapnode `state:".(*idMapSegmentDataSlices)"`
+}
+
+// IsEmpty returns true if the set contains no segments.
+func (s *idMapSet) IsEmpty() bool {
+ return s.root.nrSegments == 0
+}
+
+// IsEmptyRange returns true iff no segments in the set overlap the given
+// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be
+// more efficient.
+func (s *idMapSet) IsEmptyRange(r idMapRange) bool {
+ switch {
+ case r.Length() < 0:
+ panic(fmt.Sprintf("invalid range %v", r))
+ case r.Length() == 0:
+ return true
+ }
+ _, gap := s.Find(r.Start)
+ if !gap.Ok() {
+ return false
+ }
+ return r.End <= gap.End()
+}
+
+// Span returns the total size of all segments in the set.
+func (s *idMapSet) Span() uint32 {
+ var sz uint32
+ for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ sz += seg.Range().Length()
+ }
+ return sz
+}
+
+// SpanRange returns the total size of the intersection of segments in the set
+// with the given range.
+func (s *idMapSet) SpanRange(r idMapRange) uint32 {
+ switch {
+ case r.Length() < 0:
+ panic(fmt.Sprintf("invalid range %v", r))
+ case r.Length() == 0:
+ return 0
+ }
+ var sz uint32
+ for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() {
+ sz += seg.Range().Intersect(r).Length()
+ }
+ return sz
+}
+
+// FirstSegment returns the first segment in the set. If the set is empty,
+// FirstSegment returns a terminal iterator.
+func (s *idMapSet) FirstSegment() idMapIterator {
+ if s.root.nrSegments == 0 {
+ return idMapIterator{}
+ }
+ return s.root.firstSegment()
+}
+
+// LastSegment returns the last segment in the set. If the set is empty,
+// LastSegment returns a terminal iterator.
+func (s *idMapSet) LastSegment() idMapIterator {
+ if s.root.nrSegments == 0 {
+ return idMapIterator{}
+ }
+ return s.root.lastSegment()
+}
+
+// FirstGap returns the first gap in the set.
+func (s *idMapSet) FirstGap() idMapGapIterator {
+ n := &s.root
+ for n.hasChildren {
+ n = n.children[0]
+ }
+ return idMapGapIterator{n, 0}
+}
+
+// LastGap returns the last gap in the set.
+func (s *idMapSet) LastGap() idMapGapIterator {
+ n := &s.root
+ for n.hasChildren {
+ n = n.children[n.nrSegments]
+ }
+ return idMapGapIterator{n, n.nrSegments}
+}
+
+// Find returns the segment or gap whose range contains the given key. If a
+// segment is found, the returned Iterator is non-terminal and the
+// returned GapIterator is terminal. Otherwise, the returned Iterator is
+// terminal and the returned GapIterator is non-terminal.
+func (s *idMapSet) Find(key uint32) (idMapIterator, idMapGapIterator) {
+ n := &s.root
+ for {
+
+ lower := 0
+ upper := n.nrSegments
+ for lower < upper {
+ i := lower + (upper-lower)/2
+ if r := n.keys[i]; key < r.End {
+ if key >= r.Start {
+ return idMapIterator{n, i}, idMapGapIterator{}
+ }
+ upper = i
+ } else {
+ lower = i + 1
+ }
+ }
+ i := lower
+ if !n.hasChildren {
+ return idMapIterator{}, idMapGapIterator{n, i}
+ }
+ n = n.children[i]
+ }
+}
+
+// FindSegment returns the segment whose range contains the given key. If no
+// such segment exists, FindSegment returns a terminal iterator.
+func (s *idMapSet) FindSegment(key uint32) idMapIterator {
+ seg, _ := s.Find(key)
+ return seg
+}
+
+// LowerBoundSegment returns the segment with the lowest range that contains a
+// key greater than or equal to min. If no such segment exists,
+// LowerBoundSegment returns a terminal iterator.
+func (s *idMapSet) LowerBoundSegment(min uint32) idMapIterator {
+ seg, gap := s.Find(min)
+ if seg.Ok() {
+ return seg
+ }
+ return gap.NextSegment()
+}
+
+// UpperBoundSegment returns the segment with the highest range that contains a
+// key less than or equal to max. If no such segment exists, UpperBoundSegment
+// returns a terminal iterator.
+func (s *idMapSet) UpperBoundSegment(max uint32) idMapIterator {
+ seg, gap := s.Find(max)
+ if seg.Ok() {
+ return seg
+ }
+ return gap.PrevSegment()
+}
+
+// FindGap returns the gap containing the given key. If no such gap exists
+// (i.e. the set contains a segment containing that key), FindGap returns a
+// terminal iterator.
+func (s *idMapSet) FindGap(key uint32) idMapGapIterator {
+ _, gap := s.Find(key)
+ return gap
+}
+
+// LowerBoundGap returns the gap with the lowest range that is greater than or
+// equal to min.
+func (s *idMapSet) LowerBoundGap(min uint32) idMapGapIterator {
+ seg, gap := s.Find(min)
+ if gap.Ok() {
+ return gap
+ }
+ return seg.NextGap()
+}
+
+// UpperBoundGap returns the gap with the highest range that is less than or
+// equal to max.
+func (s *idMapSet) UpperBoundGap(max uint32) idMapGapIterator {
+ seg, gap := s.Find(max)
+ if gap.Ok() {
+ return gap
+ }
+ return seg.PrevGap()
+}
+
+// Add inserts the given segment into the set and returns true. If the new
+// segment can be merged with adjacent segments, Add will do so. If the new
+// segment would overlap an existing segment, Add returns false. If Add
+// succeeds, all existing iterators are invalidated.
+func (s *idMapSet) Add(r idMapRange, val uint32) bool {
+ if r.Length() <= 0 {
+ panic(fmt.Sprintf("invalid segment range %v", r))
+ }
+ gap := s.FindGap(r.Start)
+ if !gap.Ok() {
+ return false
+ }
+ if r.End > gap.End() {
+ return false
+ }
+ s.Insert(gap, r, val)
+ return true
+}
+
+// AddWithoutMerging inserts the given segment into the set and returns true.
+// If it would overlap an existing segment, AddWithoutMerging does nothing and
+// returns false. If AddWithoutMerging succeeds, all existing iterators are
+// invalidated.
+func (s *idMapSet) AddWithoutMerging(r idMapRange, val uint32) bool {
+ if r.Length() <= 0 {
+ panic(fmt.Sprintf("invalid segment range %v", r))
+ }
+ gap := s.FindGap(r.Start)
+ if !gap.Ok() {
+ return false
+ }
+ if r.End > gap.End() {
+ return false
+ }
+ s.InsertWithoutMergingUnchecked(gap, r, val)
+ return true
+}
+
+// Insert inserts the given segment into the given gap. If the new segment can
+// be merged with adjacent segments, Insert will do so. Insert returns an
+// iterator to the segment containing the inserted value (which may have been
+// merged with other values). All existing iterators (including gap, but not
+// including the returned iterator) are invalidated.
+//
+// If the gap cannot accommodate the segment, or if r is invalid, Insert panics.
+//
+// Insert is semantically equivalent to a InsertWithoutMerging followed by a
+// Merge, but may be more efficient. Note that there is no unchecked variant of
+// Insert since Insert must retrieve and inspect gap's predecessor and
+// successor segments regardless.
+func (s *idMapSet) Insert(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator {
+ if r.Length() <= 0 {
+ panic(fmt.Sprintf("invalid segment range %v", r))
+ }
+ prev, next := gap.PrevSegment(), gap.NextSegment()
+ if prev.Ok() && prev.End() > r.Start {
+ panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range()))
+ }
+ if next.Ok() && next.Start() < r.End {
+ panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range()))
+ }
+ if prev.Ok() && prev.End() == r.Start {
+ if mval, ok := (idMapFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok {
+ prev.SetEndUnchecked(r.End)
+ prev.SetValue(mval)
+ if next.Ok() && next.Start() == r.End {
+ val = mval
+ if mval, ok := (idMapFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok {
+ prev.SetEndUnchecked(next.End())
+ prev.SetValue(mval)
+ return s.Remove(next).PrevSegment()
+ }
+ }
+ return prev
+ }
+ }
+ if next.Ok() && next.Start() == r.End {
+ if mval, ok := (idMapFunctions{}).Merge(r, val, next.Range(), next.Value()); ok {
+ next.SetStartUnchecked(r.Start)
+ next.SetValue(mval)
+ return next
+ }
+ }
+ return s.InsertWithoutMergingUnchecked(gap, r, val)
+}
+
+// InsertWithoutMerging inserts the given segment into the given gap and
+// returns an iterator to the inserted segment. All existing iterators
+// (including gap, but not including the returned iterator) are invalidated.
+//
+// If the gap cannot accommodate the segment, or if r is invalid,
+// InsertWithoutMerging panics.
+func (s *idMapSet) InsertWithoutMerging(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator {
+ if r.Length() <= 0 {
+ panic(fmt.Sprintf("invalid segment range %v", r))
+ }
+ if gr := gap.Range(); !gr.IsSupersetOf(r) {
+ panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr))
+ }
+ return s.InsertWithoutMergingUnchecked(gap, r, val)
+}
+
+// InsertWithoutMergingUnchecked inserts the given segment into the given gap
+// and returns an iterator to the inserted segment. All existing iterators
+// (including gap, but not including the returned iterator) are invalidated.
+//
+// Preconditions: r.Start >= gap.Start(); r.End <= gap.End().
+func (s *idMapSet) InsertWithoutMergingUnchecked(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator {
+ gap = gap.node.rebalanceBeforeInsert(gap)
+ copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments])
+ copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments])
+ gap.node.keys[gap.index] = r
+ gap.node.values[gap.index] = val
+ gap.node.nrSegments++
+ return idMapIterator{gap.node, gap.index}
+}
+
+// Remove removes the given segment and returns an iterator to the vacated gap.
+// All existing iterators (including seg, but not including the returned
+// iterator) are invalidated.
+func (s *idMapSet) Remove(seg idMapIterator) idMapGapIterator {
+
+ if seg.node.hasChildren {
+
+ victim := seg.PrevSegment()
+
+ seg.SetRangeUnchecked(victim.Range())
+ seg.SetValue(victim.Value())
+ return s.Remove(victim).NextGap()
+ }
+ copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments])
+ copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments])
+ idMapFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1])
+ seg.node.nrSegments--
+ return seg.node.rebalanceAfterRemove(idMapGapIterator{seg.node, seg.index})
+}
+
+// RemoveAll removes all segments from the set. All existing iterators are
+// invalidated.
+func (s *idMapSet) RemoveAll() {
+ s.root = idMapnode{}
+}
+
+// RemoveRange removes all segments in the given range. An iterator to the
+// newly formed gap is returned, and all existing iterators are invalidated.
+func (s *idMapSet) RemoveRange(r idMapRange) idMapGapIterator {
+ seg, gap := s.Find(r.Start)
+ if seg.Ok() {
+ seg = s.Isolate(seg, r)
+ gap = s.Remove(seg)
+ }
+ for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() {
+ seg = s.Isolate(seg, r)
+ gap = s.Remove(seg)
+ }
+ return gap
+}
+
+// Merge attempts to merge two neighboring segments. If successful, Merge
+// returns an iterator to the merged segment, and all existing iterators are
+// invalidated. Otherwise, Merge returns a terminal iterator.
+//
+// If first is not the predecessor of second, Merge panics.
+func (s *idMapSet) Merge(first, second idMapIterator) idMapIterator {
+ if first.NextSegment() != second {
+ panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range()))
+ }
+ return s.MergeUnchecked(first, second)
+}
+
+// MergeUnchecked attempts to merge two neighboring segments. If successful,
+// MergeUnchecked returns an iterator to the merged segment, and all existing
+// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal
+// iterator.
+//
+// Precondition: first is the predecessor of second: first.NextSegment() ==
+// second, first == second.PrevSegment().
+func (s *idMapSet) MergeUnchecked(first, second idMapIterator) idMapIterator {
+ if first.End() == second.Start() {
+ if mval, ok := (idMapFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok {
+
+ first.SetEndUnchecked(second.End())
+ first.SetValue(mval)
+ return s.Remove(second).PrevSegment()
+ }
+ }
+ return idMapIterator{}
+}
+
+// MergeAll attempts to merge all adjacent segments in the set. All existing
+// iterators are invalidated.
+func (s *idMapSet) MergeAll() {
+ seg := s.FirstSegment()
+ if !seg.Ok() {
+ return
+ }
+ next := seg.NextSegment()
+ for next.Ok() {
+ if mseg := s.MergeUnchecked(seg, next); mseg.Ok() {
+ seg, next = mseg, mseg.NextSegment()
+ } else {
+ seg, next = next, next.NextSegment()
+ }
+ }
+}
+
+// MergeRange attempts to merge all adjacent segments that contain a key in the
+// specific range. All existing iterators are invalidated.
+func (s *idMapSet) MergeRange(r idMapRange) {
+ seg := s.LowerBoundSegment(r.Start)
+ if !seg.Ok() {
+ return
+ }
+ next := seg.NextSegment()
+ for next.Ok() && next.Range().Start < r.End {
+ if mseg := s.MergeUnchecked(seg, next); mseg.Ok() {
+ seg, next = mseg, mseg.NextSegment()
+ } else {
+ seg, next = next, next.NextSegment()
+ }
+ }
+}
+
+// MergeAdjacent attempts to merge the segment containing r.Start with its
+// predecessor, and the segment containing r.End-1 with its successor.
+func (s *idMapSet) MergeAdjacent(r idMapRange) {
+ first := s.FindSegment(r.Start)
+ if first.Ok() {
+ if prev := first.PrevSegment(); prev.Ok() {
+ s.Merge(prev, first)
+ }
+ }
+ last := s.FindSegment(r.End - 1)
+ if last.Ok() {
+ if next := last.NextSegment(); next.Ok() {
+ s.Merge(last, next)
+ }
+ }
+}
+
+// Split splits the given segment at the given key and returns iterators to the
+// two resulting segments. All existing iterators (including seg, but not
+// including the returned iterators) are invalidated.
+//
+// If the segment cannot be split at split (because split is at the start or
+// end of the segment's range, so splitting would produce a segment with zero
+// length, or because split falls outside the segment's range altogether),
+// Split panics.
+func (s *idMapSet) Split(seg idMapIterator, split uint32) (idMapIterator, idMapIterator) {
+ if !seg.Range().CanSplitAt(split) {
+ panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split))
+ }
+ return s.SplitUnchecked(seg, split)
+}
+
+// SplitUnchecked splits the given segment at the given key and returns
+// iterators to the two resulting segments. All existing iterators (including
+// seg, but not including the returned iterators) are invalidated.
+//
+// Preconditions: seg.Start() < key < seg.End().
+func (s *idMapSet) SplitUnchecked(seg idMapIterator, split uint32) (idMapIterator, idMapIterator) {
+ val1, val2 := (idMapFunctions{}).Split(seg.Range(), seg.Value(), split)
+ end2 := seg.End()
+ seg.SetEndUnchecked(split)
+ seg.SetValue(val1)
+ seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), idMapRange{split, end2}, val2)
+
+ return seg2.PrevSegment(), seg2
+}
+
+// SplitAt splits the segment straddling split, if one exists. SplitAt returns
+// true if a segment was split and false otherwise. If SplitAt splits a
+// segment, all existing iterators are invalidated.
+func (s *idMapSet) SplitAt(split uint32) bool {
+ if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) {
+ s.SplitUnchecked(seg, split)
+ return true
+ }
+ return false
+}
+
+// Isolate ensures that the given segment's range does not escape r by
+// splitting at r.Start and r.End if necessary, and returns an updated iterator
+// to the bounded segment. All existing iterators (including seg, but not
+// including the returned iterators) are invalidated.
+func (s *idMapSet) Isolate(seg idMapIterator, r idMapRange) idMapIterator {
+ if seg.Range().CanSplitAt(r.Start) {
+ _, seg = s.SplitUnchecked(seg, r.Start)
+ }
+ if seg.Range().CanSplitAt(r.End) {
+ seg, _ = s.SplitUnchecked(seg, r.End)
+ }
+ return seg
+}
+
+// ApplyContiguous applies a function to a contiguous range of segments,
+// splitting if necessary. The function is applied until the first gap is
+// encountered, at which point the gap is returned. If the function is applied
+// across the entire range, a terminal gap is returned. All existing iterators
+// are invalidated.
+//
+// N.B. The Iterator must not be invalidated by the function.
+func (s *idMapSet) ApplyContiguous(r idMapRange, fn func(seg idMapIterator)) idMapGapIterator {
+ seg, gap := s.Find(r.Start)
+ if !seg.Ok() {
+ return gap
+ }
+ for {
+ seg = s.Isolate(seg, r)
+ fn(seg)
+ if seg.End() >= r.End {
+ return idMapGapIterator{}
+ }
+ gap = seg.NextGap()
+ if !gap.IsEmpty() {
+ return gap
+ }
+ seg = gap.NextSegment()
+ if !seg.Ok() {
+
+ return idMapGapIterator{}
+ }
+ }
+}
+
+// +stateify savable
+type idMapnode struct {
+ // An internal binary tree node looks like:
+ //
+ // K
+ // / \
+ // Cl Cr
+ //
+ // where all keys in the subtree rooted by Cl (the left subtree) are less
+ // than K (the key of the parent node), and all keys in the subtree rooted
+ // by Cr (the right subtree) are greater than K.
+ //
+ // An internal B-tree node's indexes work out to look like:
+ //
+ // K0 K1 K2 ... Kn-1
+ // / \/ \/ \ ... / \
+ // C0 C1 C2 C3 ... Cn-1 Cn
+ //
+ // where n is nrSegments.
+ nrSegments int
+
+ // parent is a pointer to this node's parent. If this node is root, parent
+ // is nil.
+ parent *idMapnode
+
+ // parentIndex is the index of this node in parent.children.
+ parentIndex int
+
+ // Flag for internal nodes that is technically redundant with "children[0]
+ // != nil", but is stored in the first cache line. "hasChildren" rather
+ // than "isLeaf" because false must be the correct value for an empty root.
+ hasChildren bool
+
+ // Nodes store keys and values in separate arrays to maximize locality in
+ // the common case (scanning keys for lookup).
+ keys [idMapmaxDegree - 1]idMapRange
+ values [idMapmaxDegree - 1]uint32
+ children [idMapmaxDegree]*idMapnode
+}
+
+// firstSegment returns the first segment in the subtree rooted by n.
+//
+// Preconditions: n.nrSegments != 0.
+func (n *idMapnode) firstSegment() idMapIterator {
+ for n.hasChildren {
+ n = n.children[0]
+ }
+ return idMapIterator{n, 0}
+}
+
+// lastSegment returns the last segment in the subtree rooted by n.
+//
+// Preconditions: n.nrSegments != 0.
+func (n *idMapnode) lastSegment() idMapIterator {
+ for n.hasChildren {
+ n = n.children[n.nrSegments]
+ }
+ return idMapIterator{n, n.nrSegments - 1}
+}
+
+func (n *idMapnode) prevSibling() *idMapnode {
+ if n.parent == nil || n.parentIndex == 0 {
+ return nil
+ }
+ return n.parent.children[n.parentIndex-1]
+}
+
+func (n *idMapnode) nextSibling() *idMapnode {
+ if n.parent == nil || n.parentIndex == n.parent.nrSegments {
+ return nil
+ }
+ return n.parent.children[n.parentIndex+1]
+}
+
+// rebalanceBeforeInsert splits n and its ancestors if they are full, as
+// required for insertion, and returns an updated iterator to the position
+// represented by gap.
+func (n *idMapnode) rebalanceBeforeInsert(gap idMapGapIterator) idMapGapIterator {
+ if n.parent != nil {
+ gap = n.parent.rebalanceBeforeInsert(gap)
+ }
+ if n.nrSegments < idMapmaxDegree-1 {
+ return gap
+ }
+ if n.parent == nil {
+
+ left := &idMapnode{
+ nrSegments: idMapminDegree - 1,
+ parent: n,
+ parentIndex: 0,
+ hasChildren: n.hasChildren,
+ }
+ right := &idMapnode{
+ nrSegments: idMapminDegree - 1,
+ parent: n,
+ parentIndex: 1,
+ hasChildren: n.hasChildren,
+ }
+ copy(left.keys[:idMapminDegree-1], n.keys[:idMapminDegree-1])
+ copy(left.values[:idMapminDegree-1], n.values[:idMapminDegree-1])
+ copy(right.keys[:idMapminDegree-1], n.keys[idMapminDegree:])
+ copy(right.values[:idMapminDegree-1], n.values[idMapminDegree:])
+ n.keys[0], n.values[0] = n.keys[idMapminDegree-1], n.values[idMapminDegree-1]
+ idMapzeroValueSlice(n.values[1:])
+ if n.hasChildren {
+ copy(left.children[:idMapminDegree], n.children[:idMapminDegree])
+ copy(right.children[:idMapminDegree], n.children[idMapminDegree:])
+ idMapzeroNodeSlice(n.children[2:])
+ for i := 0; i < idMapminDegree; i++ {
+ left.children[i].parent = left
+ left.children[i].parentIndex = i
+ right.children[i].parent = right
+ right.children[i].parentIndex = i
+ }
+ }
+ n.nrSegments = 1
+ n.hasChildren = true
+ n.children[0] = left
+ n.children[1] = right
+ if gap.node != n {
+ return gap
+ }
+ if gap.index < idMapminDegree {
+ return idMapGapIterator{left, gap.index}
+ }
+ return idMapGapIterator{right, gap.index - idMapminDegree}
+ }
+
+ copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments])
+ copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments])
+ n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[idMapminDegree-1], n.values[idMapminDegree-1]
+ copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1])
+ for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ {
+ n.parent.children[i].parentIndex = i
+ }
+ sibling := &idMapnode{
+ nrSegments: idMapminDegree - 1,
+ parent: n.parent,
+ parentIndex: n.parentIndex + 1,
+ hasChildren: n.hasChildren,
+ }
+ n.parent.children[n.parentIndex+1] = sibling
+ n.parent.nrSegments++
+ copy(sibling.keys[:idMapminDegree-1], n.keys[idMapminDegree:])
+ copy(sibling.values[:idMapminDegree-1], n.values[idMapminDegree:])
+ idMapzeroValueSlice(n.values[idMapminDegree-1:])
+ if n.hasChildren {
+ copy(sibling.children[:idMapminDegree], n.children[idMapminDegree:])
+ idMapzeroNodeSlice(n.children[idMapminDegree:])
+ for i := 0; i < idMapminDegree; i++ {
+ sibling.children[i].parent = sibling
+ sibling.children[i].parentIndex = i
+ }
+ }
+ n.nrSegments = idMapminDegree - 1
+
+ if gap.node != n {
+ return gap
+ }
+ if gap.index < idMapminDegree {
+ return gap
+ }
+ return idMapGapIterator{sibling, gap.index - idMapminDegree}
+}
+
+// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient
+// (contain fewer segments than required by B-tree invariants), as required for
+// removal, and returns an updated iterator to the position represented by gap.
+//
+// Precondition: n is the only node in the tree that may currently violate a
+// B-tree invariant.
+func (n *idMapnode) rebalanceAfterRemove(gap idMapGapIterator) idMapGapIterator {
+ for {
+ if n.nrSegments >= idMapminDegree-1 {
+ return gap
+ }
+ if n.parent == nil {
+
+ return gap
+ }
+
+ if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= idMapminDegree {
+ copy(n.keys[1:], n.keys[:n.nrSegments])
+ copy(n.values[1:], n.values[:n.nrSegments])
+ n.keys[0] = n.parent.keys[n.parentIndex-1]
+ n.values[0] = n.parent.values[n.parentIndex-1]
+ n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1]
+ n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1]
+ idMapFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1])
+ if n.hasChildren {
+ copy(n.children[1:], n.children[:n.nrSegments+1])
+ n.children[0] = sibling.children[sibling.nrSegments]
+ sibling.children[sibling.nrSegments] = nil
+ n.children[0].parent = n
+ n.children[0].parentIndex = 0
+ for i := 1; i < n.nrSegments+2; i++ {
+ n.children[i].parentIndex = i
+ }
+ }
+ n.nrSegments++
+ sibling.nrSegments--
+ if gap.node == sibling && gap.index == sibling.nrSegments {
+ return idMapGapIterator{n, 0}
+ }
+ if gap.node == n {
+ return idMapGapIterator{n, gap.index + 1}
+ }
+ return gap
+ }
+ if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= idMapminDegree {
+ n.keys[n.nrSegments] = n.parent.keys[n.parentIndex]
+ n.values[n.nrSegments] = n.parent.values[n.parentIndex]
+ n.parent.keys[n.parentIndex] = sibling.keys[0]
+ n.parent.values[n.parentIndex] = sibling.values[0]
+ copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:])
+ copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:])
+ idMapFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1])
+ if n.hasChildren {
+ n.children[n.nrSegments+1] = sibling.children[0]
+ copy(sibling.children[:sibling.nrSegments], sibling.children[1:])
+ sibling.children[sibling.nrSegments] = nil
+ n.children[n.nrSegments+1].parent = n
+ n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1
+ for i := 0; i < sibling.nrSegments; i++ {
+ sibling.children[i].parentIndex = i
+ }
+ }
+ n.nrSegments++
+ sibling.nrSegments--
+ if gap.node == sibling {
+ if gap.index == 0 {
+ return idMapGapIterator{n, n.nrSegments}
+ }
+ return idMapGapIterator{sibling, gap.index - 1}
+ }
+ return gap
+ }
+
+ p := n.parent
+ if p.nrSegments == 1 {
+
+ left, right := p.children[0], p.children[1]
+ p.nrSegments = left.nrSegments + right.nrSegments + 1
+ p.hasChildren = left.hasChildren
+ p.keys[left.nrSegments] = p.keys[0]
+ p.values[left.nrSegments] = p.values[0]
+ copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments])
+ copy(p.values[:left.nrSegments], left.values[:left.nrSegments])
+ copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments])
+ copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments])
+ if left.hasChildren {
+ copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1])
+ copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1])
+ for i := 0; i < p.nrSegments+1; i++ {
+ p.children[i].parent = p
+ p.children[i].parentIndex = i
+ }
+ } else {
+ p.children[0] = nil
+ p.children[1] = nil
+ }
+ if gap.node == left {
+ return idMapGapIterator{p, gap.index}
+ }
+ if gap.node == right {
+ return idMapGapIterator{p, gap.index + left.nrSegments + 1}
+ }
+ return gap
+ }
+ // Merge n and either sibling, along with the segment separating the
+ // two, into whichever of the two nodes comes first. This is the
+ // reverse of the non-root splitting case in
+ // node.rebalanceBeforeInsert.
+ var left, right *idMapnode
+ if n.parentIndex > 0 {
+ left = n.prevSibling()
+ right = n
+ } else {
+ left = n
+ right = n.nextSibling()
+ }
+
+ if gap.node == right {
+ gap = idMapGapIterator{left, gap.index + left.nrSegments + 1}
+ }
+ left.keys[left.nrSegments] = p.keys[left.parentIndex]
+ left.values[left.nrSegments] = p.values[left.parentIndex]
+ copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments])
+ copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments])
+ if left.hasChildren {
+ copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1])
+ for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ {
+ left.children[i].parent = left
+ left.children[i].parentIndex = i
+ }
+ }
+ left.nrSegments += right.nrSegments + 1
+ copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments])
+ copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments])
+ idMapFunctions{}.ClearValue(&p.values[p.nrSegments-1])
+ copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1])
+ for i := 0; i < p.nrSegments; i++ {
+ p.children[i].parentIndex = i
+ }
+ p.children[p.nrSegments] = nil
+ p.nrSegments--
+
+ n = p
+ }
+}
+
+// A Iterator is conceptually one of:
+//
+// - A pointer to a segment in a set; or
+//
+// - A terminal iterator, which is a sentinel indicating that the end of
+// iteration has been reached.
+//
+// Iterators are copyable values and are meaningfully equality-comparable. The
+// zero value of Iterator is a terminal iterator.
+//
+// Unless otherwise specified, any mutation of a set invalidates all existing
+// iterators into the set.
+type idMapIterator struct {
+ // node is the node containing the iterated segment. If the iterator is
+ // terminal, node is nil.
+ node *idMapnode
+
+ // index is the index of the segment in node.keys/values.
+ index int
+}
+
+// Ok returns true if the iterator is not terminal. All other methods are only
+// valid for non-terminal iterators.
+func (seg idMapIterator) Ok() bool {
+ return seg.node != nil
+}
+
+// Range returns the iterated segment's range key.
+func (seg idMapIterator) Range() idMapRange {
+ return seg.node.keys[seg.index]
+}
+
+// Start is equivalent to Range().Start, but should be preferred if only the
+// start of the range is needed.
+func (seg idMapIterator) Start() uint32 {
+ return seg.node.keys[seg.index].Start
+}
+
+// End is equivalent to Range().End, but should be preferred if only the end of
+// the range is needed.
+func (seg idMapIterator) End() uint32 {
+ return seg.node.keys[seg.index].End
+}
+
+// SetRangeUnchecked mutates the iterated segment's range key. This operation
+// does not invalidate any iterators.
+//
+// Preconditions:
+//
+// - r.Length() > 0.
+//
+// - The new range must not overlap an existing one: If seg.NextSegment().Ok(),
+// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then
+// r.start >= seg.PrevSegment().End().
+func (seg idMapIterator) SetRangeUnchecked(r idMapRange) {
+ seg.node.keys[seg.index] = r
+}
+
+// SetRange mutates the iterated segment's range key. If the new range would
+// cause the iterated segment to overlap another segment, or if the new range
+// is invalid, SetRange panics. This operation does not invalidate any
+// iterators.
+func (seg idMapIterator) SetRange(r idMapRange) {
+ if r.Length() <= 0 {
+ panic(fmt.Sprintf("invalid segment range %v", r))
+ }
+ if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() {
+ panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range()))
+ }
+ if next := seg.NextSegment(); next.Ok() && r.End > next.Start() {
+ panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range()))
+ }
+ seg.SetRangeUnchecked(r)
+}
+
+// SetStartUnchecked mutates the iterated segment's start. This operation does
+// not invalidate any iterators.
+//
+// Preconditions: The new start must be valid: start < seg.End(); if
+// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
+func (seg idMapIterator) SetStartUnchecked(start uint32) {
+ seg.node.keys[seg.index].Start = start
+}
+
+// SetStart mutates the iterated segment's start. If the new start value would
+// cause the iterated segment to overlap another segment, or would result in an
+// invalid range, SetStart panics. This operation does not invalidate any
+// iterators.
+func (seg idMapIterator) SetStart(start uint32) {
+ if start >= seg.End() {
+ panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range()))
+ }
+ if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() {
+ panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range()))
+ }
+ seg.SetStartUnchecked(start)
+}
+
+// SetEndUnchecked mutates the iterated segment's end. This operation does not
+// invalidate any iterators.
+//
+// Preconditions: The new end must be valid: end > seg.Start(); if
+// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
+func (seg idMapIterator) SetEndUnchecked(end uint32) {
+ seg.node.keys[seg.index].End = end
+}
+
+// SetEnd mutates the iterated segment's end. If the new end value would cause
+// the iterated segment to overlap another segment, or would result in an
+// invalid range, SetEnd panics. This operation does not invalidate any
+// iterators.
+func (seg idMapIterator) SetEnd(end uint32) {
+ if end <= seg.Start() {
+ panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range()))
+ }
+ if next := seg.NextSegment(); next.Ok() && end > next.Start() {
+ panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range()))
+ }
+ seg.SetEndUnchecked(end)
+}
+
+// Value returns a copy of the iterated segment's value.
+func (seg idMapIterator) Value() uint32 {
+ return seg.node.values[seg.index]
+}
+
+// ValuePtr returns a pointer to the iterated segment's value. The pointer is
+// invalidated if the iterator is invalidated. This operation does not
+// invalidate any iterators.
+func (seg idMapIterator) ValuePtr() *uint32 {
+ return &seg.node.values[seg.index]
+}
+
+// SetValue mutates the iterated segment's value. This operation does not
+// invalidate any iterators.
+func (seg idMapIterator) SetValue(val uint32) {
+ seg.node.values[seg.index] = val
+}
+
+// PrevSegment returns the iterated segment's predecessor. If there is no
+// preceding segment, PrevSegment returns a terminal iterator.
+func (seg idMapIterator) PrevSegment() idMapIterator {
+ if seg.node.hasChildren {
+ return seg.node.children[seg.index].lastSegment()
+ }
+ if seg.index > 0 {
+ return idMapIterator{seg.node, seg.index - 1}
+ }
+ if seg.node.parent == nil {
+ return idMapIterator{}
+ }
+ return idMapsegmentBeforePosition(seg.node.parent, seg.node.parentIndex)
+}
+
+// NextSegment returns the iterated segment's successor. If there is no
+// succeeding segment, NextSegment returns a terminal iterator.
+func (seg idMapIterator) NextSegment() idMapIterator {
+ if seg.node.hasChildren {
+ return seg.node.children[seg.index+1].firstSegment()
+ }
+ if seg.index < seg.node.nrSegments-1 {
+ return idMapIterator{seg.node, seg.index + 1}
+ }
+ if seg.node.parent == nil {
+ return idMapIterator{}
+ }
+ return idMapsegmentAfterPosition(seg.node.parent, seg.node.parentIndex)
+}
+
+// PrevGap returns the gap immediately before the iterated segment.
+func (seg idMapIterator) PrevGap() idMapGapIterator {
+ if seg.node.hasChildren {
+
+ return seg.node.children[seg.index].lastSegment().NextGap()
+ }
+ return idMapGapIterator{seg.node, seg.index}
+}
+
+// NextGap returns the gap immediately after the iterated segment.
+func (seg idMapIterator) NextGap() idMapGapIterator {
+ if seg.node.hasChildren {
+ return seg.node.children[seg.index+1].firstSegment().PrevGap()
+ }
+ return idMapGapIterator{seg.node, seg.index + 1}
+}
+
+// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent,
+// or the gap before the iterated segment otherwise. If seg.Start() ==
+// Functions.MinKey(), PrevNonEmpty will return two terminal iterators.
+// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be
+// non-terminal.
+func (seg idMapIterator) PrevNonEmpty() (idMapIterator, idMapGapIterator) {
+ gap := seg.PrevGap()
+ if gap.Range().Length() != 0 {
+ return idMapIterator{}, gap
+ }
+ return gap.PrevSegment(), idMapGapIterator{}
+}
+
+// NextNonEmpty returns the iterated segment's successor if it is adjacent, or
+// the gap after the iterated segment otherwise. If seg.End() ==
+// Functions.MaxKey(), NextNonEmpty will return two terminal iterators.
+// Otherwise, exactly one of the iterators returned by NextNonEmpty will be
+// non-terminal.
+func (seg idMapIterator) NextNonEmpty() (idMapIterator, idMapGapIterator) {
+ gap := seg.NextGap()
+ if gap.Range().Length() != 0 {
+ return idMapIterator{}, gap
+ }
+ return gap.NextSegment(), idMapGapIterator{}
+}
+
+// A GapIterator is conceptually one of:
+//
+// - A pointer to a position between two segments, before the first segment, or
+// after the last segment in a set, called a *gap*; or
+//
+// - A terminal iterator, which is a sentinel indicating that the end of
+// iteration has been reached.
+//
+// Note that the gap between two adjacent segments exists (iterators to it are
+// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true
+// for such gaps. An empty set contains a single gap, spanning the entire range
+// of the set's keys.
+//
+// GapIterators are copyable values and are meaningfully equality-comparable.
+// The zero value of GapIterator is a terminal iterator.
+//
+// Unless otherwise specified, any mutation of a set invalidates all existing
+// iterators into the set.
+type idMapGapIterator struct {
+ // The representation of a GapIterator is identical to that of an Iterator,
+ // except that index corresponds to positions between segments in the same
+ // way as for node.children (see comment for node.nrSegments).
+ node *idMapnode
+ index int
+}
+
+// Ok returns true if the iterator is not terminal. All other methods are only
+// valid for non-terminal iterators.
+func (gap idMapGapIterator) Ok() bool {
+ return gap.node != nil
+}
+
+// Range returns the range spanned by the iterated gap.
+func (gap idMapGapIterator) Range() idMapRange {
+ return idMapRange{gap.Start(), gap.End()}
+}
+
+// Start is equivalent to Range().Start, but should be preferred if only the
+// start of the range is needed.
+func (gap idMapGapIterator) Start() uint32 {
+ if ps := gap.PrevSegment(); ps.Ok() {
+ return ps.End()
+ }
+ return idMapFunctions{}.MinKey()
+}
+
+// End is equivalent to Range().End, but should be preferred if only the end of
+// the range is needed.
+func (gap idMapGapIterator) End() uint32 {
+ if ns := gap.NextSegment(); ns.Ok() {
+ return ns.Start()
+ }
+ return idMapFunctions{}.MaxKey()
+}
+
+// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is
+// between two adjacent segments.)
+func (gap idMapGapIterator) IsEmpty() bool {
+ return gap.Range().Length() == 0
+}
+
+// PrevSegment returns the segment immediately before the iterated gap. If no
+// such segment exists, PrevSegment returns a terminal iterator.
+func (gap idMapGapIterator) PrevSegment() idMapIterator {
+ return idMapsegmentBeforePosition(gap.node, gap.index)
+}
+
+// NextSegment returns the segment immediately after the iterated gap. If no
+// such segment exists, NextSegment returns a terminal iterator.
+func (gap idMapGapIterator) NextSegment() idMapIterator {
+ return idMapsegmentAfterPosition(gap.node, gap.index)
+}
+
+// PrevGap returns the iterated gap's predecessor. If no such gap exists,
+// PrevGap returns a terminal iterator.
+func (gap idMapGapIterator) PrevGap() idMapGapIterator {
+ seg := gap.PrevSegment()
+ if !seg.Ok() {
+ return idMapGapIterator{}
+ }
+ return seg.PrevGap()
+}
+
+// NextGap returns the iterated gap's successor. If no such gap exists, NextGap
+// returns a terminal iterator.
+func (gap idMapGapIterator) NextGap() idMapGapIterator {
+ seg := gap.NextSegment()
+ if !seg.Ok() {
+ return idMapGapIterator{}
+ }
+ return seg.NextGap()
+}
+
+// segmentBeforePosition returns the predecessor segment of the position given
+// by n.children[i], which may or may not contain a child. If no such segment
+// exists, segmentBeforePosition returns a terminal iterator.
+func idMapsegmentBeforePosition(n *idMapnode, i int) idMapIterator {
+ for i == 0 {
+ if n.parent == nil {
+ return idMapIterator{}
+ }
+ n, i = n.parent, n.parentIndex
+ }
+ return idMapIterator{n, i - 1}
+}
+
+// segmentAfterPosition returns the successor segment of the position given by
+// n.children[i], which may or may not contain a child. If no such segment
+// exists, segmentAfterPosition returns a terminal iterator.
+func idMapsegmentAfterPosition(n *idMapnode, i int) idMapIterator {
+ for i == n.nrSegments {
+ if n.parent == nil {
+ return idMapIterator{}
+ }
+ n, i = n.parent, n.parentIndex
+ }
+ return idMapIterator{n, i}
+}
+
+func idMapzeroValueSlice(slice []uint32) {
+
+ for i := range slice {
+ idMapFunctions{}.ClearValue(&slice[i])
+ }
+}
+
+func idMapzeroNodeSlice(slice []*idMapnode) {
+ for i := range slice {
+ slice[i] = nil
+ }
+}
+
+// String stringifies a Set for debugging.
+func (s *idMapSet) String() string {
+ return s.root.String()
+}
+
+// String stringifes a node (and all of its children) for debugging.
+func (n *idMapnode) String() string {
+ var buf bytes.Buffer
+ n.writeDebugString(&buf, "")
+ return buf.String()
+}
+
+func (n *idMapnode) writeDebugString(buf *bytes.Buffer, prefix string) {
+ if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) {
+ buf.WriteString(prefix)
+ buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren))
+ }
+ for i := 0; i < n.nrSegments; i++ {
+ if child := n.children[i]; child != nil {
+ cprefix := fmt.Sprintf("%s- % 3d ", prefix, i)
+ if child.parent != n || child.parentIndex != i {
+ buf.WriteString(cprefix)
+ buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i))
+ }
+ child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i))
+ }
+ buf.WriteString(prefix)
+ buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
+ }
+ if child := n.children[n.nrSegments]; child != nil {
+ child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments))
+ }
+}
+
+// SegmentDataSlices represents segments from a set as slices of start, end, and
+// values. SegmentDataSlices is primarily used as an intermediate representation
+// for save/restore and the layout here is optimized for that.
+//
+// +stateify savable
+type idMapSegmentDataSlices struct {
+ Start []uint32
+ End []uint32
+ Values []uint32
+}
+
+// ExportSortedSlice returns a copy of all segments in the given set, in ascending
+// key order.
+func (s *idMapSet) ExportSortedSlices() *idMapSegmentDataSlices {
+ var sds idMapSegmentDataSlices
+ for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ sds.Start = append(sds.Start, seg.Start())
+ sds.End = append(sds.End, seg.End())
+ sds.Values = append(sds.Values, seg.Value())
+ }
+ sds.Start = sds.Start[:len(sds.Start):len(sds.Start)]
+ sds.End = sds.End[:len(sds.End):len(sds.End)]
+ sds.Values = sds.Values[:len(sds.Values):len(sds.Values)]
+ return &sds
+}
+
+// ImportSortedSlice initializes the given set from the given slice.
+//
+// Preconditions: s must be empty. sds must represent a valid set (the segments
+// in sds must have valid lengths that do not overlap). The segments in sds
+// must be sorted in ascending key order.
+func (s *idMapSet) ImportSortedSlices(sds *idMapSegmentDataSlices) error {
+ if !s.IsEmpty() {
+ return fmt.Errorf("cannot import into non-empty set %v", s)
+ }
+ gap := s.FirstGap()
+ for i := range sds.Start {
+ r := idMapRange{sds.Start[i], sds.End[i]}
+ if !gap.Range().IsSupersetOf(r) {
+ return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i])
+ }
+ gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap()
+ }
+ return nil
+}
+func (s *idMapSet) saveRoot() *idMapSegmentDataSlices {
+ return s.ExportSortedSlices()
+}
+
+func (s *idMapSet) loadRoot(sds *idMapSegmentDataSlices) {
+ if err := s.ImportSortedSlices(sds); err != nil {
+ panic(err)
+ }
+}
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
new file mode 100644
index 000000000..a40dd668f
--- /dev/null
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -0,0 +1,129 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package auth
+
+import (
+ "math"
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// A UserNamespace represents a user namespace. See user_namespaces(7) for
+// details.
+//
+// +stateify savable
+type UserNamespace struct {
+ // parent is this namespace's parent. If this is the root namespace, parent
+ // is nil. The parent pointer is immutable.
+ parent *UserNamespace
+
+ // owner is the effective UID of the namespace's creator in the root
+ // namespace. owner is immutable.
+ owner KUID
+
+ // mu protects the following fields.
+ //
+ // If mu will be locked in multiple UserNamespaces, it must be locked in
+ // descendant namespaces before ancestors.
+ mu sync.Mutex `state:"nosave"`
+
+ // Mappings of user/group IDs between this namespace and its parent.
+ //
+ // All ID maps, once set, cannot be changed. This means that successful
+ // UID/GID translations cannot be racy.
+ uidMapFromParent idMapSet
+ uidMapToParent idMapSet
+ gidMapFromParent idMapSet
+ gidMapToParent idMapSet
+
+ // TODO(b/27454212): Support disabling setgroups(2).
+}
+
+// NewRootUserNamespace returns a UserNamespace that is appropriate for a
+// system's root user namespace.
+func NewRootUserNamespace() *UserNamespace {
+ var ns UserNamespace
+ // """
+ // The initial user namespace has no parent namespace, but, for
+ // consistency, the kernel provides dummy user and group ID mapping files
+ // for this namespace. Looking at the uid_map file (gid_map is the same)
+ // from a shell in the initial namespace shows:
+ //
+ // $ cat /proc/$$/uid_map
+ // 0 0 4294967295
+ // """ - user_namespaces(7)
+ for _, m := range []*idMapSet{
+ &ns.uidMapFromParent,
+ &ns.uidMapToParent,
+ &ns.gidMapFromParent,
+ &ns.gidMapToParent,
+ } {
+ if !m.Add(idMapRange{0, math.MaxUint32}, 0) {
+ panic("Failed to insert into empty ID map")
+ }
+ }
+ return &ns
+}
+
+// Root returns the root of the user namespace tree containing ns.
+func (ns *UserNamespace) Root() *UserNamespace {
+ for ns.parent != nil {
+ ns = ns.parent
+ }
+ return ns
+}
+
+// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user
+// namespaces." - user_namespaces(7)
+const maxUserNamespaceDepth = 32
+
+func (ns *UserNamespace) depth() int {
+ var i int
+ for ns != nil {
+ i++
+ ns = ns.parent
+ }
+ return i
+}
+
+// NewChildUserNamespace returns a new user namespace created by a caller with
+// credentials c.
+func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) {
+ if c.UserNamespace.depth() >= maxUserNamespaceDepth {
+ // "... Calls to unshare(2) or clone(2) that would cause this limit to
+ // be exceeded fail with the error EUSERS." - user_namespaces(7)
+ return nil, syserror.EUSERS
+ }
+ // "EPERM: CLONE_NEWUSER was specified in flags, but either the effective
+ // user ID or the effective group ID of the caller does not have a mapping
+ // in the parent namespace (see user_namespaces(7))." - clone(2)
+ // "CLONE_NEWUSER requires that the user ID and group ID of the calling
+ // process are mapped to user IDs and group IDs in the user namespace of
+ // the calling process at the time of the call." - unshare(2)
+ if !c.EffectiveKUID.In(c.UserNamespace).Ok() {
+ return nil, syserror.EPERM
+ }
+ if !c.EffectiveKGID.In(c.UserNamespace).Ok() {
+ return nil, syserror.EPERM
+ }
+ return &UserNamespace{
+ parent: c.UserNamespace,
+ owner: c.EffectiveKUID,
+ // "When a user namespace is created, it starts without a mapping of
+ // user IDs (group IDs) to the parent user namespace." -
+ // user_namespaces(7)
+ }, nil
+}