diff options
Diffstat (limited to 'pkg/sentry/kernel')
99 files changed, 24657 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go new file mode 100644 index 000000000..5ce52e66c --- /dev/null +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -0,0 +1,111 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" +) + +// +stateify savable +type abstractEndpoint struct { + ep transport.BoundEndpoint + wr *refs.WeakRef + name string + ns *AbstractSocketNamespace +} + +// WeakRefGone implements refs.WeakRefUser.WeakRefGone. +func (e *abstractEndpoint) WeakRefGone() { + e.ns.mu.Lock() + if e.ns.endpoints[e.name].ep == e.ep { + delete(e.ns.endpoints, e.name) + } + e.ns.mu.Unlock() +} + +// AbstractSocketNamespace is used to implement the Linux abstract socket functionality. +// +// +stateify savable +type AbstractSocketNamespace struct { + mu sync.Mutex `state:"nosave"` + + // Keeps mapping from name to endpoint. + endpoints map[string]abstractEndpoint +} + +// NewAbstractSocketNamespace returns a new AbstractSocketNamespace. +func NewAbstractSocketNamespace() *AbstractSocketNamespace { + return &AbstractSocketNamespace{ + endpoints: make(map[string]abstractEndpoint), + } +} + +// A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on +// its backing object. +type boundEndpoint struct { + transport.BoundEndpoint + rc refs.RefCounter +} + +// Release implements transport.BoundEndpoint.Release. +func (e *boundEndpoint) Release() { + e.rc.DecRef() + e.BoundEndpoint.Release() +} + +// BoundEndpoint retrieves the endpoint bound to the given name. The return +// value is nil if no endpoint was bound. +func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndpoint { + a.mu.Lock() + defer a.mu.Unlock() + + ep, ok := a.endpoints[name] + if !ok { + return nil + } + + rc := ep.wr.Get() + if rc == nil { + delete(a.endpoints, name) + return nil + } + + return &boundEndpoint{ep.ep, rc} +} + +// Bind binds the given socket. +// +// When the last reference managed by rc is dropped, ep may be removed from the +// namespace. +func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { + a.mu.Lock() + defer a.mu.Unlock() + + if ep, ok := a.endpoints[name]; ok { + if rc := ep.wr.Get(); rc != nil { + rc.DecRef() + return syscall.EADDRINUSE + } + } + + ae := abstractEndpoint{ep: ep, name: name, ns: a} + ae.wr = refs.NewWeakRef(rc, &ae) + a.endpoints[name] = ae + return nil +} diff --git a/pkg/sentry/kernel/auth/auth.go b/pkg/sentry/kernel/auth/auth.go new file mode 100644 index 000000000..847d121aa --- /dev/null +++ b/pkg/sentry/kernel/auth/auth.go @@ -0,0 +1,22 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package auth implements an access control model that is a subset of Linux's. +// +// The auth package supports two kinds of access controls: user/group IDs and +// capabilities. Each resource in the security model is associated with a user +// namespace; "privileged" operations check that the operator's credentials +// have the required user/group IDs or capabilities within the user namespace +// of accessed resources. +package auth diff --git a/pkg/sentry/kernel/auth/auth_state_autogen.go b/pkg/sentry/kernel/auth/auth_state_autogen.go new file mode 100755 index 000000000..6f80381c6 --- /dev/null +++ b/pkg/sentry/kernel/auth/auth_state_autogen.go @@ -0,0 +1,151 @@ +// automatically generated by stateify. + +package auth + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *Credentials) beforeSave() {} +func (x *Credentials) save(m state.Map) { + x.beforeSave() + m.Save("RealKUID", &x.RealKUID) + m.Save("EffectiveKUID", &x.EffectiveKUID) + m.Save("SavedKUID", &x.SavedKUID) + m.Save("RealKGID", &x.RealKGID) + m.Save("EffectiveKGID", &x.EffectiveKGID) + m.Save("SavedKGID", &x.SavedKGID) + m.Save("ExtraKGIDs", &x.ExtraKGIDs) + m.Save("PermittedCaps", &x.PermittedCaps) + m.Save("InheritableCaps", &x.InheritableCaps) + m.Save("EffectiveCaps", &x.EffectiveCaps) + m.Save("BoundingCaps", &x.BoundingCaps) + m.Save("KeepCaps", &x.KeepCaps) + m.Save("UserNamespace", &x.UserNamespace) +} + +func (x *Credentials) afterLoad() {} +func (x *Credentials) load(m state.Map) { + m.Load("RealKUID", &x.RealKUID) + m.Load("EffectiveKUID", &x.EffectiveKUID) + m.Load("SavedKUID", &x.SavedKUID) + m.Load("RealKGID", &x.RealKGID) + m.Load("EffectiveKGID", &x.EffectiveKGID) + m.Load("SavedKGID", &x.SavedKGID) + m.Load("ExtraKGIDs", &x.ExtraKGIDs) + m.Load("PermittedCaps", &x.PermittedCaps) + m.Load("InheritableCaps", &x.InheritableCaps) + m.Load("EffectiveCaps", &x.EffectiveCaps) + m.Load("BoundingCaps", &x.BoundingCaps) + m.Load("KeepCaps", &x.KeepCaps) + m.Load("UserNamespace", &x.UserNamespace) +} + +func (x *IDMapEntry) beforeSave() {} +func (x *IDMapEntry) save(m state.Map) { + x.beforeSave() + m.Save("FirstID", &x.FirstID) + m.Save("FirstParentID", &x.FirstParentID) + m.Save("Length", &x.Length) +} + +func (x *IDMapEntry) afterLoad() {} +func (x *IDMapEntry) load(m state.Map) { + m.Load("FirstID", &x.FirstID) + m.Load("FirstParentID", &x.FirstParentID) + m.Load("Length", &x.Length) +} + +func (x *idMapRange) beforeSave() {} +func (x *idMapRange) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) +} + +func (x *idMapRange) afterLoad() {} +func (x *idMapRange) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) +} + +func (x *idMapSet) beforeSave() {} +func (x *idMapSet) save(m state.Map) { + x.beforeSave() + var root *idMapSegmentDataSlices = x.saveRoot() + m.SaveValue("root", root) +} + +func (x *idMapSet) afterLoad() {} +func (x *idMapSet) load(m state.Map) { + m.LoadValue("root", new(*idMapSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*idMapSegmentDataSlices)) }) +} + +func (x *idMapnode) beforeSave() {} +func (x *idMapnode) save(m state.Map) { + x.beforeSave() + m.Save("nrSegments", &x.nrSegments) + m.Save("parent", &x.parent) + m.Save("parentIndex", &x.parentIndex) + m.Save("hasChildren", &x.hasChildren) + m.Save("keys", &x.keys) + m.Save("values", &x.values) + m.Save("children", &x.children) +} + +func (x *idMapnode) afterLoad() {} +func (x *idMapnode) load(m state.Map) { + m.Load("nrSegments", &x.nrSegments) + m.Load("parent", &x.parent) + m.Load("parentIndex", &x.parentIndex) + m.Load("hasChildren", &x.hasChildren) + m.Load("keys", &x.keys) + m.Load("values", &x.values) + m.Load("children", &x.children) +} + +func (x *idMapSegmentDataSlices) beforeSave() {} +func (x *idMapSegmentDataSlices) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) + m.Save("Values", &x.Values) +} + +func (x *idMapSegmentDataSlices) afterLoad() {} +func (x *idMapSegmentDataSlices) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) + m.Load("Values", &x.Values) +} + +func (x *UserNamespace) beforeSave() {} +func (x *UserNamespace) save(m state.Map) { + x.beforeSave() + m.Save("parent", &x.parent) + m.Save("owner", &x.owner) + m.Save("uidMapFromParent", &x.uidMapFromParent) + m.Save("uidMapToParent", &x.uidMapToParent) + m.Save("gidMapFromParent", &x.gidMapFromParent) + m.Save("gidMapToParent", &x.gidMapToParent) +} + +func (x *UserNamespace) afterLoad() {} +func (x *UserNamespace) load(m state.Map) { + m.Load("parent", &x.parent) + m.Load("owner", &x.owner) + m.Load("uidMapFromParent", &x.uidMapFromParent) + m.Load("uidMapToParent", &x.uidMapToParent) + m.Load("gidMapFromParent", &x.gidMapFromParent) + m.Load("gidMapToParent", &x.gidMapToParent) +} + +func init() { + state.Register("auth.Credentials", (*Credentials)(nil), state.Fns{Save: (*Credentials).save, Load: (*Credentials).load}) + state.Register("auth.IDMapEntry", (*IDMapEntry)(nil), state.Fns{Save: (*IDMapEntry).save, Load: (*IDMapEntry).load}) + state.Register("auth.idMapRange", (*idMapRange)(nil), state.Fns{Save: (*idMapRange).save, Load: (*idMapRange).load}) + state.Register("auth.idMapSet", (*idMapSet)(nil), state.Fns{Save: (*idMapSet).save, Load: (*idMapSet).load}) + state.Register("auth.idMapnode", (*idMapnode)(nil), state.Fns{Save: (*idMapnode).save, Load: (*idMapnode).load}) + state.Register("auth.idMapSegmentDataSlices", (*idMapSegmentDataSlices)(nil), state.Fns{Save: (*idMapSegmentDataSlices).save, Load: (*idMapSegmentDataSlices).load}) + state.Register("auth.UserNamespace", (*UserNamespace)(nil), state.Fns{Save: (*UserNamespace).save, Load: (*UserNamespace).load}) +} diff --git a/pkg/sentry/kernel/auth/capability_set.go b/pkg/sentry/kernel/auth/capability_set.go new file mode 100644 index 000000000..7a0c967cd --- /dev/null +++ b/pkg/sentry/kernel/auth/capability_set.go @@ -0,0 +1,61 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bits" +) + +// A CapabilitySet is a set of capabilities implemented as a bitset. The zero +// value of CapabilitySet is a set containing no capabilities. +type CapabilitySet uint64 + +// AllCapabilities is a CapabilitySet containing all valid capabilities. +var AllCapabilities = CapabilitySetOf(linux.MaxCapability+1) - 1 + +// CapabilitySetOf returns a CapabilitySet containing only the given +// capability. +func CapabilitySetOf(cp linux.Capability) CapabilitySet { + return CapabilitySet(bits.MaskOf64(int(cp))) +} + +// CapabilitySetOfMany returns a CapabilitySet containing the given capabilities. +func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet { + var cs uint64 + for _, cp := range cps { + cs |= bits.MaskOf64(int(cp)) + } + return CapabilitySet(cs) +} + +// TaskCapabilities represents all the capability sets for a task. Each of these +// sets is explained in greater detail in capabilities(7). +type TaskCapabilities struct { + // Permitted is a limiting superset for the effective capabilities that + // the thread may assume. + PermittedCaps CapabilitySet + // Inheritable is a set of capabilities preserved across an execve(2). + InheritableCaps CapabilitySet + // Effective is the set of capabilities used by the kernel to perform + // permission checks for the thread. + EffectiveCaps CapabilitySet + // Bounding is a limiting superset for the capabilities that a thread + // can add to its inheritable set using capset(2). + BoundingCaps CapabilitySet + // Ambient is a set of capabilities that are preserved across an + // execve(2) of a program that is not privileged. + AmbientCaps CapabilitySet +} diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go new file mode 100644 index 000000000..16d110610 --- /dev/null +++ b/pkg/sentry/kernel/auth/context.go @@ -0,0 +1,36 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// contextID is the auth package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxCredentials is a Context.Value key for Credentials. + CtxCredentials contextID = iota +) + +// CredentialsFromContext returns a copy of the Credentials used by ctx, or a +// set of Credentials with no capabilities if ctx does not have Credentials. +func CredentialsFromContext(ctx context.Context) *Credentials { + if v := ctx.Value(CtxCredentials); v != nil { + return v.(*Credentials) + } + return NewAnonymousCredentials() +} diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go new file mode 100644 index 000000000..1511a0324 --- /dev/null +++ b/pkg/sentry/kernel/auth/credentials.go @@ -0,0 +1,234 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Credentials contains information required to authorize privileged operations +// in a user namespace. +// +// +stateify savable +type Credentials struct { + // Real/effective/saved user/group IDs in the root user namespace. None of + // these should ever be NoID. + RealKUID KUID + EffectiveKUID KUID + SavedKUID KUID + RealKGID KGID + EffectiveKGID KGID + SavedKGID KGID + + // Filesystem user/group IDs are not implemented. "... setfsuid() is + // nowadays unneeded and should be avoided in new applications (likewise + // for setfsgid(2))." - setfsuid(2) + + // Supplementary groups used by set/getgroups. + // + // ExtraKGIDs slices are immutable, allowing multiple Credentials with the + // same ExtraKGIDs to share the same slice. + ExtraKGIDs []KGID + + // The capability sets applicable to this set of credentials. + PermittedCaps CapabilitySet + InheritableCaps CapabilitySet + EffectiveCaps CapabilitySet + BoundingCaps CapabilitySet + // Ambient capabilities are not introduced until Linux 4.3. + + // KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be + // maintained after a switch from root user to non-root user via setuid(). + KeepCaps bool + + // The user namespace associated with the owner of the credentials. + UserNamespace *UserNamespace +} + +// NewAnonymousCredentials returns a set of credentials with no capabilities in +// any user namespace. +func NewAnonymousCredentials() *Credentials { + // Create a new root user namespace. Since the new namespace's owner is + // KUID 0 and the returned credentials have non-zero KUID/KGID, the + // returned credentials do not have any capabilities in the new namespace. + // Since the new namespace is not part of any existing user namespace + // hierarchy, the returned credentials do not have any capabilities in any + // other namespace. + return &Credentials{ + RealKUID: NobodyKUID, + EffectiveKUID: NobodyKUID, + SavedKUID: NobodyKUID, + RealKGID: NobodyKGID, + EffectiveKGID: NobodyKGID, + SavedKGID: NobodyKGID, + UserNamespace: NewRootUserNamespace(), + } +} + +// NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e. +// global root) in user namespace ns. +func NewRootCredentials(ns *UserNamespace) *Credentials { + // I can't find documentation for this anywhere, but it's correct for the + // inheritable capability set to be initially empty (the capabilities test + // checks for this property). + return &Credentials{ + RealKUID: RootKUID, + EffectiveKUID: RootKUID, + SavedKUID: RootKUID, + RealKGID: RootKGID, + EffectiveKGID: RootKGID, + SavedKGID: RootKGID, + PermittedCaps: AllCapabilities, + EffectiveCaps: AllCapabilities, + BoundingCaps: AllCapabilities, + UserNamespace: ns, + } +} + +// NewUserCredentials returns a set of credentials based on the given UID, GIDs, +// and capabilities in a given namespace. If all arguments are their zero +// values, this returns the same credentials as NewRootCredentials. +func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials { + creds := NewRootCredentials(ns) + + // Set the UID. + uid := kuid + creds.RealKUID = uid + creds.EffectiveKUID = uid + creds.SavedKUID = uid + + // Set GID. + gid := kgid + creds.RealKGID = gid + creds.EffectiveKGID = gid + creds.SavedKGID = gid + + // Set additional GIDs. + creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...) + + // Set capabilities. + if capabilities != nil { + creds.PermittedCaps = capabilities.PermittedCaps + creds.EffectiveCaps = capabilities.EffectiveCaps + creds.BoundingCaps = capabilities.BoundingCaps + creds.InheritableCaps = capabilities.InheritableCaps + // TODO(nlacasse): Support ambient capabilities. + } else { + // If no capabilities are specified, grant capabilities consistent with + // setresuid + setresgid from NewRootCredentials to the given uid and + // gid. + if kuid == RootKUID { + creds.PermittedCaps = AllCapabilities + creds.EffectiveCaps = AllCapabilities + } else { + creds.PermittedCaps = 0 + creds.EffectiveCaps = 0 + } + creds.BoundingCaps = AllCapabilities + } + + return creds +} + +// Fork generates an identical copy of a set of credentials. +func (c *Credentials) Fork() *Credentials { + nc := new(Credentials) + *nc = *c // Copy-by-value; this is legal for all fields. + return nc +} + +// InGroup returns true if c is in group kgid. Compare Linux's +// kernel/groups.c:in_group_p(). +func (c *Credentials) InGroup(kgid KGID) bool { + if c.EffectiveKGID == kgid { + return true + } + for _, extraKGID := range c.ExtraKGIDs { + if extraKGID == kgid { + return true + } + } + return false +} + +// HasCapabilityIn returns true if c has capability cp in ns. +func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool { + for { + // "1. A process has a capability inside a user namespace if it is a member + // of that namespace and it has the capability in its effective capability + // set." - user_namespaces(7) + if c.UserNamespace == ns { + return CapabilitySetOf(cp)&c.EffectiveCaps != 0 + } + // "3. ... A process that resides in the parent of the user namespace and + // whose effective user ID matches the owner of the namespace has all + // capabilities in the namespace." + if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner { + return true + } + // "2. If a process has a capability in a user namespace, then it has that + // capability in all child (and further removed descendant) namespaces as + // well." + if ns.parent == nil { + return false + } + ns = ns.parent + } +} + +// HasCapability returns true if c has capability cp in its user namespace. +func (c *Credentials) HasCapability(cp linux.Capability) bool { + return c.HasCapabilityIn(cp, c.UserNamespace) +} + +// UseUID checks that c can use uid in its user namespace, then translates it +// to the root user namespace. +// +// The checks UseUID does are common, but you should verify that it's doing +// exactly what you want. +func (c *Credentials) UseUID(uid UID) (KUID, error) { + // uid must be mapped. + kuid := c.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return NoID, syserror.EINVAL + } + // If c has CAP_SETUID, then it can use any UID in its user namespace. + if c.HasCapability(linux.CAP_SETUID) { + return kuid, nil + } + // Otherwise, c must already have the UID as its real, effective, or saved + // set-user-ID. + if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID { + return kuid, nil + } + return NoID, syserror.EPERM +} + +// UseGID checks that c can use gid in its user namespace, then translates it +// to the root user namespace. +func (c *Credentials) UseGID(gid GID) (KGID, error) { + kgid := c.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return NoID, syserror.EINVAL + } + if c.HasCapability(linux.CAP_SETGID) { + return kgid, nil + } + if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID { + return kgid, nil + } + return NoID, syserror.EPERM +} diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go new file mode 100644 index 000000000..0a58ba17c --- /dev/null +++ b/pkg/sentry/kernel/auth/id.go @@ -0,0 +1,121 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "math" +) + +// UID is a user ID in an unspecified user namespace. +type UID uint32 + +// GID is a group ID in an unspecified user namespace. +type GID uint32 + +// In the root user namespace, user/group IDs have a 1-to-1 relationship with +// the users/groups they represent. In other user namespaces, this is not the +// case; for example, two different unmapped users may both "have" the overflow +// UID. This means that it is generally only valid to compare user and group +// IDs in the root user namespace. We assign distinct types, KUID/KGID, to such +// IDs to emphasize this distinction. ("k" is for "key", as in "unique key". +// Linux also uses the prefix "k", but I think they mean "kernel".) + +// KUID is a user ID in the root user namespace. +type KUID uint32 + +// KGID is a group ID in the root user namespace. +type KGID uint32 + +const ( + // NoID is uint32(-1). -1 is consistently used as a special value, in Linux + // and by extension in the auth package, to mean "no ID": + // + // - ID mapping returns -1 if the ID is not mapped. + // + // - Most set*id() syscalls accept -1 to mean "do not change this ID". + NoID = math.MaxUint32 + + // OverflowUID is the default value of /proc/sys/kernel/overflowuid. The + // "overflow UID" is usually [1] used when translating a user ID between + // namespaces fails because the ID is not mapped. (We don't implement this + // file, so the overflow UID is constant.) + // + // [1] "There is one notable case where unmapped user and group IDs are not + // converted to the corresponding overflow ID value. When viewing a uid_map + // or gid_map file in which there is no mapping for the second field, that + // field is displayed as 4294967295 (-1 as an unsigned integer);" - + // user_namespaces(7) + OverflowUID = UID(65534) + OverflowGID = GID(65534) + + // NobodyKUID is the user ID usually reserved for the least privileged user + // "nobody". + NobodyKUID = KUID(65534) + NobodyKGID = KGID(65534) + + // RootKUID is the user ID usually used for the most privileged user "root". + RootKUID = KUID(0) + RootKGID = KGID(0) + RootUID = UID(0) + RootGID = GID(0) +) + +// Ok returns true if uid is not -1. +func (uid UID) Ok() bool { + return uid != NoID +} + +// Ok returns true if gid is not -1. +func (gid GID) Ok() bool { + return gid != NoID +} + +// Ok returns true if kuid is not -1. +func (kuid KUID) Ok() bool { + return kuid != NoID +} + +// Ok returns true if kgid is not -1. +func (kgid KGID) Ok() bool { + return kgid != NoID +} + +// OrOverflow returns uid if it is valid and the overflow UID otherwise. +func (uid UID) OrOverflow() UID { + if uid.Ok() { + return uid + } + return OverflowUID +} + +// OrOverflow returns gid if it is valid and the overflow GID otherwise. +func (gid GID) OrOverflow() GID { + if gid.Ok() { + return gid + } + return OverflowGID +} + +// In translates kuid into user namespace ns. If kuid is not mapped in ns, In +// returns NoID. +func (kuid KUID) In(ns *UserNamespace) UID { + return ns.MapFromKUID(kuid) +} + +// In translates kgid into user namespace ns. If kgid is not mapped in ns, In +// returns NoID. +func (kgid KGID) In(ns *UserNamespace) GID { + return ns.MapFromKGID(kgid) +} diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go new file mode 100644 index 000000000..e5d6028d6 --- /dev/null +++ b/pkg/sentry/kernel/auth/id_map.go @@ -0,0 +1,285 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns. +func (ns *UserNamespace) MapFromKUID(kuid KUID) UID { + if ns.parent == nil { + return UID(kuid) + } + return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid)))) +} + +// MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns. +func (ns *UserNamespace) MapFromKGID(kgid KGID) GID { + if ns.parent == nil { + return GID(kgid) + } + return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid)))) +} + +// MapToKUID translates uid, a UID in ns, to a UID in the root namespace. +func (ns *UserNamespace) MapToKUID(uid UID) KUID { + if ns.parent == nil { + return KUID(uid) + } + return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid)))) +} + +// MapToKGID translates gid, a GID in ns, to a GID in the root namespace. +func (ns *UserNamespace) MapToKGID(gid GID) KGID { + if ns.parent == nil { + return KGID(gid) + } + return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid)))) +} + +func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 { + if id == NoID { + return NoID + } + ns.mu.Lock() + defer ns.mu.Unlock() + if it := m.FindSegment(id); it.Ok() { + return it.Value() + (id - it.Start()) + } + return NoID +} + +// allIDsMapped returns true if all IDs in the range [start, end) are mapped in +// m. +// +// Preconditions: end >= start. +func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool { + ns.mu.Lock() + defer ns.mu.Unlock() + return m.SpanRange(idMapRange{start, end}) == end-start +} + +// An IDMapEntry represents a mapping from a range of contiguous IDs in a user +// namespace to an equally-sized range of contiguous IDs in the namespace's +// parent. +// +// +stateify savable +type IDMapEntry struct { + // FirstID is the first ID in the range in the namespace. + FirstID uint32 + + // FirstParentID is the first ID in the range in the parent namespace. + FirstParentID uint32 + + // Length is the number of IDs in the range. + Length uint32 +} + +// SetUIDMap instructs ns to translate UIDs as specified by entries. +// +// Note: SetUIDMap does not place an upper bound on the number of entries, but +// Linux does. This restriction is implemented in SetUIDMap's caller, the +// implementation of /proc/[pid]/uid_map. +func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error { + c := CredentialsFromContext(ctx) + + ns.mu.Lock() + defer ns.mu.Unlock() + // "After the creation of a new user namespace, the uid_map file of *one* + // of the processes in the namespace may be written to *once* to define the + // mapping of user IDs in the new user namespace. An attempt to write more + // than once to a uid_map file in a user namespace fails with the error + // EPERM. Similar rules apply for gid_map files." - user_namespaces(7) + if !ns.uidMapFromParent.IsEmpty() { + return syserror.EPERM + } + // "At least one line must be written to the file." + if len(entries) == 0 { + return syserror.EINVAL + } + // """ + // In order for a process to write to the /proc/[pid]/uid_map + // (/proc/[pid]/gid_map) file, all of the following requirements must be + // met: + // + // 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability + // in the user namespace of the process pid. + // """ + if !c.HasCapabilityIn(linux.CAP_SETUID, ns) { + return syserror.EPERM + } + // "2. The writing process must either be in the user namespace of the process + // pid or be in the parent user namespace of the process pid." + if c.UserNamespace != ns && c.UserNamespace != ns.parent { + return syserror.EPERM + } + // """ + // 3. (see trySetUIDMap) + // + // 4. One of the following two cases applies: + // + // * Either the writing process has the CAP_SETUID (CAP_SETGID) capability + // in the parent user namespace. + // """ + if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) { + // """ + // * Or otherwise all of the following restrictions apply: + // + // + The data written to uid_map (gid_map) must consist of a single line + // that maps the writing process' effective user ID (group ID) in the + // parent user namespace to a user ID (group ID) in the user namespace. + // """ + if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 { + return syserror.EPERM + } + // """ + // + The writing process must have the same effective user ID as the + // process that created the user namespace. + // """ + if c.EffectiveKUID != ns.owner { + return syserror.EPERM + } + } + // trySetUIDMap leaves data in maps if it fails. + if err := ns.trySetUIDMap(entries); err != nil { + ns.uidMapFromParent.RemoveAll() + ns.uidMapToParent.RemoveAll() + return err + } + return nil +} + +func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error { + for _, e := range entries { + // Determine upper bounds and check for overflow. This implicitly + // checks for NoID. + lastID := e.FirstID + e.Length + if lastID <= e.FirstID { + return syserror.EINVAL + } + lastParentID := e.FirstParentID + e.Length + if lastParentID <= e.FirstParentID { + return syserror.EINVAL + } + // "3. The mapped user IDs (group IDs) must in turn have a mapping in + // the parent user namespace." + // Only the root namespace has a nil parent, and root is assigned + // mappings when it's created, so SetUIDMap would have returned EPERM + // without reaching this point if ns is root. + if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) { + return syserror.EPERM + } + // If either of these Adds fail, we have an overlapping range. + if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) { + return syserror.EINVAL + } + if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) { + return syserror.EINVAL + } + } + return nil +} + +// SetGIDMap instructs ns to translate GIDs as specified by entries. +func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error { + c := CredentialsFromContext(ctx) + + ns.mu.Lock() + defer ns.mu.Unlock() + if !ns.gidMapFromParent.IsEmpty() { + return syserror.EPERM + } + if len(entries) == 0 { + return syserror.EINVAL + } + if !c.HasCapabilityIn(linux.CAP_SETGID, ns) { + return syserror.EPERM + } + if c.UserNamespace != ns && c.UserNamespace != ns.parent { + return syserror.EPERM + } + if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) { + if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 { + return syserror.EPERM + } + // It's correct for this to still be UID. + if c.EffectiveKUID != ns.owner { + return syserror.EPERM + } + // "In the case of gid_map, use of the setgroups(2) system call must + // first be denied by writing "deny" to the /proc/[pid]/setgroups file + // (see below) before writing to gid_map." (This file isn't implemented + // in the version of Linux we're emulating; see comment in + // UserNamespace.) + } + if err := ns.trySetGIDMap(entries); err != nil { + ns.gidMapFromParent.RemoveAll() + ns.gidMapToParent.RemoveAll() + return err + } + return nil +} + +func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error { + for _, e := range entries { + lastID := e.FirstID + e.Length + if lastID <= e.FirstID { + return syserror.EINVAL + } + lastParentID := e.FirstParentID + e.Length + if lastParentID <= e.FirstParentID { + return syserror.EINVAL + } + if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) { + return syserror.EPERM + } + if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) { + return syserror.EINVAL + } + if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) { + return syserror.EINVAL + } + } + return nil +} + +// UIDMap returns the user ID mappings configured for ns. If no mappings +// have been configured, UIDMap returns nil. +func (ns *UserNamespace) UIDMap() []IDMapEntry { + return ns.getIDMap(&ns.uidMapToParent) +} + +// GIDMap returns the group ID mappings configured for ns. If no mappings +// have been configured, GIDMap returns nil. +func (ns *UserNamespace) GIDMap() []IDMapEntry { + return ns.getIDMap(&ns.gidMapToParent) +} + +func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry { + ns.mu.Lock() + defer ns.mu.Unlock() + var entries []IDMapEntry + for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() { + entries = append(entries, IDMapEntry{ + FirstID: it.Start(), + FirstParentID: it.Value(), + Length: it.Range().Length(), + }) + } + return entries +} diff --git a/pkg/sentry/kernel/auth/id_map_functions.go b/pkg/sentry/kernel/auth/id_map_functions.go new file mode 100644 index 000000000..432dbfb6d --- /dev/null +++ b/pkg/sentry/kernel/auth/id_map_functions.go @@ -0,0 +1,45 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +// idMapFunctions "implements" generic interface segment.Functions for +// idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one +// user namespace to non-overlapping ranges of contiguous IDs in another user +// namespace. Each such ID mapping is implemented as a range-to-value mapping +// in the set such that [range.Start(), range.End()) => [value, value + +// range.Length()). +type idMapFunctions struct{} + +func (idMapFunctions) MinKey() uint32 { + return 0 +} + +func (idMapFunctions) MaxKey() uint32 { + return NoID +} + +func (idMapFunctions) ClearValue(*uint32) {} + +func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) { + // Mapped ranges have to be contiguous. + if val1+r1.Length() != val2 { + return 0, false + } + return val1, true +} + +func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) { + return val, val + (split - r.Start) +} diff --git a/pkg/sentry/kernel/auth/id_map_range.go b/pkg/sentry/kernel/auth/id_map_range.go new file mode 100755 index 000000000..833fa3518 --- /dev/null +++ b/pkg/sentry/kernel/auth/id_map_range.go @@ -0,0 +1,62 @@ +package auth + +// A Range represents a contiguous range of T. +// +// +stateify savable +type idMapRange struct { + // Start is the inclusive start of the range. + Start uint32 + + // End is the exclusive end of the range. + End uint32 +} + +// WellFormed returns true if r.Start <= r.End. All other methods on a Range +// require that the Range is well-formed. +func (r idMapRange) WellFormed() bool { + return r.Start <= r.End +} + +// Length returns the length of the range. +func (r idMapRange) Length() uint32 { + return r.End - r.Start +} + +// Contains returns true if r contains x. +func (r idMapRange) Contains(x uint32) bool { + return r.Start <= x && x < r.End +} + +// Overlaps returns true if r and r2 overlap. +func (r idMapRange) Overlaps(r2 idMapRange) bool { + return r.Start < r2.End && r2.Start < r.End +} + +// IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is +// contained within r. +func (r idMapRange) IsSupersetOf(r2 idMapRange) bool { + return r.Start <= r2.Start && r.End >= r2.End +} + +// Intersect returns a range consisting of the intersection between r and r2. +// If r and r2 do not overlap, Intersect returns a range with unspecified +// bounds, but for which Length() == 0. +func (r idMapRange) Intersect(r2 idMapRange) idMapRange { + if r.Start < r2.Start { + r.Start = r2.Start + } + if r.End > r2.End { + r.End = r2.End + } + if r.End < r.Start { + r.End = r.Start + } + return r +} + +// CanSplitAt returns true if it is legal to split a segment spanning the range +// r at x; that is, splitting at x would produce two ranges, both of which have +// non-zero length. +func (r idMapRange) CanSplitAt(x uint32) bool { + return r.Contains(x) && r.Start < x +} diff --git a/pkg/sentry/kernel/auth/id_map_set.go b/pkg/sentry/kernel/auth/id_map_set.go new file mode 100755 index 000000000..f72c839c7 --- /dev/null +++ b/pkg/sentry/kernel/auth/id_map_set.go @@ -0,0 +1,1270 @@ +package auth + +import ( + "bytes" + "fmt" +) + +const ( + // minDegree is the minimum degree of an internal node in a Set B-tree. + // + // - Any non-root node has at least minDegree-1 segments. + // + // - Any non-root internal (non-leaf) node has at least minDegree children. + // + // - The root node may have fewer than minDegree-1 segments, but it may + // only have 0 segments if the tree is empty. + // + // Our implementation requires minDegree >= 3. Higher values of minDegree + // usually improve performance, but increase memory usage for small sets. + idMapminDegree = 3 + + idMapmaxDegree = 2 * idMapminDegree +) + +// A Set is a mapping of segments with non-overlapping Range keys. The zero +// value for a Set is an empty set. Set values are not safely movable nor +// copyable. Set is thread-compatible. +// +// +stateify savable +type idMapSet struct { + root idMapnode `state:".(*idMapSegmentDataSlices)"` +} + +// IsEmpty returns true if the set contains no segments. +func (s *idMapSet) IsEmpty() bool { + return s.root.nrSegments == 0 +} + +// IsEmptyRange returns true iff no segments in the set overlap the given +// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be +// more efficient. +func (s *idMapSet) IsEmptyRange(r idMapRange) bool { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return true + } + _, gap := s.Find(r.Start) + if !gap.Ok() { + return false + } + return r.End <= gap.End() +} + +// Span returns the total size of all segments in the set. +func (s *idMapSet) Span() uint32 { + var sz uint32 + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sz += seg.Range().Length() + } + return sz +} + +// SpanRange returns the total size of the intersection of segments in the set +// with the given range. +func (s *idMapSet) SpanRange(r idMapRange) uint32 { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return 0 + } + var sz uint32 + for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { + sz += seg.Range().Intersect(r).Length() + } + return sz +} + +// FirstSegment returns the first segment in the set. If the set is empty, +// FirstSegment returns a terminal iterator. +func (s *idMapSet) FirstSegment() idMapIterator { + if s.root.nrSegments == 0 { + return idMapIterator{} + } + return s.root.firstSegment() +} + +// LastSegment returns the last segment in the set. If the set is empty, +// LastSegment returns a terminal iterator. +func (s *idMapSet) LastSegment() idMapIterator { + if s.root.nrSegments == 0 { + return idMapIterator{} + } + return s.root.lastSegment() +} + +// FirstGap returns the first gap in the set. +func (s *idMapSet) FirstGap() idMapGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[0] + } + return idMapGapIterator{n, 0} +} + +// LastGap returns the last gap in the set. +func (s *idMapSet) LastGap() idMapGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[n.nrSegments] + } + return idMapGapIterator{n, n.nrSegments} +} + +// Find returns the segment or gap whose range contains the given key. If a +// segment is found, the returned Iterator is non-terminal and the +// returned GapIterator is terminal. Otherwise, the returned Iterator is +// terminal and the returned GapIterator is non-terminal. +func (s *idMapSet) Find(key uint32) (idMapIterator, idMapGapIterator) { + n := &s.root + for { + + lower := 0 + upper := n.nrSegments + for lower < upper { + i := lower + (upper-lower)/2 + if r := n.keys[i]; key < r.End { + if key >= r.Start { + return idMapIterator{n, i}, idMapGapIterator{} + } + upper = i + } else { + lower = i + 1 + } + } + i := lower + if !n.hasChildren { + return idMapIterator{}, idMapGapIterator{n, i} + } + n = n.children[i] + } +} + +// FindSegment returns the segment whose range contains the given key. If no +// such segment exists, FindSegment returns a terminal iterator. +func (s *idMapSet) FindSegment(key uint32) idMapIterator { + seg, _ := s.Find(key) + return seg +} + +// LowerBoundSegment returns the segment with the lowest range that contains a +// key greater than or equal to min. If no such segment exists, +// LowerBoundSegment returns a terminal iterator. +func (s *idMapSet) LowerBoundSegment(min uint32) idMapIterator { + seg, gap := s.Find(min) + if seg.Ok() { + return seg + } + return gap.NextSegment() +} + +// UpperBoundSegment returns the segment with the highest range that contains a +// key less than or equal to max. If no such segment exists, UpperBoundSegment +// returns a terminal iterator. +func (s *idMapSet) UpperBoundSegment(max uint32) idMapIterator { + seg, gap := s.Find(max) + if seg.Ok() { + return seg + } + return gap.PrevSegment() +} + +// FindGap returns the gap containing the given key. If no such gap exists +// (i.e. the set contains a segment containing that key), FindGap returns a +// terminal iterator. +func (s *idMapSet) FindGap(key uint32) idMapGapIterator { + _, gap := s.Find(key) + return gap +} + +// LowerBoundGap returns the gap with the lowest range that is greater than or +// equal to min. +func (s *idMapSet) LowerBoundGap(min uint32) idMapGapIterator { + seg, gap := s.Find(min) + if gap.Ok() { + return gap + } + return seg.NextGap() +} + +// UpperBoundGap returns the gap with the highest range that is less than or +// equal to max. +func (s *idMapSet) UpperBoundGap(max uint32) idMapGapIterator { + seg, gap := s.Find(max) + if gap.Ok() { + return gap + } + return seg.PrevGap() +} + +// Add inserts the given segment into the set and returns true. If the new +// segment can be merged with adjacent segments, Add will do so. If the new +// segment would overlap an existing segment, Add returns false. If Add +// succeeds, all existing iterators are invalidated. +func (s *idMapSet) Add(r idMapRange, val uint32) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.Insert(gap, r, val) + return true +} + +// AddWithoutMerging inserts the given segment into the set and returns true. +// If it would overlap an existing segment, AddWithoutMerging does nothing and +// returns false. If AddWithoutMerging succeeds, all existing iterators are +// invalidated. +func (s *idMapSet) AddWithoutMerging(r idMapRange, val uint32) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.InsertWithoutMergingUnchecked(gap, r, val) + return true +} + +// Insert inserts the given segment into the given gap. If the new segment can +// be merged with adjacent segments, Insert will do so. Insert returns an +// iterator to the segment containing the inserted value (which may have been +// merged with other values). All existing iterators (including gap, but not +// including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, Insert panics. +// +// Insert is semantically equivalent to a InsertWithoutMerging followed by a +// Merge, but may be more efficient. Note that there is no unchecked variant of +// Insert since Insert must retrieve and inspect gap's predecessor and +// successor segments regardless. +func (s *idMapSet) Insert(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + prev, next := gap.PrevSegment(), gap.NextSegment() + if prev.Ok() && prev.End() > r.Start { + panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) + } + if next.Ok() && next.Start() < r.End { + panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) + } + if prev.Ok() && prev.End() == r.Start { + if mval, ok := (idMapFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { + prev.SetEndUnchecked(r.End) + prev.SetValue(mval) + if next.Ok() && next.Start() == r.End { + val = mval + if mval, ok := (idMapFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { + prev.SetEndUnchecked(next.End()) + prev.SetValue(mval) + return s.Remove(next).PrevSegment() + } + } + return prev + } + } + if next.Ok() && next.Start() == r.End { + if mval, ok := (idMapFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { + next.SetStartUnchecked(r.Start) + next.SetValue(mval) + return next + } + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMerging inserts the given segment into the given gap and +// returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, +// InsertWithoutMerging panics. +func (s *idMapSet) InsertWithoutMerging(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if gr := gap.Range(); !gr.IsSupersetOf(r) { + panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMergingUnchecked inserts the given segment into the given gap +// and returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// Preconditions: r.Start >= gap.Start(); r.End <= gap.End(). +func (s *idMapSet) InsertWithoutMergingUnchecked(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator { + gap = gap.node.rebalanceBeforeInsert(gap) + copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) + copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) + gap.node.keys[gap.index] = r + gap.node.values[gap.index] = val + gap.node.nrSegments++ + return idMapIterator{gap.node, gap.index} +} + +// Remove removes the given segment and returns an iterator to the vacated gap. +// All existing iterators (including seg, but not including the returned +// iterator) are invalidated. +func (s *idMapSet) Remove(seg idMapIterator) idMapGapIterator { + + if seg.node.hasChildren { + + victim := seg.PrevSegment() + + seg.SetRangeUnchecked(victim.Range()) + seg.SetValue(victim.Value()) + return s.Remove(victim).NextGap() + } + copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) + copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) + idMapFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) + seg.node.nrSegments-- + return seg.node.rebalanceAfterRemove(idMapGapIterator{seg.node, seg.index}) +} + +// RemoveAll removes all segments from the set. All existing iterators are +// invalidated. +func (s *idMapSet) RemoveAll() { + s.root = idMapnode{} +} + +// RemoveRange removes all segments in the given range. An iterator to the +// newly formed gap is returned, and all existing iterators are invalidated. +func (s *idMapSet) RemoveRange(r idMapRange) idMapGapIterator { + seg, gap := s.Find(r.Start) + if seg.Ok() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + return gap +} + +// Merge attempts to merge two neighboring segments. If successful, Merge +// returns an iterator to the merged segment, and all existing iterators are +// invalidated. Otherwise, Merge returns a terminal iterator. +// +// If first is not the predecessor of second, Merge panics. +func (s *idMapSet) Merge(first, second idMapIterator) idMapIterator { + if first.NextSegment() != second { + panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) + } + return s.MergeUnchecked(first, second) +} + +// MergeUnchecked attempts to merge two neighboring segments. If successful, +// MergeUnchecked returns an iterator to the merged segment, and all existing +// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal +// iterator. +// +// Precondition: first is the predecessor of second: first.NextSegment() == +// second, first == second.PrevSegment(). +func (s *idMapSet) MergeUnchecked(first, second idMapIterator) idMapIterator { + if first.End() == second.Start() { + if mval, ok := (idMapFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { + + first.SetEndUnchecked(second.End()) + first.SetValue(mval) + return s.Remove(second).PrevSegment() + } + } + return idMapIterator{} +} + +// MergeAll attempts to merge all adjacent segments in the set. All existing +// iterators are invalidated. +func (s *idMapSet) MergeAll() { + seg := s.FirstSegment() + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeRange attempts to merge all adjacent segments that contain a key in the +// specific range. All existing iterators are invalidated. +func (s *idMapSet) MergeRange(r idMapRange) { + seg := s.LowerBoundSegment(r.Start) + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() && next.Range().Start < r.End { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeAdjacent attempts to merge the segment containing r.Start with its +// predecessor, and the segment containing r.End-1 with its successor. +func (s *idMapSet) MergeAdjacent(r idMapRange) { + first := s.FindSegment(r.Start) + if first.Ok() { + if prev := first.PrevSegment(); prev.Ok() { + s.Merge(prev, first) + } + } + last := s.FindSegment(r.End - 1) + if last.Ok() { + if next := last.NextSegment(); next.Ok() { + s.Merge(last, next) + } + } +} + +// Split splits the given segment at the given key and returns iterators to the +// two resulting segments. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +// +// If the segment cannot be split at split (because split is at the start or +// end of the segment's range, so splitting would produce a segment with zero +// length, or because split falls outside the segment's range altogether), +// Split panics. +func (s *idMapSet) Split(seg idMapIterator, split uint32) (idMapIterator, idMapIterator) { + if !seg.Range().CanSplitAt(split) { + panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) + } + return s.SplitUnchecked(seg, split) +} + +// SplitUnchecked splits the given segment at the given key and returns +// iterators to the two resulting segments. All existing iterators (including +// seg, but not including the returned iterators) are invalidated. +// +// Preconditions: seg.Start() < key < seg.End(). +func (s *idMapSet) SplitUnchecked(seg idMapIterator, split uint32) (idMapIterator, idMapIterator) { + val1, val2 := (idMapFunctions{}).Split(seg.Range(), seg.Value(), split) + end2 := seg.End() + seg.SetEndUnchecked(split) + seg.SetValue(val1) + seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), idMapRange{split, end2}, val2) + + return seg2.PrevSegment(), seg2 +} + +// SplitAt splits the segment straddling split, if one exists. SplitAt returns +// true if a segment was split and false otherwise. If SplitAt splits a +// segment, all existing iterators are invalidated. +func (s *idMapSet) SplitAt(split uint32) bool { + if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) { + s.SplitUnchecked(seg, split) + return true + } + return false +} + +// Isolate ensures that the given segment's range does not escape r by +// splitting at r.Start and r.End if necessary, and returns an updated iterator +// to the bounded segment. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +func (s *idMapSet) Isolate(seg idMapIterator, r idMapRange) idMapIterator { + if seg.Range().CanSplitAt(r.Start) { + _, seg = s.SplitUnchecked(seg, r.Start) + } + if seg.Range().CanSplitAt(r.End) { + seg, _ = s.SplitUnchecked(seg, r.End) + } + return seg +} + +// ApplyContiguous applies a function to a contiguous range of segments, +// splitting if necessary. The function is applied until the first gap is +// encountered, at which point the gap is returned. If the function is applied +// across the entire range, a terminal gap is returned. All existing iterators +// are invalidated. +// +// N.B. The Iterator must not be invalidated by the function. +func (s *idMapSet) ApplyContiguous(r idMapRange, fn func(seg idMapIterator)) idMapGapIterator { + seg, gap := s.Find(r.Start) + if !seg.Ok() { + return gap + } + for { + seg = s.Isolate(seg, r) + fn(seg) + if seg.End() >= r.End { + return idMapGapIterator{} + } + gap = seg.NextGap() + if !gap.IsEmpty() { + return gap + } + seg = gap.NextSegment() + if !seg.Ok() { + + return idMapGapIterator{} + } + } +} + +// +stateify savable +type idMapnode struct { + // An internal binary tree node looks like: + // + // K + // / \ + // Cl Cr + // + // where all keys in the subtree rooted by Cl (the left subtree) are less + // than K (the key of the parent node), and all keys in the subtree rooted + // by Cr (the right subtree) are greater than K. + // + // An internal B-tree node's indexes work out to look like: + // + // K0 K1 K2 ... Kn-1 + // / \/ \/ \ ... / \ + // C0 C1 C2 C3 ... Cn-1 Cn + // + // where n is nrSegments. + nrSegments int + + // parent is a pointer to this node's parent. If this node is root, parent + // is nil. + parent *idMapnode + + // parentIndex is the index of this node in parent.children. + parentIndex int + + // Flag for internal nodes that is technically redundant with "children[0] + // != nil", but is stored in the first cache line. "hasChildren" rather + // than "isLeaf" because false must be the correct value for an empty root. + hasChildren bool + + // Nodes store keys and values in separate arrays to maximize locality in + // the common case (scanning keys for lookup). + keys [idMapmaxDegree - 1]idMapRange + values [idMapmaxDegree - 1]uint32 + children [idMapmaxDegree]*idMapnode +} + +// firstSegment returns the first segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *idMapnode) firstSegment() idMapIterator { + for n.hasChildren { + n = n.children[0] + } + return idMapIterator{n, 0} +} + +// lastSegment returns the last segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *idMapnode) lastSegment() idMapIterator { + for n.hasChildren { + n = n.children[n.nrSegments] + } + return idMapIterator{n, n.nrSegments - 1} +} + +func (n *idMapnode) prevSibling() *idMapnode { + if n.parent == nil || n.parentIndex == 0 { + return nil + } + return n.parent.children[n.parentIndex-1] +} + +func (n *idMapnode) nextSibling() *idMapnode { + if n.parent == nil || n.parentIndex == n.parent.nrSegments { + return nil + } + return n.parent.children[n.parentIndex+1] +} + +// rebalanceBeforeInsert splits n and its ancestors if they are full, as +// required for insertion, and returns an updated iterator to the position +// represented by gap. +func (n *idMapnode) rebalanceBeforeInsert(gap idMapGapIterator) idMapGapIterator { + if n.parent != nil { + gap = n.parent.rebalanceBeforeInsert(gap) + } + if n.nrSegments < idMapmaxDegree-1 { + return gap + } + if n.parent == nil { + + left := &idMapnode{ + nrSegments: idMapminDegree - 1, + parent: n, + parentIndex: 0, + hasChildren: n.hasChildren, + } + right := &idMapnode{ + nrSegments: idMapminDegree - 1, + parent: n, + parentIndex: 1, + hasChildren: n.hasChildren, + } + copy(left.keys[:idMapminDegree-1], n.keys[:idMapminDegree-1]) + copy(left.values[:idMapminDegree-1], n.values[:idMapminDegree-1]) + copy(right.keys[:idMapminDegree-1], n.keys[idMapminDegree:]) + copy(right.values[:idMapminDegree-1], n.values[idMapminDegree:]) + n.keys[0], n.values[0] = n.keys[idMapminDegree-1], n.values[idMapminDegree-1] + idMapzeroValueSlice(n.values[1:]) + if n.hasChildren { + copy(left.children[:idMapminDegree], n.children[:idMapminDegree]) + copy(right.children[:idMapminDegree], n.children[idMapminDegree:]) + idMapzeroNodeSlice(n.children[2:]) + for i := 0; i < idMapminDegree; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + right.children[i].parent = right + right.children[i].parentIndex = i + } + } + n.nrSegments = 1 + n.hasChildren = true + n.children[0] = left + n.children[1] = right + if gap.node != n { + return gap + } + if gap.index < idMapminDegree { + return idMapGapIterator{left, gap.index} + } + return idMapGapIterator{right, gap.index - idMapminDegree} + } + + copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) + copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) + n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[idMapminDegree-1], n.values[idMapminDegree-1] + copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) + for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { + n.parent.children[i].parentIndex = i + } + sibling := &idMapnode{ + nrSegments: idMapminDegree - 1, + parent: n.parent, + parentIndex: n.parentIndex + 1, + hasChildren: n.hasChildren, + } + n.parent.children[n.parentIndex+1] = sibling + n.parent.nrSegments++ + copy(sibling.keys[:idMapminDegree-1], n.keys[idMapminDegree:]) + copy(sibling.values[:idMapminDegree-1], n.values[idMapminDegree:]) + idMapzeroValueSlice(n.values[idMapminDegree-1:]) + if n.hasChildren { + copy(sibling.children[:idMapminDegree], n.children[idMapminDegree:]) + idMapzeroNodeSlice(n.children[idMapminDegree:]) + for i := 0; i < idMapminDegree; i++ { + sibling.children[i].parent = sibling + sibling.children[i].parentIndex = i + } + } + n.nrSegments = idMapminDegree - 1 + + if gap.node != n { + return gap + } + if gap.index < idMapminDegree { + return gap + } + return idMapGapIterator{sibling, gap.index - idMapminDegree} +} + +// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient +// (contain fewer segments than required by B-tree invariants), as required for +// removal, and returns an updated iterator to the position represented by gap. +// +// Precondition: n is the only node in the tree that may currently violate a +// B-tree invariant. +func (n *idMapnode) rebalanceAfterRemove(gap idMapGapIterator) idMapGapIterator { + for { + if n.nrSegments >= idMapminDegree-1 { + return gap + } + if n.parent == nil { + + return gap + } + + if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= idMapminDegree { + copy(n.keys[1:], n.keys[:n.nrSegments]) + copy(n.values[1:], n.values[:n.nrSegments]) + n.keys[0] = n.parent.keys[n.parentIndex-1] + n.values[0] = n.parent.values[n.parentIndex-1] + n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] + n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] + idMapFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + copy(n.children[1:], n.children[:n.nrSegments+1]) + n.children[0] = sibling.children[sibling.nrSegments] + sibling.children[sibling.nrSegments] = nil + n.children[0].parent = n + n.children[0].parentIndex = 0 + for i := 1; i < n.nrSegments+2; i++ { + n.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling && gap.index == sibling.nrSegments { + return idMapGapIterator{n, 0} + } + if gap.node == n { + return idMapGapIterator{n, gap.index + 1} + } + return gap + } + if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= idMapminDegree { + n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] + n.values[n.nrSegments] = n.parent.values[n.parentIndex] + n.parent.keys[n.parentIndex] = sibling.keys[0] + n.parent.values[n.parentIndex] = sibling.values[0] + copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) + copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) + idMapFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + n.children[n.nrSegments+1] = sibling.children[0] + copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) + sibling.children[sibling.nrSegments] = nil + n.children[n.nrSegments+1].parent = n + n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 + for i := 0; i < sibling.nrSegments; i++ { + sibling.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling { + if gap.index == 0 { + return idMapGapIterator{n, n.nrSegments} + } + return idMapGapIterator{sibling, gap.index - 1} + } + return gap + } + + p := n.parent + if p.nrSegments == 1 { + + left, right := p.children[0], p.children[1] + p.nrSegments = left.nrSegments + right.nrSegments + 1 + p.hasChildren = left.hasChildren + p.keys[left.nrSegments] = p.keys[0] + p.values[left.nrSegments] = p.values[0] + copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) + copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) + copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) + copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := 0; i < p.nrSegments+1; i++ { + p.children[i].parent = p + p.children[i].parentIndex = i + } + } else { + p.children[0] = nil + p.children[1] = nil + } + if gap.node == left { + return idMapGapIterator{p, gap.index} + } + if gap.node == right { + return idMapGapIterator{p, gap.index + left.nrSegments + 1} + } + return gap + } + // Merge n and either sibling, along with the segment separating the + // two, into whichever of the two nodes comes first. This is the + // reverse of the non-root splitting case in + // node.rebalanceBeforeInsert. + var left, right *idMapnode + if n.parentIndex > 0 { + left = n.prevSibling() + right = n + } else { + left = n + right = n.nextSibling() + } + + if gap.node == right { + gap = idMapGapIterator{left, gap.index + left.nrSegments + 1} + } + left.keys[left.nrSegments] = p.keys[left.parentIndex] + left.values[left.nrSegments] = p.values[left.parentIndex] + copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + } + } + left.nrSegments += right.nrSegments + 1 + copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) + copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) + idMapFunctions{}.ClearValue(&p.values[p.nrSegments-1]) + copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) + for i := 0; i < p.nrSegments; i++ { + p.children[i].parentIndex = i + } + p.children[p.nrSegments] = nil + p.nrSegments-- + + n = p + } +} + +// A Iterator is conceptually one of: +// +// - A pointer to a segment in a set; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Iterators are copyable values and are meaningfully equality-comparable. The +// zero value of Iterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type idMapIterator struct { + // node is the node containing the iterated segment. If the iterator is + // terminal, node is nil. + node *idMapnode + + // index is the index of the segment in node.keys/values. + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (seg idMapIterator) Ok() bool { + return seg.node != nil +} + +// Range returns the iterated segment's range key. +func (seg idMapIterator) Range() idMapRange { + return seg.node.keys[seg.index] +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (seg idMapIterator) Start() uint32 { + return seg.node.keys[seg.index].Start +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (seg idMapIterator) End() uint32 { + return seg.node.keys[seg.index].End +} + +// SetRangeUnchecked mutates the iterated segment's range key. This operation +// does not invalidate any iterators. +// +// Preconditions: +// +// - r.Length() > 0. +// +// - The new range must not overlap an existing one: If seg.NextSegment().Ok(), +// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then +// r.start >= seg.PrevSegment().End(). +func (seg idMapIterator) SetRangeUnchecked(r idMapRange) { + seg.node.keys[seg.index] = r +} + +// SetRange mutates the iterated segment's range key. If the new range would +// cause the iterated segment to overlap another segment, or if the new range +// is invalid, SetRange panics. This operation does not invalidate any +// iterators. +func (seg idMapIterator) SetRange(r idMapRange) { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) + } + if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) + } + seg.SetRangeUnchecked(r) +} + +// SetStartUnchecked mutates the iterated segment's start. This operation does +// not invalidate any iterators. +// +// Preconditions: The new start must be valid: start < seg.End(); if +// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). +func (seg idMapIterator) SetStartUnchecked(start uint32) { + seg.node.keys[seg.index].Start = start +} + +// SetStart mutates the iterated segment's start. If the new start value would +// cause the iterated segment to overlap another segment, or would result in an +// invalid range, SetStart panics. This operation does not invalidate any +// iterators. +func (seg idMapIterator) SetStart(start uint32) { + if start >= seg.End() { + panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) + } + if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { + panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) + } + seg.SetStartUnchecked(start) +} + +// SetEndUnchecked mutates the iterated segment's end. This operation does not +// invalidate any iterators. +// +// Preconditions: The new end must be valid: end > seg.Start(); if +// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). +func (seg idMapIterator) SetEndUnchecked(end uint32) { + seg.node.keys[seg.index].End = end +} + +// SetEnd mutates the iterated segment's end. If the new end value would cause +// the iterated segment to overlap another segment, or would result in an +// invalid range, SetEnd panics. This operation does not invalidate any +// iterators. +func (seg idMapIterator) SetEnd(end uint32) { + if end <= seg.Start() { + panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) + } + if next := seg.NextSegment(); next.Ok() && end > next.Start() { + panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) + } + seg.SetEndUnchecked(end) +} + +// Value returns a copy of the iterated segment's value. +func (seg idMapIterator) Value() uint32 { + return seg.node.values[seg.index] +} + +// ValuePtr returns a pointer to the iterated segment's value. The pointer is +// invalidated if the iterator is invalidated. This operation does not +// invalidate any iterators. +func (seg idMapIterator) ValuePtr() *uint32 { + return &seg.node.values[seg.index] +} + +// SetValue mutates the iterated segment's value. This operation does not +// invalidate any iterators. +func (seg idMapIterator) SetValue(val uint32) { + seg.node.values[seg.index] = val +} + +// PrevSegment returns the iterated segment's predecessor. If there is no +// preceding segment, PrevSegment returns a terminal iterator. +func (seg idMapIterator) PrevSegment() idMapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index].lastSegment() + } + if seg.index > 0 { + return idMapIterator{seg.node, seg.index - 1} + } + if seg.node.parent == nil { + return idMapIterator{} + } + return idMapsegmentBeforePosition(seg.node.parent, seg.node.parentIndex) +} + +// NextSegment returns the iterated segment's successor. If there is no +// succeeding segment, NextSegment returns a terminal iterator. +func (seg idMapIterator) NextSegment() idMapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment() + } + if seg.index < seg.node.nrSegments-1 { + return idMapIterator{seg.node, seg.index + 1} + } + if seg.node.parent == nil { + return idMapIterator{} + } + return idMapsegmentAfterPosition(seg.node.parent, seg.node.parentIndex) +} + +// PrevGap returns the gap immediately before the iterated segment. +func (seg idMapIterator) PrevGap() idMapGapIterator { + if seg.node.hasChildren { + + return seg.node.children[seg.index].lastSegment().NextGap() + } + return idMapGapIterator{seg.node, seg.index} +} + +// NextGap returns the gap immediately after the iterated segment. +func (seg idMapIterator) NextGap() idMapGapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment().PrevGap() + } + return idMapGapIterator{seg.node, seg.index + 1} +} + +// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, +// or the gap before the iterated segment otherwise. If seg.Start() == +// Functions.MinKey(), PrevNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be +// non-terminal. +func (seg idMapIterator) PrevNonEmpty() (idMapIterator, idMapGapIterator) { + gap := seg.PrevGap() + if gap.Range().Length() != 0 { + return idMapIterator{}, gap + } + return gap.PrevSegment(), idMapGapIterator{} +} + +// NextNonEmpty returns the iterated segment's successor if it is adjacent, or +// the gap after the iterated segment otherwise. If seg.End() == +// Functions.MaxKey(), NextNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by NextNonEmpty will be +// non-terminal. +func (seg idMapIterator) NextNonEmpty() (idMapIterator, idMapGapIterator) { + gap := seg.NextGap() + if gap.Range().Length() != 0 { + return idMapIterator{}, gap + } + return gap.NextSegment(), idMapGapIterator{} +} + +// A GapIterator is conceptually one of: +// +// - A pointer to a position between two segments, before the first segment, or +// after the last segment in a set, called a *gap*; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Note that the gap between two adjacent segments exists (iterators to it are +// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true +// for such gaps. An empty set contains a single gap, spanning the entire range +// of the set's keys. +// +// GapIterators are copyable values and are meaningfully equality-comparable. +// The zero value of GapIterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type idMapGapIterator struct { + // The representation of a GapIterator is identical to that of an Iterator, + // except that index corresponds to positions between segments in the same + // way as for node.children (see comment for node.nrSegments). + node *idMapnode + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (gap idMapGapIterator) Ok() bool { + return gap.node != nil +} + +// Range returns the range spanned by the iterated gap. +func (gap idMapGapIterator) Range() idMapRange { + return idMapRange{gap.Start(), gap.End()} +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (gap idMapGapIterator) Start() uint32 { + if ps := gap.PrevSegment(); ps.Ok() { + return ps.End() + } + return idMapFunctions{}.MinKey() +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (gap idMapGapIterator) End() uint32 { + if ns := gap.NextSegment(); ns.Ok() { + return ns.Start() + } + return idMapFunctions{}.MaxKey() +} + +// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is +// between two adjacent segments.) +func (gap idMapGapIterator) IsEmpty() bool { + return gap.Range().Length() == 0 +} + +// PrevSegment returns the segment immediately before the iterated gap. If no +// such segment exists, PrevSegment returns a terminal iterator. +func (gap idMapGapIterator) PrevSegment() idMapIterator { + return idMapsegmentBeforePosition(gap.node, gap.index) +} + +// NextSegment returns the segment immediately after the iterated gap. If no +// such segment exists, NextSegment returns a terminal iterator. +func (gap idMapGapIterator) NextSegment() idMapIterator { + return idMapsegmentAfterPosition(gap.node, gap.index) +} + +// PrevGap returns the iterated gap's predecessor. If no such gap exists, +// PrevGap returns a terminal iterator. +func (gap idMapGapIterator) PrevGap() idMapGapIterator { + seg := gap.PrevSegment() + if !seg.Ok() { + return idMapGapIterator{} + } + return seg.PrevGap() +} + +// NextGap returns the iterated gap's successor. If no such gap exists, NextGap +// returns a terminal iterator. +func (gap idMapGapIterator) NextGap() idMapGapIterator { + seg := gap.NextSegment() + if !seg.Ok() { + return idMapGapIterator{} + } + return seg.NextGap() +} + +// segmentBeforePosition returns the predecessor segment of the position given +// by n.children[i], which may or may not contain a child. If no such segment +// exists, segmentBeforePosition returns a terminal iterator. +func idMapsegmentBeforePosition(n *idMapnode, i int) idMapIterator { + for i == 0 { + if n.parent == nil { + return idMapIterator{} + } + n, i = n.parent, n.parentIndex + } + return idMapIterator{n, i - 1} +} + +// segmentAfterPosition returns the successor segment of the position given by +// n.children[i], which may or may not contain a child. If no such segment +// exists, segmentAfterPosition returns a terminal iterator. +func idMapsegmentAfterPosition(n *idMapnode, i int) idMapIterator { + for i == n.nrSegments { + if n.parent == nil { + return idMapIterator{} + } + n, i = n.parent, n.parentIndex + } + return idMapIterator{n, i} +} + +func idMapzeroValueSlice(slice []uint32) { + + for i := range slice { + idMapFunctions{}.ClearValue(&slice[i]) + } +} + +func idMapzeroNodeSlice(slice []*idMapnode) { + for i := range slice { + slice[i] = nil + } +} + +// String stringifies a Set for debugging. +func (s *idMapSet) String() string { + return s.root.String() +} + +// String stringifes a node (and all of its children) for debugging. +func (n *idMapnode) String() string { + var buf bytes.Buffer + n.writeDebugString(&buf, "") + return buf.String() +} + +func (n *idMapnode) writeDebugString(buf *bytes.Buffer, prefix string) { + if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) + } + for i := 0; i < n.nrSegments; i++ { + if child := n.children[i]; child != nil { + cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) + if child.parent != n || child.parentIndex != i { + buf.WriteString(cprefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) + } + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) + } + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } + if child := n.children[n.nrSegments]; child != nil { + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) + } +} + +// SegmentDataSlices represents segments from a set as slices of start, end, and +// values. SegmentDataSlices is primarily used as an intermediate representation +// for save/restore and the layout here is optimized for that. +// +// +stateify savable +type idMapSegmentDataSlices struct { + Start []uint32 + End []uint32 + Values []uint32 +} + +// ExportSortedSlice returns a copy of all segments in the given set, in ascending +// key order. +func (s *idMapSet) ExportSortedSlices() *idMapSegmentDataSlices { + var sds idMapSegmentDataSlices + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sds.Start = append(sds.Start, seg.Start()) + sds.End = append(sds.End, seg.End()) + sds.Values = append(sds.Values, seg.Value()) + } + sds.Start = sds.Start[:len(sds.Start):len(sds.Start)] + sds.End = sds.End[:len(sds.End):len(sds.End)] + sds.Values = sds.Values[:len(sds.Values):len(sds.Values)] + return &sds +} + +// ImportSortedSlice initializes the given set from the given slice. +// +// Preconditions: s must be empty. sds must represent a valid set (the segments +// in sds must have valid lengths that do not overlap). The segments in sds +// must be sorted in ascending key order. +func (s *idMapSet) ImportSortedSlices(sds *idMapSegmentDataSlices) error { + if !s.IsEmpty() { + return fmt.Errorf("cannot import into non-empty set %v", s) + } + gap := s.FirstGap() + for i := range sds.Start { + r := idMapRange{sds.Start[i], sds.End[i]} + if !gap.Range().IsSupersetOf(r) { + return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i]) + } + gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap() + } + return nil +} +func (s *idMapSet) saveRoot() *idMapSegmentDataSlices { + return s.ExportSortedSlices() +} + +func (s *idMapSet) loadRoot(sds *idMapSegmentDataSlices) { + if err := s.ImportSortedSlices(sds); err != nil { + panic(err) + } +} diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go new file mode 100644 index 000000000..a40dd668f --- /dev/null +++ b/pkg/sentry/kernel/auth/user_namespace.go @@ -0,0 +1,129 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package auth + +import ( + "math" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// A UserNamespace represents a user namespace. See user_namespaces(7) for +// details. +// +// +stateify savable +type UserNamespace struct { + // parent is this namespace's parent. If this is the root namespace, parent + // is nil. The parent pointer is immutable. + parent *UserNamespace + + // owner is the effective UID of the namespace's creator in the root + // namespace. owner is immutable. + owner KUID + + // mu protects the following fields. + // + // If mu will be locked in multiple UserNamespaces, it must be locked in + // descendant namespaces before ancestors. + mu sync.Mutex `state:"nosave"` + + // Mappings of user/group IDs between this namespace and its parent. + // + // All ID maps, once set, cannot be changed. This means that successful + // UID/GID translations cannot be racy. + uidMapFromParent idMapSet + uidMapToParent idMapSet + gidMapFromParent idMapSet + gidMapToParent idMapSet + + // TODO(b/27454212): Support disabling setgroups(2). +} + +// NewRootUserNamespace returns a UserNamespace that is appropriate for a +// system's root user namespace. +func NewRootUserNamespace() *UserNamespace { + var ns UserNamespace + // """ + // The initial user namespace has no parent namespace, but, for + // consistency, the kernel provides dummy user and group ID mapping files + // for this namespace. Looking at the uid_map file (gid_map is the same) + // from a shell in the initial namespace shows: + // + // $ cat /proc/$$/uid_map + // 0 0 4294967295 + // """ - user_namespaces(7) + for _, m := range []*idMapSet{ + &ns.uidMapFromParent, + &ns.uidMapToParent, + &ns.gidMapFromParent, + &ns.gidMapToParent, + } { + if !m.Add(idMapRange{0, math.MaxUint32}, 0) { + panic("Failed to insert into empty ID map") + } + } + return &ns +} + +// Root returns the root of the user namespace tree containing ns. +func (ns *UserNamespace) Root() *UserNamespace { + for ns.parent != nil { + ns = ns.parent + } + return ns +} + +// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user +// namespaces." - user_namespaces(7) +const maxUserNamespaceDepth = 32 + +func (ns *UserNamespace) depth() int { + var i int + for ns != nil { + i++ + ns = ns.parent + } + return i +} + +// NewChildUserNamespace returns a new user namespace created by a caller with +// credentials c. +func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) { + if c.UserNamespace.depth() >= maxUserNamespaceDepth { + // "... Calls to unshare(2) or clone(2) that would cause this limit to + // be exceeded fail with the error EUSERS." - user_namespaces(7) + return nil, syserror.EUSERS + } + // "EPERM: CLONE_NEWUSER was specified in flags, but either the effective + // user ID or the effective group ID of the caller does not have a mapping + // in the parent namespace (see user_namespaces(7))." - clone(2) + // "CLONE_NEWUSER requires that the user ID and group ID of the calling + // process are mapped to user IDs and group IDs in the user namespace of + // the calling process at the time of the call." - unshare(2) + if !c.EffectiveKUID.In(c.UserNamespace).Ok() { + return nil, syserror.EPERM + } + if !c.EffectiveKGID.In(c.UserNamespace).Ok() { + return nil, syserror.EPERM + } + return &UserNamespace{ + parent: c.UserNamespace, + owner: c.EffectiveKUID, + // "When a user namespace is created, it starts without a mapping of + // user IDs (group IDs) to the parent user namespace." - + // user_namespaces(7) + }, nil +} diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go new file mode 100644 index 000000000..a1a084eab --- /dev/null +++ b/pkg/sentry/kernel/context.go @@ -0,0 +1,135 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// contextID is the kernel package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxCanTrace is a Context.Value key for a function with the same + // signature and semantics as kernel.Task.CanTrace. + CtxCanTrace contextID = iota + + // CtxKernel is a Context.Value key for a Kernel. + CtxKernel + + // CtxPIDNamespace is a Context.Value key for a PIDNamespace. + CtxPIDNamespace + + // CtxTask is a Context.Value key for a Task. + CtxTask + + // CtxUTSNamespace is a Context.Value key for a UTSNamespace. + CtxUTSNamespace + + // CtxIPCNamespace is a Context.Value key for a IPCNamespace. + CtxIPCNamespace +) + +// ContextCanTrace returns true if ctx is permitted to trace t, in the same sense +// as kernel.Task.CanTrace. +func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool { + if v := ctx.Value(CtxCanTrace); v != nil { + return v.(func(*Task, bool) bool)(t, attach) + } + return false +} + +// KernelFromContext returns the Kernel in which ctx is executing, or nil if +// there is no such Kernel. +func KernelFromContext(ctx context.Context) *Kernel { + if v := ctx.Value(CtxKernel); v != nil { + return v.(*Kernel) + } + return nil +} + +// PIDNamespaceFromContext returns the PID namespace in which ctx is executing, +// or nil if there is no such PID namespace. +func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace { + if v := ctx.Value(CtxPIDNamespace); v != nil { + return v.(*PIDNamespace) + } + return nil +} + +// UTSNamespaceFromContext returns the UTS namespace in which ctx is executing, +// or nil if there is no such UTS namespace. +func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { + if v := ctx.Value(CtxUTSNamespace); v != nil { + return v.(*UTSNamespace) + } + return nil +} + +// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing, +// or nil if there is no such IPC namespace. +func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { + if v := ctx.Value(CtxIPCNamespace); v != nil { + return v.(*IPCNamespace) + } + return nil +} + +// TaskFromContext returns the Task associated with ctx, or nil if there is no +// such Task. +func TaskFromContext(ctx context.Context) *Task { + if v := ctx.Value(CtxTask); v != nil { + return v.(*Task) + } + return nil +} + +// AsyncContext returns a context.Context that may be used by goroutines that +// do work on behalf of t and therefore share its contextual values, but are +// not t's task goroutine (e.g. asynchronous I/O). +func (t *Task) AsyncContext() context.Context { + return taskAsyncContext{t: t} +} + +type taskAsyncContext struct { + context.NoopSleeper + t *Task +} + +// Debugf implements log.Logger.Debugf. +func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { + ctx.t.Debugf(format, v...) +} + +// Infof implements log.Logger.Infof. +func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { + ctx.t.Infof(format, v...) +} + +// Warningf implements log.Logger.Warningf. +func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { + ctx.t.Warningf(format, v...) +} + +// IsLogging implements log.Logger.IsLogging. +func (ctx taskAsyncContext) IsLogging(level log.Level) bool { + return ctx.t.IsLogging(level) +} + +// Value implements context.Context.Value. +func (ctx taskAsyncContext) Value(key interface{}) interface{} { + return ctx.t.Value(key) +} diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go new file mode 100644 index 000000000..bbacba1f4 --- /dev/null +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -0,0 +1,473 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package epoll provides an implementation of Linux's IO event notification +// facility. See epoll(7) for more details. +package epoll + +import ( + "fmt" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Event describes the event mask that was observed and the user data to be +// returned when one of the events occurs. It has this format to match the linux +// format to avoid extra copying/allocation when writing events to userspace. +type Event struct { + // Events is the event mask containing the set of events that have been + // observed on an entry. + Events uint32 + + // Data is an opaque 64-bit value provided by the caller when adding the + // entry, and returned to the caller when the entry reports an event. + Data [2]int32 +} + +// EntryFlags is a bitmask that holds an entry's flags. +type EntryFlags int + +// Valid entry flags. +const ( + OneShot EntryFlags = 1 << iota + EdgeTriggered +) + +// FileIdentifier identifies a file. We cannot use just the FD because it could +// potentially be reassigned. We also cannot use just the file pointer because +// it is possible to have multiple entries for the same file object as long as +// they are created with different FDs (i.e., the FDs point to the same file). +// +// +stateify savable +type FileIdentifier struct { + File *fs.File `state:"wait"` + Fd kdefs.FD +} + +// pollEntry holds all the state associated with an event poll entry, that is, +// a file being observed by an event poll object. +// +// +stateify savable +type pollEntry struct { + pollEntryEntry + file *refs.WeakRef `state:"manual"` + id FileIdentifier `state:"wait"` + userData [2]int32 + waiter waiter.Entry `state:"manual"` + mask waiter.EventMask + flags EntryFlags + + epoll *EventPoll + + // We cannot save the current list pointer as it points into EventPoll + // struct, while state framework currently does not support such + // in-struct pointers. Instead, EventPoll will properly set this field + // in its loading logic. + curList *pollEntryList `state:"nosave"` +} + +// WeakRefGone implements refs.WeakRefUser.WeakRefGone. +// weakReferenceGone is called when the file in the weak reference is destroyed. +// The poll entry is removed in response to this. +func (p *pollEntry) WeakRefGone() { + p.epoll.RemoveEntry(p.id) +} + +// EventPoll holds all the state associated with an event poll object, that is, +// collection of files to observe and their current state. +// +// +stateify savable +type EventPoll struct { + fsutil.FilePipeSeek `state:"zerovalue"` + fsutil.FileNotDirReaddir `state:"zerovalue"` + fsutil.FileNoFsync `state:"zerovalue"` + fsutil.FileNoopFlush `state:"zerovalue"` + fsutil.FileNoIoctl `state:"zerovalue"` + fsutil.FileNoMMap `state:"zerovalue"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + // Wait queue is used to notify interested parties when the event poll + // object itself becomes readable or writable. + waiter.Queue `state:"zerovalue"` + + // files is the map of all the files currently being observed, it is + // protected by mu. + mu sync.Mutex `state:"nosave"` + files map[FileIdentifier]*pollEntry + + // listsMu protects manipulation of the lists below. It needs to be a + // different lock to avoid circular lock acquisition order involving + // the wait queue mutexes and mu. The full order is mu, observed file + // wait queue mutex, then listsMu; this allows listsMu to be acquired + // when readyCallback is called. + // + // An entry is always in one of the following lists: + // readyList -- when there's a chance that it's ready to have + // events delivered to epoll waiters. Given that being + // ready is a transient state, the Readiness() and + // readEvents() functions always call the entry's file + // Readiness() function to confirm it's ready. + // waitingList -- when there's no chance that the entry is ready, + // so it's waiting for the readyCallback to be called + // on it before it gets moved to the readyList. + // disabledList -- when the entry is disabled. This happens when + // a one-shot entry gets delivered via readEvents(). + listsMu sync.Mutex `state:"nosave"` + readyList pollEntryList + waitingList pollEntryList + disabledList pollEntryList +} + +// cycleMu is used to serialize all the cycle checks. This is only used when +// an event poll file is added as an entry to another event poll. Such checks +// are serialized to avoid lock acquisition order inversion: if a thread is +// adding A to B, and another thread is adding B to A, each would acquire A's +// and B's mutexes in reverse order, and could cause deadlocks. Having this +// lock prevents this by allowing only one check at a time to happen. +// +// We do the cycle check to prevent callers from introducing potentially +// infinite recursions. If a caller were to add A to B and then B to A, for +// event poll A to know if it's readable, it would need to check event poll B, +// which in turn would need event poll A and so on indefinitely. +var cycleMu sync.Mutex + +// NewEventPoll allocates and initializes a new event poll object. +func NewEventPoll(ctx context.Context) *fs.File { + // name matches fs/eventpoll.c:epoll_create1. + dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]")) + return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{ + files: make(map[FileIdentifier]*pollEntry), + }) +} + +// Release implements fs.FileOperations.Release. +func (e *EventPoll) Release() { + // We need to take the lock now because files may be attempting to + // remove entries in parallel if they get destroyed. + e.mu.Lock() + defer e.mu.Unlock() + + // Go through all entries and clean up. + for _, entry := range e.files { + entry.id.File.EventUnregister(&entry.waiter) + entry.file.Drop() + } +} + +// Read implements fs.FileOperations.Read. +func (*EventPoll) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syscall.ENOSYS +} + +// Write implements fs.FileOperations.Write. +func (*EventPoll) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syscall.ENOSYS +} + +// eventsAvailable determines if 'e' has events available for delivery. +func (e *EventPoll) eventsAvailable() bool { + e.listsMu.Lock() + + for it := e.readyList.Front(); it != nil; { + entry := it + it = it.Next() + + // If the entry is ready, we know 'e' has at least one entry + // ready for delivery. + ready := entry.id.File.Readiness(entry.mask) + if ready != 0 { + e.listsMu.Unlock() + return true + } + + // Entry is not ready, so move it to waiting list. + e.readyList.Remove(entry) + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + } + + e.listsMu.Unlock() + + return false +} + +// Readiness determines if the event poll object is currently readable (i.e., +// if there are pending events for delivery). +func (e *EventPoll) Readiness(mask waiter.EventMask) waiter.EventMask { + ready := waiter.EventMask(0) + + if (mask&waiter.EventIn) != 0 && e.eventsAvailable() { + ready |= waiter.EventIn + } + + return ready +} + +// ReadEvents returns up to max available events. +func (e *EventPoll) ReadEvents(max int) []Event { + var local pollEntryList + var ret []Event + + e.listsMu.Lock() + + // Go through all entries we believe may be ready. + for it := e.readyList.Front(); it != nil && len(ret) < max; { + entry := it + it = it.Next() + + // Check the entry's readiness. It it's not really ready, we + // just put it back in the waiting list and move on to the next + // entry. + ready := entry.id.File.Readiness(entry.mask) & entry.mask + if ready == 0 { + e.readyList.Remove(entry) + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + + continue + } + + // Add event to the array that will be returned to caller. + ret = append(ret, Event{ + Events: uint32(ready), + Data: entry.userData, + }) + + // The entry is consumed, so we must move it to the disabled + // list in case it's one-shot, or back to the wait list if it's + // edge-triggered. If it's neither, we leave it in the ready + // list so that its readiness can be checked the next time + // around; however, we must move it to the end of the list so + // that other events can be delivered as well. + e.readyList.Remove(entry) + if entry.flags&OneShot != 0 { + e.disabledList.PushBack(entry) + entry.curList = &e.disabledList + } else if entry.flags&EdgeTriggered != 0 { + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + } else { + local.PushBack(entry) + } + } + + e.readyList.PushBackList(&local) + + e.listsMu.Unlock() + + return ret +} + +// readyCallback is called when one of the files we're polling becomes ready. It +// moves said file to the readyList if it's currently in the waiting list. +type readyCallback struct{} + +// Callback implements waiter.EntryCallback.Callback. +func (*readyCallback) Callback(w *waiter.Entry) { + entry := w.Context.(*pollEntry) + e := entry.epoll + + e.listsMu.Lock() + + if entry.curList == &e.waitingList { + e.waitingList.Remove(entry) + e.readyList.PushBack(entry) + entry.curList = &e.readyList + + e.Notify(waiter.EventIn) + } + + e.listsMu.Unlock() +} + +// initEntryReadiness initializes the entry's state with regards to its +// readiness by placing it in the appropriate list and registering for +// notifications. +func (e *EventPoll) initEntryReadiness(entry *pollEntry) { + // A new entry starts off in the waiting list. + e.listsMu.Lock() + e.waitingList.PushBack(entry) + entry.curList = &e.waitingList + e.listsMu.Unlock() + + // Register for event notifications. + f := entry.id.File + f.EventRegister(&entry.waiter, entry.mask) + + // Check if the file happens to already be in a ready state. + ready := f.Readiness(entry.mask) & entry.mask + if ready != 0 { + (*readyCallback).Callback(nil, &entry.waiter) + } +} + +// observes checks if event poll object e is directly or indirectly observing +// event poll object ep. It uses a bounded recursive depth-first search. +func (e *EventPoll) observes(ep *EventPoll, depthLeft int) bool { + // If we reached the maximum depth, we'll consider that we found it + // because we don't want to allow chains that are too long. + if depthLeft <= 0 { + return true + } + + e.mu.Lock() + defer e.mu.Unlock() + + // Go through each observed file and check if it is or observes ep. + for id := range e.files { + f, ok := id.File.FileOperations.(*EventPoll) + if !ok { + continue + } + + if f == ep || f.observes(ep, depthLeft-1) { + return true + } + } + + return false +} + +// AddEntry adds a new file to the collection of files observed by e. +func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error { + // Acquire cycle check lock if another event poll is being added. + ep, ok := id.File.FileOperations.(*EventPoll) + if ok { + cycleMu.Lock() + defer cycleMu.Unlock() + } + + e.mu.Lock() + defer e.mu.Unlock() + + // Fail if the file already has an entry. + if _, ok := e.files[id]; ok { + return syscall.EEXIST + } + + // Check if a cycle would be created. We use 4 as the limit because + // that's the value used by linux and we want to emulate it. + if ep != nil { + if e == ep { + return syscall.EINVAL + } + + if ep.observes(e, 4) { + return syscall.ELOOP + } + } + + // Create new entry and add it to map. + // + // N.B. Even though we are creating a weak reference here, we know it + // won't trigger a callback because we hold a reference to the file + // throughout the execution of this function. + entry := &pollEntry{ + id: id, + userData: data, + epoll: e, + flags: flags, + waiter: waiter.Entry{Callback: &readyCallback{}}, + mask: mask, + } + entry.waiter.Context = entry + e.files[id] = entry + entry.file = refs.NewWeakRef(id.File, entry) + + // Initialize the readiness state of the new entry. + e.initEntryReadiness(entry) + + return nil +} + +// UpdateEntry updates the flags, mask and user data associated with a file that +// is already part of the collection of observed files. +func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter.EventMask, data [2]int32) error { + e.mu.Lock() + defer e.mu.Unlock() + + // Fail if the file doesn't have an entry. + entry, ok := e.files[id] + if !ok { + return syscall.ENOENT + } + + // Unregister the old mask and remove entry from the list it's in, so + // readyCallback is guaranteed to not be called on this entry anymore. + entry.id.File.EventUnregister(&entry.waiter) + + // Remove entry from whatever list it's in. This ensure that no other + // threads have access to this entry as the only way left to find it + // is via e.files, but we hold e.mu, which prevents that. + e.listsMu.Lock() + entry.curList.Remove(entry) + e.listsMu.Unlock() + + // Initialize new readiness state. + entry.flags = flags + entry.mask = mask + entry.userData = data + e.initEntryReadiness(entry) + + return nil +} + +// RemoveEntry a files from the collection of observed files. +func (e *EventPoll) RemoveEntry(id FileIdentifier) error { + e.mu.Lock() + defer e.mu.Unlock() + + // Fail if the file doesn't have an entry. + entry, ok := e.files[id] + if !ok { + return syscall.ENOENT + } + + // Unregister from file first so that no concurrent attempts will be + // made to manipulate the file. + entry.id.File.EventUnregister(&entry.waiter) + + // Remove from the current list. + e.listsMu.Lock() + entry.curList.Remove(entry) + entry.curList = nil + e.listsMu.Unlock() + + // Remove file from map, and drop weak reference. + delete(e.files, id) + entry.file.Drop() + + return nil +} + +// UnregisterEpollWaiters removes the epoll waiter objects from the waiting +// queues. This is different from Release() as the file is not dereferenced. +func (e *EventPoll) UnregisterEpollWaiters() { + e.mu.Lock() + defer e.mu.Unlock() + + for _, entry := range e.files { + entry.id.File.EventUnregister(&entry.waiter) + } +} diff --git a/pkg/sentry/kernel/epoll/epoll_list.go b/pkg/sentry/kernel/epoll/epoll_list.go new file mode 100755 index 000000000..94d5c9e57 --- /dev/null +++ b/pkg/sentry/kernel/epoll/epoll_list.go @@ -0,0 +1,173 @@ +package epoll + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type pollEntryElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (pollEntryElementMapper) linkerFor(elem *pollEntry) *pollEntry { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type pollEntryList struct { + head *pollEntry + tail *pollEntry +} + +// Reset resets list l to the empty state. +func (l *pollEntryList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *pollEntryList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *pollEntryList) Front() *pollEntry { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *pollEntryList) Back() *pollEntry { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *pollEntryList) PushFront(e *pollEntry) { + pollEntryElementMapper{}.linkerFor(e).SetNext(l.head) + pollEntryElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + pollEntryElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *pollEntryList) PushBack(e *pollEntry) { + pollEntryElementMapper{}.linkerFor(e).SetNext(nil) + pollEntryElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + pollEntryElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *pollEntryList) PushBackList(m *pollEntryList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + pollEntryElementMapper{}.linkerFor(l.tail).SetNext(m.head) + pollEntryElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *pollEntryList) InsertAfter(b, e *pollEntry) { + a := pollEntryElementMapper{}.linkerFor(b).Next() + pollEntryElementMapper{}.linkerFor(e).SetNext(a) + pollEntryElementMapper{}.linkerFor(e).SetPrev(b) + pollEntryElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + pollEntryElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *pollEntryList) InsertBefore(a, e *pollEntry) { + b := pollEntryElementMapper{}.linkerFor(a).Prev() + pollEntryElementMapper{}.linkerFor(e).SetNext(a) + pollEntryElementMapper{}.linkerFor(e).SetPrev(b) + pollEntryElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + pollEntryElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *pollEntryList) Remove(e *pollEntry) { + prev := pollEntryElementMapper{}.linkerFor(e).Prev() + next := pollEntryElementMapper{}.linkerFor(e).Next() + + if prev != nil { + pollEntryElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + pollEntryElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type pollEntryEntry struct { + next *pollEntry + prev *pollEntry +} + +// Next returns the entry that follows e in the list. +func (e *pollEntryEntry) Next() *pollEntry { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *pollEntryEntry) Prev() *pollEntry { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *pollEntryEntry) SetNext(elem *pollEntry) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *pollEntryEntry) SetPrev(elem *pollEntry) { + e.prev = elem +} diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go new file mode 100644 index 000000000..4c3c38f9e --- /dev/null +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -0,0 +1,49 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package epoll + +import ( + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// afterLoad is invoked by stateify. +func (p *pollEntry) afterLoad() { + p.waiter = waiter.Entry{Callback: &readyCallback{}} + p.waiter.Context = p + p.file = refs.NewWeakRef(p.id.File, p) + p.id.File.EventRegister(&p.waiter, p.mask) +} + +// afterLoad is invoked by stateify. +func (e *EventPoll) afterLoad() { + e.listsMu.Lock() + defer e.listsMu.Unlock() + + for _, ls := range []*pollEntryList{&e.waitingList, &e.readyList, &e.disabledList} { + for it := ls.Front(); it != nil; it = it.Next() { + it.curList = ls + } + } + + for it := e.waitingList.Front(); it != nil; it = it.Next() { + if it.id.File.Readiness(it.mask) != 0 { + e.waitingList.Remove(it) + e.readyList.PushBack(it) + it.curList = &e.readyList + e.Notify(waiter.EventIn) + } + } +} diff --git a/pkg/sentry/kernel/epoll/epoll_state_autogen.go b/pkg/sentry/kernel/epoll/epoll_state_autogen.go new file mode 100755 index 000000000..a361ff37b --- /dev/null +++ b/pkg/sentry/kernel/epoll/epoll_state_autogen.go @@ -0,0 +1,99 @@ +// automatically generated by stateify. + +package epoll + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *FileIdentifier) beforeSave() {} +func (x *FileIdentifier) save(m state.Map) { + x.beforeSave() + m.Save("File", &x.File) + m.Save("Fd", &x.Fd) +} + +func (x *FileIdentifier) afterLoad() {} +func (x *FileIdentifier) load(m state.Map) { + m.LoadWait("File", &x.File) + m.Load("Fd", &x.Fd) +} + +func (x *pollEntry) beforeSave() {} +func (x *pollEntry) save(m state.Map) { + x.beforeSave() + m.Save("pollEntryEntry", &x.pollEntryEntry) + m.Save("id", &x.id) + m.Save("userData", &x.userData) + m.Save("mask", &x.mask) + m.Save("flags", &x.flags) + m.Save("epoll", &x.epoll) +} + +func (x *pollEntry) load(m state.Map) { + m.Load("pollEntryEntry", &x.pollEntryEntry) + m.LoadWait("id", &x.id) + m.Load("userData", &x.userData) + m.Load("mask", &x.mask) + m.Load("flags", &x.flags) + m.Load("epoll", &x.epoll) + m.AfterLoad(x.afterLoad) +} + +func (x *EventPoll) beforeSave() {} +func (x *EventPoll) save(m state.Map) { + x.beforeSave() + if !state.IsZeroValue(x.FilePipeSeek) { m.Failf("FilePipeSeek is %v, expected zero", x.FilePipeSeek) } + if !state.IsZeroValue(x.FileNotDirReaddir) { m.Failf("FileNotDirReaddir is %v, expected zero", x.FileNotDirReaddir) } + if !state.IsZeroValue(x.FileNoFsync) { m.Failf("FileNoFsync is %v, expected zero", x.FileNoFsync) } + if !state.IsZeroValue(x.FileNoopFlush) { m.Failf("FileNoopFlush is %v, expected zero", x.FileNoopFlush) } + if !state.IsZeroValue(x.FileNoIoctl) { m.Failf("FileNoIoctl is %v, expected zero", x.FileNoIoctl) } + if !state.IsZeroValue(x.FileNoMMap) { m.Failf("FileNoMMap is %v, expected zero", x.FileNoMMap) } + if !state.IsZeroValue(x.Queue) { m.Failf("Queue is %v, expected zero", x.Queue) } + m.Save("files", &x.files) + m.Save("readyList", &x.readyList) + m.Save("waitingList", &x.waitingList) + m.Save("disabledList", &x.disabledList) +} + +func (x *EventPoll) load(m state.Map) { + m.Load("files", &x.files) + m.Load("readyList", &x.readyList) + m.Load("waitingList", &x.waitingList) + m.Load("disabledList", &x.disabledList) + m.AfterLoad(x.afterLoad) +} + +func (x *pollEntryList) beforeSave() {} +func (x *pollEntryList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *pollEntryList) afterLoad() {} +func (x *pollEntryList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *pollEntryEntry) beforeSave() {} +func (x *pollEntryEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *pollEntryEntry) afterLoad() {} +func (x *pollEntryEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func init() { + state.Register("epoll.FileIdentifier", (*FileIdentifier)(nil), state.Fns{Save: (*FileIdentifier).save, Load: (*FileIdentifier).load}) + state.Register("epoll.pollEntry", (*pollEntry)(nil), state.Fns{Save: (*pollEntry).save, Load: (*pollEntry).load}) + state.Register("epoll.EventPoll", (*EventPoll)(nil), state.Fns{Save: (*EventPoll).save, Load: (*EventPoll).load}) + state.Register("epoll.pollEntryList", (*pollEntryList)(nil), state.Fns{Save: (*pollEntryList).save, Load: (*pollEntryList).load}) + state.Register("epoll.pollEntryEntry", (*pollEntryEntry)(nil), state.Fns{Save: (*pollEntryEntry).save, Load: (*pollEntryEntry).load}) +} diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go new file mode 100644 index 000000000..2f900be38 --- /dev/null +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -0,0 +1,283 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package eventfd provides an implementation of Linux's file-based event +// notification. +package eventfd + +import ( + "math" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/fdnotifier" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// EventOperations represents an event with the semantics of Linux's file-based event +// notification (eventfd). Eventfds are usually internal to the Sentry but in certain +// situations they may be converted into a host-backed eventfd. +// +// +stateify savable +type EventOperations struct { + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + // Mutex that protects accesses to the fields of this event. + mu sync.Mutex `state:"nosave"` + + // Queue is used to notify interested parties when the event object + // becomes readable or writable. + wq waiter.Queue `state:"zerovalue"` + + // val is the current value of the event counter. + val uint64 + + // semMode specifies whether the event is in "semaphore" mode. + semMode bool + + // hostfd indicates whether this eventfd is passed through to the host. + hostfd int +} + +// New creates a new event object with the supplied initial value and mode. +func New(ctx context.Context, initVal uint64, semMode bool) *fs.File { + // name matches fs/eventfd.c:eventfd_file_create. + dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[eventfd]") + return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{ + val: initVal, + semMode: semMode, + hostfd: -1, + }) +} + +// HostFD returns the host eventfd associated with this event. +func (e *EventOperations) HostFD() (int, error) { + e.mu.Lock() + defer e.mu.Unlock() + if e.hostfd >= 0 { + return e.hostfd, nil + } + + flags := linux.EFD_NONBLOCK + if e.semMode { + flags |= linux.EFD_SEMAPHORE + } + + fd, _, err := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(e.val), uintptr(flags), 0) + if err != 0 { + return -1, err + } + + if err := fdnotifier.AddFD(int32(fd), &e.wq); err != nil { + syscall.Close(int(fd)) + return -1, err + } + + e.hostfd = int(fd) + return e.hostfd, nil +} + +// Release implements fs.FileOperations.Release. +func (e *EventOperations) Release() { + e.mu.Lock() + defer e.mu.Unlock() + if e.hostfd >= 0 { + fdnotifier.RemoveFD(int32(e.hostfd)) + syscall.Close(e.hostfd) + e.hostfd = -1 + } +} + +// Read implements fs.FileOperations.Read. +func (e *EventOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + if dst.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := e.read(ctx, dst); err != nil { + return 0, err + } + return 8, nil +} + +// Write implements fs.FileOperations.Write. +func (e *EventOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + if src.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := e.write(ctx, src); err != nil { + return 0, err + } + return 8, nil +} + +// Must be called with e.mu locked. +func (e *EventOperations) hostRead(ctx context.Context, dst usermem.IOSequence) error { + var buf [8]byte + + if _, err := syscall.Read(e.hostfd, buf[:]); err != nil { + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err + } + + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) error { + e.mu.Lock() + + if e.hostfd >= 0 { + defer e.mu.Unlock() + return e.hostRead(ctx, dst) + } + + // We can't complete the read if the value is currently zero. + if e.val == 0 { + e.mu.Unlock() + return syserror.ErrWouldBlock + } + + // Update the value based on the mode the event is operating in. + var val uint64 + if e.semMode { + val = 1 + // Consistent with Linux, this is done even if writing to memory fails. + e.val-- + } else { + val = e.val + e.val = 0 + } + + e.mu.Unlock() + + // Notify writers. We do this even if we were already writable because + // it is possible that a writer is waiting to write the maximum value + // to the event. + e.wq.Notify(waiter.EventOut) + + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +// Must be called with e.mu locked. +func (e *EventOperations) hostWrite(val uint64) error { + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := syscall.Write(e.hostfd, buf[:]) + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err +} + +func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) error { + var buf [8]byte + if _, err := src.CopyIn(ctx, buf[:]); err != nil { + return err + } + val := usermem.ByteOrder.Uint64(buf[:]) + + return e.Signal(val) +} + +// Signal is an internal function to signal the event fd. +func (e *EventOperations) Signal(val uint64) error { + if val == math.MaxUint64 { + return syscall.EINVAL + } + + e.mu.Lock() + + if e.hostfd >= 0 { + defer e.mu.Unlock() + return e.hostWrite(val) + } + + // We only allow writes that won't cause the value to go over the max + // uint64 minus 1. + if val > math.MaxUint64-1-e.val { + e.mu.Unlock() + return syserror.ErrWouldBlock + } + + e.val += val + e.mu.Unlock() + + // Always trigger a notification. + e.wq.Notify(waiter.EventIn) + + return nil +} + +// Readiness returns the ready events for the event fd. +func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + e.mu.Lock() + if e.hostfd >= 0 { + defer e.mu.Unlock() + return fdnotifier.NonBlockingPoll(int32(e.hostfd), mask) + } + + ready := waiter.EventMask(0) + if e.val > 0 { + ready |= waiter.EventIn + } + + if e.val < math.MaxUint64-1 { + ready |= waiter.EventOut + } + e.mu.Unlock() + + return mask & ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (e *EventOperations) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { + e.wq.EventRegister(entry, mask) + + e.mu.Lock() + defer e.mu.Unlock() + if e.hostfd >= 0 { + fdnotifier.UpdateFD(int32(e.hostfd)) + } +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (e *EventOperations) EventUnregister(entry *waiter.Entry) { + e.wq.EventUnregister(entry) + + e.mu.Lock() + defer e.mu.Unlock() + if e.hostfd >= 0 { + fdnotifier.UpdateFD(int32(e.hostfd)) + } +} diff --git a/pkg/sentry/kernel/eventfd/eventfd_state_autogen.go b/pkg/sentry/kernel/eventfd/eventfd_state_autogen.go new file mode 100755 index 000000000..922ff1b73 --- /dev/null +++ b/pkg/sentry/kernel/eventfd/eventfd_state_autogen.go @@ -0,0 +1,27 @@ +// automatically generated by stateify. + +package eventfd + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *EventOperations) beforeSave() {} +func (x *EventOperations) save(m state.Map) { + x.beforeSave() + if !state.IsZeroValue(x.wq) { m.Failf("wq is %v, expected zero", x.wq) } + m.Save("val", &x.val) + m.Save("semMode", &x.semMode) + m.Save("hostfd", &x.hostfd) +} + +func (x *EventOperations) afterLoad() {} +func (x *EventOperations) load(m state.Map) { + m.Load("val", &x.val) + m.Load("semMode", &x.semMode) + m.Load("hostfd", &x.hostfd) +} + +func init() { + state.Register("eventfd.EventOperations", (*EventOperations)(nil), state.Fns{Save: (*EventOperations).save, Load: (*EventOperations).load}) +} diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go new file mode 100644 index 000000000..84cd08501 --- /dev/null +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -0,0 +1,148 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fasync provides FIOASYNC related functionality. +package fasync + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// New creates a new FileAsync. +func New() fs.FileAsync { + return &FileAsync{} +} + +// FileAsync sends signals when the registered file is ready for IO. +// +// +stateify savable +type FileAsync struct { + mu sync.Mutex `state:"nosave"` + e waiter.Entry + requester *auth.Credentials + + // Only one of the following is allowed to be non-nil. + recipientPG *kernel.ProcessGroup + recipientTG *kernel.ThreadGroup + recipientT *kernel.Task +} + +// Callback sends a signal. +func (a *FileAsync) Callback(e *waiter.Entry) { + a.mu.Lock() + if a.e.Callback == nil { + a.mu.Unlock() + return + } + t := a.recipientT + tg := a.recipientTG + if a.recipientPG != nil { + tg = a.recipientPG.Originator() + } + if tg != nil { + t = tg.Leader() + } + if t == nil { + // No recipient has been registered. + a.mu.Unlock() + return + } + c := t.Credentials() + // Logic from sigio_perm in fs/fcntl.c. + if a.requester.EffectiveKUID == 0 || + a.requester.EffectiveKUID == c.SavedKUID || + a.requester.EffectiveKUID == c.RealKUID || + a.requester.RealKUID == c.SavedKUID || + a.requester.RealKUID == c.RealKUID { + t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO)) + } + a.mu.Unlock() +} + +// Register sets the file which will be monitored for IO events. +// +// The file must not be currently registered. +func (a *FileAsync) Register(w waiter.Waitable) { + a.mu.Lock() + defer a.mu.Unlock() + + if a.e.Callback != nil { + panic("registering already registered file") + } + + a.e.Callback = a + w.EventRegister(&a.e, waiter.EventIn|waiter.EventOut|waiter.EventErr|waiter.EventHUp) +} + +// Unregister stops monitoring a file. +// +// The file must be currently registered. +func (a *FileAsync) Unregister(w waiter.Waitable) { + a.mu.Lock() + defer a.mu.Unlock() + + if a.e.Callback == nil { + panic("unregistering unregistered file") + } + + w.EventUnregister(&a.e) + a.e.Callback = nil +} + +// Owner returns who is currently getting signals. All return values will be +// nil if no one is set to receive signals. +func (a *FileAsync) Owner() (*kernel.Task, *kernel.ThreadGroup, *kernel.ProcessGroup) { + a.mu.Lock() + defer a.mu.Unlock() + return a.recipientT, a.recipientTG, a.recipientPG +} + +// SetOwnerTask sets the owner (who will receive signals) to a specified task. +// Only this owner will receive signals. +func (a *FileAsync) SetOwnerTask(requester *kernel.Task, recipient *kernel.Task) { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = requester.Credentials() + a.recipientT = recipient + a.recipientTG = nil + a.recipientPG = nil +} + +// SetOwnerThreadGroup sets the owner (who will receive signals) to a specified +// thread group. Only this owner will receive signals. +func (a *FileAsync) SetOwnerThreadGroup(requester *kernel.Task, recipient *kernel.ThreadGroup) { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = requester.Credentials() + a.recipientT = nil + a.recipientTG = recipient + a.recipientPG = nil +} + +// SetOwnerProcessGroup sets the owner (who will receive signals) to a +// specified process group. Only this owner will receive signals. +func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kernel.ProcessGroup) { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = requester.Credentials() + a.recipientT = nil + a.recipientTG = nil + a.recipientPG = recipient +} diff --git a/pkg/sentry/kernel/fasync/fasync_state_autogen.go b/pkg/sentry/kernel/fasync/fasync_state_autogen.go new file mode 100755 index 000000000..e162e0033 --- /dev/null +++ b/pkg/sentry/kernel/fasync/fasync_state_autogen.go @@ -0,0 +1,30 @@ +// automatically generated by stateify. + +package fasync + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *FileAsync) beforeSave() {} +func (x *FileAsync) save(m state.Map) { + x.beforeSave() + m.Save("e", &x.e) + m.Save("requester", &x.requester) + m.Save("recipientPG", &x.recipientPG) + m.Save("recipientTG", &x.recipientTG) + m.Save("recipientT", &x.recipientT) +} + +func (x *FileAsync) afterLoad() {} +func (x *FileAsync) load(m state.Map) { + m.Load("e", &x.e) + m.Load("requester", &x.requester) + m.Load("recipientPG", &x.recipientPG) + m.Load("recipientTG", &x.recipientTG) + m.Load("recipientT", &x.recipientT) +} + +func init() { + state.Register("fasync.FileAsync", (*FileAsync)(nil), state.Fns{Save: (*FileAsync).save, Load: (*FileAsync).load}) +} diff --git a/pkg/sentry/kernel/fd_map.go b/pkg/sentry/kernel/fd_map.go new file mode 100644 index 000000000..c5636d233 --- /dev/null +++ b/pkg/sentry/kernel/fd_map.go @@ -0,0 +1,364 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "fmt" + "sort" + "sync" + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" +) + +// FDs is an ordering of FD's that can be made stable. +type FDs []kdefs.FD + +func (f FDs) Len() int { + return len(f) +} + +func (f FDs) Swap(i, j int) { + f[i], f[j] = f[j], f[i] +} + +func (f FDs) Less(i, j int) bool { + return f[i] < f[j] +} + +// FDFlags define flags for an individual descriptor. +// +// +stateify savable +type FDFlags struct { + // CloseOnExec indicates the descriptor should be closed on exec. + CloseOnExec bool +} + +// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags +// representation. +func (f FDFlags) ToLinuxFileFlags() (mask uint) { + if f.CloseOnExec { + mask |= linux.O_CLOEXEC + } + return +} + +// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags +// representation. +func (f FDFlags) ToLinuxFDFlags() (mask uint) { + if f.CloseOnExec { + mask |= linux.FD_CLOEXEC + } + return +} + +// descriptor holds the details about a file descriptor, namely a pointer the +// file itself and the descriptor flags. +// +// +stateify savable +type descriptor struct { + file *fs.File + flags FDFlags +} + +// FDMap is used to manage File references and flags. +// +// +stateify savable +type FDMap struct { + refs.AtomicRefCount + k *Kernel + files map[kdefs.FD]descriptor + mu sync.RWMutex `state:"nosave"` + uid uint64 +} + +// ID returns a unique identifier for this FDMap. +func (f *FDMap) ID() uint64 { + return f.uid +} + +// NewFDMap allocates a new FDMap that may be used by tasks in k. +func (k *Kernel) NewFDMap() *FDMap { + return &FDMap{ + k: k, + files: make(map[kdefs.FD]descriptor), + uid: atomic.AddUint64(&k.fdMapUids, 1), + } +} + +// destroy removes all of the file descriptors from the map. +func (f *FDMap) destroy() { + f.RemoveIf(func(*fs.File, FDFlags) bool { + return true + }) +} + +// DecRef implements RefCounter.DecRef with destructor f.destroy. +func (f *FDMap) DecRef() { + f.DecRefWithDestructor(f.destroy) +} + +// Size returns the number of file descriptor slots currently allocated. +func (f *FDMap) Size() int { + f.mu.RLock() + defer f.mu.RUnlock() + + return len(f.files) +} + +// String is a stringer for FDMap. +func (f *FDMap) String() string { + f.mu.RLock() + defer f.mu.RUnlock() + + var b bytes.Buffer + for k, v := range f.files { + n, _ := v.file.Dirent.FullName(nil /* root */) + b.WriteString(fmt.Sprintf("\tfd:%d => name %s\n", k, n)) + } + return b.String() +} + +// NewFDFrom allocates a new FD guaranteed to be the lowest number available +// greater than or equal to from. This property is important as Unix programs +// tend to count on this allocation order. +func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error) { + if fd < 0 { + // Don't accept negative FDs. + return 0, syscall.EINVAL + } + + f.mu.Lock() + defer f.mu.Unlock() + + // Finds the lowest fd not in the handles map. + lim := limitSet.Get(limits.NumberOfFiles) + for i := fd; lim.Cur == limits.Infinity || i < kdefs.FD(lim.Cur); i++ { + if _, ok := f.files[i]; !ok { + file.IncRef() + f.files[i] = descriptor{file, flags} + return i, nil + } + } + + return -1, syscall.EMFILE +} + +// NewFDAt sets the file reference for the given FD. If there is an +// active reference for that FD, the ref count for that existing reference +// is decremented. +func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + // In this one case we do not do a defer of the Unlock. The + // reason is that we must have done all the work needed for + // discarding any old open file before we return to the + // caller. In other words, the DecRef(), below, must have + // completed by the time we return to the caller to ensure + // side effects are, in fact, effected. A classic example is + // dup2(fd1, fd2); if fd2 was already open, it must be closed, + // and we don't want to resume the caller until it is; we have + // to block on the DecRef(). Hence we can not just do a 'go + // oldfile.DecRef()', since there would be no guarantee that + // it would be done before we the caller resumed. Since we + // must wait for the DecRef() to finish, and that could take + // time, it's best to first call f.muUnlock beore so we are + // not blocking other uses of this FDMap on the DecRef() call. + f.mu.Lock() + oldDesc, oldExists := f.files[fd] + lim := limitSet.Get(limits.NumberOfFiles).Cur + // if we're closing one then the effective limit is one + // more than the actual limit. + if oldExists && lim != limits.Infinity { + lim++ + } + if lim != limits.Infinity && fd >= kdefs.FD(lim) { + f.mu.Unlock() + return syscall.EMFILE + } + + file.IncRef() + f.files[fd] = descriptor{file, flags} + f.mu.Unlock() + + if oldExists { + oldDesc.file.DecRef() + } + return nil +} + +// SetFlags sets the flags for the given file descriptor, if it is valid. +func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags) { + f.mu.Lock() + defer f.mu.Unlock() + + desc, ok := f.files[fd] + if !ok { + return + } + + f.files[fd] = descriptor{desc.file, flags} +} + +// GetDescriptor returns a reference to the file and the flags for the FD. It +// bumps its reference count as well. It returns nil if there is no File +// for the FD, i.e. if the FD is invalid. The caller must use DecRef +// when they are done. +func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags) { + f.mu.RLock() + defer f.mu.RUnlock() + + if desc, ok := f.files[fd]; ok { + desc.file.IncRef() + return desc.file, desc.flags + } + return nil, FDFlags{} +} + +// GetFile returns a reference to the File for the FD and bumps +// its reference count as well. It returns nil if there is no File +// for the FD, i.e. if the FD is invalid. The caller must use DecRef +// when they are done. +func (f *FDMap) GetFile(fd kdefs.FD) *fs.File { + f.mu.RLock() + if desc, ok := f.files[fd]; ok { + desc.file.IncRef() + f.mu.RUnlock() + return desc.file + } + f.mu.RUnlock() + return nil +} + +// fds returns an ordering of FDs. +func (f *FDMap) fds() FDs { + fds := make(FDs, 0, len(f.files)) + for fd := range f.files { + fds = append(fds, fd) + } + sort.Sort(fds) + return fds +} + +// GetFDs returns a list of valid fds. +func (f *FDMap) GetFDs() FDs { + f.mu.RLock() + defer f.mu.RUnlock() + return f.fds() +} + +// GetRefs returns a stable slice of references to all files and bumps the +// reference count on each. The caller must use DecRef on each reference when +// they're done using the slice. +func (f *FDMap) GetRefs() []*fs.File { + f.mu.RLock() + defer f.mu.RUnlock() + + fds := f.fds() + fs := make([]*fs.File, 0, len(fds)) + for _, fd := range fds { + desc := f.files[fd] + desc.file.IncRef() + fs = append(fs, desc.file) + } + return fs +} + +// Fork returns an independent FDMap pointing to the same descriptors. +func (f *FDMap) Fork() *FDMap { + f.mu.RLock() + defer f.mu.RUnlock() + + clone := f.k.NewFDMap() + + // Grab a extra reference for every file. + for fd, desc := range f.files { + desc.file.IncRef() + clone.files[fd] = desc + } + + // That's it! + return clone +} + +// unlock releases all file locks held by this FDMap's uid. Must only be +// called on a non-nil *fs.File. +func (f *FDMap) unlock(file *fs.File) { + id := lock.UniqueID(f.ID()) + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(id, lock.LockRange{0, lock.LockEOF}) +} + +// inotifyFileClose generates the appropriate inotify events for f being closed. +func inotifyFileClose(f *fs.File) { + var ev uint32 + d := f.Dirent + + if fs.IsDir(d.Inode.StableAttr) { + ev |= linux.IN_ISDIR + } + + if f.Flags().Write { + ev |= linux.IN_CLOSE_WRITE + } else { + ev |= linux.IN_CLOSE_NOWRITE + } + + d.InotifyEvent(ev, 0) +} + +// Remove removes an FD from the FDMap, and returns (File, true) if a File +// one was found. Callers are expected to decrement the reference count on +// the File. Otherwise returns (nil, false). +func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool) { + f.mu.Lock() + desc := f.files[fd] + delete(f.files, fd) + f.mu.Unlock() + if desc.file != nil { + f.unlock(desc.file) + inotifyFileClose(desc.file) + return desc.file, true + } + return nil, false +} + +// RemoveIf removes all FDs where cond is true. +func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool) { + var removed []*fs.File + f.mu.Lock() + for fd, desc := range f.files { + if desc.file != nil && cond(desc.file, desc.flags) { + delete(f.files, fd) + removed = append(removed, desc.file) + } + } + f.mu.Unlock() + + for _, file := range removed { + f.unlock(file) + inotifyFileClose(file) + file.DecRef() + } +} diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go new file mode 100644 index 000000000..d8115f59a --- /dev/null +++ b/pkg/sentry/kernel/fs_context.go @@ -0,0 +1,187 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// FSContext contains filesystem context. +// +// This includes umask and working directory. +// +// +stateify savable +type FSContext struct { + refs.AtomicRefCount + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // root is the filesystem root. Will be nil iff the FSContext has been + // destroyed. + root *fs.Dirent + + // cwd is the current working directory. Will be nil iff the FSContext + // has been destroyed. + cwd *fs.Dirent + + // umask is the current file mode creation mask. When a thread using this + // context invokes a syscall that creates a file, bits set in umask are + // removed from the permissions that the file is created with. + umask uint +} + +// newFSContext returns a new filesystem context. +func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { + root.IncRef() + cwd.IncRef() + return &FSContext{ + root: root, + cwd: cwd, + umask: umask, + } +} + +// destroy is the destructor for an FSContext. +// +// This will call DecRef on both root and cwd Dirents. If either call to +// DecRef returns an error, then it will be propigated. If both calls to +// DecRef return an error, then the one from root.DecRef will be propigated. +// +// Note that there may still be calls to WorkingDirectory() or RootDirectory() +// (that return nil). This is because valid references may still be held via +// proc files or other mechanisms. +func (f *FSContext) destroy() { + // Hold f.mu so that we don't race with RootDirectory() and + // WorkingDirectory(). + f.mu.Lock() + defer f.mu.Unlock() + + f.root.DecRef() + f.root = nil + + f.cwd.DecRef() + f.cwd = nil +} + +// DecRef implements RefCounter.DecRef with destructor f.destroy. +func (f *FSContext) DecRef() { + f.DecRefWithDestructor(f.destroy) +} + +// Fork forks this FSContext. +// +// This is not a valid call after destroy. +func (f *FSContext) Fork() *FSContext { + f.mu.Lock() + defer f.mu.Unlock() + f.cwd.IncRef() + f.root.IncRef() + return &FSContext{ + cwd: f.cwd, + root: f.root, + umask: f.umask, + } +} + +// WorkingDirectory returns the current working directory. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) WorkingDirectory() *fs.Dirent { + f.mu.Lock() + defer f.mu.Unlock() + if f.cwd != nil { + f.cwd.IncRef() + } + return f.cwd +} + +// SetWorkingDirectory sets the current working directory. +// This will take an extra reference on the Dirent. +// +// This is not a valid call after destroy. +func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { + if d == nil { + panic("FSContext.SetWorkingDirectory called with nil dirent") + } + + f.mu.Lock() + defer f.mu.Unlock() + + if f.cwd == nil { + panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d)) + } + + old := f.cwd + f.cwd = d + d.IncRef() + old.DecRef() +} + +// RootDirectory returns the current filesystem root. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) RootDirectory() *fs.Dirent { + f.mu.Lock() + defer f.mu.Unlock() + if f.root != nil { + f.root.IncRef() + } + return f.root +} + +// SetRootDirectory sets the root directory. +// This will take an extra reference on the Dirent. +// +// This is not a valid call after free. +func (f *FSContext) SetRootDirectory(d *fs.Dirent) { + if d == nil { + panic("FSContext.SetRootDirectory called with nil dirent") + } + + f.mu.Lock() + defer f.mu.Unlock() + + if f.root == nil { + panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", d)) + } + + old := f.root + f.root = d + d.IncRef() + old.DecRef() +} + +// Umask returns the current umask. +func (f *FSContext) Umask() uint { + f.mu.Lock() + defer f.mu.Unlock() + return f.umask +} + +// SwapUmask atomically sets the current umask and returns the old umask. +func (f *FSContext) SwapUmask(mask uint) uint { + f.mu.Lock() + defer f.mu.Unlock() + old := f.umask + f.umask = mask + return old +} diff --git a/pkg/sentry/kernel/futex/atomicptr_bucket.go b/pkg/sentry/kernel/futex/atomicptr_bucket.go new file mode 100755 index 000000000..2251a6e72 --- /dev/null +++ b/pkg/sentry/kernel/futex/atomicptr_bucket.go @@ -0,0 +1,27 @@ +package futex + +import ( + "sync/atomic" + "unsafe" +) + +// An AtomicPtr is a pointer to a value of type Value that can be atomically +// loaded and stored. The zero value of an AtomicPtr represents nil. +// +// Note that copying AtomicPtr by value performs a non-atomic read of the +// stored pointer, which is unsafe if Store() can be called concurrently; in +// this case, do `dst.Store(src.Load())` instead. +type AtomicPtrBucket struct { + ptr unsafe.Pointer +} + +// Load returns the value set by the most recent Store. It returns nil if there +// has been no previous call to Store. +func (p *AtomicPtrBucket) Load() *bucket { + return (*bucket)(atomic.LoadPointer(&p.ptr)) +} + +// Store sets the value returned by Load to x. +func (p *AtomicPtrBucket) Store(x *bucket) { + atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) +} diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go new file mode 100644 index 000000000..bb38eb81e --- /dev/null +++ b/pkg/sentry/kernel/futex/futex.go @@ -0,0 +1,783 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package futex provides an implementation of the futex interface as found in +// the Linux kernel. It allows one to easily transform Wait() calls into waits +// on a channel, which is useful in a Go-based kernel, for example. +package futex + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// KeyKind indicates the type of a Key. +type KeyKind int + +const ( + // KindPrivate indicates a private futex (a futex syscall with the + // FUTEX_PRIVATE_FLAG set). + KindPrivate KeyKind = iota + + // KindSharedPrivate indicates a shared futex on a private memory mapping. + // Although KindPrivate and KindSharedPrivate futexes both use memory + // addresses to identify futexes, they do not interoperate (in Linux, the + // two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key + // comparison). + KindSharedPrivate + + // KindSharedMappable indicates a shared futex on a memory mapping other + // than a private anonymous memory mapping. + KindSharedMappable +) + +// Key represents something that a futex waiter may wait on. +type Key struct { + // Kind is the type of the Key. + Kind KeyKind + + // Mappable is the memory-mapped object that is represented by the Key. + // Mappable is always nil if Kind is not KindSharedMappable, and may be nil + // even if it is. + Mappable memmap.Mappable + + // MappingIdentity is the MappingIdentity associated with Mappable. + // MappingIdentity is always nil is Mappable is nil, and may be nil even if + // it isn't. + MappingIdentity memmap.MappingIdentity + + // If Kind is KindPrivate or KindSharedPrivate, Offset is the represented + // memory address. Otherwise, Offset is the represented offset into + // Mappable. + Offset uint64 +} + +func (k *Key) release() { + if k.MappingIdentity != nil { + k.MappingIdentity.DecRef() + } + k.Mappable = nil + k.MappingIdentity = nil +} + +func (k *Key) clone() Key { + if k.MappingIdentity != nil { + k.MappingIdentity.IncRef() + } + return *k +} + +// Preconditions: k.Kind == KindPrivate or KindSharedPrivate. +func (k *Key) addr() usermem.Addr { + return usermem.Addr(k.Offset) +} + +// matches returns true if a wakeup on k2 should wake a waiter waiting on k. +func (k *Key) matches(k2 *Key) bool { + // k.MappingIdentity is ignored; it's only used for reference counting. + return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset +} + +// Target abstracts memory accesses and keys. +type Target interface { + // SwapUint32 gives access to usermem.IO.SwapUint32. + SwapUint32(addr usermem.Addr, new uint32) (uint32, error) + + // CompareAndSwap gives access to usermem.IO.CompareAndSwapUint32. + CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) + + // LoadUint32 gives access to usermem.IO.LoadUint32. + LoadUint32(addr usermem.Addr) (uint32, error) + + // GetSharedKey returns a Key with kind KindSharedPrivate or + // KindSharedMappable corresponding to the memory mapped at address addr. + // + // If GetSharedKey returns a Key with a non-nil MappingIdentity, a + // reference is held on the MappingIdentity, which must be dropped by the + // caller when the Key is no longer in use. + GetSharedKey(addr usermem.Addr) (Key, error) +} + +// check performs a basic equality check on the given address. +func check(t Target, addr usermem.Addr, val uint32) error { + cur, err := t.LoadUint32(addr) + if err != nil { + return err + } + if cur != val { + return syserror.EAGAIN + } + return nil +} + +// atomicOp performs a complex operation on the given address. +func atomicOp(t Target, addr usermem.Addr, opIn uint32) (bool, error) { + opType := (opIn >> 28) & 0xf + cmp := (opIn >> 24) & 0xf + opArg := (opIn >> 12) & 0xfff + cmpArg := opIn & 0xfff + + if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 { + opArg = 1 << opArg + opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag. + } + + var ( + oldVal uint32 + err error + ) + if opType == linux.FUTEX_OP_SET { + oldVal, err = t.SwapUint32(addr, opArg) + if err != nil { + return false, err + } + } else { + for { + oldVal, err = t.LoadUint32(addr) + if err != nil { + return false, err + } + var newVal uint32 + switch opType { + case linux.FUTEX_OP_ADD: + newVal = oldVal + opArg + case linux.FUTEX_OP_OR: + newVal = oldVal | opArg + case linux.FUTEX_OP_ANDN: + newVal = oldVal &^ opArg + case linux.FUTEX_OP_XOR: + newVal = oldVal ^ opArg + default: + return false, syserror.ENOSYS + } + prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal) + if err != nil { + return false, err + } + if prev == oldVal { + break // Success. + } + } + } + + switch cmp { + case linux.FUTEX_OP_CMP_EQ: + return oldVal == cmpArg, nil + case linux.FUTEX_OP_CMP_NE: + return oldVal != cmpArg, nil + case linux.FUTEX_OP_CMP_LT: + return oldVal < cmpArg, nil + case linux.FUTEX_OP_CMP_LE: + return oldVal <= cmpArg, nil + case linux.FUTEX_OP_CMP_GT: + return oldVal > cmpArg, nil + case linux.FUTEX_OP_CMP_GE: + return oldVal >= cmpArg, nil + default: + return false, syserror.ENOSYS + } +} + +// Waiter is the struct which gets enqueued into buckets for wake up routines +// and requeue routines to scan and notify. Once a Waiter has been enqueued by +// WaitPrepare(), callers may listen on C for wake up events. +type Waiter struct { + // Synchronization: + // + // - A Waiter that is not enqueued in a bucket is exclusively owned (no + // synchronization applies). + // + // - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this, + // waiterEntry, bucket, and key are protected by the bucket.mu ("bucket + // lock") of the containing bucket, and bitmask is immutable. Note that + // since bucket is mutated using atomic memory operations, bucket.Load() + // may be called without holding the bucket lock, although it may change + // racily. See WaitComplete(). + // + // - A Waiter is only guaranteed to be no longer queued after calling + // WaitComplete(). + + // waiterEntry links Waiter into bucket.waiters. + waiterEntry + + // bucket is the bucket this waiter is queued in. If bucket is nil, the + // waiter is not waiting and is not in any bucket. + bucket AtomicPtrBucket + + // C is sent to when the Waiter is woken. + C chan struct{} + + // key is what this waiter is waiting on. + key Key + + // The bitmask we're waiting on. + // This is used the case of a FUTEX_WAKE_BITSET. + bitmask uint32 + + // tid is the thread ID for the waiter in case this is a PI mutex. + tid uint32 +} + +// NewWaiter returns a new unqueued Waiter. +func NewWaiter() *Waiter { + return &Waiter{ + C: make(chan struct{}, 1), + } +} + +// woken returns true if w has been woken since the last call to WaitPrepare. +func (w *Waiter) woken() bool { + return len(w.C) != 0 +} + +// bucket holds a list of waiters for a given address hash. +// +// +stateify savable +type bucket struct { + // mu protects waiters and contained Waiter state. See comment in Waiter. + mu sync.Mutex `state:"nosave"` + + waiters waiterList `state:"zerovalue"` +} + +// wakeLocked wakes up to n waiters matching the bitmask at the addr for this +// bucket and returns the number of waiters woken. +// +// Preconditions: b.mu must be locked. +func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int { + done := 0 + for w := b.waiters.Front(); done < n && w != nil; { + if !w.key.matches(key) || w.bitmask&bitmask == 0 { + // Not matching. + w = w.Next() + continue + } + + // Remove from the bucket and wake the waiter. + woke := w + w = w.Next() // Next iteration. + b.wakeWaiterLocked(woke) + done++ + } + return done +} + +func (b *bucket) wakeWaiterLocked(w *Waiter) { + // Remove from the bucket and wake the waiter. + b.waiters.Remove(w) + w.C <- struct{}{} + + // NOTE: The above channel write establishes a write barrier according + // to the memory model, so nothing may be ordered around it. Since + // we've dequeued w and will never touch it again, we can safely + // store nil to w.bucket here and allow the WaitComplete() to + // short-circuit grabbing the bucket lock. If they somehow miss the + // store, we are still holding the lock, so we can know that they won't + // dequeue w, assume it's free and have the below operation + // afterwards. + w.bucket.Store(nil) +} + +// requeueLocked takes n waiters from the bucket and moves them to naddr on the +// bucket "to". +// +// Preconditions: b and to must be locked. +func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { + done := 0 + for w := b.waiters.Front(); done < n && w != nil; { + if !w.key.matches(key) { + // Not matching. + w = w.Next() + continue + } + + requeued := w + w = w.Next() // Next iteration. + b.waiters.Remove(requeued) + requeued.key.release() + requeued.key = nkey.clone() + to.waiters.PushBack(requeued) + requeued.bucket.Store(to) + done++ + } + return done +} + +const ( + // bucketCount is the number of buckets per Manager. By having many of + // these we reduce contention when concurrent yet unrelated calls are made. + bucketCount = 1 << bucketCountBits + bucketCountBits = 10 +) + +// getKey returns a Key representing address addr in c. +func getKey(t Target, addr usermem.Addr, private bool) (Key, error) { + // Ensure the address is aligned. + // It must be a DWORD boundary. + if addr&0x3 != 0 { + return Key{}, syserror.EINVAL + } + if private { + return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil + } + return t.GetSharedKey(addr) +} + +// bucketIndexForAddr returns the index into Manager.buckets for addr. +func bucketIndexForAddr(addr usermem.Addr) uintptr { + // - The bottom 2 bits of addr must be 0, per getKey. + // + // - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47 + // for a canonical address, and (on all existing platforms) bit 47 must be + // 0 for an application address. + // + // Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful" + // bits. We choose one of the simplest possible hash functions that at + // least uses all 45 useful bits in the output, given that bucketCountBits + // == 10. This hash function also has the property that it will usually map + // adjacent addresses to adjacent buckets, slightly improving memory + // locality when an application synchronization structure uses multiple + // nearby futexes. + // + // Note that despite the large number of arithmetic operations in the + // function, many components can be computed in parallel, such that the + // critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This + // is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... + + // (addr >> 42)" without any additional grouping, the compiler puts all 4 + // additions in the critical path. + h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22) + h2 := uintptr(addr>>32) + uintptr(addr>>42) + return (h1 + h2) % bucketCount +} + +// Manager holds futex state for a single virtual address space. +// +// +stateify savable +type Manager struct { + // privateBuckets holds buckets for KindPrivate and KindSharedPrivate + // futexes. + privateBuckets [bucketCount]bucket `state:"zerovalue"` + + // sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket + // may be shared by multiple Managers. The sharedBucket pointer is + // immutable. + sharedBucket *bucket +} + +// NewManager returns an initialized futex manager. +func NewManager() *Manager { + return &Manager{ + sharedBucket: &bucket{}, + } +} + +// Fork returns a new Manager. Shared futex clients using the returned Manager +// may interoperate with those using m. +func (m *Manager) Fork() *Manager { + return &Manager{ + sharedBucket: m.sharedBucket, + } +} + +// lockBucket returns a locked bucket for the given key. +func (m *Manager) lockBucket(k *Key) *bucket { + var b *bucket + if k.Kind == KindSharedMappable { + b = m.sharedBucket + } else { + b = &m.privateBuckets[bucketIndexForAddr(k.addr())] + } + b.mu.Lock() + return b +} + +// lockBuckets returns locked buckets for the given keys. +func (m *Manager) lockBuckets(k1, k2 *Key) (*bucket, *bucket) { + // Buckets must be consistently ordered to avoid circular lock + // dependencies. We order buckets in m.privateBuckets by index (lowest + // index first), and all buckets in m.privateBuckets precede + // m.sharedBucket. + + // Handle the common case first: + if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable { + i1 := bucketIndexForAddr(k1.addr()) + i2 := bucketIndexForAddr(k2.addr()) + b1 := &m.privateBuckets[i1] + b2 := &m.privateBuckets[i2] + switch { + case i1 < i2: + b1.mu.Lock() + b2.mu.Lock() + case i2 < i1: + b2.mu.Lock() + b1.mu.Lock() + default: + b1.mu.Lock() + } + return b1, b2 + } + + // At least one of b1 or b2 should be m.sharedBucket. + b1 := m.sharedBucket + b2 := m.sharedBucket + if k1.Kind != KindSharedMappable { + b1 = m.lockBucket(k1) + } else if k2.Kind != KindSharedMappable { + b2 = m.lockBucket(k2) + } + m.sharedBucket.mu.Lock() + return b1, b2 +} + +// Wake wakes up to n waiters matching the bitmask on the given addr. +// The number of waiters woken is returned. +func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32, n int) (int, error) { + // This function is very hot; avoid defer. + k, err := getKey(t, addr, private) + if err != nil { + return 0, err + } + + b := m.lockBucket(&k) + r := b.wakeLocked(&k, bitmask, n) + + b.mu.Unlock() + k.release() + return r, nil +} + +func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) { + k1, err := getKey(t, addr, private) + if err != nil { + return 0, err + } + defer k1.release() + k2, err := getKey(t, naddr, private) + if err != nil { + return 0, err + } + defer k2.release() + + b1, b2 := m.lockBuckets(&k1, &k2) + defer b1.mu.Unlock() + if b2 != b1 { + defer b2.mu.Unlock() + } + + if checkval { + if err := check(t, addr, val); err != nil { + return 0, err + } + } + + // Wake the number required. + done := b1.wakeLocked(&k1, ^uint32(0), nwake) + + // Requeue the number required. + b1.requeueLocked(b2, &k1, &k2, nreq) + + return done, nil +} + +// Requeue wakes up to nwake waiters on the given addr, and unconditionally +// requeues up to nreq waiters on naddr. +func (m *Manager) Requeue(t Target, addr, naddr usermem.Addr, private bool, nwake int, nreq int) (int, error) { + return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq) +} + +// RequeueCmp atomically checks that the addr contains val (via the Target), +// wakes up to nwake waiters on addr and then unconditionally requeues nreq +// waiters on naddr. +func (m *Manager) RequeueCmp(t Target, addr, naddr usermem.Addr, private bool, val uint32, nwake int, nreq int) (int, error) { + return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq) +} + +// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1 +// waiters unconditionally from addr1, and, based on the original value at addr2 +// and a comparison encoded in op, wakes up to nwake2 waiters from addr2. +// It returns the total number of waiters woken. +func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) { + k1, err := getKey(t, addr1, private) + if err != nil { + return 0, err + } + defer k1.release() + k2, err := getKey(t, addr2, private) + if err != nil { + return 0, err + } + defer k2.release() + + b1, b2 := m.lockBuckets(&k1, &k2) + defer b1.mu.Unlock() + if b2 != b1 { + defer b2.mu.Unlock() + } + + done := 0 + cond, err := atomicOp(t, addr2, op) + if err != nil { + return 0, err + } + + // Wake up up to nwake1 entries from the first bucket. + done = b1.wakeLocked(&k1, ^uint32(0), nwake1) + + // Wake up up to nwake2 entries from the second bucket if the + // operation yielded true. + if cond { + done += b2.wakeLocked(&k2, ^uint32(0), nwake2) + } + + return done, nil +} + +// WaitPrepare atomically checks that addr contains val (via the Checker), then +// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the +// Waiter must be subsequently removed by calling WaitComplete, whether or not +// a wakeup is received on w.C. +func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bool, val uint32, bitmask uint32) error { + k, err := getKey(t, addr, private) + if err != nil { + return err + } + // Ownership of k is transferred to w below. + + // Prepare the Waiter before taking the bucket lock. + select { + case <-w.C: + default: + } + w.key = k + w.bitmask = bitmask + + b := m.lockBucket(&k) + // This function is very hot; avoid defer. + + // Perform our atomic check. + if err := check(t, addr, val); err != nil { + b.mu.Unlock() + w.key.release() + return err + } + + // Add the waiter to the bucket. + b.waiters.PushBack(w) + w.bucket.Store(b) + + b.mu.Unlock() + return nil +} + +// WaitComplete must be called when a Waiter previously added by WaitPrepare is +// no longer eligible to be woken. +func (m *Manager) WaitComplete(w *Waiter) { + // Remove w from the bucket it's in. + for { + b := w.bucket.Load() + + // If b is nil, the waiter isn't in any bucket anymore. This can't be + // racy because the waiter can't be concurrently re-queued in another + // bucket. + if b == nil { + break + } + + // Take the bucket lock. Note that without holding the bucket lock, the + // waiter is not guaranteed to stay in that bucket, so after we take + // the bucket lock, we must ensure that the bucket hasn't changed: if + // it happens to have changed, we release the old bucket lock and try + // again with the new bucket; if it hasn't changed, we know it won't + // change now because we hold the lock. + b.mu.Lock() + if b != w.bucket.Load() { + b.mu.Unlock() + continue + } + + // Remove waiter from bucket. + b.waiters.Remove(w) + w.bucket.Store(nil) + b.mu.Unlock() + break + } + + // Release references held by the waiter. + w.key.release() +} + +// LockPI attempts to lock the futex following the Priority-inheritance futex +// rules. The lock is acquired only when 'addr' points to 0. The TID of the +// calling task is set to 'addr' to indicate the futex is owned. It returns true +// if the futex was successfully acquired. +// +// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see +// exit_robust_list()). Given we don't support robust lists, although handled +// below, it's never set. +func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, private, try bool) (bool, error) { + k, err := getKey(t, addr, private) + if err != nil { + return false, err + } + // Ownership of k is transferred to w below. + + // Prepare the Waiter before taking the bucket lock. + select { + case <-w.C: + default: + } + w.key = k + w.tid = tid + + b := m.lockBucket(&k) + // Hot function: avoid defers. + + success, err := m.lockPILocked(w, t, addr, tid, b, try) + if err != nil { + w.key.release() + b.mu.Unlock() + return false, err + } + if success || try { + // Release waiter if it's not going to be a wait. + w.key.release() + } + b.mu.Unlock() + return success, nil +} + +func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint32, b *bucket, try bool) (bool, error) { + for { + cur, err := t.LoadUint32(addr) + if err != nil { + return false, err + } + if (cur & linux.FUTEX_TID_MASK) == tid { + return false, syserror.EDEADLK + } + + if (cur & linux.FUTEX_TID_MASK) == 0 { + // No owner and no waiters, try to acquire the futex. + + // Set TID and preserve owner died status. + val := tid + val |= cur & linux.FUTEX_OWNER_DIED + prev, err := t.CompareAndSwapUint32(addr, cur, val) + if err != nil { + return false, err + } + if prev != cur { + // CAS failed, retry... + // Linux reacquires the bucket lock on retries, which will re-lookup the + // mapping at the futex address. However, retrying while holding the + // lock is more efficient and reduces the chance of another conflict. + continue + } + // Futex acquired. + return true, nil + } + + // Futex is already owned, prepare to wait. + + if try { + // Caller doesn't want to wait. + return false, nil + } + + // Set waiters bit if not set yet. + if cur&linux.FUTEX_WAITERS == 0 { + prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS) + if err != nil { + return false, err + } + if prev != cur { + // CAS failed, retry... + continue + } + } + + // Add the waiter to the bucket. + b.waiters.PushBack(w) + w.bucket.Store(b) + return false, nil + } +} + +// UnlockPI unlock the futex following the Priority-inheritance futex +// rules. The address provided must contain the caller's TID. If there are +// waiters, TID of the next waiter (FIFO) is set to the given address, and the +// waiter woken up. If there are no waiters, 0 is set to the address. +func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error { + k, err := getKey(t, addr, private) + if err != nil { + return err + } + b := m.lockBucket(&k) + + err = m.unlockPILocked(t, addr, tid, b) + + k.release() + b.mu.Unlock() + return err +} + +func (m *Manager) unlockPILocked(t Target, addr usermem.Addr, tid uint32, b *bucket) error { + cur, err := t.LoadUint32(addr) + if err != nil { + return err + } + + if (cur & linux.FUTEX_TID_MASK) != tid { + return syserror.EPERM + } + + if b.waiters.Empty() { + // It's safe to set 0 because there are no waiters, no new owner, and the + // executing task is the current owner (no owner died bit). + prev, err := t.CompareAndSwapUint32(addr, cur, 0) + if err != nil { + return err + } + if prev != cur { + // Let user mode handle CAS races. This is different than lock, which + // retries when CAS fails. + return syserror.EAGAIN + } + return nil + } + + next := b.waiters.Front() + + // Set next owner's TID, waiters if there are any. Resets owner died bit, if + // set, because the executing task takes over as the owner. + val := next.tid + if next.Next() != nil { + val |= linux.FUTEX_WAITERS + } + + prev, err := t.CompareAndSwapUint32(addr, cur, val) + if err != nil { + return err + } + if prev != cur { + return syserror.EINVAL + } + + b.wakeWaiterLocked(next) + return nil +} diff --git a/pkg/sentry/kernel/futex/futex_state_autogen.go b/pkg/sentry/kernel/futex/futex_state_autogen.go new file mode 100755 index 000000000..b58e22b78 --- /dev/null +++ b/pkg/sentry/kernel/futex/futex_state_autogen.go @@ -0,0 +1,62 @@ +// automatically generated by stateify. + +package futex + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *bucket) beforeSave() {} +func (x *bucket) save(m state.Map) { + x.beforeSave() + if !state.IsZeroValue(x.waiters) { m.Failf("waiters is %v, expected zero", x.waiters) } +} + +func (x *bucket) afterLoad() {} +func (x *bucket) load(m state.Map) { +} + +func (x *Manager) beforeSave() {} +func (x *Manager) save(m state.Map) { + x.beforeSave() + if !state.IsZeroValue(x.privateBuckets) { m.Failf("privateBuckets is %v, expected zero", x.privateBuckets) } + m.Save("sharedBucket", &x.sharedBucket) +} + +func (x *Manager) afterLoad() {} +func (x *Manager) load(m state.Map) { + m.Load("sharedBucket", &x.sharedBucket) +} + +func (x *waiterList) beforeSave() {} +func (x *waiterList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *waiterList) afterLoad() {} +func (x *waiterList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *waiterEntry) beforeSave() {} +func (x *waiterEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *waiterEntry) afterLoad() {} +func (x *waiterEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func init() { + state.Register("futex.bucket", (*bucket)(nil), state.Fns{Save: (*bucket).save, Load: (*bucket).load}) + state.Register("futex.Manager", (*Manager)(nil), state.Fns{Save: (*Manager).save, Load: (*Manager).load}) + state.Register("futex.waiterList", (*waiterList)(nil), state.Fns{Save: (*waiterList).save, Load: (*waiterList).load}) + state.Register("futex.waiterEntry", (*waiterEntry)(nil), state.Fns{Save: (*waiterEntry).save, Load: (*waiterEntry).load}) +} diff --git a/pkg/sentry/kernel/futex/waiter_list.go b/pkg/sentry/kernel/futex/waiter_list.go new file mode 100755 index 000000000..cca5c4721 --- /dev/null +++ b/pkg/sentry/kernel/futex/waiter_list.go @@ -0,0 +1,173 @@ +package futex + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type waiterElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (waiterElementMapper) linkerFor(elem *Waiter) *Waiter { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type waiterList struct { + head *Waiter + tail *Waiter +} + +// Reset resets list l to the empty state. +func (l *waiterList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *waiterList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *waiterList) Front() *Waiter { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *waiterList) Back() *Waiter { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *waiterList) PushFront(e *Waiter) { + waiterElementMapper{}.linkerFor(e).SetNext(l.head) + waiterElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + waiterElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *waiterList) PushBack(e *Waiter) { + waiterElementMapper{}.linkerFor(e).SetNext(nil) + waiterElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + waiterElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *waiterList) PushBackList(m *waiterList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + waiterElementMapper{}.linkerFor(l.tail).SetNext(m.head) + waiterElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *waiterList) InsertAfter(b, e *Waiter) { + a := waiterElementMapper{}.linkerFor(b).Next() + waiterElementMapper{}.linkerFor(e).SetNext(a) + waiterElementMapper{}.linkerFor(e).SetPrev(b) + waiterElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + waiterElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *waiterList) InsertBefore(a, e *Waiter) { + b := waiterElementMapper{}.linkerFor(a).Prev() + waiterElementMapper{}.linkerFor(e).SetNext(a) + waiterElementMapper{}.linkerFor(e).SetPrev(b) + waiterElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + waiterElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *waiterList) Remove(e *Waiter) { + prev := waiterElementMapper{}.linkerFor(e).Prev() + next := waiterElementMapper{}.linkerFor(e).Next() + + if prev != nil { + waiterElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + waiterElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type waiterEntry struct { + next *Waiter + prev *Waiter +} + +// Next returns the entry that follows e in the list. +func (e *waiterEntry) Next() *Waiter { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *waiterEntry) Prev() *Waiter { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *waiterEntry) SetNext(elem *Waiter) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *waiterEntry) SetPrev(elem *Waiter) { + e.prev = elem +} diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go new file mode 100644 index 000000000..ebe12812c --- /dev/null +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -0,0 +1,58 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/semaphore" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm" +) + +// IPCNamespace represents an IPC namespace. +// +// +stateify savable +type IPCNamespace struct { + // User namespace which owns this IPC namespace. Immutable. + userNS *auth.UserNamespace + + semaphores *semaphore.Registry + shms *shm.Registry +} + +// NewIPCNamespace creates a new IPC namespace. +func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { + return &IPCNamespace{ + userNS: userNS, + semaphores: semaphore.NewRegistry(userNS), + shms: shm.NewRegistry(userNS), + } +} + +// SemaphoreRegistry returns the semanphore set registry for this namespace. +func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry { + return i.semaphores +} + +// ShmRegistry returns the shm segment registry for this namespace. +func (i *IPCNamespace) ShmRegistry() *shm.Registry { + return i.shms +} + +// IPCNamespace returns the task's IPC namespace. +func (t *Task) IPCNamespace() *IPCNamespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.ipcns +} diff --git a/pkg/sentry/kernel/kdefs/kdefs.go b/pkg/sentry/kernel/kdefs/kdefs.go new file mode 100644 index 000000000..304da2032 --- /dev/null +++ b/pkg/sentry/kernel/kdefs/kdefs.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kdefs defines common kernel definitions. +// +package kdefs + +// FD is a File Descriptor. +type FD int32 diff --git a/pkg/sentry/kernel/kdefs/kdefs_state_autogen.go b/pkg/sentry/kernel/kdefs/kdefs_state_autogen.go new file mode 100755 index 000000000..cef77125b --- /dev/null +++ b/pkg/sentry/kernel/kdefs/kdefs_state_autogen.go @@ -0,0 +1,4 @@ +// automatically generated by stateify. + +package kdefs + diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go new file mode 100644 index 000000000..85d73ace2 --- /dev/null +++ b/pkg/sentry/kernel/kernel.go @@ -0,0 +1,1241 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kernel provides an emulation of the Linux kernel. +// +// See README.md for a detailed overview. +// +// Lock order (outermost locks must be taken first): +// +// Kernel.extMu +// ThreadGroup.timerMu +// ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer) +// TaskSet.mu +// SignalHandlers.mu +// Task.mu +// +// Locking SignalHandlers.mu in multiple SignalHandlers requires locking +// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same +// time requires locking all of their signal mutexes first. +package kernel + +import ( + "errors" + "fmt" + "io" + "path/filepath" + "sync" + "sync/atomic" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/eventchannel" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd" + "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/epoll" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/loader" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/port" + sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl" + uspb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" + "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" + "gvisor.googlesource.com/gvisor/pkg/state" + "gvisor.googlesource.com/gvisor/pkg/tcpip" +) + +// Kernel represents an emulated Linux kernel. It must be initialized by calling +// Init() or LoadFrom(). +// +// +stateify savable +type Kernel struct { + // extMu serializes external changes to the Kernel with calls to + // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel + // remains frozen for the duration of the call; it requires that the Kernel + // is paused as a precondition, which ensures that none of the tasks + // running within the Kernel can affect its state, but extMu is required to + // ensure that concurrent users of the Kernel *outside* the Kernel's + // control cannot affect its state by calling e.g. + // Kernel.SendExternalSignal.) + extMu sync.Mutex `state:"nosave"` + + // started is true if Start has been called. Unless otherwise specified, + // all Kernel fields become immutable once started becomes true. + started bool `state:"nosave"` + + // All of the following fields are immutable unless otherwise specified. + + // Platform is the platform that is used to execute tasks in the created + // Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is + // embedded anonymously (the same issue applies). + platform.Platform `state:"nosave"` + + // mf provides application memory. + mf *pgalloc.MemoryFile `state:"nosave"` + + // See InitKernelArgs for the meaning of these fields. + featureSet *cpuid.FeatureSet + timekeeper *Timekeeper + tasks *TaskSet + rootUserNamespace *auth.UserNamespace + networkStack inet.Stack `state:"nosave"` + applicationCores uint + useHostCores bool + extraAuxv []arch.AuxEntry + vdso *loader.VDSO + rootUTSNamespace *UTSNamespace + rootIPCNamespace *IPCNamespace + rootAbstractSocketNamespace *AbstractSocketNamespace + + // mounts holds the state of the virtual filesystem. mounts is initially + // nil, and must be set by calling Kernel.SetRootMountNamespace before + // Kernel.CreateProcess can succeed. + mounts *fs.MountNamespace + + // futexes is the "root" futex.Manager, from which all others are forked. + // This is necessary to ensure that shared futexes are coherent across all + // tasks, including those created by CreateProcess. + futexes *futex.Manager + + // globalInit is the thread group whose leader has ID 1 in the root PID + // namespace. globalInit is stored separately so that it is accessible even + // after all tasks in the thread group have exited, such that ID 1 is no + // longer mapped. + // + // globalInit is mutable until it is assigned by the first successful call + // to CreateProcess, and is protected by extMu. + globalInit *ThreadGroup + + // realtimeClock is a ktime.Clock based on timekeeper's Realtime. + realtimeClock *timekeeperClock + + // monotonicClock is a ktime.Clock based on timekeeper's Monotonic. + monotonicClock *timekeeperClock + + // syslog is the kernel log. + syslog syslog + + // cpuClock is incremented every linux.ClockTick. cpuClock is used to + // measure task CPU usage, since sampling monotonicClock twice on every + // syscall turns out to be unreasonably expensive. This is similar to how + // Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING), + // although Linux also uses scheduler timing information to improve + // resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do + // since "preeemptive" scheduling is managed by the Go runtime, which + // doesn't provide this information. + // + // cpuClock is mutable, and is accessed using atomic memory operations. + cpuClock uint64 + + // cpuClockTicker increments cpuClock. + cpuClockTicker *ktime.Timer `state:"nosave"` + + // fdMapUids is an ever-increasing counter for generating FDMap uids. + // + // fdMapUids is mutable, and is accessed using atomic memory operations. + fdMapUids uint64 + + // uniqueID is used to generate unique identifiers. + // + // uniqueID is mutable, and is accessed using atomic memory operations. + uniqueID uint64 + + // nextInotifyCookie is a monotonically increasing counter used for + // generating unique inotify event cookies. + // + // nextInotifyCookie is mutable, and is accessed using atomic memory + // operations. + nextInotifyCookie uint32 + + // netlinkPorts manages allocation of netlink socket port IDs. + netlinkPorts *port.Manager + + // saveErr is the error causing the sandbox to exit during save, if + // any. It is protected by extMu. + saveErr error `state:"nosave"` + + // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. + danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` + + // socketTable is used to track all sockets on the system. Protected by + // extMu. + socketTable map[int]map[*refs.WeakRef]struct{} + + // deviceRegistry is used to save/restore device.SimpleDevices. + deviceRegistry struct{} `state:".(*device.Registry)"` + + // DirentCacheLimiter controls the number of total dirent entries can be in + // caches. Not all caches use it, only the caches that use host resources use + // the limiter. It may be nil if disabled. + DirentCacheLimiter *fs.DirentCacheLimiter +} + +// InitKernelArgs holds arguments to Init. +type InitKernelArgs struct { + // FeatureSet is the emulated CPU feature set. + FeatureSet *cpuid.FeatureSet + + // Timekeeper manages time for all tasks in the system. + Timekeeper *Timekeeper + + // RootUserNamespace is the root user namespace. + RootUserNamespace *auth.UserNamespace + + // NetworkStack is the TCP/IP network stack. NetworkStack may be nil. + NetworkStack inet.Stack + + // ApplicationCores is the number of logical CPUs visible to sandboxed + // applications. The set of logical CPU IDs is [0, ApplicationCores); thus + // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the + // most significant bit in cpu_possible_mask + 1. + ApplicationCores uint + + // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU + // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a + // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it + // will be overridden. + UseHostCores bool + + // ExtraAuxv contains additional auxiliary vector entries that are added to + // each process by the ELF loader. + ExtraAuxv []arch.AuxEntry + + // Vdso holds the VDSO and its parameter page. + Vdso *loader.VDSO + + // RootUTSNamespace is the root UTS namespace. + RootUTSNamespace *UTSNamespace + + // RootIPCNamespace is the root IPC namespace. + RootIPCNamespace *IPCNamespace + + // RootAbstractSocketNamespace is the root Abstract Socket namespace. + RootAbstractSocketNamespace *AbstractSocketNamespace +} + +// Init initialize the Kernel with no tasks. +// +// Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile +// before calling Init. +func (k *Kernel) Init(args InitKernelArgs) error { + if args.FeatureSet == nil { + return fmt.Errorf("FeatureSet is nil") + } + if args.Timekeeper == nil { + return fmt.Errorf("Timekeeper is nil") + } + if args.RootUserNamespace == nil { + return fmt.Errorf("RootUserNamespace is nil") + } + if args.ApplicationCores == 0 { + return fmt.Errorf("ApplicationCores is 0") + } + + k.featureSet = args.FeatureSet + k.timekeeper = args.Timekeeper + k.tasks = newTaskSet() + k.rootUserNamespace = args.RootUserNamespace + k.rootUTSNamespace = args.RootUTSNamespace + k.rootIPCNamespace = args.RootIPCNamespace + k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace + k.networkStack = args.NetworkStack + k.applicationCores = args.ApplicationCores + if args.UseHostCores { + k.useHostCores = true + maxCPU, err := hostcpu.MaxPossibleCPU() + if err != nil { + return fmt.Errorf("Failed to get maximum CPU number: %v", err) + } + minAppCores := uint(maxCPU) + 1 + if k.applicationCores < minAppCores { + log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) + k.applicationCores = minAppCores + } + } + k.extraAuxv = args.ExtraAuxv + k.vdso = args.Vdso + k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime} + k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} + k.futexes = futex.NewManager() + k.netlinkPorts = port.New() + k.socketTable = make(map[int]map[*refs.WeakRef]struct{}) + + return nil +} + +// SaveTo saves the state of k to w. +// +// Preconditions: The kernel must be paused throughout the call to SaveTo. +func (k *Kernel) SaveTo(w io.Writer) error { + saveStart := time.Now() + ctx := k.SupervisorContext() + + // Do not allow other Kernel methods to affect it while it's being saved. + k.extMu.Lock() + defer k.extMu.Unlock() + + // Stop time. + k.pauseTimeLocked() + defer k.resumeTimeLocked() + + // Evict all evictable MemoryFile allocations. + k.mf.StartEvictions() + k.mf.WaitForEvictions() + + // Flush write operations on open files so data reaches backing storage. + // This must come after MemoryFile eviction since eviction may cause file + // writes. + if err := k.tasks.flushWritesToFiles(ctx); err != nil { + return err + } + + // Remove all epoll waiter objects from underlying wait queues. + // NOTE: for programs to resume execution in future snapshot scenarios, + // we will need to re-establish these waiter objects after saving. + k.tasks.unregisterEpollWaiters() + + // Clear the dirent cache before saving because Dirents must be Loaded in a + // particular order (parents before children), and Loading dirents from a cache + // breaks that order. + if err := k.flushMountSourceRefs(); err != nil { + return err + } + + // Ensure that all pending asynchronous work is complete: + // - inode and mount release + // - asynchronuous IO + fs.AsyncBarrier() + + // Once all fs work has completed (flushed references have all been released), + // reset mount mappings. This allows individual mounts to save how inodes map + // to filesystem resources. Without this, fs.Inodes cannot be restored. + fs.SaveInodeMappings() + + // Discard unsavable mappings, such as those for host file descriptors. + // This must be done after waiting for "asynchronous fs work", which + // includes async I/O that may touch application memory. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } + + // Save the CPUID FeatureSet before the rest of the kernel so we can + // verify its compatibility on restore before attempting to restore the + // entire kernel, which may fail on an incompatible machine. + // + // N.B. This will also be saved along with the full kernel save below. + cpuidStart := time.Now() + if err := state.Save(w, k.FeatureSet(), nil); err != nil { + return err + } + log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) + + // Save the kernel state. + kernelStart := time.Now() + var stats state.Stats + if err := state.Save(w, k, &stats); err != nil { + return err + } + log.Infof("Kernel save stats: %s", &stats) + log.Infof("Kernel save took [%s].", time.Since(kernelStart)) + + // Save the memory file's state. + memoryStart := time.Now() + if err := k.mf.SaveTo(w); err != nil { + return err + } + log.Infof("Memory save took [%s].", time.Since(memoryStart)) + + log.Infof("Overall save took [%s].", time.Since(saveStart)) + + return nil +} + +// flushMountSourceRefs flushes the MountSources for all mounted filesystems +// and open FDs. +func (k *Kernel) flushMountSourceRefs() error { + // Flush all mount sources for currently mounted filesystems. + k.mounts.FlushMountSourceRefs() + + // There may be some open FDs whose filesystems have been unmounted. We + // must flush those as well. + return k.tasks.forEachFDPaused(func(desc descriptor) error { + desc.file.Dirent.Inode.MountSource.FlushDirentRefs() + return nil + }) +} + +// forEachFDPaused applies the given function to each open file descriptor in each +// task. +// +// Precondition: Must be called with the kernel paused. +func (ts *TaskSet) forEachFDPaused(f func(descriptor) error) error { + ts.mu.RLock() + defer ts.mu.RUnlock() + for t := range ts.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if t.fds == nil { + continue + } + for _, desc := range t.fds.files { + if err := f(desc); err != nil { + return err + } + } + } + return nil +} + +func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { + return ts.forEachFDPaused(func(desc descriptor) error { + if flags := desc.file.Flags(); !flags.Write { + return nil + } + if sattr := desc.file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { + return nil + } + // Here we need all metadata synced. + syncErr := desc.file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + if err := fs.SaveFileFsyncError(syncErr); err != nil { + name, _ := desc.file.Dirent.FullName(nil /* root */) + // Wrap this error in ErrSaveRejection + // so that it will trigger a save + // error, rather than a panic. This + // also allows us to distinguish Fsync + // errors from state file errors in + // state.Save. + return fs.ErrSaveRejection{ + Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err), + } + } + return nil + }) +} + +// Preconditions: The kernel must be paused. +func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { + invalidated := make(map[*mm.MemoryManager]struct{}) + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t := range k.tasks.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if mm := t.tc.MemoryManager; mm != nil { + if _, ok := invalidated[mm]; !ok { + if err := mm.InvalidateUnsavable(ctx); err != nil { + return err + } + invalidated[mm] = struct{}{} + } + } + // I really wish we just had a sync.Map of all MMs... + if r, ok := t.runState.(*runSyscallAfterExecStop); ok { + if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil { + return err + } + } + } + return nil +} + +func (ts *TaskSet) unregisterEpollWaiters() { + ts.mu.RLock() + defer ts.mu.RUnlock() + for t := range ts.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if fdmap := t.fds; fdmap != nil { + for _, desc := range fdmap.files { + if desc.file != nil { + if e, ok := desc.file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() + } + } + } + } + } +} + +// LoadFrom returns a new Kernel loaded from args. +func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error { + loadStart := time.Now() + + k.networkStack = net + + initAppCores := k.applicationCores + + // Load the pre-saved CPUID FeatureSet. + // + // N.B. This was also saved along with the full kernel below, so we + // don't need to explicitly install it in the Kernel. + cpuidStart := time.Now() + var features cpuid.FeatureSet + if err := state.Load(r, &features, nil); err != nil { + return err + } + log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) + + // Verify that the FeatureSet is usable on this host. We do this before + // Kernel load so that the explicit CPUID mismatch error has priority + // over floating point state restore errors that may occur on load on + // an incompatible machine. + if err := features.CheckHostCompatible(); err != nil { + return err + } + + // Load the kernel state. + kernelStart := time.Now() + var stats state.Stats + if err := state.Load(r, k, &stats); err != nil { + return err + } + log.Infof("Kernel load stats: %s", &stats) + log.Infof("Kernel load took [%s].", time.Since(kernelStart)) + + // Load the memory file's state. + memoryStart := time.Now() + if err := k.mf.LoadFrom(r); err != nil { + return err + } + log.Infof("Memory load took [%s].", time.Since(memoryStart)) + + // Ensure that all pending asynchronous work is complete: + // - namedpipe opening + // - inode file opening + if err := fs.AsyncErrorBarrier(); err != nil { + return err + } + + tcpip.AsyncLoading.Wait() + + log.Infof("Overall load took [%s]", time.Since(loadStart)) + + // Applications may size per-cpu structures based on k.applicationCores, so + // it can't change across save/restore. When we are virtualizing CPU + // numbers, this isn't a problem. However, when we are exposing host CPU + // assignments, we can't tolerate an increase in the number of host CPUs, + // which could result in getcpu(2) returning CPUs that applications expect + // not to exist. + if k.useHostCores && initAppCores > k.applicationCores { + return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) + } + + return nil +} + +// Destroy releases resources owned by k. +// +// Preconditions: There must be no task goroutines running in k. +func (k *Kernel) Destroy() { + if k.mounts != nil { + k.mounts.DecRef() + k.mounts = nil + } +} + +// UniqueID returns a unique identifier. +func (k *Kernel) UniqueID() uint64 { + id := atomic.AddUint64(&k.uniqueID, 1) + if id == 0 { + panic("unique identifier generator wrapped around") + } + return id +} + +// CreateProcessArgs holds arguments to kernel.CreateProcess. +type CreateProcessArgs struct { + // Filename is the filename to load. + // + // If this is provided as "", then the file will be guessed via Argv[0]. + Filename string + + // Argvv is a list of arguments. + Argv []string + + // Envv is a list of environment variables. + Envv []string + + // WorkingDirectory is the initial working directory. + // + // This defaults to the root if empty. + WorkingDirectory string + + // Credentials is the initial credentials. + Credentials *auth.Credentials + + // FDMap is the initial set of file descriptors. If CreateProcess succeeds, + // it takes a reference on FDMap. + FDMap *FDMap + + // Umask is the initial umask. + Umask uint + + // Limits is the initial resource limits. + Limits *limits.LimitSet + + // MaxSymlinkTraversals is the maximum number of symlinks to follow + // during resolution. + MaxSymlinkTraversals uint + + // UTSNamespace is the initial UTS namespace. + UTSNamespace *UTSNamespace + + // IPCNamespace is the initial IPC namespace. + IPCNamespace *IPCNamespace + + // AbstractSocketNamespace is the initial Abstract Socket namespace. + AbstractSocketNamespace *AbstractSocketNamespace + + // Root optionally contains the dirent that serves as the root for the + // process. If nil, the mount namespace's root is used as the process' + // root. + // + // Anyone setting Root must donate a reference (i.e. increment it) to + // keep it alive until it is decremented by CreateProcess. + Root *fs.Dirent + + // ContainerID is the container that the process belongs to. + ContainerID string +} + +// NewContext returns a context.Context that represents the task that will be +// created by args.NewContext(k). +func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext { + return &createProcessContext{ + Logger: log.Log(), + k: k, + args: args, + } +} + +// createProcessContext is a context.Context that represents the context +// associated with a task that is being created. +type createProcessContext struct { + context.NoopSleeper + log.Logger + k *Kernel + args *CreateProcessArgs +} + +// Value implements context.Context.Value. +func (ctx *createProcessContext) Value(key interface{}) interface{} { + switch key { + case CtxKernel: + return ctx.k + case CtxPIDNamespace: + // "The new task ... is in the root PID namespace." - + // Kernel.CreateProcess + return ctx.k.tasks.Root + case CtxUTSNamespace: + return ctx.args.UTSNamespace + case CtxIPCNamespace: + return ctx.args.IPCNamespace + case auth.CtxCredentials: + return ctx.args.Credentials + case fs.CtxRoot: + if ctx.args.Root != nil { + // Take a refernce on the root dirent that will be + // given to the caller. + ctx.args.Root.IncRef() + return ctx.args.Root + } + if ctx.k.mounts != nil { + // MountNamespace.Root() will take a reference on the + // root dirent for us. + return ctx.k.mounts.Root() + } + return nil + case fs.CtxDirentCacheLimiter: + return ctx.k.DirentCacheLimiter + case ktime.CtxRealtimeClock: + return ctx.k.RealtimeClock() + case limits.CtxLimits: + return ctx.args.Limits + case pgalloc.CtxMemoryFile: + return ctx.k.mf + case pgalloc.CtxMemoryFileProvider: + return ctx.k + case platform.CtxPlatform: + return ctx.k + case uniqueid.CtxGlobalUniqueID: + return ctx.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return ctx.k + case uniqueid.CtxInotifyCookie: + return ctx.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return ctx.k + default: + return nil + } +} + +// CreateProcess creates a new task in a new thread group with the given +// options. The new task has no parent and is in the root PID namespace. +// +// If k.Start() has already been called, then the created process must be +// started by calling kernel.StartProcess(tg). +// +// If k.Start() has not yet been called, then the created task will begin +// running when k.Start() is called. +// +// CreateProcess has no analogue in Linux; it is used to create the initial +// application task, as well as processes started by the control server. +func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) { + k.extMu.Lock() + defer k.extMu.Unlock() + log.Infof("EXEC: %v", args.Argv) + + if k.mounts == nil { + return nil, 0, fmt.Errorf("no kernel MountNamespace") + } + + tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) + ctx := args.NewContext(k) + + // Grab the root directory. + root := args.Root + if root == nil { + root = fs.RootFromContext(ctx) + // Is the root STILL nil? + if root == nil { + return nil, 0, fmt.Errorf("CreateProcessArgs.Root was not provided, and failed to get root from context") + } + } + defer root.DecRef() + args.Root = nil + + // Grab the working directory. + remainingTraversals := uint(args.MaxSymlinkTraversals) + wd := root // Default. + if args.WorkingDirectory != "" { + var err error + wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + + if args.Filename == "" { + // Was anything provided? + if len(args.Argv) == 0 { + return nil, 0, fmt.Errorf("no filename or command provided") + } + if !filepath.IsAbs(args.Argv[0]) { + return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) + } + args.Filename = args.Argv[0] + } + + // Create a fresh task context. + remainingTraversals = uint(args.MaxSymlinkTraversals) + tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet) + if se != nil { + return nil, 0, errors.New(se.String()) + } + + // Take a reference on the FDMap, which will be transferred to + // TaskSet.NewTask(). + args.FDMap.IncRef() + + // Create the task. + config := &TaskConfig{ + Kernel: k, + ThreadGroup: tg, + TaskContext: tc, + FSContext: newFSContext(root, wd, args.Umask), + FDMap: args.FDMap, + Credentials: args.Credentials, + AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), + UTSNamespace: args.UTSNamespace, + IPCNamespace: args.IPCNamespace, + AbstractSocketNamespace: args.AbstractSocketNamespace, + ContainerID: args.ContainerID, + } + if _, err := k.tasks.NewTask(config); err != nil { + return nil, 0, err + } + + // Success. + tgid := k.tasks.Root.IDOfThreadGroup(tg) + if k.globalInit == nil { + k.globalInit = tg + } + return tg, tgid, nil +} + +// StartProcess starts running a process that was created with CreateProcess. +func (k *Kernel) StartProcess(tg *ThreadGroup) { + t := tg.Leader() + tid := k.tasks.Root.IDOfTask(t) + t.Start(tid) +} + +// Start starts execution of all tasks in k. +// +// Preconditions: Start may be called exactly once. +func (k *Kernel) Start() error { + k.extMu.Lock() + defer k.extMu.Unlock() + + if k.globalInit == nil { + return fmt.Errorf("kernel contains no tasks") + } + if k.started { + return fmt.Errorf("kernel already started") + } + + k.started = true + k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k)) + k.cpuClockTicker.Swap(ktime.Setting{ + Enabled: true, + Period: linux.ClockTick, + }) + // If k was created by LoadKernelFrom, timers were stopped during + // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, + // this is a no-op. + k.resumeTimeLocked() + // Start task goroutines. + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + for t, tid := range k.tasks.Root.tids { + t.Start(tid) + } + return nil +} + +// pauseTimeLocked pauses all Timers and Timekeeper updates. +// +// Preconditions: Any task goroutines running in k must be stopped. k.extMu +// must be locked. +func (k *Kernel) pauseTimeLocked() { + // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before + // Kernel.Start(). + if k.cpuClockTicker != nil { + k.cpuClockTicker.Pause() + } + + // By precondition, nothing else can be interacting with PIDNamespace.tids + // or FDMap.files, so we can iterate them without synchronization. (We + // can't hold the TaskSet mutex when pausing thread group timers because + // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet + // mutex, while holding the Timer mutex.) + for t := range k.tasks.Root.tids { + if t == t.tg.leader { + t.tg.itimerRealTimer.Pause() + for _, it := range t.tg.timers { + it.PauseTimer() + } + } + // This means we'll iterate FDMaps shared by multiple tasks repeatedly, + // but ktime.Timer.Pause is idempotent so this is harmless. + if fdm := t.fds; fdm != nil { + for _, desc := range fdm.files { + if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.PauseTimer() + } + } + } + } + k.timekeeper.PauseUpdates() +} + +// resumeTimeLocked resumes all Timers and Timekeeper updates. If +// pauseTimeLocked has not been previously called, resumeTimeLocked has no +// effect. +// +// Preconditions: Any task goroutines running in k must be stopped. k.extMu +// must be locked. +func (k *Kernel) resumeTimeLocked() { + if k.cpuClockTicker != nil { + k.cpuClockTicker.Resume() + } + + k.timekeeper.ResumeUpdates() + for t := range k.tasks.Root.tids { + if t == t.tg.leader { + t.tg.itimerRealTimer.Resume() + for _, it := range t.tg.timers { + it.ResumeTimer() + } + } + if fdm := t.fds; fdm != nil { + for _, desc := range fdm.files { + if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.ResumeTimer() + } + } + } + } +} + +// WaitExited blocks until all tasks in k have exited. +func (k *Kernel) WaitExited() { + k.tasks.liveGoroutines.Wait() +} + +// Kill requests that all tasks in k immediately exit as if group exiting with +// status es. Kill does not wait for tasks to exit. +func (k *Kernel) Kill(es ExitStatus) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.Kill(es) +} + +// Pause requests that all tasks in k temporarily stop executing, and blocks +// until all tasks in k have stopped. Multiple calls to Pause nest and require +// an equal number of calls to Unpause to resume execution. +func (k *Kernel) Pause() { + k.extMu.Lock() + k.tasks.BeginExternalStop() + k.extMu.Unlock() + k.tasks.runningGoroutines.Wait() +} + +// Unpause ends the effect of a previous call to Pause. If Unpause is called +// without a matching preceding call to Pause, Unpause may panic. +func (k *Kernel) Unpause() { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.EndExternalStop() +} + +// SendExternalSignal injects a signal into the kernel. +// +// context is used only for debugging to describe how the signal was received. +// +// Preconditions: Kernel must have an init process. +func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.sendExternalSignal(info, context) +} + +// SendContainerSignal sends the given signal to all processes inside the +// namespace that match the given container ID. +func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { + k.extMu.Lock() + defer k.extMu.Unlock() + k.tasks.mu.RLock() + defer k.tasks.mu.RUnlock() + + var lastErr error + for tg := range k.tasks.Root.tgids { + if tg.leader.ContainerID() == cid { + tg.signalHandlers.mu.Lock() + infoCopy := *info + if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { + lastErr = err + } + tg.signalHandlers.mu.Unlock() + } + } + return lastErr +} + +// FeatureSet returns the FeatureSet. +func (k *Kernel) FeatureSet() *cpuid.FeatureSet { + return k.featureSet +} + +// Timekeeper returns the Timekeeper. +func (k *Kernel) Timekeeper() *Timekeeper { + return k.timekeeper +} + +// TaskSet returns the TaskSet. +func (k *Kernel) TaskSet() *TaskSet { + return k.tasks +} + +// RootUserNamespace returns the root UserNamespace. +func (k *Kernel) RootUserNamespace() *auth.UserNamespace { + return k.rootUserNamespace +} + +// RootUTSNamespace returns the root UTSNamespace. +func (k *Kernel) RootUTSNamespace() *UTSNamespace { + return k.rootUTSNamespace +} + +// RootIPCNamespace returns the root IPCNamespace. +func (k *Kernel) RootIPCNamespace() *IPCNamespace { + return k.rootIPCNamespace +} + +// RootAbstractSocketNamespace returns the root AbstractSocketNamespace. +func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { + return k.rootAbstractSocketNamespace +} + +// RootMountNamespace returns the MountNamespace. +func (k *Kernel) RootMountNamespace() *fs.MountNamespace { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.mounts +} + +// SetRootMountNamespace sets the MountNamespace. +func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) { + k.extMu.Lock() + defer k.extMu.Unlock() + k.mounts = mounts +} + +// NetworkStack returns the network stack. NetworkStack may return nil if no +// network stack is available. +func (k *Kernel) NetworkStack() inet.Stack { + return k.networkStack +} + +// GlobalInit returns the thread group with ID 1 in the root PID namespace, or +// nil if no such thread group exists. GlobalInit may return a thread group +// containing no tasks if the thread group has already exited. +func (k *Kernel) GlobalInit() *ThreadGroup { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.globalInit +} + +// ApplicationCores returns the number of CPUs visible to sandboxed +// applications. +func (k *Kernel) ApplicationCores() uint { + return k.applicationCores +} + +// RealtimeClock returns the application CLOCK_REALTIME clock. +func (k *Kernel) RealtimeClock() ktime.Clock { + return k.realtimeClock +} + +// MonotonicClock returns the application CLOCK_MONOTONIC clock. +func (k *Kernel) MonotonicClock() ktime.Clock { + return k.monotonicClock +} + +// CPUClockNow returns the current value of k.cpuClock. +func (k *Kernel) CPUClockNow() uint64 { + return atomic.LoadUint64(&k.cpuClock) +} + +// Syslog returns the syslog. +func (k *Kernel) Syslog() *syslog { + return &k.syslog +} + +// GenerateInotifyCookie generates a unique inotify event cookie. +// +// Returned values may overlap with previously returned values if the value +// space is exhausted. 0 is not a valid cookie value, all other values +// representable in a uint32 are allowed. +func (k *Kernel) GenerateInotifyCookie() uint32 { + id := atomic.AddUint32(&k.nextInotifyCookie, 1) + // Wrap-around is explicitly allowed for inotify event cookies. + if id == 0 { + id = atomic.AddUint32(&k.nextInotifyCookie, 1) + } + return id +} + +// NetlinkPorts returns the netlink port manager. +func (k *Kernel) NetlinkPorts() *port.Manager { + return k.netlinkPorts +} + +// SaveError returns the sandbox error that caused the kernel to exit during +// save. +func (k *Kernel) SaveError() error { + k.extMu.Lock() + defer k.extMu.Unlock() + return k.saveErr +} + +// SetSaveError sets the sandbox error that caused the kernel to exit during +// save, if one is not already set. +func (k *Kernel) SetSaveError(err error) { + k.extMu.Lock() + defer k.extMu.Unlock() + if k.saveErr == nil { + k.saveErr = err + } +} + +var _ tcpip.Clock = (*Kernel)(nil) + +// NowNanoseconds implements tcpip.Clock.NowNanoseconds. +func (k *Kernel) NowNanoseconds() int64 { + now, err := k.timekeeper.GetTime(sentrytime.Realtime) + if err != nil { + panic("Kernel.NowNanoseconds: " + err.Error()) + } + return now +} + +// NowMonotonic implements tcpip.Clock.NowMonotonic. +func (k *Kernel) NowMonotonic() int64 { + now, err := k.timekeeper.GetTime(sentrytime.Monotonic) + if err != nil { + panic("Kernel.NowMonotonic: " + err.Error()) + } + return now +} + +// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or +// LoadFrom. +func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { + k.mf = mf +} + +// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. +func (k *Kernel) MemoryFile() *pgalloc.MemoryFile { + return k.mf +} + +// SupervisorContext returns a Context with maximum privileges in k. It should +// only be used by goroutines outside the control of the emulated kernel +// defined by e. +// +// Callers are responsible for ensuring that the returned Context is not used +// concurrently with changes to the Kernel. +func (k *Kernel) SupervisorContext() context.Context { + return supervisorContext{ + Logger: log.Log(), + k: k, + } +} + +// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event +// channel. +func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { + t := TaskFromContext(ctx) + eventchannel.Emit(&uspb.UnimplementedSyscall{ + Tid: int32(t.ThreadID()), + Registers: t.Arch().StateData().Proto(), + }) +} + +// socketEntry represents a socket recorded in Kernel.socketTable. It implements +// refs.WeakRefUser for sockets stored in the socket table. +// +// +stateify savable +type socketEntry struct { + k *Kernel + sock *refs.WeakRef + family int +} + +// WeakRefGone implements refs.WeakRefUser.WeakRefGone. +func (s *socketEntry) WeakRefGone() { + s.k.extMu.Lock() + // k.socketTable is guaranteed to point to a valid socket table for s.family + // at this point, since we made sure of the fact when we created this + // socketEntry, and we never delete socket tables. + delete(s.k.socketTable[s.family], s.sock) + s.k.extMu.Unlock() +} + +// RecordSocket adds a socket to the system-wide socket table for tracking. +// +// Precondition: Caller must hold a reference to sock. +func (k *Kernel) RecordSocket(sock *fs.File, family int) { + k.extMu.Lock() + table, ok := k.socketTable[family] + if !ok { + table = make(map[*refs.WeakRef]struct{}) + k.socketTable[family] = table + } + se := socketEntry{k: k, family: family} + se.sock = refs.NewWeakRef(sock, &se) + table[se.sock] = struct{}{} + k.extMu.Unlock() +} + +// ListSockets returns a snapshot of all sockets of a given family. +func (k *Kernel) ListSockets(family int) []*refs.WeakRef { + k.extMu.Lock() + socks := []*refs.WeakRef{} + if table, ok := k.socketTable[family]; ok { + socks = make([]*refs.WeakRef, 0, len(table)) + for s := range table { + socks = append(socks, s) + } + } + k.extMu.Unlock() + return socks +} + +type supervisorContext struct { + context.NoopSleeper + log.Logger + k *Kernel +} + +// Value implements context.Context. +func (ctx supervisorContext) Value(key interface{}) interface{} { + switch key { + case CtxCanTrace: + // The supervisor context can trace anything. (None of + // supervisorContext's users are expected to invoke ptrace, but ptrace + // permissions are required for certain file accesses.) + return func(*Task, bool) bool { return true } + case CtxKernel: + return ctx.k + case CtxPIDNamespace: + return ctx.k.tasks.Root + case CtxUTSNamespace: + return ctx.k.rootUTSNamespace + case CtxIPCNamespace: + return ctx.k.rootIPCNamespace + case auth.CtxCredentials: + // The supervisor context is global root. + return auth.NewRootCredentials(ctx.k.rootUserNamespace) + case fs.CtxRoot: + return ctx.k.mounts.Root() + case fs.CtxDirentCacheLimiter: + return ctx.k.DirentCacheLimiter + case ktime.CtxRealtimeClock: + return ctx.k.RealtimeClock() + case limits.CtxLimits: + // No limits apply. + return limits.NewLimitSet() + case pgalloc.CtxMemoryFile: + return ctx.k.mf + case pgalloc.CtxMemoryFileProvider: + return ctx.k + case platform.CtxPlatform: + return ctx.k + case uniqueid.CtxGlobalUniqueID: + return ctx.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return ctx.k + case uniqueid.CtxInotifyCookie: + return ctx.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return ctx.k + default: + return nil + } +} diff --git a/pkg/sentry/kernel/kernel_state.go b/pkg/sentry/kernel/kernel_state.go new file mode 100644 index 000000000..48c3ff5a9 --- /dev/null +++ b/pkg/sentry/kernel/kernel_state.go @@ -0,0 +1,42 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/tcpip" +) + +// saveDanglingEndpoints is invoked by stateify. +func (k *Kernel) saveDanglingEndpoints() []tcpip.Endpoint { + return tcpip.GetDanglingEndpoints() +} + +// loadDanglingEndpoints is invoked by stateify. +func (k *Kernel) loadDanglingEndpoints(es []tcpip.Endpoint) { + for _, e := range es { + tcpip.AddDanglingEndpoint(e) + } +} + +// saveDeviceRegistry is invoked by stateify. +func (k *Kernel) saveDeviceRegistry() *device.Registry { + return device.SimpleDevices +} + +// loadDeviceRegistry is invoked by stateify. +func (k *Kernel) loadDeviceRegistry(r *device.Registry) { + device.SimpleDevices.LoadFrom(r) +} diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go new file mode 100755 index 000000000..82fd0abfd --- /dev/null +++ b/pkg/sentry/kernel/kernel_state_autogen.go @@ -0,0 +1,1147 @@ +// automatically generated by stateify. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/tcpip" +) + +func (x *abstractEndpoint) beforeSave() {} +func (x *abstractEndpoint) save(m state.Map) { + x.beforeSave() + m.Save("ep", &x.ep) + m.Save("wr", &x.wr) + m.Save("name", &x.name) + m.Save("ns", &x.ns) +} + +func (x *abstractEndpoint) afterLoad() {} +func (x *abstractEndpoint) load(m state.Map) { + m.Load("ep", &x.ep) + m.Load("wr", &x.wr) + m.Load("name", &x.name) + m.Load("ns", &x.ns) +} + +func (x *AbstractSocketNamespace) beforeSave() {} +func (x *AbstractSocketNamespace) save(m state.Map) { + x.beforeSave() + m.Save("endpoints", &x.endpoints) +} + +func (x *AbstractSocketNamespace) afterLoad() {} +func (x *AbstractSocketNamespace) load(m state.Map) { + m.Load("endpoints", &x.endpoints) +} + +func (x *FDFlags) beforeSave() {} +func (x *FDFlags) save(m state.Map) { + x.beforeSave() + m.Save("CloseOnExec", &x.CloseOnExec) +} + +func (x *FDFlags) afterLoad() {} +func (x *FDFlags) load(m state.Map) { + m.Load("CloseOnExec", &x.CloseOnExec) +} + +func (x *descriptor) beforeSave() {} +func (x *descriptor) save(m state.Map) { + x.beforeSave() + m.Save("file", &x.file) + m.Save("flags", &x.flags) +} + +func (x *descriptor) afterLoad() {} +func (x *descriptor) load(m state.Map) { + m.Load("file", &x.file) + m.Load("flags", &x.flags) +} + +func (x *FDMap) beforeSave() {} +func (x *FDMap) save(m state.Map) { + x.beforeSave() + m.Save("AtomicRefCount", &x.AtomicRefCount) + m.Save("k", &x.k) + m.Save("files", &x.files) + m.Save("uid", &x.uid) +} + +func (x *FDMap) afterLoad() {} +func (x *FDMap) load(m state.Map) { + m.Load("AtomicRefCount", &x.AtomicRefCount) + m.Load("k", &x.k) + m.Load("files", &x.files) + m.Load("uid", &x.uid) +} + +func (x *FSContext) beforeSave() {} +func (x *FSContext) save(m state.Map) { + x.beforeSave() + m.Save("AtomicRefCount", &x.AtomicRefCount) + m.Save("root", &x.root) + m.Save("cwd", &x.cwd) + m.Save("umask", &x.umask) +} + +func (x *FSContext) afterLoad() {} +func (x *FSContext) load(m state.Map) { + m.Load("AtomicRefCount", &x.AtomicRefCount) + m.Load("root", &x.root) + m.Load("cwd", &x.cwd) + m.Load("umask", &x.umask) +} + +func (x *IPCNamespace) beforeSave() {} +func (x *IPCNamespace) save(m state.Map) { + x.beforeSave() + m.Save("userNS", &x.userNS) + m.Save("semaphores", &x.semaphores) + m.Save("shms", &x.shms) +} + +func (x *IPCNamespace) afterLoad() {} +func (x *IPCNamespace) load(m state.Map) { + m.Load("userNS", &x.userNS) + m.Load("semaphores", &x.semaphores) + m.Load("shms", &x.shms) +} + +func (x *Kernel) beforeSave() {} +func (x *Kernel) save(m state.Map) { + x.beforeSave() + var danglingEndpoints []tcpip.Endpoint = x.saveDanglingEndpoints() + m.SaveValue("danglingEndpoints", danglingEndpoints) + var deviceRegistry *device.Registry = x.saveDeviceRegistry() + m.SaveValue("deviceRegistry", deviceRegistry) + m.Save("featureSet", &x.featureSet) + m.Save("timekeeper", &x.timekeeper) + m.Save("tasks", &x.tasks) + m.Save("rootUserNamespace", &x.rootUserNamespace) + m.Save("applicationCores", &x.applicationCores) + m.Save("useHostCores", &x.useHostCores) + m.Save("extraAuxv", &x.extraAuxv) + m.Save("vdso", &x.vdso) + m.Save("rootUTSNamespace", &x.rootUTSNamespace) + m.Save("rootIPCNamespace", &x.rootIPCNamespace) + m.Save("rootAbstractSocketNamespace", &x.rootAbstractSocketNamespace) + m.Save("mounts", &x.mounts) + m.Save("futexes", &x.futexes) + m.Save("globalInit", &x.globalInit) + m.Save("realtimeClock", &x.realtimeClock) + m.Save("monotonicClock", &x.monotonicClock) + m.Save("syslog", &x.syslog) + m.Save("cpuClock", &x.cpuClock) + m.Save("fdMapUids", &x.fdMapUids) + m.Save("uniqueID", &x.uniqueID) + m.Save("nextInotifyCookie", &x.nextInotifyCookie) + m.Save("netlinkPorts", &x.netlinkPorts) + m.Save("socketTable", &x.socketTable) + m.Save("DirentCacheLimiter", &x.DirentCacheLimiter) +} + +func (x *Kernel) afterLoad() {} +func (x *Kernel) load(m state.Map) { + m.Load("featureSet", &x.featureSet) + m.Load("timekeeper", &x.timekeeper) + m.Load("tasks", &x.tasks) + m.Load("rootUserNamespace", &x.rootUserNamespace) + m.Load("applicationCores", &x.applicationCores) + m.Load("useHostCores", &x.useHostCores) + m.Load("extraAuxv", &x.extraAuxv) + m.Load("vdso", &x.vdso) + m.Load("rootUTSNamespace", &x.rootUTSNamespace) + m.Load("rootIPCNamespace", &x.rootIPCNamespace) + m.Load("rootAbstractSocketNamespace", &x.rootAbstractSocketNamespace) + m.Load("mounts", &x.mounts) + m.Load("futexes", &x.futexes) + m.Load("globalInit", &x.globalInit) + m.Load("realtimeClock", &x.realtimeClock) + m.Load("monotonicClock", &x.monotonicClock) + m.Load("syslog", &x.syslog) + m.Load("cpuClock", &x.cpuClock) + m.Load("fdMapUids", &x.fdMapUids) + m.Load("uniqueID", &x.uniqueID) + m.Load("nextInotifyCookie", &x.nextInotifyCookie) + m.Load("netlinkPorts", &x.netlinkPorts) + m.Load("socketTable", &x.socketTable) + m.Load("DirentCacheLimiter", &x.DirentCacheLimiter) + m.LoadValue("danglingEndpoints", new([]tcpip.Endpoint), func(y interface{}) { x.loadDanglingEndpoints(y.([]tcpip.Endpoint)) }) + m.LoadValue("deviceRegistry", new(*device.Registry), func(y interface{}) { x.loadDeviceRegistry(y.(*device.Registry)) }) +} + +func (x *socketEntry) beforeSave() {} +func (x *socketEntry) save(m state.Map) { + x.beforeSave() + m.Save("k", &x.k) + m.Save("sock", &x.sock) + m.Save("family", &x.family) +} + +func (x *socketEntry) afterLoad() {} +func (x *socketEntry) load(m state.Map) { + m.Load("k", &x.k) + m.Load("sock", &x.sock) + m.Load("family", &x.family) +} + +func (x *pendingSignals) beforeSave() {} +func (x *pendingSignals) save(m state.Map) { + x.beforeSave() + var signals []savedPendingSignal = x.saveSignals() + m.SaveValue("signals", signals) +} + +func (x *pendingSignals) afterLoad() {} +func (x *pendingSignals) load(m state.Map) { + m.LoadValue("signals", new([]savedPendingSignal), func(y interface{}) { x.loadSignals(y.([]savedPendingSignal)) }) +} + +func (x *pendingSignalQueue) beforeSave() {} +func (x *pendingSignalQueue) save(m state.Map) { + x.beforeSave() + m.Save("pendingSignalList", &x.pendingSignalList) + m.Save("length", &x.length) +} + +func (x *pendingSignalQueue) afterLoad() {} +func (x *pendingSignalQueue) load(m state.Map) { + m.Load("pendingSignalList", &x.pendingSignalList) + m.Load("length", &x.length) +} + +func (x *pendingSignal) beforeSave() {} +func (x *pendingSignal) save(m state.Map) { + x.beforeSave() + m.Save("pendingSignalEntry", &x.pendingSignalEntry) + m.Save("SignalInfo", &x.SignalInfo) + m.Save("timer", &x.timer) +} + +func (x *pendingSignal) afterLoad() {} +func (x *pendingSignal) load(m state.Map) { + m.Load("pendingSignalEntry", &x.pendingSignalEntry) + m.Load("SignalInfo", &x.SignalInfo) + m.Load("timer", &x.timer) +} + +func (x *pendingSignalList) beforeSave() {} +func (x *pendingSignalList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *pendingSignalList) afterLoad() {} +func (x *pendingSignalList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *pendingSignalEntry) beforeSave() {} +func (x *pendingSignalEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *pendingSignalEntry) afterLoad() {} +func (x *pendingSignalEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func (x *savedPendingSignal) beforeSave() {} +func (x *savedPendingSignal) save(m state.Map) { + x.beforeSave() + m.Save("si", &x.si) + m.Save("timer", &x.timer) +} + +func (x *savedPendingSignal) afterLoad() {} +func (x *savedPendingSignal) load(m state.Map) { + m.Load("si", &x.si) + m.Load("timer", &x.timer) +} + +func (x *IntervalTimer) beforeSave() {} +func (x *IntervalTimer) save(m state.Map) { + x.beforeSave() + m.Save("timer", &x.timer) + m.Save("target", &x.target) + m.Save("signo", &x.signo) + m.Save("id", &x.id) + m.Save("sigval", &x.sigval) + m.Save("group", &x.group) + m.Save("sigpending", &x.sigpending) + m.Save("sigorphan", &x.sigorphan) + m.Save("overrunCur", &x.overrunCur) + m.Save("overrunLast", &x.overrunLast) +} + +func (x *IntervalTimer) afterLoad() {} +func (x *IntervalTimer) load(m state.Map) { + m.Load("timer", &x.timer) + m.Load("target", &x.target) + m.Load("signo", &x.signo) + m.Load("id", &x.id) + m.Load("sigval", &x.sigval) + m.Load("group", &x.group) + m.Load("sigpending", &x.sigpending) + m.Load("sigorphan", &x.sigorphan) + m.Load("overrunCur", &x.overrunCur) + m.Load("overrunLast", &x.overrunLast) +} + +func (x *processGroupList) beforeSave() {} +func (x *processGroupList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *processGroupList) afterLoad() {} +func (x *processGroupList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *processGroupEntry) beforeSave() {} +func (x *processGroupEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *processGroupEntry) afterLoad() {} +func (x *processGroupEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func (x *ptraceOptions) beforeSave() {} +func (x *ptraceOptions) save(m state.Map) { + x.beforeSave() + m.Save("ExitKill", &x.ExitKill) + m.Save("SysGood", &x.SysGood) + m.Save("TraceClone", &x.TraceClone) + m.Save("TraceExec", &x.TraceExec) + m.Save("TraceExit", &x.TraceExit) + m.Save("TraceFork", &x.TraceFork) + m.Save("TraceSeccomp", &x.TraceSeccomp) + m.Save("TraceVfork", &x.TraceVfork) + m.Save("TraceVforkDone", &x.TraceVforkDone) +} + +func (x *ptraceOptions) afterLoad() {} +func (x *ptraceOptions) load(m state.Map) { + m.Load("ExitKill", &x.ExitKill) + m.Load("SysGood", &x.SysGood) + m.Load("TraceClone", &x.TraceClone) + m.Load("TraceExec", &x.TraceExec) + m.Load("TraceExit", &x.TraceExit) + m.Load("TraceFork", &x.TraceFork) + m.Load("TraceSeccomp", &x.TraceSeccomp) + m.Load("TraceVfork", &x.TraceVfork) + m.Load("TraceVforkDone", &x.TraceVforkDone) +} + +func (x *ptraceStop) beforeSave() {} +func (x *ptraceStop) save(m state.Map) { + x.beforeSave() + m.Save("frozen", &x.frozen) + m.Save("listen", &x.listen) +} + +func (x *ptraceStop) afterLoad() {} +func (x *ptraceStop) load(m state.Map) { + m.Load("frozen", &x.frozen) + m.Load("listen", &x.listen) +} + +func (x *RSEQCriticalRegion) beforeSave() {} +func (x *RSEQCriticalRegion) save(m state.Map) { + x.beforeSave() + m.Save("CriticalSection", &x.CriticalSection) + m.Save("Restart", &x.Restart) +} + +func (x *RSEQCriticalRegion) afterLoad() {} +func (x *RSEQCriticalRegion) load(m state.Map) { + m.Load("CriticalSection", &x.CriticalSection) + m.Load("Restart", &x.Restart) +} + +func (x *sessionList) beforeSave() {} +func (x *sessionList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *sessionList) afterLoad() {} +func (x *sessionList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *sessionEntry) beforeSave() {} +func (x *sessionEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *sessionEntry) afterLoad() {} +func (x *sessionEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func (x *Session) beforeSave() {} +func (x *Session) save(m state.Map) { + x.beforeSave() + m.Save("refs", &x.refs) + m.Save("leader", &x.leader) + m.Save("id", &x.id) + m.Save("processGroups", &x.processGroups) + m.Save("sessionEntry", &x.sessionEntry) +} + +func (x *Session) afterLoad() {} +func (x *Session) load(m state.Map) { + m.Load("refs", &x.refs) + m.Load("leader", &x.leader) + m.Load("id", &x.id) + m.Load("processGroups", &x.processGroups) + m.Load("sessionEntry", &x.sessionEntry) +} + +func (x *ProcessGroup) beforeSave() {} +func (x *ProcessGroup) save(m state.Map) { + x.beforeSave() + m.Save("refs", &x.refs) + m.Save("originator", &x.originator) + m.Save("id", &x.id) + m.Save("session", &x.session) + m.Save("ancestors", &x.ancestors) + m.Save("processGroupEntry", &x.processGroupEntry) +} + +func (x *ProcessGroup) afterLoad() {} +func (x *ProcessGroup) load(m state.Map) { + m.Load("refs", &x.refs) + m.Load("originator", &x.originator) + m.Load("id", &x.id) + m.Load("session", &x.session) + m.Load("ancestors", &x.ancestors) + m.Load("processGroupEntry", &x.processGroupEntry) +} + +func (x *SignalHandlers) beforeSave() {} +func (x *SignalHandlers) save(m state.Map) { + x.beforeSave() + m.Save("actions", &x.actions) +} + +func (x *SignalHandlers) afterLoad() {} +func (x *SignalHandlers) load(m state.Map) { + m.Load("actions", &x.actions) +} + +func (x *SyscallTable) beforeSave() {} +func (x *SyscallTable) save(m state.Map) { + x.beforeSave() + m.Save("OS", &x.OS) + m.Save("Arch", &x.Arch) +} + +func (x *SyscallTable) load(m state.Map) { + m.LoadWait("OS", &x.OS) + m.LoadWait("Arch", &x.Arch) + m.AfterLoad(x.afterLoad) +} + +func (x *syslog) beforeSave() {} +func (x *syslog) save(m state.Map) { + x.beforeSave() + m.Save("msg", &x.msg) +} + +func (x *syslog) afterLoad() {} +func (x *syslog) load(m state.Map) { + m.Load("msg", &x.msg) +} + +func (x *Task) beforeSave() {} +func (x *Task) save(m state.Map) { + x.beforeSave() + var ptraceTracer *Task = x.savePtraceTracer() + m.SaveValue("ptraceTracer", ptraceTracer) + var logPrefix string = x.saveLogPrefix() + m.SaveValue("logPrefix", logPrefix) + var syscallFilters []bpf.Program = x.saveSyscallFilters() + m.SaveValue("syscallFilters", syscallFilters) + m.Save("taskNode", &x.taskNode) + m.Save("runState", &x.runState) + m.Save("haveSyscallReturn", &x.haveSyscallReturn) + m.Save("gosched", &x.gosched) + m.Save("yieldCount", &x.yieldCount) + m.Save("pendingSignals", &x.pendingSignals) + m.Save("signalMask", &x.signalMask) + m.Save("realSignalMask", &x.realSignalMask) + m.Save("haveSavedSignalMask", &x.haveSavedSignalMask) + m.Save("savedSignalMask", &x.savedSignalMask) + m.Save("signalStack", &x.signalStack) + m.Save("groupStopPending", &x.groupStopPending) + m.Save("groupStopAcknowledged", &x.groupStopAcknowledged) + m.Save("trapStopPending", &x.trapStopPending) + m.Save("trapNotifyPending", &x.trapNotifyPending) + m.Save("stop", &x.stop) + m.Save("exitStatus", &x.exitStatus) + m.Save("syscallRestartBlock", &x.syscallRestartBlock) + m.Save("k", &x.k) + m.Save("containerID", &x.containerID) + m.Save("tc", &x.tc) + m.Save("fsc", &x.fsc) + m.Save("fds", &x.fds) + m.Save("vforkParent", &x.vforkParent) + m.Save("exitState", &x.exitState) + m.Save("exitTracerNotified", &x.exitTracerNotified) + m.Save("exitTracerAcked", &x.exitTracerAcked) + m.Save("exitParentNotified", &x.exitParentNotified) + m.Save("exitParentAcked", &x.exitParentAcked) + m.Save("ptraceTracees", &x.ptraceTracees) + m.Save("ptraceSeized", &x.ptraceSeized) + m.Save("ptraceOpts", &x.ptraceOpts) + m.Save("ptraceSyscallMode", &x.ptraceSyscallMode) + m.Save("ptraceSinglestep", &x.ptraceSinglestep) + m.Save("ptraceCode", &x.ptraceCode) + m.Save("ptraceSiginfo", &x.ptraceSiginfo) + m.Save("ptraceEventMsg", &x.ptraceEventMsg) + m.Save("ioUsage", &x.ioUsage) + m.Save("creds", &x.creds) + m.Save("utsns", &x.utsns) + m.Save("ipcns", &x.ipcns) + m.Save("abstractSockets", &x.abstractSockets) + m.Save("parentDeathSignal", &x.parentDeathSignal) + m.Save("cleartid", &x.cleartid) + m.Save("allowedCPUMask", &x.allowedCPUMask) + m.Save("cpu", &x.cpu) + m.Save("niceness", &x.niceness) + m.Save("numaPolicy", &x.numaPolicy) + m.Save("numaNodeMask", &x.numaNodeMask) + m.Save("netns", &x.netns) + m.Save("rseqCPUAddr", &x.rseqCPUAddr) + m.Save("rseqCPU", &x.rseqCPU) + m.Save("startTime", &x.startTime) +} + +func (x *Task) load(m state.Map) { + m.Load("taskNode", &x.taskNode) + m.Load("runState", &x.runState) + m.Load("haveSyscallReturn", &x.haveSyscallReturn) + m.Load("gosched", &x.gosched) + m.Load("yieldCount", &x.yieldCount) + m.Load("pendingSignals", &x.pendingSignals) + m.Load("signalMask", &x.signalMask) + m.Load("realSignalMask", &x.realSignalMask) + m.Load("haveSavedSignalMask", &x.haveSavedSignalMask) + m.Load("savedSignalMask", &x.savedSignalMask) + m.Load("signalStack", &x.signalStack) + m.Load("groupStopPending", &x.groupStopPending) + m.Load("groupStopAcknowledged", &x.groupStopAcknowledged) + m.Load("trapStopPending", &x.trapStopPending) + m.Load("trapNotifyPending", &x.trapNotifyPending) + m.Load("stop", &x.stop) + m.Load("exitStatus", &x.exitStatus) + m.Load("syscallRestartBlock", &x.syscallRestartBlock) + m.Load("k", &x.k) + m.Load("containerID", &x.containerID) + m.Load("tc", &x.tc) + m.Load("fsc", &x.fsc) + m.Load("fds", &x.fds) + m.Load("vforkParent", &x.vforkParent) + m.Load("exitState", &x.exitState) + m.Load("exitTracerNotified", &x.exitTracerNotified) + m.Load("exitTracerAcked", &x.exitTracerAcked) + m.Load("exitParentNotified", &x.exitParentNotified) + m.Load("exitParentAcked", &x.exitParentAcked) + m.Load("ptraceTracees", &x.ptraceTracees) + m.Load("ptraceSeized", &x.ptraceSeized) + m.Load("ptraceOpts", &x.ptraceOpts) + m.Load("ptraceSyscallMode", &x.ptraceSyscallMode) + m.Load("ptraceSinglestep", &x.ptraceSinglestep) + m.Load("ptraceCode", &x.ptraceCode) + m.Load("ptraceSiginfo", &x.ptraceSiginfo) + m.Load("ptraceEventMsg", &x.ptraceEventMsg) + m.Load("ioUsage", &x.ioUsage) + m.Load("creds", &x.creds) + m.Load("utsns", &x.utsns) + m.Load("ipcns", &x.ipcns) + m.Load("abstractSockets", &x.abstractSockets) + m.Load("parentDeathSignal", &x.parentDeathSignal) + m.Load("cleartid", &x.cleartid) + m.Load("allowedCPUMask", &x.allowedCPUMask) + m.Load("cpu", &x.cpu) + m.Load("niceness", &x.niceness) + m.Load("numaPolicy", &x.numaPolicy) + m.Load("numaNodeMask", &x.numaNodeMask) + m.Load("netns", &x.netns) + m.Load("rseqCPUAddr", &x.rseqCPUAddr) + m.Load("rseqCPU", &x.rseqCPU) + m.Load("startTime", &x.startTime) + m.LoadValue("ptraceTracer", new(*Task), func(y interface{}) { x.loadPtraceTracer(y.(*Task)) }) + m.LoadValue("logPrefix", new(string), func(y interface{}) { x.loadLogPrefix(y.(string)) }) + m.LoadValue("syscallFilters", new([]bpf.Program), func(y interface{}) { x.loadSyscallFilters(y.([]bpf.Program)) }) + m.AfterLoad(x.afterLoad) +} + +func (x *runSyscallAfterPtraceEventClone) beforeSave() {} +func (x *runSyscallAfterPtraceEventClone) save(m state.Map) { + x.beforeSave() + m.Save("vforkChild", &x.vforkChild) + m.Save("vforkChildTID", &x.vforkChildTID) +} + +func (x *runSyscallAfterPtraceEventClone) afterLoad() {} +func (x *runSyscallAfterPtraceEventClone) load(m state.Map) { + m.Load("vforkChild", &x.vforkChild) + m.Load("vforkChildTID", &x.vforkChildTID) +} + +func (x *runSyscallAfterVforkStop) beforeSave() {} +func (x *runSyscallAfterVforkStop) save(m state.Map) { + x.beforeSave() + m.Save("childTID", &x.childTID) +} + +func (x *runSyscallAfterVforkStop) afterLoad() {} +func (x *runSyscallAfterVforkStop) load(m state.Map) { + m.Load("childTID", &x.childTID) +} + +func (x *vforkStop) beforeSave() {} +func (x *vforkStop) save(m state.Map) { + x.beforeSave() +} + +func (x *vforkStop) afterLoad() {} +func (x *vforkStop) load(m state.Map) { +} + +func (x *TaskContext) beforeSave() {} +func (x *TaskContext) save(m state.Map) { + x.beforeSave() + m.Save("Name", &x.Name) + m.Save("Arch", &x.Arch) + m.Save("MemoryManager", &x.MemoryManager) + m.Save("fu", &x.fu) + m.Save("st", &x.st) +} + +func (x *TaskContext) afterLoad() {} +func (x *TaskContext) load(m state.Map) { + m.Load("Name", &x.Name) + m.Load("Arch", &x.Arch) + m.Load("MemoryManager", &x.MemoryManager) + m.Load("fu", &x.fu) + m.Load("st", &x.st) +} + +func (x *execStop) beforeSave() {} +func (x *execStop) save(m state.Map) { + x.beforeSave() +} + +func (x *execStop) afterLoad() {} +func (x *execStop) load(m state.Map) { +} + +func (x *runSyscallAfterExecStop) beforeSave() {} +func (x *runSyscallAfterExecStop) save(m state.Map) { + x.beforeSave() + m.Save("tc", &x.tc) +} + +func (x *runSyscallAfterExecStop) afterLoad() {} +func (x *runSyscallAfterExecStop) load(m state.Map) { + m.Load("tc", &x.tc) +} + +func (x *ExitStatus) beforeSave() {} +func (x *ExitStatus) save(m state.Map) { + x.beforeSave() + m.Save("Code", &x.Code) + m.Save("Signo", &x.Signo) +} + +func (x *ExitStatus) afterLoad() {} +func (x *ExitStatus) load(m state.Map) { + m.Load("Code", &x.Code) + m.Load("Signo", &x.Signo) +} + +func (x *runExit) beforeSave() {} +func (x *runExit) save(m state.Map) { + x.beforeSave() +} + +func (x *runExit) afterLoad() {} +func (x *runExit) load(m state.Map) { +} + +func (x *runExitMain) beforeSave() {} +func (x *runExitMain) save(m state.Map) { + x.beforeSave() +} + +func (x *runExitMain) afterLoad() {} +func (x *runExitMain) load(m state.Map) { +} + +func (x *runExitNotify) beforeSave() {} +func (x *runExitNotify) save(m state.Map) { + x.beforeSave() +} + +func (x *runExitNotify) afterLoad() {} +func (x *runExitNotify) load(m state.Map) { +} + +func (x *taskList) beforeSave() {} +func (x *taskList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *taskList) afterLoad() {} +func (x *taskList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *taskEntry) beforeSave() {} +func (x *taskEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *taskEntry) afterLoad() {} +func (x *taskEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func (x *runApp) beforeSave() {} +func (x *runApp) save(m state.Map) { + x.beforeSave() +} + +func (x *runApp) afterLoad() {} +func (x *runApp) load(m state.Map) { +} + +func (x *TaskGoroutineSchedInfo) beforeSave() {} +func (x *TaskGoroutineSchedInfo) save(m state.Map) { + x.beforeSave() + m.Save("Timestamp", &x.Timestamp) + m.Save("State", &x.State) + m.Save("UserTicks", &x.UserTicks) + m.Save("SysTicks", &x.SysTicks) +} + +func (x *TaskGoroutineSchedInfo) afterLoad() {} +func (x *TaskGoroutineSchedInfo) load(m state.Map) { + m.Load("Timestamp", &x.Timestamp) + m.Load("State", &x.State) + m.Load("UserTicks", &x.UserTicks) + m.Load("SysTicks", &x.SysTicks) +} + +func (x *taskClock) beforeSave() {} +func (x *taskClock) save(m state.Map) { + x.beforeSave() + m.Save("t", &x.t) + m.Save("includeSys", &x.includeSys) +} + +func (x *taskClock) afterLoad() {} +func (x *taskClock) load(m state.Map) { + m.Load("t", &x.t) + m.Load("includeSys", &x.includeSys) +} + +func (x *tgClock) beforeSave() {} +func (x *tgClock) save(m state.Map) { + x.beforeSave() + m.Save("tg", &x.tg) + m.Save("includeSys", &x.includeSys) +} + +func (x *tgClock) afterLoad() {} +func (x *tgClock) load(m state.Map) { + m.Load("tg", &x.tg) + m.Load("includeSys", &x.includeSys) +} + +func (x *groupStop) beforeSave() {} +func (x *groupStop) save(m state.Map) { + x.beforeSave() +} + +func (x *groupStop) afterLoad() {} +func (x *groupStop) load(m state.Map) { +} + +func (x *runInterrupt) beforeSave() {} +func (x *runInterrupt) save(m state.Map) { + x.beforeSave() +} + +func (x *runInterrupt) afterLoad() {} +func (x *runInterrupt) load(m state.Map) { +} + +func (x *runInterruptAfterSignalDeliveryStop) beforeSave() {} +func (x *runInterruptAfterSignalDeliveryStop) save(m state.Map) { + x.beforeSave() +} + +func (x *runInterruptAfterSignalDeliveryStop) afterLoad() {} +func (x *runInterruptAfterSignalDeliveryStop) load(m state.Map) { +} + +func (x *runSyscallAfterSyscallEnterStop) beforeSave() {} +func (x *runSyscallAfterSyscallEnterStop) save(m state.Map) { + x.beforeSave() +} + +func (x *runSyscallAfterSyscallEnterStop) afterLoad() {} +func (x *runSyscallAfterSyscallEnterStop) load(m state.Map) { +} + +func (x *runSyscallAfterSysemuStop) beforeSave() {} +func (x *runSyscallAfterSysemuStop) save(m state.Map) { + x.beforeSave() +} + +func (x *runSyscallAfterSysemuStop) afterLoad() {} +func (x *runSyscallAfterSysemuStop) load(m state.Map) { +} + +func (x *runSyscallReinvoke) beforeSave() {} +func (x *runSyscallReinvoke) save(m state.Map) { + x.beforeSave() +} + +func (x *runSyscallReinvoke) afterLoad() {} +func (x *runSyscallReinvoke) load(m state.Map) { +} + +func (x *runSyscallExit) beforeSave() {} +func (x *runSyscallExit) save(m state.Map) { + x.beforeSave() +} + +func (x *runSyscallExit) afterLoad() {} +func (x *runSyscallExit) load(m state.Map) { +} + +func (x *ThreadGroup) beforeSave() {} +func (x *ThreadGroup) save(m state.Map) { + x.beforeSave() + var rscr *RSEQCriticalRegion = x.saveRscr() + m.SaveValue("rscr", rscr) + m.Save("threadGroupNode", &x.threadGroupNode) + m.Save("signalHandlers", &x.signalHandlers) + m.Save("pendingSignals", &x.pendingSignals) + m.Save("groupStopDequeued", &x.groupStopDequeued) + m.Save("groupStopSignal", &x.groupStopSignal) + m.Save("groupStopPendingCount", &x.groupStopPendingCount) + m.Save("groupStopComplete", &x.groupStopComplete) + m.Save("groupStopWaitable", &x.groupStopWaitable) + m.Save("groupContNotify", &x.groupContNotify) + m.Save("groupContInterrupted", &x.groupContInterrupted) + m.Save("groupContWaitable", &x.groupContWaitable) + m.Save("exiting", &x.exiting) + m.Save("exitStatus", &x.exitStatus) + m.Save("terminationSignal", &x.terminationSignal) + m.Save("itimerRealTimer", &x.itimerRealTimer) + m.Save("itimerVirtSetting", &x.itimerVirtSetting) + m.Save("itimerProfSetting", &x.itimerProfSetting) + m.Save("rlimitCPUSoftSetting", &x.rlimitCPUSoftSetting) + m.Save("cpuTimersEnabled", &x.cpuTimersEnabled) + m.Save("timers", &x.timers) + m.Save("nextTimerID", &x.nextTimerID) + m.Save("exitedCPUStats", &x.exitedCPUStats) + m.Save("childCPUStats", &x.childCPUStats) + m.Save("ioUsage", &x.ioUsage) + m.Save("maxRSS", &x.maxRSS) + m.Save("childMaxRSS", &x.childMaxRSS) + m.Save("limits", &x.limits) + m.Save("processGroup", &x.processGroup) + m.Save("execed", &x.execed) +} + +func (x *ThreadGroup) afterLoad() {} +func (x *ThreadGroup) load(m state.Map) { + m.Load("threadGroupNode", &x.threadGroupNode) + m.Load("signalHandlers", &x.signalHandlers) + m.Load("pendingSignals", &x.pendingSignals) + m.Load("groupStopDequeued", &x.groupStopDequeued) + m.Load("groupStopSignal", &x.groupStopSignal) + m.Load("groupStopPendingCount", &x.groupStopPendingCount) + m.Load("groupStopComplete", &x.groupStopComplete) + m.Load("groupStopWaitable", &x.groupStopWaitable) + m.Load("groupContNotify", &x.groupContNotify) + m.Load("groupContInterrupted", &x.groupContInterrupted) + m.Load("groupContWaitable", &x.groupContWaitable) + m.Load("exiting", &x.exiting) + m.Load("exitStatus", &x.exitStatus) + m.Load("terminationSignal", &x.terminationSignal) + m.Load("itimerRealTimer", &x.itimerRealTimer) + m.Load("itimerVirtSetting", &x.itimerVirtSetting) + m.Load("itimerProfSetting", &x.itimerProfSetting) + m.Load("rlimitCPUSoftSetting", &x.rlimitCPUSoftSetting) + m.Load("cpuTimersEnabled", &x.cpuTimersEnabled) + m.Load("timers", &x.timers) + m.Load("nextTimerID", &x.nextTimerID) + m.Load("exitedCPUStats", &x.exitedCPUStats) + m.Load("childCPUStats", &x.childCPUStats) + m.Load("ioUsage", &x.ioUsage) + m.Load("maxRSS", &x.maxRSS) + m.Load("childMaxRSS", &x.childMaxRSS) + m.Load("limits", &x.limits) + m.Load("processGroup", &x.processGroup) + m.Load("execed", &x.execed) + m.LoadValue("rscr", new(*RSEQCriticalRegion), func(y interface{}) { x.loadRscr(y.(*RSEQCriticalRegion)) }) +} + +func (x *itimerRealListener) beforeSave() {} +func (x *itimerRealListener) save(m state.Map) { + x.beforeSave() + m.Save("tg", &x.tg) +} + +func (x *itimerRealListener) afterLoad() {} +func (x *itimerRealListener) load(m state.Map) { + m.Load("tg", &x.tg) +} + +func (x *TaskSet) beforeSave() {} +func (x *TaskSet) save(m state.Map) { + x.beforeSave() + m.Save("Root", &x.Root) + m.Save("sessions", &x.sessions) +} + +func (x *TaskSet) afterLoad() {} +func (x *TaskSet) load(m state.Map) { + m.Load("Root", &x.Root) + m.Load("sessions", &x.sessions) +} + +func (x *PIDNamespace) beforeSave() {} +func (x *PIDNamespace) save(m state.Map) { + x.beforeSave() + m.Save("owner", &x.owner) + m.Save("parent", &x.parent) + m.Save("userns", &x.userns) + m.Save("last", &x.last) + m.Save("tasks", &x.tasks) + m.Save("tids", &x.tids) + m.Save("tgids", &x.tgids) + m.Save("sessions", &x.sessions) + m.Save("sids", &x.sids) + m.Save("processGroups", &x.processGroups) + m.Save("pgids", &x.pgids) + m.Save("exiting", &x.exiting) +} + +func (x *PIDNamespace) afterLoad() {} +func (x *PIDNamespace) load(m state.Map) { + m.Load("owner", &x.owner) + m.Load("parent", &x.parent) + m.Load("userns", &x.userns) + m.Load("last", &x.last) + m.Load("tasks", &x.tasks) + m.Load("tids", &x.tids) + m.Load("tgids", &x.tgids) + m.Load("sessions", &x.sessions) + m.Load("sids", &x.sids) + m.Load("processGroups", &x.processGroups) + m.Load("pgids", &x.pgids) + m.Load("exiting", &x.exiting) +} + +func (x *threadGroupNode) beforeSave() {} +func (x *threadGroupNode) save(m state.Map) { + x.beforeSave() + m.Save("pidns", &x.pidns) + m.Save("leader", &x.leader) + m.Save("execing", &x.execing) + m.Save("tasks", &x.tasks) + m.Save("tasksCount", &x.tasksCount) + m.Save("liveTasks", &x.liveTasks) + m.Save("activeTasks", &x.activeTasks) +} + +func (x *threadGroupNode) afterLoad() {} +func (x *threadGroupNode) load(m state.Map) { + m.Load("pidns", &x.pidns) + m.Load("leader", &x.leader) + m.Load("execing", &x.execing) + m.Load("tasks", &x.tasks) + m.Load("tasksCount", &x.tasksCount) + m.Load("liveTasks", &x.liveTasks) + m.Load("activeTasks", &x.activeTasks) +} + +func (x *taskNode) beforeSave() {} +func (x *taskNode) save(m state.Map) { + x.beforeSave() + m.Save("tg", &x.tg) + m.Save("taskEntry", &x.taskEntry) + m.Save("parent", &x.parent) + m.Save("children", &x.children) + m.Save("childPIDNamespace", &x.childPIDNamespace) +} + +func (x *taskNode) afterLoad() {} +func (x *taskNode) load(m state.Map) { + m.LoadWait("tg", &x.tg) + m.Load("taskEntry", &x.taskEntry) + m.Load("parent", &x.parent) + m.Load("children", &x.children) + m.Load("childPIDNamespace", &x.childPIDNamespace) +} + +func (x *Timekeeper) save(m state.Map) { + x.beforeSave() + m.Save("bootTime", &x.bootTime) + m.Save("saveMonotonic", &x.saveMonotonic) + m.Save("saveRealtime", &x.saveRealtime) + m.Save("params", &x.params) +} + +func (x *Timekeeper) load(m state.Map) { + m.Load("bootTime", &x.bootTime) + m.Load("saveMonotonic", &x.saveMonotonic) + m.Load("saveRealtime", &x.saveRealtime) + m.Load("params", &x.params) + m.AfterLoad(x.afterLoad) +} + +func (x *timekeeperClock) beforeSave() {} +func (x *timekeeperClock) save(m state.Map) { + x.beforeSave() + m.Save("tk", &x.tk) + m.Save("c", &x.c) +} + +func (x *timekeeperClock) afterLoad() {} +func (x *timekeeperClock) load(m state.Map) { + m.Load("tk", &x.tk) + m.Load("c", &x.c) +} + +func (x *UTSNamespace) beforeSave() {} +func (x *UTSNamespace) save(m state.Map) { + x.beforeSave() + m.Save("hostName", &x.hostName) + m.Save("domainName", &x.domainName) + m.Save("userns", &x.userns) +} + +func (x *UTSNamespace) afterLoad() {} +func (x *UTSNamespace) load(m state.Map) { + m.Load("hostName", &x.hostName) + m.Load("domainName", &x.domainName) + m.Load("userns", &x.userns) +} + +func (x *VDSOParamPage) beforeSave() {} +func (x *VDSOParamPage) save(m state.Map) { + x.beforeSave() + m.Save("mfp", &x.mfp) + m.Save("fr", &x.fr) + m.Save("seq", &x.seq) +} + +func (x *VDSOParamPage) afterLoad() {} +func (x *VDSOParamPage) load(m state.Map) { + m.Load("mfp", &x.mfp) + m.Load("fr", &x.fr) + m.Load("seq", &x.seq) +} + +func init() { + state.Register("kernel.abstractEndpoint", (*abstractEndpoint)(nil), state.Fns{Save: (*abstractEndpoint).save, Load: (*abstractEndpoint).load}) + state.Register("kernel.AbstractSocketNamespace", (*AbstractSocketNamespace)(nil), state.Fns{Save: (*AbstractSocketNamespace).save, Load: (*AbstractSocketNamespace).load}) + state.Register("kernel.FDFlags", (*FDFlags)(nil), state.Fns{Save: (*FDFlags).save, Load: (*FDFlags).load}) + state.Register("kernel.descriptor", (*descriptor)(nil), state.Fns{Save: (*descriptor).save, Load: (*descriptor).load}) + state.Register("kernel.FDMap", (*FDMap)(nil), state.Fns{Save: (*FDMap).save, Load: (*FDMap).load}) + state.Register("kernel.FSContext", (*FSContext)(nil), state.Fns{Save: (*FSContext).save, Load: (*FSContext).load}) + state.Register("kernel.IPCNamespace", (*IPCNamespace)(nil), state.Fns{Save: (*IPCNamespace).save, Load: (*IPCNamespace).load}) + state.Register("kernel.Kernel", (*Kernel)(nil), state.Fns{Save: (*Kernel).save, Load: (*Kernel).load}) + state.Register("kernel.socketEntry", (*socketEntry)(nil), state.Fns{Save: (*socketEntry).save, Load: (*socketEntry).load}) + state.Register("kernel.pendingSignals", (*pendingSignals)(nil), state.Fns{Save: (*pendingSignals).save, Load: (*pendingSignals).load}) + state.Register("kernel.pendingSignalQueue", (*pendingSignalQueue)(nil), state.Fns{Save: (*pendingSignalQueue).save, Load: (*pendingSignalQueue).load}) + state.Register("kernel.pendingSignal", (*pendingSignal)(nil), state.Fns{Save: (*pendingSignal).save, Load: (*pendingSignal).load}) + state.Register("kernel.pendingSignalList", (*pendingSignalList)(nil), state.Fns{Save: (*pendingSignalList).save, Load: (*pendingSignalList).load}) + state.Register("kernel.pendingSignalEntry", (*pendingSignalEntry)(nil), state.Fns{Save: (*pendingSignalEntry).save, Load: (*pendingSignalEntry).load}) + state.Register("kernel.savedPendingSignal", (*savedPendingSignal)(nil), state.Fns{Save: (*savedPendingSignal).save, Load: (*savedPendingSignal).load}) + state.Register("kernel.IntervalTimer", (*IntervalTimer)(nil), state.Fns{Save: (*IntervalTimer).save, Load: (*IntervalTimer).load}) + state.Register("kernel.processGroupList", (*processGroupList)(nil), state.Fns{Save: (*processGroupList).save, Load: (*processGroupList).load}) + state.Register("kernel.processGroupEntry", (*processGroupEntry)(nil), state.Fns{Save: (*processGroupEntry).save, Load: (*processGroupEntry).load}) + state.Register("kernel.ptraceOptions", (*ptraceOptions)(nil), state.Fns{Save: (*ptraceOptions).save, Load: (*ptraceOptions).load}) + state.Register("kernel.ptraceStop", (*ptraceStop)(nil), state.Fns{Save: (*ptraceStop).save, Load: (*ptraceStop).load}) + state.Register("kernel.RSEQCriticalRegion", (*RSEQCriticalRegion)(nil), state.Fns{Save: (*RSEQCriticalRegion).save, Load: (*RSEQCriticalRegion).load}) + state.Register("kernel.sessionList", (*sessionList)(nil), state.Fns{Save: (*sessionList).save, Load: (*sessionList).load}) + state.Register("kernel.sessionEntry", (*sessionEntry)(nil), state.Fns{Save: (*sessionEntry).save, Load: (*sessionEntry).load}) + state.Register("kernel.Session", (*Session)(nil), state.Fns{Save: (*Session).save, Load: (*Session).load}) + state.Register("kernel.ProcessGroup", (*ProcessGroup)(nil), state.Fns{Save: (*ProcessGroup).save, Load: (*ProcessGroup).load}) + state.Register("kernel.SignalHandlers", (*SignalHandlers)(nil), state.Fns{Save: (*SignalHandlers).save, Load: (*SignalHandlers).load}) + state.Register("kernel.SyscallTable", (*SyscallTable)(nil), state.Fns{Save: (*SyscallTable).save, Load: (*SyscallTable).load}) + state.Register("kernel.syslog", (*syslog)(nil), state.Fns{Save: (*syslog).save, Load: (*syslog).load}) + state.Register("kernel.Task", (*Task)(nil), state.Fns{Save: (*Task).save, Load: (*Task).load}) + state.Register("kernel.runSyscallAfterPtraceEventClone", (*runSyscallAfterPtraceEventClone)(nil), state.Fns{Save: (*runSyscallAfterPtraceEventClone).save, Load: (*runSyscallAfterPtraceEventClone).load}) + state.Register("kernel.runSyscallAfterVforkStop", (*runSyscallAfterVforkStop)(nil), state.Fns{Save: (*runSyscallAfterVforkStop).save, Load: (*runSyscallAfterVforkStop).load}) + state.Register("kernel.vforkStop", (*vforkStop)(nil), state.Fns{Save: (*vforkStop).save, Load: (*vforkStop).load}) + state.Register("kernel.TaskContext", (*TaskContext)(nil), state.Fns{Save: (*TaskContext).save, Load: (*TaskContext).load}) + state.Register("kernel.execStop", (*execStop)(nil), state.Fns{Save: (*execStop).save, Load: (*execStop).load}) + state.Register("kernel.runSyscallAfterExecStop", (*runSyscallAfterExecStop)(nil), state.Fns{Save: (*runSyscallAfterExecStop).save, Load: (*runSyscallAfterExecStop).load}) + state.Register("kernel.ExitStatus", (*ExitStatus)(nil), state.Fns{Save: (*ExitStatus).save, Load: (*ExitStatus).load}) + state.Register("kernel.runExit", (*runExit)(nil), state.Fns{Save: (*runExit).save, Load: (*runExit).load}) + state.Register("kernel.runExitMain", (*runExitMain)(nil), state.Fns{Save: (*runExitMain).save, Load: (*runExitMain).load}) + state.Register("kernel.runExitNotify", (*runExitNotify)(nil), state.Fns{Save: (*runExitNotify).save, Load: (*runExitNotify).load}) + state.Register("kernel.taskList", (*taskList)(nil), state.Fns{Save: (*taskList).save, Load: (*taskList).load}) + state.Register("kernel.taskEntry", (*taskEntry)(nil), state.Fns{Save: (*taskEntry).save, Load: (*taskEntry).load}) + state.Register("kernel.runApp", (*runApp)(nil), state.Fns{Save: (*runApp).save, Load: (*runApp).load}) + state.Register("kernel.TaskGoroutineSchedInfo", (*TaskGoroutineSchedInfo)(nil), state.Fns{Save: (*TaskGoroutineSchedInfo).save, Load: (*TaskGoroutineSchedInfo).load}) + state.Register("kernel.taskClock", (*taskClock)(nil), state.Fns{Save: (*taskClock).save, Load: (*taskClock).load}) + state.Register("kernel.tgClock", (*tgClock)(nil), state.Fns{Save: (*tgClock).save, Load: (*tgClock).load}) + state.Register("kernel.groupStop", (*groupStop)(nil), state.Fns{Save: (*groupStop).save, Load: (*groupStop).load}) + state.Register("kernel.runInterrupt", (*runInterrupt)(nil), state.Fns{Save: (*runInterrupt).save, Load: (*runInterrupt).load}) + state.Register("kernel.runInterruptAfterSignalDeliveryStop", (*runInterruptAfterSignalDeliveryStop)(nil), state.Fns{Save: (*runInterruptAfterSignalDeliveryStop).save, Load: (*runInterruptAfterSignalDeliveryStop).load}) + state.Register("kernel.runSyscallAfterSyscallEnterStop", (*runSyscallAfterSyscallEnterStop)(nil), state.Fns{Save: (*runSyscallAfterSyscallEnterStop).save, Load: (*runSyscallAfterSyscallEnterStop).load}) + state.Register("kernel.runSyscallAfterSysemuStop", (*runSyscallAfterSysemuStop)(nil), state.Fns{Save: (*runSyscallAfterSysemuStop).save, Load: (*runSyscallAfterSysemuStop).load}) + state.Register("kernel.runSyscallReinvoke", (*runSyscallReinvoke)(nil), state.Fns{Save: (*runSyscallReinvoke).save, Load: (*runSyscallReinvoke).load}) + state.Register("kernel.runSyscallExit", (*runSyscallExit)(nil), state.Fns{Save: (*runSyscallExit).save, Load: (*runSyscallExit).load}) + state.Register("kernel.ThreadGroup", (*ThreadGroup)(nil), state.Fns{Save: (*ThreadGroup).save, Load: (*ThreadGroup).load}) + state.Register("kernel.itimerRealListener", (*itimerRealListener)(nil), state.Fns{Save: (*itimerRealListener).save, Load: (*itimerRealListener).load}) + state.Register("kernel.TaskSet", (*TaskSet)(nil), state.Fns{Save: (*TaskSet).save, Load: (*TaskSet).load}) + state.Register("kernel.PIDNamespace", (*PIDNamespace)(nil), state.Fns{Save: (*PIDNamespace).save, Load: (*PIDNamespace).load}) + state.Register("kernel.threadGroupNode", (*threadGroupNode)(nil), state.Fns{Save: (*threadGroupNode).save, Load: (*threadGroupNode).load}) + state.Register("kernel.taskNode", (*taskNode)(nil), state.Fns{Save: (*taskNode).save, Load: (*taskNode).load}) + state.Register("kernel.Timekeeper", (*Timekeeper)(nil), state.Fns{Save: (*Timekeeper).save, Load: (*Timekeeper).load}) + state.Register("kernel.timekeeperClock", (*timekeeperClock)(nil), state.Fns{Save: (*timekeeperClock).save, Load: (*timekeeperClock).load}) + state.Register("kernel.UTSNamespace", (*UTSNamespace)(nil), state.Fns{Save: (*UTSNamespace).save, Load: (*UTSNamespace).load}) + state.Register("kernel.VDSOParamPage", (*VDSOParamPage)(nil), state.Fns{Save: (*VDSOParamPage).save, Load: (*VDSOParamPage).load}) +} diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go new file mode 100644 index 000000000..c93f6598a --- /dev/null +++ b/pkg/sentry/kernel/pending_signals.go @@ -0,0 +1,142 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bits" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" +) + +const ( + // stdSignalCap is the maximum number of instances of a given standard + // signal that may be pending. ("[If] multiple instances of a standard + // signal are delivered while that signal is currently blocked, then only + // one instance is queued.") - signal(7) + stdSignalCap = 1 + + // rtSignalCap is the maximum number of instances of a given realtime + // signal that may be pending. + // + // TODO(igudger): In Linux, the minimum signal queue size is + // RLIMIT_SIGPENDING, which is by default max_threads/2. + rtSignalCap = 32 +) + +// pendingSignals holds a collection of pending signals. The zero value of +// pendingSignals is a valid empty collection. pendingSignals is thread-unsafe; +// users must provide synchronization. +// +// +stateify savable +type pendingSignals struct { + // signals contains all pending signals. + // + // Note that signals is zero-indexed, but signal 1 is the first valid + // signal, so signals[0] contains signals with signo 1 etc. This offset is + // usually handled by using Signal.index(). + signals [linux.SignalMaximum]pendingSignalQueue `state:".([]savedPendingSignal)"` + + // Bit i of pendingSet is set iff there is at least one signal with signo + // i+1 pending. + pendingSet linux.SignalSet `state:"manual"` +} + +// pendingSignalQueue holds a pendingSignalList for a single signal number. +// +// +stateify savable +type pendingSignalQueue struct { + pendingSignalList + length int +} + +// +stateify savable +type pendingSignal struct { + // pendingSignalEntry links into a pendingSignalList. + pendingSignalEntry + *arch.SignalInfo + + // If timer is not nil, it is the IntervalTimer which sent this signal. + timer *IntervalTimer +} + +// enqueue enqueues the given signal. enqueue returns true on success and false +// on failure (if the given signal's queue is full). +// +// Preconditions: info represents a valid signal. +func (p *pendingSignals) enqueue(info *arch.SignalInfo, timer *IntervalTimer) bool { + sig := linux.Signal(info.Signo) + q := &p.signals[sig.Index()] + if sig.IsStandard() { + if q.length >= stdSignalCap { + return false + } + } else if q.length >= rtSignalCap { + return false + } + q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info, timer: timer}) + q.length++ + p.pendingSet |= linux.SignalSetOf(sig) + return true +} + +// dequeue dequeues and returns any pending signal not masked by mask. If no +// unmasked signals are pending, dequeue returns nil. +func (p *pendingSignals) dequeue(mask linux.SignalSet) *arch.SignalInfo { + // "Real-time signals are delivered in a guaranteed order. Multiple + // real-time signals of the same type are delivered in the order they were + // sent. If different real-time signals are sent to a process, they are + // delivered starting with the lowest-numbered signal. (I.e., low-numbered + // signals have highest priority.) By contrast, if multiple standard + // signals are pending for a process, the order in which they are delivered + // is unspecified. If both standard and real-time signals are pending for a + // process, POSIX leaves it unspecified which is delivered first. Linux, + // like many other implementations, gives priority to standard signals in + // this case." - signal(7) + lowestPendingUnblockedBit := bits.TrailingZeros64(uint64(p.pendingSet &^ mask)) + if lowestPendingUnblockedBit >= linux.SignalMaximum { + return nil + } + return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1)) +} + +func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *arch.SignalInfo { + q := &p.signals[sig.Index()] + ps := q.pendingSignalList.Front() + if ps == nil { + return nil + } + q.pendingSignalList.Remove(ps) + q.length-- + if q.length == 0 { + p.pendingSet &^= linux.SignalSetOf(sig) + } + if ps.timer != nil { + ps.timer.updateDequeuedSignalLocked(ps.SignalInfo) + } + return ps.SignalInfo +} + +// discardSpecific causes all pending signals with number sig to be discarded. +func (p *pendingSignals) discardSpecific(sig linux.Signal) { + q := &p.signals[sig.Index()] + for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() { + if ps.timer != nil { + ps.timer.signalRejectedLocked() + } + } + q.pendingSignalList.Reset() + q.length = 0 + p.pendingSet &^= linux.SignalSetOf(sig) +} diff --git a/pkg/sentry/kernel/pending_signals_list.go b/pkg/sentry/kernel/pending_signals_list.go new file mode 100755 index 000000000..a3499371a --- /dev/null +++ b/pkg/sentry/kernel/pending_signals_list.go @@ -0,0 +1,173 @@ +package kernel + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type pendingSignalElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (pendingSignalElementMapper) linkerFor(elem *pendingSignal) *pendingSignal { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type pendingSignalList struct { + head *pendingSignal + tail *pendingSignal +} + +// Reset resets list l to the empty state. +func (l *pendingSignalList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *pendingSignalList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *pendingSignalList) Front() *pendingSignal { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *pendingSignalList) Back() *pendingSignal { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *pendingSignalList) PushFront(e *pendingSignal) { + pendingSignalElementMapper{}.linkerFor(e).SetNext(l.head) + pendingSignalElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + pendingSignalElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *pendingSignalList) PushBack(e *pendingSignal) { + pendingSignalElementMapper{}.linkerFor(e).SetNext(nil) + pendingSignalElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + pendingSignalElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *pendingSignalList) PushBackList(m *pendingSignalList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + pendingSignalElementMapper{}.linkerFor(l.tail).SetNext(m.head) + pendingSignalElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *pendingSignalList) InsertAfter(b, e *pendingSignal) { + a := pendingSignalElementMapper{}.linkerFor(b).Next() + pendingSignalElementMapper{}.linkerFor(e).SetNext(a) + pendingSignalElementMapper{}.linkerFor(e).SetPrev(b) + pendingSignalElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + pendingSignalElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *pendingSignalList) InsertBefore(a, e *pendingSignal) { + b := pendingSignalElementMapper{}.linkerFor(a).Prev() + pendingSignalElementMapper{}.linkerFor(e).SetNext(a) + pendingSignalElementMapper{}.linkerFor(e).SetPrev(b) + pendingSignalElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + pendingSignalElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *pendingSignalList) Remove(e *pendingSignal) { + prev := pendingSignalElementMapper{}.linkerFor(e).Prev() + next := pendingSignalElementMapper{}.linkerFor(e).Next() + + if prev != nil { + pendingSignalElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + pendingSignalElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type pendingSignalEntry struct { + next *pendingSignal + prev *pendingSignal +} + +// Next returns the entry that follows e in the list. +func (e *pendingSignalEntry) Next() *pendingSignal { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *pendingSignalEntry) Prev() *pendingSignal { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *pendingSignalEntry) SetNext(elem *pendingSignal) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *pendingSignalEntry) SetPrev(elem *pendingSignal) { + e.prev = elem +} diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go new file mode 100644 index 000000000..2c902c7e3 --- /dev/null +++ b/pkg/sentry/kernel/pending_signals_state.go @@ -0,0 +1,46 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" +) + +// +stateify savable +type savedPendingSignal struct { + si *arch.SignalInfo + timer *IntervalTimer +} + +// saveSignals is invoked by stateify. +func (p *pendingSignals) saveSignals() []savedPendingSignal { + var pending []savedPendingSignal + for _, q := range p.signals { + for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() { + pending = append(pending, savedPendingSignal{ + si: ps.SignalInfo, + timer: ps.timer, + }) + } + } + return pending +} + +// loadSignals is invoked by stateify. +func (p *pendingSignals) loadSignals(pending []savedPendingSignal) { + for _, sps := range pending { + p.enqueue(sps.si, sps.timer) + } +} diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go new file mode 100644 index 000000000..4360dc44f --- /dev/null +++ b/pkg/sentry/kernel/pipe/buffer.go @@ -0,0 +1,90 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" +) + +// buffer encapsulates a queueable byte buffer. +// +// Note that the total size is slightly less than two pages. This +// is done intentionally to ensure that the buffer object aligns +// with runtime internals. We have no hard size or alignment +// requirements. This two page size will effectively minimize +// internal fragmentation, but still have a large enough chunk +// to limit excessive segmentation. +// +// +stateify savable +type buffer struct { + data [8144]byte + read int + write int + bufferEntry +} + +// Reset resets internal data. +// +// This must be called before use. +func (b *buffer) Reset() { + b.read = 0 + b.write = 0 +} + +// Empty indicates the buffer is empty. +// +// This indicates there is no data left to read. +func (b *buffer) Empty() bool { + return b.read == b.write +} + +// Full indicates the buffer is full. +// +// This indicates there is no capacity left to write. +func (b *buffer) Full() bool { + return b.write == len(b.data) +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.write:])) + n, err := safemem.CopySeq(dst, srcs) + b.write += int(n) + return n, err +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write])) + n, err := safemem.CopySeq(dsts, src) + b.read += int(n) + return n, err +} + +// bufferPool is a pool for buffers. +var bufferPool = sync.Pool{ + New: func() interface{} { + return new(buffer) + }, +} + +// newBuffer grabs a new buffer from the pool. +func newBuffer() *buffer { + b := bufferPool.Get().(*buffer) + b.Reset() + return b +} diff --git a/pkg/sentry/kernel/pipe/buffer_list.go b/pkg/sentry/kernel/pipe/buffer_list.go new file mode 100755 index 000000000..42ec78788 --- /dev/null +++ b/pkg/sentry/kernel/pipe/buffer_list.go @@ -0,0 +1,173 @@ +package pipe + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type bufferElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (bufferElementMapper) linkerFor(elem *buffer) *buffer { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type bufferList struct { + head *buffer + tail *buffer +} + +// Reset resets list l to the empty state. +func (l *bufferList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *bufferList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *bufferList) Front() *buffer { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *bufferList) Back() *buffer { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *bufferList) PushFront(e *buffer) { + bufferElementMapper{}.linkerFor(e).SetNext(l.head) + bufferElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + bufferElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *bufferList) PushBack(e *buffer) { + bufferElementMapper{}.linkerFor(e).SetNext(nil) + bufferElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + bufferElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *bufferList) PushBackList(m *bufferList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + bufferElementMapper{}.linkerFor(l.tail).SetNext(m.head) + bufferElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *bufferList) InsertAfter(b, e *buffer) { + a := bufferElementMapper{}.linkerFor(b).Next() + bufferElementMapper{}.linkerFor(e).SetNext(a) + bufferElementMapper{}.linkerFor(e).SetPrev(b) + bufferElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + bufferElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *bufferList) InsertBefore(a, e *buffer) { + b := bufferElementMapper{}.linkerFor(a).Prev() + bufferElementMapper{}.linkerFor(e).SetNext(a) + bufferElementMapper{}.linkerFor(e).SetPrev(b) + bufferElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + bufferElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *bufferList) Remove(e *buffer) { + prev := bufferElementMapper{}.linkerFor(e).Prev() + next := bufferElementMapper{}.linkerFor(e).Next() + + if prev != nil { + bufferElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + bufferElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type bufferEntry struct { + next *buffer + prev *buffer +} + +// Next returns the entry that follows e in the list. +func (e *bufferEntry) Next() *buffer { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *bufferEntry) Prev() *buffer { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *bufferEntry) SetNext(elem *buffer) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *bufferEntry) SetPrev(elem *buffer) { + e.prev = elem +} diff --git a/pkg/sentry/kernel/pipe/device.go b/pkg/sentry/kernel/pipe/device.go new file mode 100644 index 000000000..eb59e15a1 --- /dev/null +++ b/pkg/sentry/kernel/pipe/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import "gvisor.googlesource.com/gvisor/pkg/sentry/device" + +// pipeDevice is used for all pipe files. +var pipeDevice = device.NewAnonDevice() diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go new file mode 100644 index 000000000..926c4c623 --- /dev/null +++ b/pkg/sentry/kernel/pipe/node.go @@ -0,0 +1,196 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/amutex" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// inodeOperations implements fs.InodeOperations for pipes. +// +// +stateify savable +type inodeOperations struct { + fsutil.InodeGenericChecker `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + fsutil.InodeNoopRelease `state:"nosave"` + fsutil.InodeNoopTruncate `state:"nosave"` + fsutil.InodeNoopWriteOut `state:"nosave"` + fsutil.InodeNotDirectory `state:"nosave"` + fsutil.InodeNotMappable `state:"nosave"` + fsutil.InodeNotSocket `state:"nosave"` + fsutil.InodeNotSymlink `state:"nosave"` + fsutil.InodeNotVirtual `state:"nosave"` + + fsutil.InodeSimpleAttributes + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // p is the underlying Pipe object representing this fifo. + p *Pipe + + // Channels for synchronizing the creation of new readers and writers of + // this fifo. See waitFor and newHandleLocked. + // + // These are not saved/restored because all waiters are unblocked on save, + // and either automatically restart (via ERESTARTSYS) or return EINTR on + // resume. On restarts via ERESTARTSYS, the appropriate channel will be + // recreated. + rWakeup chan struct{} `state:"nosave"` + wWakeup chan struct{} `state:"nosave"` +} + +var _ fs.InodeOperations = (*inodeOperations)(nil) + +// NewInodeOperations returns a new fs.InodeOperations for a given pipe. +func NewInodeOperations(ctx context.Context, perms fs.FilePermissions, p *Pipe) *inodeOperations { + return &inodeOperations{ + InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), perms, linux.PIPEFS_MAGIC), + p: p, + } +} + +// GetFile implements fs.InodeOperations.GetFile. Named pipes have special blocking +// semantics during open: +// +// "Normally, opening the FIFO blocks until the other end is opened also. A +// process can open a FIFO in nonblocking mode. In this case, opening for +// read-only will succeed even if no-one has opened on the write side yet, +// opening for write-only will fail with ENXIO (no such device or address) +// unless the other end has already been opened. Under Linux, opening a FIFO +// for read and write will succeed both in blocking and nonblocking mode. POSIX +// leaves this behavior undefined. This can be used to open a FIFO for writing +// while there are no readers available." - fifo(7) +func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + i.mu.Lock() + defer i.mu.Unlock() + + switch { + case flags.Read && !flags.Write: // O_RDONLY. + r := i.p.Open(ctx, flags) + i.newHandleLocked(&i.rWakeup) + + if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { + if !i.waitFor(&i.wWakeup, ctx) { + r.DecRef() + return nil, syserror.ErrInterrupted + } + } + + // By now, either we're doing a nonblocking open or we have a writer. On + // a nonblocking read-only open, the open succeeds even if no-one has + // opened the write side yet. + return r, nil + + case flags.Write && !flags.Read: // O_WRONLY. + w := i.p.Open(ctx, flags) + i.newHandleLocked(&i.wWakeup) + + if i.p.isNamed && !i.p.HasReaders() { + // On a nonblocking, write-only open, the open fails with ENXIO if the + // read side isn't open yet. + if flags.NonBlocking { + w.DecRef() + return nil, syserror.ENXIO + } + + if !i.waitFor(&i.rWakeup, ctx) { + w.DecRef() + return nil, syserror.ErrInterrupted + } + } + return w, nil + + case flags.Read && flags.Write: // O_RDWR. + // Pipes opened for read-write always succeeds without blocking. + rw := i.p.Open(ctx, flags) + i.newHandleLocked(&i.rWakeup) + i.newHandleLocked(&i.wWakeup) + return rw, nil + + default: + return nil, syserror.EINVAL + } +} + +// waitFor blocks until the underlying pipe has at least one reader/writer is +// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this +// function will block for either readers or writers, depending on where +// 'wakeupChan' points. +// +// f.mu must be held by the caller. waitFor returns with f.mu held, but it will +// drop f.mu before blocking for any reader/writers. +func (i *inodeOperations) waitFor(wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool { + // Ideally this function would simply use a condition variable. However, the + // wait needs to be interruptible via 'sleeper', so we must sychronize via a + // channel. The synchronization below relies on the fact that closing a + // channel unblocks all receives on the channel. + + // Does an appropriate wakeup channel already exist? If not, create a new + // one. This is all done under f.mu to avoid races. + if *wakeupChan == nil { + *wakeupChan = make(chan struct{}) + } + + // Grab a local reference to the wakeup channel since it may disappear as + // soon as we drop f.mu. + wakeup := *wakeupChan + + // Drop the lock and prepare to sleep. + i.mu.Unlock() + cancel := sleeper.SleepStart() + + // Wait for either a new reader/write to be signalled via 'wakeup', or + // for the sleep to be cancelled. + select { + case <-wakeup: + sleeper.SleepFinish(true) + case <-cancel: + sleeper.SleepFinish(false) + } + + // Take the lock and check if we were woken. If we were woken and + // interrupted, the former takes priority. + i.mu.Lock() + select { + case <-wakeup: + return true + default: + return false + } +} + +// newHandleLocked signals a new pipe reader or writer depending on where +// 'wakeupChan' points. This unblocks any corresponding reader or writer +// waiting for the other end of the channel to be opened, see Fifo.waitFor. +// +// i.mu must be held. +func (*inodeOperations) newHandleLocked(wakeupChan *chan struct{}) { + if *wakeupChan != nil { + close(*wakeupChan) + *wakeupChan = nil + } +} + +func (*inodeOperations) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { + return syserror.EPIPE +} diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go new file mode 100644 index 000000000..b65204492 --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -0,0 +1,429 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pipe provides a pipe implementation. +package pipe + +import ( + "fmt" + "sync" + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +const ( + // MinimumPipeSize is a hard limit of the minimum size of a pipe. + MinimumPipeSize = 64 << 10 + + // DefaultPipeSize is the system-wide default size of a pipe in bytes. + DefaultPipeSize = MinimumPipeSize + + // MaximumPipeSize is a hard limit on the maximum size of a pipe. + MaximumPipeSize = 8 << 20 +) + +// Sizer is an interface for setting and getting the size of a pipe. +// +// It is implemented by Pipe and, through embedding, all other types. +type Sizer interface { + // PipeSize returns the pipe capacity in bytes. + PipeSize() int64 + + // SetPipeSize sets the new pipe capacity in bytes. + // + // The new size is returned (which may be capped). + SetPipeSize(int64) (int64, error) +} + +// Pipe is an encapsulation of a platform-independent pipe. +// It manages a buffered byte queue shared between a reader/writer +// pair. +// +// +stateify savable +type Pipe struct { + waiter.Queue `state:"nosave"` + + // isNamed indicates whether this is a named pipe. + // + // This value is immutable. + isNamed bool + + // atomicIOBytes is the maximum number of bytes that the pipe will + // guarantee atomic reads or writes atomically. + // + // This value is immutable. + atomicIOBytes int64 + + // The dirent backing this pipe. Shared by all readers and writers. + // + // This value is immutable. + Dirent *fs.Dirent + + // The number of active readers for this pipe. + // + // Access atomically. + readers int32 + + // The number of active writes for this pipe. + // + // Access atomically. + writers int32 + + // mu protects all pipe internal state below. + mu sync.Mutex `state:"nosave"` + + // data is the buffer queue of pipe contents. + // + // This is protected by mu. + data bufferList + + // max is the maximum size of the pipe in bytes. When this max has been + // reached, writers will get EWOULDBLOCK. + // + // This is protected by mu. + max int64 + + // size is the current size of the pipe in bytes. + // + // This is protected by mu. + size int64 + + // hadWriter indicates if this pipe ever had a writer. Note that this + // does not necessarily indicate there is *currently* a writer, just + // that there has been a writer at some point since the pipe was + // created. + // + // This is protected by mu. + hadWriter bool +} + +// NewPipe initializes and returns a pipe. +// +// N.B. The size and atomicIOBytes will be bounded. +func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe { + if sizeBytes < MinimumPipeSize { + sizeBytes = MinimumPipeSize + } + if sizeBytes > MaximumPipeSize { + sizeBytes = MaximumPipeSize + } + if atomicIOBytes <= 0 { + atomicIOBytes = 1 + } + if atomicIOBytes > sizeBytes { + atomicIOBytes = sizeBytes + } + p := &Pipe{ + isNamed: isNamed, + max: sizeBytes, + atomicIOBytes: atomicIOBytes, + } + + // Build the fs.Dirent of this pipe, shared by all fs.Files associated + // with this pipe. + perms := fs.FilePermissions{ + User: fs.PermMask{Read: true, Write: true}, + } + iops := NewInodeOperations(ctx, perms, p) + ino := pipeDevice.NextIno() + sattr := fs.StableAttr{ + Type: fs.Pipe, + DeviceID: pipeDevice.DeviceID(), + InodeID: ino, + BlockSize: int64(atomicIOBytes), + } + ms := fs.NewPseudoMountSource() + p.Dirent = fs.NewDirent(fs.NewInode(iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino)) + return p +} + +// NewConnectedPipe initializes a pipe and returns a pair of objects +// representing the read and write ends of the pipe. +func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) { + p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes) + return p.Open(ctx, fs.FileFlags{Read: true}), p.Open(ctx, fs.FileFlags{Write: true}) +} + +// Open opens the pipe and returns a new file. +// +// Precondition: at least one of flags.Read or flags.Write must be set. +func (p *Pipe) Open(ctx context.Context, flags fs.FileFlags) *fs.File { + switch { + case flags.Read && flags.Write: + p.rOpen() + p.wOpen() + return fs.NewFile(ctx, p.Dirent, flags, &ReaderWriter{ + Pipe: p, + }) + case flags.Read: + p.rOpen() + return fs.NewFile(ctx, p.Dirent, flags, &Reader{ + ReaderWriter: ReaderWriter{Pipe: p}, + }) + case flags.Write: + p.wOpen() + return fs.NewFile(ctx, p.Dirent, flags, &Writer{ + ReaderWriter: ReaderWriter{Pipe: p}, + }) + default: + // Precondition violated. + panic("invalid pipe flags") + } +} + +// read reads data from the pipe into dst and returns the number of bytes +// read, or returns ErrWouldBlock if the pipe is empty. +// +// Precondition: this pipe must have readers. +func (p *Pipe) read(ctx context.Context, dst usermem.IOSequence) (int64, error) { + // Don't block for a zero-length read even if the pipe is empty. + if dst.NumBytes() == 0 { + return 0, nil + } + + p.mu.Lock() + defer p.mu.Unlock() + + // Is the pipe empty? + if p.size == 0 { + if !p.HasWriters() { + // There are no writers, return EOF. + return 0, nil + } + return 0, syserror.ErrWouldBlock + } + + // Limit how much we consume. + if dst.NumBytes() > p.size { + dst = dst.TakeFirst64(p.size) + } + + done := int64(0) + for dst.NumBytes() > 0 { + // Pop the first buffer. + first := p.data.Front() + if first == nil { + break + } + + // Copy user data. + n, err := dst.CopyOutFrom(ctx, first) + done += int64(n) + p.size -= n + dst = dst.DropFirst64(n) + + // Empty buffer? + if first.Empty() { + // Push to the free list. + p.data.Remove(first) + bufferPool.Put(first) + } + + // Handle errors. + if err != nil { + return done, err + } + } + + return done, nil +} + +// write writes data from sv into the pipe and returns the number of bytes +// written. If no bytes are written because the pipe is full (or has less than +// atomicIOBytes free capacity), write returns ErrWouldBlock. +// +// Precondition: this pipe must have writers. +func (p *Pipe) write(ctx context.Context, src usermem.IOSequence) (int64, error) { + p.mu.Lock() + defer p.mu.Unlock() + + // Can't write to a pipe with no readers. + if !p.HasReaders() { + return 0, syscall.EPIPE + } + + // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be + // atomic, but requires no atomicity for writes larger than this. + wanted := src.NumBytes() + if avail := p.max - p.size; wanted > avail { + if wanted <= p.atomicIOBytes { + return 0, syserror.ErrWouldBlock + } + // Limit to the available capacity. + src = src.TakeFirst64(avail) + } + + done := int64(0) + for src.NumBytes() > 0 { + // Need a new buffer? + last := p.data.Back() + if last == nil || last.Full() { + // Add a new buffer to the data list. + last = newBuffer() + p.data.PushBack(last) + } + + // Copy user data. + n, err := src.CopyInTo(ctx, last) + done += int64(n) + p.size += n + src = src.DropFirst64(n) + + // Handle errors. + if err != nil { + return done, err + } + } + if wanted > done { + // Partial write due to full pipe. + return done, syserror.ErrWouldBlock + } + + return done, nil +} + +// rOpen signals a new reader of the pipe. +func (p *Pipe) rOpen() { + atomic.AddInt32(&p.readers, 1) +} + +// wOpen signals a new writer of the pipe. +func (p *Pipe) wOpen() { + p.mu.Lock() + defer p.mu.Unlock() + p.hadWriter = true + atomic.AddInt32(&p.writers, 1) +} + +// rClose signals that a reader has closed their end of the pipe. +func (p *Pipe) rClose() { + newReaders := atomic.AddInt32(&p.readers, -1) + if newReaders < 0 { + panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders)) + } +} + +// wClose signals that a writer has closed their end of the pipe. +func (p *Pipe) wClose() { + newWriters := atomic.AddInt32(&p.writers, -1) + if newWriters < 0 { + panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters)) + } +} + +// HasReaders returns whether the pipe has any active readers. +func (p *Pipe) HasReaders() bool { + return atomic.LoadInt32(&p.readers) > 0 +} + +// HasWriters returns whether the pipe has any active writers. +func (p *Pipe) HasWriters() bool { + return atomic.LoadInt32(&p.writers) > 0 +} + +// rReadinessLocked calculates the read readiness. +// +// Precondition: mu must be held. +func (p *Pipe) rReadinessLocked() waiter.EventMask { + ready := waiter.EventMask(0) + if p.HasReaders() && p.data.Front() != nil { + ready |= waiter.EventIn + } + if !p.HasWriters() && p.hadWriter { + // POLLHUP must be suppressed until the pipe has had at least one writer + // at some point. Otherwise a reader thread may poll and immediately get + // a POLLHUP before the writer ever opens the pipe, which the reader may + // interpret as the writer opening then closing the pipe. + ready |= waiter.EventHUp + } + return ready +} + +// rReadiness returns a mask that states whether the read end of the pipe is +// ready for reading. +func (p *Pipe) rReadiness() waiter.EventMask { + p.mu.Lock() + defer p.mu.Unlock() + return p.rReadinessLocked() +} + +// wReadinessLocked calculates the write readiness. +// +// Precondition: mu must be held. +func (p *Pipe) wReadinessLocked() waiter.EventMask { + ready := waiter.EventMask(0) + if p.HasWriters() && p.size < p.max { + ready |= waiter.EventOut + } + if !p.HasReaders() { + ready |= waiter.EventErr + } + return ready +} + +// wReadiness returns a mask that states whether the write end of the pipe +// is ready for writing. +func (p *Pipe) wReadiness() waiter.EventMask { + p.mu.Lock() + defer p.mu.Unlock() + return p.wReadinessLocked() +} + +// rwReadiness returns a mask that states whether a read-write handle to the +// pipe is ready for IO. +func (p *Pipe) rwReadiness() waiter.EventMask { + p.mu.Lock() + defer p.mu.Unlock() + return p.rReadinessLocked() | p.wReadinessLocked() +} + +// queued returns the amount of queued data. +func (p *Pipe) queued() int64 { + p.mu.Lock() + defer p.mu.Unlock() + return p.size +} + +// PipeSize implements PipeSizer.PipeSize. +func (p *Pipe) PipeSize() int64 { + p.mu.Lock() + defer p.mu.Unlock() + return p.max +} + +// SetPipeSize implements PipeSize.SetPipeSize. +func (p *Pipe) SetPipeSize(size int64) (int64, error) { + if size < 0 { + return 0, syserror.EINVAL + } + if size < MinimumPipeSize { + size = MinimumPipeSize // Per spec. + } + if size > MaximumPipeSize { + return 0, syserror.EPERM + } + p.mu.Lock() + defer p.mu.Unlock() + if size < p.size { + return 0, syserror.EBUSY + } + p.max = size + return size, nil +} diff --git a/pkg/sentry/kernel/pipe/pipe_state_autogen.go b/pkg/sentry/kernel/pipe/pipe_state_autogen.go new file mode 100755 index 000000000..5d3686109 --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe_state_autogen.go @@ -0,0 +1,134 @@ +// automatically generated by stateify. + +package pipe + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *buffer) beforeSave() {} +func (x *buffer) save(m state.Map) { + x.beforeSave() + m.Save("data", &x.data) + m.Save("read", &x.read) + m.Save("write", &x.write) + m.Save("bufferEntry", &x.bufferEntry) +} + +func (x *buffer) afterLoad() {} +func (x *buffer) load(m state.Map) { + m.Load("data", &x.data) + m.Load("read", &x.read) + m.Load("write", &x.write) + m.Load("bufferEntry", &x.bufferEntry) +} + +func (x *bufferList) beforeSave() {} +func (x *bufferList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *bufferList) afterLoad() {} +func (x *bufferList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *bufferEntry) beforeSave() {} +func (x *bufferEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *bufferEntry) afterLoad() {} +func (x *bufferEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func (x *inodeOperations) beforeSave() {} +func (x *inodeOperations) save(m state.Map) { + x.beforeSave() + m.Save("InodeSimpleAttributes", &x.InodeSimpleAttributes) + m.Save("p", &x.p) +} + +func (x *inodeOperations) afterLoad() {} +func (x *inodeOperations) load(m state.Map) { + m.Load("InodeSimpleAttributes", &x.InodeSimpleAttributes) + m.Load("p", &x.p) +} + +func (x *Pipe) beforeSave() {} +func (x *Pipe) save(m state.Map) { + x.beforeSave() + m.Save("isNamed", &x.isNamed) + m.Save("atomicIOBytes", &x.atomicIOBytes) + m.Save("Dirent", &x.Dirent) + m.Save("readers", &x.readers) + m.Save("writers", &x.writers) + m.Save("data", &x.data) + m.Save("max", &x.max) + m.Save("size", &x.size) + m.Save("hadWriter", &x.hadWriter) +} + +func (x *Pipe) afterLoad() {} +func (x *Pipe) load(m state.Map) { + m.Load("isNamed", &x.isNamed) + m.Load("atomicIOBytes", &x.atomicIOBytes) + m.Load("Dirent", &x.Dirent) + m.Load("readers", &x.readers) + m.Load("writers", &x.writers) + m.Load("data", &x.data) + m.Load("max", &x.max) + m.Load("size", &x.size) + m.Load("hadWriter", &x.hadWriter) +} + +func (x *Reader) beforeSave() {} +func (x *Reader) save(m state.Map) { + x.beforeSave() + m.Save("ReaderWriter", &x.ReaderWriter) +} + +func (x *Reader) afterLoad() {} +func (x *Reader) load(m state.Map) { + m.Load("ReaderWriter", &x.ReaderWriter) +} + +func (x *ReaderWriter) beforeSave() {} +func (x *ReaderWriter) save(m state.Map) { + x.beforeSave() + m.Save("Pipe", &x.Pipe) +} + +func (x *ReaderWriter) afterLoad() {} +func (x *ReaderWriter) load(m state.Map) { + m.Load("Pipe", &x.Pipe) +} + +func (x *Writer) beforeSave() {} +func (x *Writer) save(m state.Map) { + x.beforeSave() + m.Save("ReaderWriter", &x.ReaderWriter) +} + +func (x *Writer) afterLoad() {} +func (x *Writer) load(m state.Map) { + m.Load("ReaderWriter", &x.ReaderWriter) +} + +func init() { + state.Register("pipe.buffer", (*buffer)(nil), state.Fns{Save: (*buffer).save, Load: (*buffer).load}) + state.Register("pipe.bufferList", (*bufferList)(nil), state.Fns{Save: (*bufferList).save, Load: (*bufferList).load}) + state.Register("pipe.bufferEntry", (*bufferEntry)(nil), state.Fns{Save: (*bufferEntry).save, Load: (*bufferEntry).load}) + state.Register("pipe.inodeOperations", (*inodeOperations)(nil), state.Fns{Save: (*inodeOperations).save, Load: (*inodeOperations).load}) + state.Register("pipe.Pipe", (*Pipe)(nil), state.Fns{Save: (*Pipe).save, Load: (*Pipe).load}) + state.Register("pipe.Reader", (*Reader)(nil), state.Fns{Save: (*Reader).save, Load: (*Reader).load}) + state.Register("pipe.ReaderWriter", (*ReaderWriter)(nil), state.Fns{Save: (*ReaderWriter).save, Load: (*ReaderWriter).load}) + state.Register("pipe.Writer", (*Writer)(nil), state.Fns{Save: (*Writer).save, Load: (*Writer).load}) +} diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go new file mode 100644 index 000000000..656be824d --- /dev/null +++ b/pkg/sentry/kernel/pipe/reader.go @@ -0,0 +1,42 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Reader satisfies the fs.FileOperations interface for read-only pipes. +// Reader should be used with !fs.FileFlags.Write to reject writes. +// +// +stateify savable +type Reader struct { + ReaderWriter +} + +// Release implements fs.FileOperations.Release. +// +// This overrides ReaderWriter.Release. +func (r *Reader) Release() { + r.Pipe.rClose() + + // Wake up writers. + r.Pipe.Notify(waiter.EventOut) +} + +// Readiness returns the ready events in the underlying pipe. +func (r *Reader) Readiness(mask waiter.EventMask) waiter.EventMask { + return r.Pipe.rReadiness() & mask +} diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go new file mode 100644 index 000000000..e560b9be9 --- /dev/null +++ b/pkg/sentry/kernel/pipe/reader_writer.go @@ -0,0 +1,96 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "math" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// ReaderWriter satisfies the FileOperations interface and services both +// read and write requests. This should only be used directly for named pipes. +// pipe(2) and pipe2(2) only support unidirectional pipes and should use +// either pipe.Reader or pipe.Writer. +// +// +stateify savable +type ReaderWriter struct { + fsutil.FilePipeSeek `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileNoFsync `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + *Pipe +} + +// Release implements fs.FileOperations.Release. +func (rw *ReaderWriter) Release() { + rw.Pipe.rClose() + rw.Pipe.wClose() + + // Wake up readers and writers. + rw.Pipe.Notify(waiter.EventIn | waiter.EventOut) +} + +// Read implements fs.FileOperations.Read. +func (rw *ReaderWriter) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + n, err := rw.Pipe.read(ctx, dst) + if n > 0 { + rw.Pipe.Notify(waiter.EventOut) + } + return n, err +} + +// Write implements fs.FileOperations.Write. +func (rw *ReaderWriter) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + n, err := rw.Pipe.write(ctx, src) + if n > 0 { + rw.Pipe.Notify(waiter.EventIn) + } + return n, err +} + +// Readiness returns the ready events in the underlying pipe. +func (rw *ReaderWriter) Readiness(mask waiter.EventMask) waiter.EventMask { + return rw.Pipe.rwReadiness() & mask +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (rw *ReaderWriter) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Switch on ioctl request. + switch int(args[1].Int()) { + case linux.FIONREAD: + v := rw.queued() + if v > math.MaxInt32 { + v = math.MaxInt32 // Silently truncate. + } + // Copy result to user-space. + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + default: + return 0, syscall.ENOTTY + } +} diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go new file mode 100644 index 000000000..8d5b68541 --- /dev/null +++ b/pkg/sentry/kernel/pipe/writer.go @@ -0,0 +1,42 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Writer satisfies the fs.FileOperations interface for write-only pipes. +// Writer should be used with !fs.FileFlags.Read to reject reads. +// +// +stateify savable +type Writer struct { + ReaderWriter +} + +// Release implements fs.FileOperations.Release. +// +// This overrides ReaderWriter.Release. +func (w *Writer) Release() { + w.Pipe.wClose() + + // Wake up readers. + w.Pipe.Notify(waiter.EventHUp) +} + +// Readiness returns the ready events in the underlying pipe. +func (w *Writer) Readiness(mask waiter.EventMask) waiter.EventMask { + return w.Pipe.wReadiness() & mask +} diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go new file mode 100644 index 000000000..a016b4087 --- /dev/null +++ b/pkg/sentry/kernel/posixtimer.go @@ -0,0 +1,306 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// IntervalTimer represents a POSIX interval timer as described by +// timer_create(2). +// +// +stateify savable +type IntervalTimer struct { + timer *ktime.Timer + + // If target is not nil, it receives signo from timer expirations. If group + // is true, these signals are thread-group-directed. These fields are + // immutable. + target *Task + signo linux.Signal + id linux.TimerID + sigval uint64 + group bool + + // If sigpending is true, a signal to target is already queued, and timer + // expirations should increment overrunCur instead of sending another + // signal. sigpending is protected by target's signal mutex. (If target is + // nil, the timer will never send signals, so sigpending will be unused.) + sigpending bool + + // If sigorphan is true, timer's setting has been changed since sigpending + // last became true, such that overruns should no longer be counted in the + // pending signals si_overrun. sigorphan is protected by target's signal + // mutex. + sigorphan bool + + // overrunCur is the number of overruns that have occurred since the last + // time a signal was sent. overrunCur is protected by target's signal + // mutex. + overrunCur uint64 + + // Consider the last signal sent by this timer that has been dequeued. + // overrunLast is the number of overruns that occurred between when this + // signal was sent and when it was dequeued. Equivalently, overrunLast was + // the value of overrunCur when this signal was dequeued. overrunLast is + // protected by target's signal mutex. + overrunLast uint64 +} + +// DestroyTimer releases it's resources. +func (it *IntervalTimer) DestroyTimer() { + it.timer.Destroy() + it.timerSettingChanged() + // A destroyed IntervalTimer is still potentially reachable via a + // pendingSignal; nil out timer so that it won't be saved. + it.timer = nil +} + +func (it *IntervalTimer) timerSettingChanged() { + if it.target == nil { + return + } + it.target.tg.pidns.owner.mu.RLock() + defer it.target.tg.pidns.owner.mu.RUnlock() + it.target.tg.signalHandlers.mu.Lock() + defer it.target.tg.signalHandlers.mu.Unlock() + it.sigorphan = true + it.overrunCur = 0 + it.overrunLast = 0 +} + +// PauseTimer pauses the associated Timer. +func (it *IntervalTimer) PauseTimer() { + it.timer.Pause() +} + +// ResumeTimer resumes the associated Timer. +func (it *IntervalTimer) ResumeTimer() { + it.timer.Resume() +} + +// Preconditions: it.target's signal mutex must be locked. +func (it *IntervalTimer) updateDequeuedSignalLocked(si *arch.SignalInfo) { + it.sigpending = false + if it.sigorphan { + return + } + it.overrunLast = it.overrunCur + it.overrunCur = 0 + si.SetOverrun(saturateI32FromU64(it.overrunLast)) +} + +// Preconditions: it.target's signal mutex must be locked. +func (it *IntervalTimer) signalRejectedLocked() { + it.sigpending = false + if it.sigorphan { + return + } + it.overrunCur++ +} + +// Notify implements ktime.TimerListener.Notify. +func (it *IntervalTimer) Notify(exp uint64) { + if it.target == nil { + return + } + + it.target.tg.pidns.owner.mu.RLock() + defer it.target.tg.pidns.owner.mu.RUnlock() + it.target.tg.signalHandlers.mu.Lock() + defer it.target.tg.signalHandlers.mu.Unlock() + + if it.sigpending { + it.overrunCur += exp + return + } + + // sigpending must be set before sendSignalTimerLocked() so that it can be + // unset if the signal is discarded (in which case sendSignalTimerLocked() + // will return nil). + it.sigpending = true + it.sigorphan = false + it.overrunCur += exp - 1 + si := &arch.SignalInfo{ + Signo: int32(it.signo), + Code: arch.SignalInfoTimer, + } + si.SetTimerID(it.id) + si.SetSigval(it.sigval) + // si_overrun is set when the signal is dequeued. + if err := it.target.sendSignalTimerLocked(si, it.group, it); err != nil { + it.signalRejectedLocked() + } +} + +// Destroy implements ktime.TimerListener.Destroy. Users of Timer should call +// DestroyTimer instead. +func (it *IntervalTimer) Destroy() { +} + +// IntervalTimerCreate implements timer_create(2). +func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux.TimerID, error) { + t.tg.timerMu.Lock() + defer t.tg.timerMu.Unlock() + + // Allocate a timer ID. + var id linux.TimerID + end := t.tg.nextTimerID + for { + id = t.tg.nextTimerID + _, ok := t.tg.timers[id] + t.tg.nextTimerID++ + if t.tg.nextTimerID < 0 { + t.tg.nextTimerID = 0 + } + if !ok { + break + } + if t.tg.nextTimerID == end { + return 0, syserror.EAGAIN + } + } + + // "The implementation of the default case where evp [sic] is NULL is + // handled inside glibc, which invokes the underlying system call with a + // suitably populated sigevent structure." - timer_create(2). This is + // misleading; the timer_create syscall also handles a NULL sevp as + // described by the man page + // (kernel/time/posix-timers.c:sys_timer_create(), do_timer_create()). This + // must be handled here instead of the syscall wrapper since sigval is the + // timer ID, which isn't available until we allocate it in this function. + if sigev == nil { + sigev = &linux.Sigevent{ + Signo: int32(linux.SIGALRM), + Notify: linux.SIGEV_SIGNAL, + Value: uint64(id), + } + } + + // Construct the timer. + it := &IntervalTimer{ + id: id, + sigval: sigev.Value, + } + switch sigev.Notify { + case linux.SIGEV_NONE: + // leave it.target = nil + case linux.SIGEV_SIGNAL, linux.SIGEV_THREAD: + // POSIX SIGEV_THREAD semantics are implemented in userspace by libc; + // to the kernel, SIGEV_THREAD and SIGEV_SIGNAL are equivalent. (See + // Linux's kernel/time/posix-timers.c:good_sigevent().) + it.target = t.tg.leader + it.group = true + case linux.SIGEV_THREAD_ID: + t.tg.pidns.owner.mu.RLock() + target, ok := t.tg.pidns.tasks[ThreadID(sigev.Tid)] + t.tg.pidns.owner.mu.RUnlock() + if !ok || target.tg != t.tg { + return 0, syserror.EINVAL + } + it.target = target + default: + return 0, syserror.EINVAL + } + if sigev.Notify != linux.SIGEV_NONE { + it.signo = linux.Signal(sigev.Signo) + if !it.signo.IsValid() { + return 0, syserror.EINVAL + } + } + it.timer = ktime.NewTimer(c, it) + + t.tg.timers[id] = it + return id, nil +} + +// IntervalTimerDelete implements timer_delete(2). +func (t *Task) IntervalTimerDelete(id linux.TimerID) error { + t.tg.timerMu.Lock() + defer t.tg.timerMu.Unlock() + it := t.tg.timers[id] + if it == nil { + return syserror.EINVAL + } + delete(t.tg.timers, id) + it.DestroyTimer() + return nil +} + +// IntervalTimerSettime implements timer_settime(2). +func (t *Task) IntervalTimerSettime(id linux.TimerID, its linux.Itimerspec, abs bool) (linux.Itimerspec, error) { + t.tg.timerMu.Lock() + defer t.tg.timerMu.Unlock() + it := t.tg.timers[id] + if it == nil { + return linux.Itimerspec{}, syserror.EINVAL + } + + newS, err := ktime.SettingFromItimerspec(its, abs, it.timer.Clock()) + if err != nil { + return linux.Itimerspec{}, err + } + tm, oldS := it.timer.SwapAnd(newS, it.timerSettingChanged) + its = ktime.ItimerspecFromSetting(tm, oldS) + return its, nil +} + +// IntervalTimerGettime implements timer_gettime(2). +func (t *Task) IntervalTimerGettime(id linux.TimerID) (linux.Itimerspec, error) { + t.tg.timerMu.Lock() + defer t.tg.timerMu.Unlock() + it := t.tg.timers[id] + if it == nil { + return linux.Itimerspec{}, syserror.EINVAL + } + + tm, s := it.timer.Get() + its := ktime.ItimerspecFromSetting(tm, s) + return its, nil +} + +// IntervalTimerGetoverrun implements timer_getoverrun(2). +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) IntervalTimerGetoverrun(id linux.TimerID) (int32, error) { + t.tg.timerMu.Lock() + defer t.tg.timerMu.Unlock() + it := t.tg.timers[id] + if it == nil { + return 0, syserror.EINVAL + } + // By timer_create(2) invariant, either it.target == nil (in which case + // it.overrunLast is immutably 0) or t.tg == it.target.tg; and the fact + // that t is executing timer_getoverrun(2) means that t.tg can't be + // completing execve, so t.tg.signalHandlers can't be changing, allowing us + // to lock t.tg.signalHandlers.mu without holding the TaskSet mutex. + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + // This is consistent with Linux after 78c9c4dfbf8c ("posix-timers: + // Sanitize overrun handling"). + return saturateI32FromU64(it.overrunLast), nil +} + +func saturateI32FromU64(x uint64) int32 { + if x > math.MaxInt32 { + return math.MaxInt32 + } + return int32(x) +} diff --git a/pkg/sentry/kernel/process_group_list.go b/pkg/sentry/kernel/process_group_list.go new file mode 100755 index 000000000..853145237 --- /dev/null +++ b/pkg/sentry/kernel/process_group_list.go @@ -0,0 +1,173 @@ +package kernel + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type processGroupElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (processGroupElementMapper) linkerFor(elem *ProcessGroup) *ProcessGroup { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type processGroupList struct { + head *ProcessGroup + tail *ProcessGroup +} + +// Reset resets list l to the empty state. +func (l *processGroupList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *processGroupList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *processGroupList) Front() *ProcessGroup { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *processGroupList) Back() *ProcessGroup { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *processGroupList) PushFront(e *ProcessGroup) { + processGroupElementMapper{}.linkerFor(e).SetNext(l.head) + processGroupElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + processGroupElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *processGroupList) PushBack(e *ProcessGroup) { + processGroupElementMapper{}.linkerFor(e).SetNext(nil) + processGroupElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + processGroupElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *processGroupList) PushBackList(m *processGroupList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + processGroupElementMapper{}.linkerFor(l.tail).SetNext(m.head) + processGroupElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *processGroupList) InsertAfter(b, e *ProcessGroup) { + a := processGroupElementMapper{}.linkerFor(b).Next() + processGroupElementMapper{}.linkerFor(e).SetNext(a) + processGroupElementMapper{}.linkerFor(e).SetPrev(b) + processGroupElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + processGroupElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *processGroupList) InsertBefore(a, e *ProcessGroup) { + b := processGroupElementMapper{}.linkerFor(a).Prev() + processGroupElementMapper{}.linkerFor(e).SetNext(a) + processGroupElementMapper{}.linkerFor(e).SetPrev(b) + processGroupElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + processGroupElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *processGroupList) Remove(e *ProcessGroup) { + prev := processGroupElementMapper{}.linkerFor(e).Prev() + next := processGroupElementMapper{}.linkerFor(e).Next() + + if prev != nil { + processGroupElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + processGroupElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type processGroupEntry struct { + next *ProcessGroup + prev *ProcessGroup +} + +// Next returns the entry that follows e in the list. +func (e *processGroupEntry) Next() *ProcessGroup { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *processGroupEntry) Prev() *ProcessGroup { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *processGroupEntry) SetNext(elem *ProcessGroup) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *processGroupEntry) SetPrev(elem *ProcessGroup) { + e.prev = elem +} diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go new file mode 100644 index 000000000..4423e7efd --- /dev/null +++ b/pkg/sentry/kernel/ptrace.go @@ -0,0 +1,1105 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// ptraceOptions are the subset of options controlling a task's ptrace behavior +// that are set by ptrace(PTRACE_SETOPTIONS). +// +// +stateify savable +type ptraceOptions struct { + // ExitKill is true if the tracee should be sent SIGKILL when the tracer + // exits. + ExitKill bool + + // If SysGood is true, set bit 7 in the signal number for + // syscall-entry-stop and syscall-exit-stop traps delivered to this task's + // tracer. + SysGood bool + + // TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE + // events. + TraceClone bool + + // TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC + // events. + TraceExec bool + + // TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT + // events. + TraceExit bool + + // TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK + // events. + TraceFork bool + + // TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP + // events. + TraceSeccomp bool + + // TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK + // events. + TraceVfork bool + + // TraceVforkDone is true if the tracer wants to receive + // PTRACE_EVENT_VFORK_DONE events. + TraceVforkDone bool +} + +// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry +// and exit. +type ptraceSyscallMode int + +const ( + // ptraceSyscallNone indicates that the task has never ptrace-stopped, or + // that it was resumed from its last ptrace-stop by PTRACE_CONT or + // PTRACE_DETACH. The task's syscalls will not be intercepted. + ptraceSyscallNone ptraceSyscallMode = iota + + // ptraceSyscallIntercept indicates that the task was resumed from its last + // ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a + // syscall, a ptrace-stop will occur. + ptraceSyscallIntercept + + // ptraceSyscallEmu indicates that the task was resumed from its last + // ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time + // the task enters a syscall, the syscall will be skipped, and a + // ptrace-stop will occur. + ptraceSyscallEmu +) + +// CanTrace checks that t is permitted to access target's state, as defined by +// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it +// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access +// mode PTRACE_MODE_READ. +func (t *Task) CanTrace(target *Task, attach bool) bool { + // "1. If the calling thread and the target thread are in the same thread + // group, access is always allowed." - ptrace(2) + // + // Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access() + // should not deny sub-threads", first released in Linux 3.12), the rule + // only applies if t and target are the same task. But, as that commit + // message puts it, "[any] security check is pointless when the tasks share + // the same ->mm." + if t.tg == target.tg { + return true + } + + // """ + // 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped, + // doesn't exist until Linux 4.5). + // + // Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the + // caller's real UID and GID for the checks in the next step. (Most APIs + // that check the caller's UID and GID use the effective IDs. For + // historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs + // instead.) + // + // 3. Deny access if neither of the following is true: + // + // - The real, effective, and saved-set user IDs of the target match the + // caller's user ID, *and* the real, effective, and saved-set group IDs of + // the target match the caller's group ID. + // + // - The caller has the CAP_SYS_PTRACE capability in the user namespace of + // the target. + // + // 4. Deny access if the target process "dumpable" attribute has a value + // other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in + // prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in + // the user namespace of the target process. + // + // 5. The kernel LSM security_ptrace_access_check() interface is invoked to + // see if ptrace access is permitted. The results depend on the LSM(s). The + // implementation of this interface in the commoncap LSM performs the + // following steps: + // + // a) If the access mode includes PTRACE_MODE_FSCREDS, then use the + // caller's effective capability set; otherwise (the access mode specifies + // PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set. + // + // b) Deny access if neither of the following is true: + // + // - The caller and the target process are in the same user namespace, and + // the caller's capabilities are a proper superset of the target process's + // permitted capabilities. + // + // - The caller has the CAP_SYS_PTRACE capability in the target process's + // user namespace. + // + // Note that the commoncap LSM does not distinguish between + // PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this + // section: "the commoncap LSM ... is always invoked".) + // """ + callerCreds := t.Credentials() + targetCreds := target.Credentials() + if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) { + return true + } + if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID { + return false + } + if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID { + return false + } + // TODO(b/31916171): dumpability check + if callerCreds.UserNamespace != targetCreds.UserNamespace { + return false + } + if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 { + return false + } + // TODO: Yama LSM + return true +} + +// Tracer returns t's ptrace Tracer. +func (t *Task) Tracer() *Task { + return t.ptraceTracer.Load().(*Task) +} + +// hasTracer returns true if t has a ptrace tracer attached. +func (t *Task) hasTracer() bool { + // This isn't just inlined into callers so that if Task.Tracer() turns out + // to be too expensive because of e.g. interface conversion, we can switch + // to having a separate atomic flag more easily. + return t.Tracer() != nil +} + +// ptraceStop is a TaskStop placed on tasks in a ptrace-stop. +// +// +stateify savable +type ptraceStop struct { + // If frozen is true, the stopped task's tracer is currently operating on + // it, so Task.Kill should not remove the stop. + frozen bool + + // If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so + // ptraceFreeze should fail. + listen bool +} + +// Killable implements TaskStop.Killable. +func (s *ptraceStop) Killable() bool { + return !s.frozen +} + +// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been +// killed, the stop is skipped, and beginPtraceStopLocked returns false. +// +// beginPtraceStopLocked does not signal t's tracer or wake it if it is +// waiting. +// +// Preconditions: The TaskSet mutex must be locked. The caller must be running +// on the task goroutine. +func (t *Task) beginPtraceStopLocked() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + // This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... => + // kernel/sched/core.c:__schedule() => signal_pending_state() check, which + // is what prevents tasks from entering ptrace-stops after being killed. + // Note that if t was SIGKILLed and beingPtraceStopLocked is being called + // for PTRACE_EVENT_EXIT, the task will have dequeued the signal before + // entering the exit path, so t.killedLocked() will no longer return true. + // This is consistent with Linux: "Bugs: ... A SIGKILL signal may still + // cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be + // changed in the future; SIGKILL is meant to always immediately kill tasks + // even under ptrace. Last confirmed on Linux 3.13." - ptrace(2) + if t.killedLocked() { + return false + } + t.beginInternalStopLocked(&ptraceStop{}) + return true +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceTrapLocked(code int32) { + // This is unconditional in ptrace_stop(). + t.tg.signalHandlers.mu.Lock() + t.trapStopPending = false + t.tg.signalHandlers.mu.Unlock() + t.ptraceCode = code + t.ptraceSiginfo = &arch.SignalInfo{ + Signo: int32(linux.SIGTRAP), + Code: code, + } + t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + if t.beginPtraceStopLocked() { + tracer := t.Tracer() + tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP)) + tracer.tg.eventQueue.Notify(EventTraceeStop) + } +} + +// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the +// ptraceStop, temporarily preventing it from being removed by a concurrent +// Task.Kill, and returns true. Otherwise it returns false. +// +// Preconditions: The TaskSet mutex must be locked. The caller must be running +// on the task goroutine of t's tracer. +func (t *Task) ptraceFreeze() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.stop == nil { + return false + } + s, ok := t.stop.(*ptraceStop) + if !ok { + return false + } + if s.listen { + return false + } + s.frozen = true + return true +} + +// ptraceUnfreeze ends the effect of a previous successful call to +// ptraceFreeze. +// +// Preconditions: t must be in a frozen ptraceStop. +func (t *Task) ptraceUnfreeze() { + // t.tg.signalHandlers is stable because t is in a frozen ptrace-stop, + // preventing its thread group from completing execve. + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.ptraceUnfreezeLocked() +} + +// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be +// locked. +func (t *Task) ptraceUnfreezeLocked() { + // Do this even if the task has been killed to ensure a panic if t.stop is + // nil or not a ptraceStop. + t.stop.(*ptraceStop).frozen = false + if t.killedLocked() { + t.endInternalStopLocked() + } +} + +// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL, +// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on +// mode and singlestep. +// +// Preconditions: t must be in a frozen ptrace stop. +// +// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace +// stop. +func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error { + if sig != 0 && !sig.IsValid() { + return syserror.EIO + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.ptraceCode = int32(sig) + t.ptraceSyscallMode = mode + t.ptraceSinglestep = singlestep + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.endInternalStopLocked() + return nil +} + +func (t *Task) ptraceTraceme() error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if t.hasTracer() { + return syserror.EPERM + } + if t.parent == nil { + // In Linux, only init can not have a parent, and init is assumed never + // to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user + // application that may invoke PTRACE_TRACEME; having no parent can + // also occur if all tasks in the parent thread group have exited, and + // failed to find a living thread group to reparent to. The former case + // is treated as if TGID 1 has an exited parent in an invisible + // ancestor PID namespace that is an owner of the root user namespace + // (and consequently has CAP_SYS_PTRACE), and the latter case is a + // special form of the exited parent case below. In either case, + // returning nil here is correct. + return nil + } + if !t.parent.CanTrace(t, true) { + return syserror.EPERM + } + if t.parent.exitState != TaskExitNone { + // Fail silently, as if we were successfully attached but then + // immediately detached. This is consistent with Linux. + return nil + } + t.ptraceTracer.Store(t.parent) + t.parent.ptraceTracees[t] = struct{}{} + return nil +} + +// ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and +// ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller. +func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { + if t.tg == target.tg { + return syserror.EPERM + } + if !t.CanTrace(target, true) { + return syserror.EPERM + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.hasTracer() { + return syserror.EPERM + } + // Attaching to zombies and dead tasks is not permitted; the exit + // notification logic relies on this. Linux allows attaching to PF_EXITING + // tasks, though. + if target.exitState >= TaskExitZombie { + return syserror.EPERM + } + if seize { + if err := target.ptraceSetOptionsLocked(opts); err != nil { + return syserror.EIO + } + } + target.ptraceTracer.Store(t) + t.ptraceTracees[target] = struct{}{} + target.ptraceSeized = seize + target.tg.signalHandlers.mu.Lock() + // "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." - + // ptrace(2) + if !seize { + target.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGSTOP), + Code: arch.SignalInfoUser, + }, false /* group */) + } + // Undocumented Linux feature: If the tracee is already group-stopped (and + // consequently will not report the SIGSTOP just sent), force it to leave + // and re-enter the stop so that it will switch to a ptrace-stop. + if target.stop == (*groupStop)(nil) { + target.trapStopPending = true + target.endInternalStopLocked() + // TODO(jamieliu): Linux blocks ptrace_attach() until the task has + // entered the ptrace-stop (or exited) via JOBCTL_TRAPPING. + } + target.tg.signalHandlers.mu.Unlock() + return nil +} + +// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the +// caller. +// +// Preconditions: target must be a tracee of t in a frozen ptrace stop. +// +// Postconditions: If ptraceDetach returns nil, target will no longer be in a +// ptrace stop. +func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error { + if sig != 0 && !sig.IsValid() { + return syserror.EIO + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + target.ptraceCode = int32(sig) + target.forgetTracerLocked() + delete(t.ptraceTracees, target) + return nil +} + +// exitPtrace is called in the exit path to detach all of t's tracees. +func (t *Task) exitPtrace() { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + for target := range t.ptraceTracees { + if target.ptraceOpts.ExitKill { + target.tg.signalHandlers.mu.Lock() + target.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + }, false /* group */) + target.tg.signalHandlers.mu.Unlock() + } + // Leave ptraceCode unchanged so that if the task is ptrace-stopped, it + // observes the ptraceCode it set before it entered the stop. I believe + // this is consistent with Linux. + target.forgetTracerLocked() + } + // "nil maps cannot be saved" + t.ptraceTracees = make(map[*Task]struct{}) +} + +// forgetTracerLocked detaches t's tracer and ensures that t is no longer +// ptrace-stopped. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) forgetTracerLocked() { + t.ptraceSeized = false + t.ptraceOpts = ptraceOptions{} + t.ptraceSyscallMode = ptraceSyscallNone + t.ptraceSinglestep = false + t.ptraceTracer.Store((*Task)(nil)) + if t.exitTracerNotified && !t.exitTracerAcked { + t.exitTracerAcked = true + t.exitNotifyLocked(true) + } + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + // Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If + // it wasn't, it will be reset via t.groupStopPending after the following. + t.trapStopPending = false + // If t's thread group is in a group stop and t is eligible to participate, + // make it do so. This is essentially the reverse of the special case in + // ptraceAttach, which converts a group stop to a ptrace stop. ("Handling + // of restart from group-stop is currently buggy, but the "as planned" + // behavior is to leave tracee stopped and waiting for SIGCONT." - + // ptrace(2)) + if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated { + t.groupStopPending = true + // t already participated in the group stop when it unset + // groupStopPending. + t.groupStopAcknowledged = true + t.interrupt() + } + if _, ok := t.stop.(*ptraceStop); ok { + t.endInternalStopLocked() + } +} + +// ptraceSignalLocked is called after signal dequeueing to check if t should +// enter ptrace signal-delivery-stop. +// +// Preconditions: The signal mutex must be locked. The caller must be running +// on the task goroutine. +func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool { + if linux.Signal(info.Signo) == linux.SIGKILL { + return false + } + if !t.hasTracer() { + return false + } + // The tracer might change this signal into a stop signal, in which case + // any SIGCONT received after the signal was originally dequeued should + // cancel it. This is consistent with Linux. + t.tg.groupStopDequeued = true + // This is unconditional in ptrace_stop(). + t.trapStopPending = false + // Can't lock the TaskSet mutex while holding a signal mutex. + t.tg.signalHandlers.mu.Unlock() + defer t.tg.signalHandlers.mu.Lock() + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + tracer := t.Tracer() + if tracer == nil { + return false + } + t.ptraceCode = info.Signo + t.ptraceSiginfo = info + t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo) + if t.beginPtraceStopLocked() { + tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo) + tracer.tg.eventQueue.Notify(EventTraceeStop) + } + return true +} + +// ptraceSeccomp is called when a seccomp-bpf filter returns action +// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data +// is the lower 16 bits of the filter's return value. +func (t *Task) ptraceSeccomp(data uint16) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceSeccomp { + return false + } + t.Debugf("Entering PTRACE_EVENT_SECCOMP stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_SECCOMP, uint64(data)) + return true +} + +// ptraceSyscallEnter is called immediately before entering a syscall to check +// if t should enter ptrace syscall-enter-stop. +func (t *Task) ptraceSyscallEnter() (taskRunState, bool) { + if !t.hasTracer() { + return nil, false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + switch t.ptraceSyscallMode { + case ptraceSyscallNone: + return nil, false + case ptraceSyscallIntercept: + t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL") + t.ptraceSyscallStopLocked() + return (*runSyscallAfterSyscallEnterStop)(nil), true + case ptraceSyscallEmu: + t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU") + t.ptraceSyscallStopLocked() + return (*runSyscallAfterSysemuStop)(nil), true + } + panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode)) +} + +// ptraceSyscallExit is called immediately after leaving a syscall to check if +// t should enter ptrace syscall-exit-stop. +func (t *Task) ptraceSyscallExit() { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if t.ptraceSyscallMode != ptraceSyscallIntercept { + return + } + t.Debugf("Entering syscall-exit-stop") + t.ptraceSyscallStopLocked() +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceSyscallStopLocked() { + code := int32(linux.SIGTRAP) + if t.ptraceOpts.SysGood { + code |= 0x80 + } + t.ptraceTrapLocked(code) +} + +type ptraceCloneKind int32 + +const ( + // ptraceCloneKindClone represents a call to Task.Clone where + // TerminationSignal is not SIGCHLD and Vfork is false. + ptraceCloneKindClone ptraceCloneKind = iota + + // ptraceCloneKindFork represents a call to Task.Clone where + // TerminationSignal is SIGCHLD and Vfork is false. + ptraceCloneKindFork + + // ptraceCloneKindVfork represents a call to Task.Clone where Vfork is + // true. + ptraceCloneKindVfork +) + +// ptraceClone is called at the end of a clone or fork syscall to check if t +// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK +// stop. child is the new task. +func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + event := false + if !opts.Untraced { + switch kind { + case ptraceCloneKindClone: + if t.ptraceOpts.TraceClone { + t.Debugf("Entering PTRACE_EVENT_CLONE stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child])) + event = true + } + case ptraceCloneKindFork: + if t.ptraceOpts.TraceFork { + t.Debugf("Entering PTRACE_EVENT_FORK stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child])) + event = true + } + case ptraceCloneKindVfork: + if t.ptraceOpts.TraceVfork { + t.Debugf("Entering PTRACE_EVENT_VFORK stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child])) + event = true + } + default: + panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind)) + } + } + // "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE + // options are in effect, then children created by, respectively, vfork(2) + // or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit + // signal set to SIGCHLD, and other kinds of clone(2), are automatically + // attached to the same tracer which traced their parent. SIGSTOP is + // delivered to the children, causing them to enter signal-delivery-stop + // after they exit the system call which created them." - ptrace(2) + // + // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is + // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() => + // include/linux/ptrace.h:ptrace_init_task(). + if event || opts.InheritTracer { + tracer := t.Tracer() + if tracer != nil { + child.ptraceTracer.Store(tracer) + tracer.ptraceTracees[child] = struct{}{} + // "The "seized" behavior ... is inherited by children that are + // automatically attached using PTRACE_O_TRACEFORK, + // PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2) + child.ptraceSeized = t.ptraceSeized + // "Flags are inherited by new tracees created and "auto-attached" + // via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or + // PTRACE_O_TRACECLONE options." - ptrace(2) + child.ptraceOpts = t.ptraceOpts + child.tg.signalHandlers.mu.Lock() + // "PTRACE_SEIZE: ... Automatically attached children stop with + // PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead + // of having SIGSTOP signal delivered to them." - ptrace(2) + if child.ptraceSeized { + child.trapStopPending = true + } else { + child.pendingSignals.enqueue(&arch.SignalInfo{ + Signo: int32(linux.SIGSTOP), + }, nil) + } + // The child will self-interrupt() when its task goroutine starts + // running, so we don't have to. + child.tg.signalHandlers.mu.Unlock() + } + } + return event +} + +// ptraceVforkDone is called after the end of a vfork stop to check if t should +// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's +// PID namespace. +func (t *Task) ptraceVforkDone(child ThreadID) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceVforkDone { + return false + } + t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK_DONE, uint64(child)) + return true +} + +// ptraceExec is called at the end of an execve syscall to check if t should +// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID +// namespace, prior to the execve. (If t did not have a tracer at the time +// oldTID was read, oldTID may be 0. This is consistent with Linux.) +func (t *Task) ptraceExec(oldTID ThreadID) { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + // Recheck with the TaskSet mutex locked. Most ptrace points don't need to + // do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC + // is special because both TraceExec and !TraceExec do something if a + // tracer is attached. + if !t.hasTracer() { + return + } + if t.ptraceOpts.TraceExec { + t.Debugf("Entering PTRACE_EVENT_EXEC stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_EXEC, uint64(oldTID)) + return + } + // "If the PTRACE_O_TRACEEXEC option is not in effect for the execing + // tracee, and if the tracee was PTRACE_ATTACHed rather that [sic] + // PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after + // execve(2) returns. This is an ordinary signal (similar to one which can + // be generated by `kill -TRAP`, not a special kind of ptrace-stop. + // Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0 + // (SI_USER). This signal may be blocked by signal mask, and thus may be + // delivered (much) later." - ptrace(2) + if t.ptraceSeized { + return + } + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGTRAP), + Code: arch.SignalInfoUser, + }, false /* group */) +} + +// ptraceExit is called early in the task exit path to check if t should enter +// PTRACE_EVENT_EXIT stop. +func (t *Task) ptraceExit() { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceExit { + return + } + t.tg.signalHandlers.mu.Lock() + status := t.exitStatus.Status() + t.tg.signalHandlers.mu.Unlock() + t.Debugf("Entering PTRACE_EVENT_EXIT stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status)) +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceEventLocked(event int32, msg uint64) { + t.ptraceEventMsg = msg + // """ + // PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning + // with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An + // additional bit is set in the higher byte of the status word: the value + // status>>8 will be + // + // (SIGTRAP | PTRACE_EVENT_foo << 8). + // + // ... + // + // """ - ptrace(2) + t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8)) +} + +// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller. +func (t *Task) ptraceKill(target *Task) error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.Tracer() != t { + return syserror.ESRCH + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + // "This operation is deprecated; do not use it! Instead, send a SIGKILL + // directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is + // that it requires the tracee to be in signal-delivery-stop, otherwise it + // may not work (i.e., may complete successfully but won't kill the + // tracee)." - ptrace(2) + if target.stop == nil { + return nil + } + if _, ok := target.stop.(*ptraceStop); !ok { + return nil + } + target.ptraceCode = int32(linux.SIGKILL) + target.endInternalStopLocked() + return nil +} + +func (t *Task) ptraceInterrupt(target *Task) error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.Tracer() != t { + return syserror.ESRCH + } + if !target.ptraceSeized { + return syserror.EIO + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.killedLocked() || target.exitState >= TaskExitInitiated { + return nil + } + target.trapStopPending = true + if s, ok := target.stop.(*ptraceStop); ok && s.listen { + target.endInternalStopLocked() + } + target.interrupt() + return nil +} + +// Preconditions: The TaskSet mutex must be locked for writing. t must have a +// tracer. +func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { + const valid = uintptr(linux.PTRACE_O_EXITKILL | + linux.PTRACE_O_TRACESYSGOOD | + linux.PTRACE_O_TRACECLONE | + linux.PTRACE_O_TRACEEXEC | + linux.PTRACE_O_TRACEEXIT | + linux.PTRACE_O_TRACEFORK | + linux.PTRACE_O_TRACESECCOMP | + linux.PTRACE_O_TRACEVFORK | + linux.PTRACE_O_TRACEVFORKDONE) + if opts&^valid != 0 { + return syserror.EINVAL + } + t.ptraceOpts = ptraceOptions{ + ExitKill: opts&linux.PTRACE_O_EXITKILL != 0, + SysGood: opts&linux.PTRACE_O_TRACESYSGOOD != 0, + TraceClone: opts&linux.PTRACE_O_TRACECLONE != 0, + TraceExec: opts&linux.PTRACE_O_TRACEEXEC != 0, + TraceExit: opts&linux.PTRACE_O_TRACEEXIT != 0, + TraceFork: opts&linux.PTRACE_O_TRACEFORK != 0, + TraceSeccomp: opts&linux.PTRACE_O_TRACESECCOMP != 0, + TraceVfork: opts&linux.PTRACE_O_TRACEVFORK != 0, + TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0, + } + return nil +} + +// Ptrace implements the ptrace system call. +func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { + // PTRACE_TRACEME ignores all other arguments. + if req == linux.PTRACE_TRACEME { + return t.ptraceTraceme() + } + // All other ptrace requests operate on a current or future tracee + // specified by pid. + target := t.tg.pidns.TaskWithID(pid) + if target == nil { + return syserror.ESRCH + } + + // PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already + // a tracee. + if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE { + seize := req == linux.PTRACE_SEIZE + if seize && addr != 0 { + return syserror.EIO + } + return t.ptraceAttach(target, seize, uintptr(data)) + } + // PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee, + // but does not require that it is ptrace-stopped. + if req == linux.PTRACE_KILL { + return t.ptraceKill(target) + } + if req == linux.PTRACE_INTERRUPT { + return t.ptraceInterrupt(target) + } + // All other ptrace requests require that the target is a ptrace-stopped + // tracee, and freeze the ptrace-stop so the tracee can be operated on. + t.tg.pidns.owner.mu.RLock() + if target.Tracer() != t { + t.tg.pidns.owner.mu.RUnlock() + return syserror.ESRCH + } + if !target.ptraceFreeze() { + t.tg.pidns.owner.mu.RUnlock() + // "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE, + // PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the + // tracee to be in a ptrace-stop, otherwise they fail with ESRCH." - + // ptrace(2) + return syserror.ESRCH + } + t.tg.pidns.owner.mu.RUnlock() + // Even if the target has a ptrace-stop active, the tracee's task goroutine + // may not yet have reached Task.doStop; wait for it to do so. This is safe + // because there's no way for target to initiate a ptrace-stop and then + // block (by calling Task.block) before entering it. + // + // Caveat: If tasks were just restored, the tracee's first call to + // Task.Activate (in Task.run) occurs before its first call to Task.doStop, + // which may block if the tracer's address space is active. + t.UninterruptibleSleepStart(true) + target.waitGoroutineStoppedOrExited() + t.UninterruptibleSleepFinish(true) + + // Resuming commands end the ptrace stop, but only if successful. + // PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the + // target. + switch req { + case linux.PTRACE_DETACH: + if err := t.ptraceDetach(target, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_CONT: + if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SYSCALL: + if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SINGLESTEP: + if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SYSEMU: + if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SYSEMU_SINGLESTEP: + if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_LISTEN: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !target.ptraceSeized { + return syserror.EIO + } + if target.ptraceSiginfo == nil { + return syserror.EIO + } + if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP { + return syserror.EIO + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.trapNotifyPending { + target.endInternalStopLocked() + } else { + target.stop.(*ptraceStop).listen = true + target.ptraceUnfreezeLocked() + } + return nil + } + + // All other ptrace requests expect us to unfreeze the stop. + defer target.ptraceUnfreeze() + + switch req { + case linux.PTRACE_PEEKTEXT, linux.PTRACE_PEEKDATA: + // "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and + // PTRACE_PEEKUSER requests have a different API: they store the result + // at the address specified by the data parameter, and the return value + // is the error flag." - ptrace(2) + word := t.Arch().Native(0) + if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{ + IgnorePermissions: true, + }); err != nil { + return err + } + _, err := t.CopyOut(data, word) + return err + + case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA: + _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{ + IgnorePermissions: true, + }) + return err + + case linux.PTRACE_GETREGSET: + // "Read the tracee's registers. addr specifies, in an + // architecture-dependent way, the type of registers to be read. ... + // data points to a struct iovec, which describes the destination + // buffer's location and length. On return, the kernel modifies iov.len + // to indicate the actual number of bytes returned." - ptrace(2) + ars, err := t.CopyInIovecs(data, 1) + if err != nil { + return err + } + ar := ars.Head() + n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: ar.Start, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }, int(ar.Length())) + if err != nil { + return err + } + + // Update iovecs to represent the range of the written register set. + end, ok := ar.Start.AddLength(uint64(n)) + if !ok { + panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length())) + } + ar.End = end + return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) + + case linux.PTRACE_SETREGSET: + ars, err := t.CopyInIovecs(data, 1) + if err != nil { + return err + } + ar := ars.Head() + n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: ar.Start, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }, int(ar.Length())) + if err != nil { + return err + } + ar.End -= usermem.Addr(n) + return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) + + case linux.PTRACE_GETSIGINFO: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if target.ptraceSiginfo == nil { + return syserror.EINVAL + } + _, err := t.CopyOut(data, target.ptraceSiginfo) + return err + + case linux.PTRACE_SETSIGINFO: + var info arch.SignalInfo + if _, err := t.CopyIn(data, &info); err != nil { + return err + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if target.ptraceSiginfo == nil { + return syserror.EINVAL + } + target.ptraceSiginfo = &info + return nil + + case linux.PTRACE_GETSIGMASK: + if addr != linux.SignalSetSize { + return syserror.EINVAL + } + _, err := t.CopyOut(data, target.SignalMask()) + return err + + case linux.PTRACE_SETSIGMASK: + if addr != linux.SignalSetSize { + return syserror.EINVAL + } + var mask linux.SignalSet + if _, err := t.CopyIn(data, &mask); err != nil { + return err + } + // The target's task goroutine is stopped, so this is safe: + target.SetSignalMask(mask &^ UnblockableSignals) + return nil + + case linux.PTRACE_SETOPTIONS: + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + return target.ptraceSetOptionsLocked(uintptr(data)) + + case linux.PTRACE_GETEVENTMSG: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg) + return err + + // PEEKSIGINFO is unimplemented but seems to have no users anywhere. + + default: + return t.ptraceArch(target, req, addr, data) + } +} diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go new file mode 100644 index 000000000..048eeaa3f --- /dev/null +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -0,0 +1,89 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// ptraceArch implements arch-specific ptrace commands. +func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error { + switch req { + case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER + n, err := target.Arch().PtracePeekUser(uintptr(addr)) + if err != nil { + return err + } + _, err = t.CopyOut(data, n) + return err + + case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER + return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data)) + + case linux.PTRACE_GETREGS: + // "Copy the tracee's general-purpose ... registers ... to the address + // data in the tracer. ... (addr is ignored.) Note that SPARC systems + // have the meaning of data and addr reversed ..." + _, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + case linux.PTRACE_GETFPREGS: + _, err := target.Arch().PtraceGetFPRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + case linux.PTRACE_SETREGS: + _, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + case linux.PTRACE_SETFPREGS: + _, err := target.Arch().PtraceSetFPRegs(&usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: data, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }) + return err + + default: + return syserror.EIO + } +} diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go new file mode 100644 index 000000000..4899c813f --- /dev/null +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// ptraceArch implements arch-specific ptrace commands. +func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) error { + return syserror.EIO +} diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go new file mode 100644 index 000000000..c4fb2c56c --- /dev/null +++ b/pkg/sentry/kernel/rseq.go @@ -0,0 +1,120 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Restartable sequences, as described in https://lwn.net/Articles/650333/. + +// RSEQCriticalRegion describes a restartable sequence critical region. +// +// +stateify savable +type RSEQCriticalRegion struct { + // When a task in this thread group has its CPU preempted (as defined by + // platform.ErrContextCPUPreempted) or has a signal delivered to an + // application handler while its instruction pointer is in CriticalSection, + // set the instruction pointer to Restart and application register r10 (on + // amd64) to the former instruction pointer. + CriticalSection usermem.AddrRange + Restart usermem.Addr +} + +// RSEQAvailable returns true if t supports restartable sequences. +func (t *Task) RSEQAvailable() bool { + return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption() +} + +// RSEQCriticalRegion returns a copy of t's thread group's current restartable +// sequence. +func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion { + return *t.tg.rscr.Load().(*RSEQCriticalRegion) +} + +// SetRSEQCriticalRegion replaces t's thread group's restartable sequence. +// +// Preconditions: t.RSEQAvailable() == true. +func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error { + // These checks are somewhat more lenient than in Linux, which (bizarrely) + // requires rscr.CriticalSection to be non-empty and rscr.Restart to be + // outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0 + // (which disables the critical region). + if rscr.CriticalSection.Start == 0 { + rscr.CriticalSection.End = 0 + rscr.Restart = 0 + t.tg.rscr.Store(&rscr) + return nil + } + if rscr.CriticalSection.Start >= rscr.CriticalSection.End { + return syserror.EINVAL + } + if rscr.CriticalSection.Contains(rscr.Restart) { + return syserror.EINVAL + } + // TODO(jamieliu): check that rscr.CriticalSection and rscr.Restart are in + // the application address range, for consistency with Linux + t.tg.rscr.Store(&rscr) + return nil +} + +// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU +// number. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) RSEQCPUAddr() usermem.Addr { + return t.rseqCPUAddr +} + +// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU +// number. +// +// Preconditions: t.RSEQAvailable() == true. The caller must be running on the +// task goroutine. t's AddressSpace must be active. +func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error { + t.rseqCPUAddr = addr + if addr != 0 { + t.rseqCPU = int32(hostcpu.GetCPU()) + if err := t.rseqCopyOutCPU(); err != nil { + t.rseqCPUAddr = 0 + t.rseqCPU = -1 + return syserror.EINVAL // yes, EINVAL, not err or EFAULT + } + } else { + t.rseqCPU = -1 + } + return nil +} + +// Preconditions: The caller must be running on the task goroutine. t's +// AddressSpace must be active. +func (t *Task) rseqCopyOutCPU() error { + buf := t.CopyScratchBuffer(4) + usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) + _, err := t.CopyOutBytes(t.rseqCPUAddr, buf) + return err +} + +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) rseqInterrupt() { + rscr := t.tg.rscr.Load().(*RSEQCriticalRegion) + if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) { + t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart) + t.Arch().SetIP(uintptr(rscr.Restart)) + t.Arch().SetRSEQInterruptedIP(ip) + } +} diff --git a/pkg/sentry/kernel/sched/cpuset.go b/pkg/sentry/kernel/sched/cpuset.go new file mode 100644 index 000000000..c6c436690 --- /dev/null +++ b/pkg/sentry/kernel/sched/cpuset.go @@ -0,0 +1,105 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sched + +import "math/bits" + +const ( + bitsPerByte = 8 + bytesPerLong = 8 // only for 64-bit architectures +) + +// CPUSet contains a bitmap to record CPU information. +// +// Note that this definition is only correct for little-endian architectures, +// since Linux's cpumask_t uses unsigned long. +type CPUSet []byte + +// CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus. +func CPUSetSize(num uint) uint { + // NOTE(b/68859821): Applications may expect that the size of a CPUSet in + // bytes is always a multiple of sizeof(unsigned long), since this is true + // in Linux. Thus we always round up. + bytes := (num + bitsPerByte - 1) / bitsPerByte + longs := (bytes + bytesPerLong - 1) / bytesPerLong + return longs * bytesPerLong +} + +// NewCPUSet returns a CPUSet for the given number of CPUs which initially +// contains no CPUs. +func NewCPUSet(num uint) CPUSet { + return CPUSet(make([]byte, CPUSetSize(num))) +} + +// NewFullCPUSet returns a CPUSet for the given number of CPUs, all of which +// are present in the set. +func NewFullCPUSet(num uint) CPUSet { + c := NewCPUSet(num) + var i uint + for ; i < num/bitsPerByte; i++ { + c[i] = 0xff + } + if rem := num % bitsPerByte; rem != 0 { + c[i] = (1 << rem) - 1 + } + return c +} + +// Size returns the size of 'c' in bytes. +func (c CPUSet) Size() uint { + return uint(len(c)) +} + +// NumCPUs returns how many cpus are set in the CPUSet. +func (c CPUSet) NumCPUs() uint { + var n int + for _, b := range c { + n += bits.OnesCount8(b) + } + return uint(n) +} + +// Copy returns a copy of the CPUSet. +func (c CPUSet) Copy() CPUSet { + return append(CPUSet(nil), c...) +} + +// Set sets the bit corresponding to cpu. +func (c *CPUSet) Set(cpu uint) { + (*c)[cpu/bitsPerByte] |= 1 << (cpu % bitsPerByte) +} + +// ClearAbove clears bits corresponding to cpu and all higher cpus. +func (c *CPUSet) ClearAbove(cpu uint) { + i := cpu / bitsPerByte + if i >= c.Size() { + return + } + (*c)[i] &^= 0xff << (cpu % bitsPerByte) + for i++; i < c.Size(); i++ { + (*c)[i] = 0 + } +} + +// ForEachCPU iterates over the CPUSet and calls fn with the cpu index if +// it's set. +func (c CPUSet) ForEachCPU(fn func(uint)) { + for i := uint(0); i < c.Size()*bitsPerByte; i++ { + bit := uint(1) << (i & (bitsPerByte - 1)) + if uint(c[i/bitsPerByte])&bit == bit { + fn(i) + } + } +} diff --git a/pkg/sentry/kernel/sched/sched.go b/pkg/sentry/kernel/sched/sched.go new file mode 100644 index 000000000..de18c9d02 --- /dev/null +++ b/pkg/sentry/kernel/sched/sched.go @@ -0,0 +1,16 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sched implements scheduler related features. +package sched diff --git a/pkg/sentry/kernel/sched/sched_state_autogen.go b/pkg/sentry/kernel/sched/sched_state_autogen.go new file mode 100755 index 000000000..2a482732e --- /dev/null +++ b/pkg/sentry/kernel/sched/sched_state_autogen.go @@ -0,0 +1,4 @@ +// automatically generated by stateify. + +package sched + diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go new file mode 100644 index 000000000..cc75eb08a --- /dev/null +++ b/pkg/sentry/kernel/seccomp.go @@ -0,0 +1,217 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const maxSyscallFilterInstructions = 1 << 15 + +// seccompData is equivalent to struct seccomp_data, which contains the data +// passed to seccomp-bpf filters. +type seccompData struct { + // nr is the system call number. + nr int32 + + // arch is an AUDIT_ARCH_* value indicating the system call convention. + arch uint32 + + // instructionPointer is the value of the instruction pointer at the time + // of the system call. + instructionPointer uint64 + + // args contains the first 6 system call arguments. + args [6]uint64 +} + +func (d *seccompData) asBPFInput() bpf.Input { + return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder} +} + +func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo { + si := &arch.SignalInfo{ + Signo: int32(linux.SIGSYS), + Errno: errno, + Code: arch.SYS_SECCOMP, + } + si.SetCallAddr(uint64(ip)) + si.SetSyscall(sysno) + si.SetArch(t.SyscallTable().AuditNumber) + return si +} + +// checkSeccompSyscall applies the task's seccomp filters before the execution +// of syscall sysno at instruction pointer ip. (These parameters must be passed +// in because vsyscalls do not use the values in t.Arch().) +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip usermem.Addr) linux.BPFAction { + result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip)) + action := result & linux.SECCOMP_RET_ACTION + switch action { + case linux.SECCOMP_RET_TRAP: + // "Results in the kernel sending a SIGSYS signal to the triggering + // task without executing the system call. ... The SECCOMP_RET_DATA + // portion of the return value will be passed as si_errno." - + // Documentation/prctl/seccomp_filter.txt + t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip)) + // "The return value register will contain an arch-dependent value." In + // practice, it's ~always the syscall number. + t.Arch().SetReturn(uintptr(sysno)) + + case linux.SECCOMP_RET_ERRNO: + // "Results in the lower 16-bits of the return value being passed to + // userland as the errno without executing the system call." + t.Arch().SetReturn(-uintptr(result.Data())) + + case linux.SECCOMP_RET_TRACE: + // "When returned, this value will cause the kernel to attempt to + // notify a ptrace()-based tracer prior to executing the system call. + // If there is no tracer present, -ENOSYS is returned to userland and + // the system call is not executed." + if !t.ptraceSeccomp(result.Data()) { + // This useless-looking temporary is needed because Go. + tmp := uintptr(syscall.ENOSYS) + t.Arch().SetReturn(-tmp) + return linux.SECCOMP_RET_ERRNO + } + + case linux.SECCOMP_RET_ALLOW: + // "Results in the system call being executed." + + case linux.SECCOMP_RET_KILL_THREAD: + // "Results in the task exiting immediately without executing the + // system call. The exit status of the task will be SIGSYS, not + // SIGKILL." + + default: + // consistent with Linux + return linux.SECCOMP_RET_KILL_THREAD + } + return action +} + +func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 { + data := seccompData{ + nr: sysno, + arch: t.tc.st.AuditNumber, + instructionPointer: uint64(ip), + } + // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so + // we can't do any slicing tricks or even use copy/append here. + for i, arg := range args { + if i >= len(data.args) { + break + } + data.args[i] = arg.Uint64() + } + input := data.asBPFInput() + + ret := uint32(linux.SECCOMP_RET_ALLOW) + f := t.syscallFilters.Load() + if f == nil { + return ret + } + + // "Every filter successfully installed will be evaluated (in reverse + // order) for each system call the task makes." - kernel/seccomp.c + for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- { + thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input) + if err != nil { + t.Debugf("seccomp-bpf filter %d returned error: %v", i, err) + thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD) + } + // "If multiple filters exist, the return value for the evaluation of a + // given system call will always use the highest precedent value." - + // Documentation/prctl/seccomp_filter.txt + // + // (Note that this contradicts prctl(2): "If the filters permit prctl() + // calls, then additional filters can be added; they are run in order + // until the first non-allow result is seen." prctl(2) is incorrect.) + // + // "The ordering ensures that a min_t() over composed return values + // always selects the least permissive choice." - + // include/uapi/linux/seccomp.h + if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) { + ret = thisRet + } + } + + return ret +} + +// AppendSyscallFilter adds BPF program p as a system call filter. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error { + // While syscallFilters are an atomic.Value we must take the mutex to prevent + // our read-copy-update from happening while another task is syncing syscall + // filters to us, this keeps the filters in a consistent state. + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + + // Cap the combined length of all syscall filters (plus a penalty of 4 + // instructions per filter beyond the first) to maxSyscallFilterInstructions. + // This restriction is inherited from Linux. + totalLength := p.Length() + var newFilters []bpf.Program + + if sf := t.syscallFilters.Load(); sf != nil { + oldFilters := sf.([]bpf.Program) + for _, f := range oldFilters { + totalLength += f.Length() + 4 + } + newFilters = append(newFilters, oldFilters...) + } + + if totalLength > maxSyscallFilterInstructions { + return syserror.ENOMEM + } + + newFilters = append(newFilters, p) + t.syscallFilters.Store(newFilters) + + if syncAll { + // Note: No new privs is always assumed to be set. + for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() { + if ot != t { + var copiedFilters []bpf.Program + copiedFilters = append(copiedFilters, newFilters...) + ot.syscallFilters.Store(copiedFilters) + } + } + } + + return nil +} + +// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current +// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP) +// and /proc/[pid]/status. +func (t *Task) SeccompMode() int { + f := t.syscallFilters.Load() + if f != nil && len(f.([]bpf.Program)) > 0 { + return linux.SECCOMP_MODE_FILTER + } + return linux.SECCOMP_MODE_NONE +} diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go new file mode 100644 index 000000000..9d0620e02 --- /dev/null +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -0,0 +1,571 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package semaphore implements System V semaphores. +package semaphore + +import ( + "fmt" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const ( + valueMax = 32767 // SEMVMX + + // semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL). + semaphoresMax = 32000 + + // setMax is "system-wide limit on the number of semaphore sets" (SEMMNI). + setsMax = 32000 + + // semaphoresTotalMax is "system-wide limit on the number of semaphores" + // (SEMMNS = SEMMNI*SEMMSL). + semaphoresTotalMax = 1024000000 +) + +// Registry maintains a set of semaphores that can be found by key or ID. +// +// +stateify savable +type Registry struct { + // userNS owning the ipc name this registry belongs to. Immutable. + userNS *auth.UserNamespace + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + semaphores map[int32]*Set + lastIDUsed int32 +} + +// Set represents a set of semaphores that can be operated atomically. +// +// +stateify savable +type Set struct { + // registry owning this sem set. Immutable. + registry *Registry + + // Id is a handle that identifies the set. + ID int32 + + // key is an user provided key that can be shared between processes. + key int32 + + // creator is the user that created the set. Immutable. + creator fs.FileOwner + + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + owner fs.FileOwner + perms fs.FilePermissions + opTime ktime.Time + changeTime ktime.Time + + // sems holds all semaphores in the set. The slice itself is immutable after + // it's been set, however each 'sem' object in the slice requires 'mu' lock. + sems []sem + + // dead is set to true when the set is removed and can't be reached anymore. + // All waiters must wake up and fail when set is dead. + dead bool +} + +// sem represents a single semanphore from a set. +// +// +stateify savable +type sem struct { + value int16 + waiters waiterList `state:"zerovalue"` + pid int32 +} + +// waiter represents a caller that is waiting for the semaphore value to +// become positive or zero. +// +// +stateify savable +type waiter struct { + waiterEntry + + // value represents how much resource the waiter needs to wake up. + value int16 + ch chan struct{} +} + +// NewRegistry creates a new semaphore set registry. +func NewRegistry(userNS *auth.UserNamespace) *Registry { + return &Registry{ + userNS: userNS, + semaphores: make(map[int32]*Set), + } +} + +// FindOrCreate searches for a semaphore set that matches 'key'. If not found, +// it may create a new one if requested. If private is true, key is ignored and +// a new set is always created. If create is false, it fails if a set cannot +// be found. If exclusive is true, it fails if a set with the same key already +// exists. +func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { + if nsems < 0 || nsems > semaphoresMax { + return nil, syserror.EINVAL + } + + r.mu.Lock() + defer r.mu.Unlock() + + if !private { + // Look up an existing semaphore. + if set := r.findByKey(key); set != nil { + set.mu.Lock() + defer set.mu.Unlock() + + // Check that caller can access semaphore set. + creds := auth.CredentialsFromContext(ctx) + if !set.checkPerms(creds, fs.PermsFromMode(mode)) { + return nil, syserror.EACCES + } + + // Validate parameters. + if nsems > int32(set.Size()) { + return nil, syserror.EINVAL + } + if create && exclusive { + return nil, syserror.EEXIST + } + return set, nil + } + + if !create { + // Semaphore not found and should not be created. + return nil, syserror.ENOENT + } + } + + // Zero is only valid if an existing set is found. + if nsems == 0 { + return nil, syserror.EINVAL + } + + // Apply system limits. + if len(r.semaphores) >= setsMax { + return nil, syserror.EINVAL + } + if r.totalSems() > int(semaphoresTotalMax-nsems) { + return nil, syserror.EINVAL + } + + // Finally create a new set. + owner := fs.FileOwnerFromContext(ctx) + perms := fs.FilePermsFromMode(mode) + return r.newSet(ctx, key, owner, owner, perms, nsems) +} + +// RemoveID removes set with give 'id' from the registry and marks the set as +// dead. All waiters will be awakened and fail. +func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error { + r.mu.Lock() + defer r.mu.Unlock() + + set := r.semaphores[id] + if set == nil { + return syserror.EINVAL + } + + set.mu.Lock() + defer set.mu.Unlock() + + // "The effective user ID of the calling process must match the creator or + // owner of the semaphore set, or the caller must be privileged." + if !set.checkCredentials(creds) && !set.checkCapability(creds) { + return syserror.EACCES + } + + delete(r.semaphores, set.ID) + set.destroy() + return nil +} + +func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) { + set := &Set{ + registry: r, + key: key, + owner: owner, + creator: owner, + perms: perms, + changeTime: ktime.NowFromContext(ctx), + sems: make([]sem, nsems), + } + + // Find the next available ID. + for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { + // Handle wrap around. + if id < 0 { + id = 0 + continue + } + if r.semaphores[id] == nil { + r.lastIDUsed = id + r.semaphores[id] = set + set.ID = id + return set, nil + } + } + + log.Warningf("Semaphore map is full, they must be leaking") + return nil, syserror.ENOMEM +} + +// FindByID looks up a set given an ID. +func (r *Registry) FindByID(id int32) *Set { + r.mu.Lock() + defer r.mu.Unlock() + return r.semaphores[id] +} + +func (r *Registry) findByKey(key int32) *Set { + for _, v := range r.semaphores { + if v.key == key { + return v + } + } + return nil +} + +func (r *Registry) totalSems() int { + totalSems := 0 + for _, v := range r.semaphores { + totalSems += v.Size() + } + return totalSems +} + +func (s *Set) findSem(num int32) *sem { + if num < 0 || int(num) >= s.Size() { + return nil + } + return &s.sems[num] +} + +// Size returns the number of semaphores in the set. Size is immutable. +func (s *Set) Size() int { + return len(s.sems) +} + +// Change changes some fields from the set atomically. +func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error { + s.mu.Lock() + defer s.mu.Unlock() + + // "The effective UID of the calling process must match the owner or creator + // of the semaphore set, or the caller must be privileged." + if !s.checkCredentials(creds) && !s.checkCapability(creds) { + return syserror.EACCES + } + + s.owner = owner + s.perms = perms + s.changeTime = ktime.NowFromContext(ctx) + return nil +} + +// SetVal overrides a semaphore value, waking up waiters as needed. +func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error { + if val < 0 || val > valueMax { + return syserror.ERANGE + } + + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have alter permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Write: true}) { + return syserror.EACCES + } + + sem := s.findSem(num) + if sem == nil { + return syserror.ERANGE + } + + // TODO(b/29354920): Clear undo entries in all processes + sem.value = val + sem.pid = pid + s.changeTime = ktime.NowFromContext(ctx) + sem.wakeWaiters() + return nil +} + +// SetValAll overrides all semaphores values, waking up waiters as needed. It also +// sets semaphore's PID which was fixed in Linux 4.6. +// +// 'len(vals)' must be equal to 's.Size()'. +func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials, pid int32) error { + if len(vals) != s.Size() { + panic(fmt.Sprintf("vals length (%d) different that Set.Size() (%d)", len(vals), s.Size())) + } + + for _, val := range vals { + if val < 0 || val > valueMax { + return syserror.ERANGE + } + } + + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have alter permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Write: true}) { + return syserror.EACCES + } + + for i, val := range vals { + sem := &s.sems[i] + + // TODO(b/29354920): Clear undo entries in all processes + sem.value = int16(val) + sem.pid = pid + sem.wakeWaiters() + } + s.changeTime = ktime.NowFromContext(ctx) + return nil +} + +// GetVal returns a semaphore value. +func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have read permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return 0, syserror.EACCES + } + + sem := s.findSem(num) + if sem == nil { + return 0, syserror.ERANGE + } + return sem.value, nil +} + +// GetValAll returns value for all semaphores. +func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have read permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return nil, syserror.EACCES + } + + vals := make([]uint16, s.Size()) + for i, sem := range s.sems { + vals[i] = uint16(sem.value) + } + return vals, nil +} + +// GetPID returns the PID set when performing operations in the semaphore. +func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have read permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return 0, syserror.EACCES + } + + sem := s.findSem(num) + if sem == nil { + return 0, syserror.ERANGE + } + return sem.pid, nil +} + +// ExecuteOps attempts to execute a list of operations to the set. It only +// succeeds when all operations can be applied. No changes are made if it fails. +// +// On failure, it may return an error (retries are hopeless) or it may return +// a channel that can be waited on before attempting again. +func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials, pid int32) (chan struct{}, int32, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // Did it race with a removal operation? + if s.dead { + return nil, 0, syserror.EIDRM + } + + // Validate the operations. + readOnly := true + for _, op := range ops { + if s.findSem(int32(op.SemNum)) == nil { + return nil, 0, syserror.EFBIG + } + if op.SemOp != 0 { + readOnly = false + } + } + + if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) { + return nil, 0, syserror.EACCES + } + + ch, num, err := s.executeOps(ctx, ops, pid) + if err != nil { + return nil, 0, err + } + return ch, num, nil +} + +func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (chan struct{}, int32, error) { + // Changes to semaphores go to this slice temporarily until they all succeed. + tmpVals := make([]int16, len(s.sems)) + for i := range s.sems { + tmpVals[i] = s.sems[i].value + } + + for _, op := range ops { + sem := &s.sems[op.SemNum] + if op.SemOp == 0 { + // Handle 'wait for zero' operation. + if tmpVals[op.SemNum] != 0 { + // Semaphore isn't 0, must wait. + if op.SemFlg&linux.IPC_NOWAIT != 0 { + return nil, 0, syserror.ErrWouldBlock + } + + w := newWaiter(op.SemOp) + sem.waiters.PushBack(w) + return w.ch, int32(op.SemNum), nil + } + } else { + if op.SemOp < 0 { + // Handle 'wait' operation. + if -op.SemOp > valueMax { + return nil, 0, syserror.ERANGE + } + if -op.SemOp > tmpVals[op.SemNum] { + // Not enough resources, must wait. + if op.SemFlg&linux.IPC_NOWAIT != 0 { + return nil, 0, syserror.ErrWouldBlock + } + + w := newWaiter(op.SemOp) + sem.waiters.PushBack(w) + return w.ch, int32(op.SemNum), nil + } + } else { + // op.SemOp > 0: Handle 'signal' operation. + if tmpVals[op.SemNum] > valueMax-op.SemOp { + return nil, 0, syserror.ERANGE + } + } + + tmpVals[op.SemNum] += op.SemOp + } + } + + // All operations succeeded, apply them. + // TODO(b/29354920): handle undo operations. + for i, v := range tmpVals { + s.sems[i].value = v + s.sems[i].wakeWaiters() + s.sems[i].pid = pid + } + s.opTime = ktime.NowFromContext(ctx) + return nil, 0, nil +} + +// AbortWait notifies that a waiter is giving up and will not wait on the +// channel anymore. +func (s *Set) AbortWait(num int32, ch chan struct{}) { + s.mu.Lock() + defer s.mu.Unlock() + + sem := &s.sems[num] + for w := sem.waiters.Front(); w != nil; w = w.Next() { + if w.ch == ch { + sem.waiters.Remove(w) + return + } + } + // Waiter may not be found in case it raced with wakeWaiters(). +} + +func (s *Set) checkCredentials(creds *auth.Credentials) bool { + return s.owner.UID == creds.EffectiveKUID || + s.owner.GID == creds.EffectiveKGID || + s.creator.UID == creds.EffectiveKUID || + s.creator.GID == creds.EffectiveKGID +} + +func (s *Set) checkCapability(creds *auth.Credentials) bool { + return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok() +} + +func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool { + // Are we owner, or in group, or other? + p := s.perms.Other + if s.owner.UID == creds.EffectiveKUID { + p = s.perms.User + } else if creds.InGroup(s.owner.GID) { + p = s.perms.Group + } + + // Are permissions satisfied without capability checks? + if p.SupersetOf(reqPerms) { + return true + } + + return s.checkCapability(creds) +} + +// destroy destroys the set. Caller must hold 's.mu'. +func (s *Set) destroy() { + // Notify all waiters. They will fail on the next attempt to execute + // operations and return error. + s.dead = true + for _, s := range s.sems { + for w := s.waiters.Front(); w != nil; w = w.Next() { + w.ch <- struct{}{} + } + s.waiters.Reset() + } +} + +// wakeWaiters goes over all waiters and checks which of them can be notified. +func (s *sem) wakeWaiters() { + // Note that this will release all waiters waiting for 0 too. + for w := s.waiters.Front(); w != nil; { + if s.value < w.value { + // Still blocked, skip it. + continue + } + w.ch <- struct{}{} + old := w + w = w.Next() + s.waiters.Remove(old) + } +} + +func newWaiter(val int16) *waiter { + return &waiter{ + value: val, + ch: make(chan struct{}, 1), + } +} diff --git a/pkg/sentry/kernel/semaphore/semaphore_state_autogen.go b/pkg/sentry/kernel/semaphore/semaphore_state_autogen.go new file mode 100755 index 000000000..1551f792e --- /dev/null +++ b/pkg/sentry/kernel/semaphore/semaphore_state_autogen.go @@ -0,0 +1,115 @@ +// automatically generated by stateify. + +package semaphore + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *Registry) beforeSave() {} +func (x *Registry) save(m state.Map) { + x.beforeSave() + m.Save("userNS", &x.userNS) + m.Save("semaphores", &x.semaphores) + m.Save("lastIDUsed", &x.lastIDUsed) +} + +func (x *Registry) afterLoad() {} +func (x *Registry) load(m state.Map) { + m.Load("userNS", &x.userNS) + m.Load("semaphores", &x.semaphores) + m.Load("lastIDUsed", &x.lastIDUsed) +} + +func (x *Set) beforeSave() {} +func (x *Set) save(m state.Map) { + x.beforeSave() + m.Save("registry", &x.registry) + m.Save("ID", &x.ID) + m.Save("key", &x.key) + m.Save("creator", &x.creator) + m.Save("owner", &x.owner) + m.Save("perms", &x.perms) + m.Save("opTime", &x.opTime) + m.Save("changeTime", &x.changeTime) + m.Save("sems", &x.sems) + m.Save("dead", &x.dead) +} + +func (x *Set) afterLoad() {} +func (x *Set) load(m state.Map) { + m.Load("registry", &x.registry) + m.Load("ID", &x.ID) + m.Load("key", &x.key) + m.Load("creator", &x.creator) + m.Load("owner", &x.owner) + m.Load("perms", &x.perms) + m.Load("opTime", &x.opTime) + m.Load("changeTime", &x.changeTime) + m.Load("sems", &x.sems) + m.Load("dead", &x.dead) +} + +func (x *sem) beforeSave() {} +func (x *sem) save(m state.Map) { + x.beforeSave() + if !state.IsZeroValue(x.waiters) { m.Failf("waiters is %v, expected zero", x.waiters) } + m.Save("value", &x.value) + m.Save("pid", &x.pid) +} + +func (x *sem) afterLoad() {} +func (x *sem) load(m state.Map) { + m.Load("value", &x.value) + m.Load("pid", &x.pid) +} + +func (x *waiter) beforeSave() {} +func (x *waiter) save(m state.Map) { + x.beforeSave() + m.Save("waiterEntry", &x.waiterEntry) + m.Save("value", &x.value) + m.Save("ch", &x.ch) +} + +func (x *waiter) afterLoad() {} +func (x *waiter) load(m state.Map) { + m.Load("waiterEntry", &x.waiterEntry) + m.Load("value", &x.value) + m.Load("ch", &x.ch) +} + +func (x *waiterList) beforeSave() {} +func (x *waiterList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *waiterList) afterLoad() {} +func (x *waiterList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *waiterEntry) beforeSave() {} +func (x *waiterEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *waiterEntry) afterLoad() {} +func (x *waiterEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func init() { + state.Register("semaphore.Registry", (*Registry)(nil), state.Fns{Save: (*Registry).save, Load: (*Registry).load}) + state.Register("semaphore.Set", (*Set)(nil), state.Fns{Save: (*Set).save, Load: (*Set).load}) + state.Register("semaphore.sem", (*sem)(nil), state.Fns{Save: (*sem).save, Load: (*sem).load}) + state.Register("semaphore.waiter", (*waiter)(nil), state.Fns{Save: (*waiter).save, Load: (*waiter).load}) + state.Register("semaphore.waiterList", (*waiterList)(nil), state.Fns{Save: (*waiterList).save, Load: (*waiterList).load}) + state.Register("semaphore.waiterEntry", (*waiterEntry)(nil), state.Fns{Save: (*waiterEntry).save, Load: (*waiterEntry).load}) +} diff --git a/pkg/sentry/kernel/semaphore/waiter_list.go b/pkg/sentry/kernel/semaphore/waiter_list.go new file mode 100755 index 000000000..33e29fb55 --- /dev/null +++ b/pkg/sentry/kernel/semaphore/waiter_list.go @@ -0,0 +1,173 @@ +package semaphore + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type waiterElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (waiterElementMapper) linkerFor(elem *waiter) *waiter { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type waiterList struct { + head *waiter + tail *waiter +} + +// Reset resets list l to the empty state. +func (l *waiterList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *waiterList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *waiterList) Front() *waiter { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *waiterList) Back() *waiter { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *waiterList) PushFront(e *waiter) { + waiterElementMapper{}.linkerFor(e).SetNext(l.head) + waiterElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + waiterElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *waiterList) PushBack(e *waiter) { + waiterElementMapper{}.linkerFor(e).SetNext(nil) + waiterElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + waiterElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *waiterList) PushBackList(m *waiterList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + waiterElementMapper{}.linkerFor(l.tail).SetNext(m.head) + waiterElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *waiterList) InsertAfter(b, e *waiter) { + a := waiterElementMapper{}.linkerFor(b).Next() + waiterElementMapper{}.linkerFor(e).SetNext(a) + waiterElementMapper{}.linkerFor(e).SetPrev(b) + waiterElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + waiterElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *waiterList) InsertBefore(a, e *waiter) { + b := waiterElementMapper{}.linkerFor(a).Prev() + waiterElementMapper{}.linkerFor(e).SetNext(a) + waiterElementMapper{}.linkerFor(e).SetPrev(b) + waiterElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + waiterElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *waiterList) Remove(e *waiter) { + prev := waiterElementMapper{}.linkerFor(e).Prev() + next := waiterElementMapper{}.linkerFor(e).Next() + + if prev != nil { + waiterElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + waiterElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type waiterEntry struct { + next *waiter + prev *waiter +} + +// Next returns the entry that follows e in the list. +func (e *waiterEntry) Next() *waiter { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *waiterEntry) Prev() *waiter { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *waiterEntry) SetNext(elem *waiter) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *waiterEntry) SetPrev(elem *waiter) { + e.prev = elem +} diff --git a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go new file mode 100755 index 000000000..4bf8719f2 --- /dev/null +++ b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go @@ -0,0 +1,55 @@ +package kernel + +import ( + "reflect" + "strings" + "unsafe" + + "fmt" + "gvisor.googlesource.com/gvisor/third_party/gvsync" +) + +// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race +// with any writer critical sections in sc. +func SeqAtomicLoadTaskGoroutineSchedInfo(sc *gvsync.SeqCount, ptr *TaskGoroutineSchedInfo) TaskGoroutineSchedInfo { + // This function doesn't use SeqAtomicTryLoad because doing so is + // measurably, significantly (~20%) slower; Go is awful at inlining. + var val TaskGoroutineSchedInfo + for { + epoch := sc.BeginRead() + if gvsync.RaceEnabled { + + gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) + } else { + + val = *ptr + } + if sc.ReadOk(epoch) { + break + } + } + return val +} + +// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section +// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read +// would race with a writer critical section, SeqAtomicTryLoad returns +// (unspecified, false). +func SeqAtomicTryLoadTaskGoroutineSchedInfo(sc *gvsync.SeqCount, epoch gvsync.SeqCountEpoch, ptr *TaskGoroutineSchedInfo) (TaskGoroutineSchedInfo, bool) { + var val TaskGoroutineSchedInfo + if gvsync.RaceEnabled { + gvsync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) + } else { + val = *ptr + } + return val, sc.ReadOk(epoch) +} + +func initTaskGoroutineSchedInfo() { + var val TaskGoroutineSchedInfo + typ := reflect.TypeOf(val) + name := typ.Name() + if ptrs := gvsync.PointersInType(typ, name); len(ptrs) != 0 { + panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n"))) + } +} diff --git a/pkg/sentry/kernel/session_list.go b/pkg/sentry/kernel/session_list.go new file mode 100755 index 000000000..9ba27b164 --- /dev/null +++ b/pkg/sentry/kernel/session_list.go @@ -0,0 +1,173 @@ +package kernel + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type sessionElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (sessionElementMapper) linkerFor(elem *Session) *Session { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type sessionList struct { + head *Session + tail *Session +} + +// Reset resets list l to the empty state. +func (l *sessionList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *sessionList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *sessionList) Front() *Session { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *sessionList) Back() *Session { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *sessionList) PushFront(e *Session) { + sessionElementMapper{}.linkerFor(e).SetNext(l.head) + sessionElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + sessionElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *sessionList) PushBack(e *Session) { + sessionElementMapper{}.linkerFor(e).SetNext(nil) + sessionElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + sessionElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *sessionList) PushBackList(m *sessionList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + sessionElementMapper{}.linkerFor(l.tail).SetNext(m.head) + sessionElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *sessionList) InsertAfter(b, e *Session) { + a := sessionElementMapper{}.linkerFor(b).Next() + sessionElementMapper{}.linkerFor(e).SetNext(a) + sessionElementMapper{}.linkerFor(e).SetPrev(b) + sessionElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + sessionElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *sessionList) InsertBefore(a, e *Session) { + b := sessionElementMapper{}.linkerFor(a).Prev() + sessionElementMapper{}.linkerFor(e).SetNext(a) + sessionElementMapper{}.linkerFor(e).SetPrev(b) + sessionElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + sessionElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *sessionList) Remove(e *Session) { + prev := sessionElementMapper{}.linkerFor(e).Prev() + next := sessionElementMapper{}.linkerFor(e).Next() + + if prev != nil { + sessionElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + sessionElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type sessionEntry struct { + next *Session + prev *Session +} + +// Next returns the entry that follows e in the list. +func (e *sessionEntry) Next() *Session { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *sessionEntry) Prev() *Session { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *sessionEntry) SetNext(elem *Session) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *sessionEntry) SetPrev(elem *Session) { + e.prev = elem +} diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go new file mode 100644 index 000000000..610e199da --- /dev/null +++ b/pkg/sentry/kernel/sessions.go @@ -0,0 +1,508 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SessionID is the public identifier. +type SessionID ThreadID + +// ProcessGroupID is the public identifier. +type ProcessGroupID ThreadID + +// Session contains a leader threadgroup and a list of ProcessGroups. +// +// +stateify savable +type Session struct { + refs refs.AtomicRefCount + + // leader is the originator of the Session. + // + // Note that this may no longer be running (and may be reaped), so the + // ID is cached upon initial creation. The leader is still required + // however, since its PIDNamespace defines the scope of the Session. + // + // The leader is immutable. + leader *ThreadGroup + + // id is the cached identifier in the leader's namespace. + // + // The id is immutable. + id SessionID + + // ProcessGroups is a list of process groups in this Session. This is + // protected by TaskSet.mu. + processGroups processGroupList + + // sessionEntry is the embed for TaskSet.sessions. This is protected by + // TaskSet.mu. + sessionEntry +} + +// incRef grabs a reference. +func (s *Session) incRef() { + s.refs.IncRef() +} + +// decRef drops a reference. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (s *Session) decRef() { + s.refs.DecRefWithDestructor(func() { + // Remove translations from the leader. + for ns := s.leader.pidns; ns != nil; ns = ns.parent { + id := ns.sids[s] + delete(ns.sids, s) + delete(ns.sessions, id) + } + + // Remove from the list of global Sessions. + s.leader.pidns.owner.sessions.Remove(s) + }) +} + +// ProcessGroup contains an originator threadgroup and a parent Session. +// +// +stateify savable +type ProcessGroup struct { + refs refs.AtomicRefCount // not exported. + + // originator is the originator of the group. + // + // See note re: leader in Session. The same applies here. + // + // The originator is immutable. + originator *ThreadGroup + + // id is the cached identifier in the originator's namespace. + // + // The id is immutable. + id ProcessGroupID + + // Session is the parent Session. + // + // The session is immutable. + session *Session + + // ancestors is the number of thread groups in this process group whose + // parent is in a different process group in the same session. + // + // The name is derived from the fact that process groups where + // ancestors is zero are considered "orphans". + // + // ancestors is protected by TaskSet.mu. + ancestors uint32 + + // processGroupEntry is the embedded entry for Sessions.groups. This is + // protected by TaskSet.mu. + processGroupEntry +} + +// Originator retuns the originator of the process group. +func (pg *ProcessGroup) Originator() *ThreadGroup { + return pg.originator +} + +// IsOrphan returns true if this process group is an orphan. +func (pg *ProcessGroup) IsOrphan() bool { + pg.originator.TaskSet().mu.RLock() + defer pg.originator.TaskSet().mu.RUnlock() + return pg.ancestors == 0 +} + +// incRefWithParent grabs a reference. +// +// This function is called when this ProcessGroup is being associated with some +// new ThreadGroup, tg. parentPG is the ProcessGroup of tg's parent +// ThreadGroup. If tg is init, then parentPG may be nil. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (pg *ProcessGroup) incRefWithParent(parentPG *ProcessGroup) { + // We acquire an "ancestor" reference in the case of a nil parent. + // This is because the process being associated is init, and init can + // never be orphaned (we count it as always having an ancestor). + if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) { + pg.ancestors++ + } + + pg.refs.IncRef() +} + +// decRefWithParent drops a reference. +// +// parentPG is per incRefWithParent. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { + // See incRefWithParent regarding parent == nil. + if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) { + pg.ancestors-- + } + + alive := true + pg.refs.DecRefWithDestructor(func() { + alive = false // don't bother with handleOrphan. + + // Remove translations from the originator. + for ns := pg.originator.pidns; ns != nil; ns = ns.parent { + id := ns.pgids[pg] + delete(ns.pgids, pg) + delete(ns.processGroups, id) + } + + // Remove the list of process groups. + pg.session.processGroups.Remove(pg) + pg.session.decRef() + }) + if alive { + pg.handleOrphan() + } +} + +// parentPG returns the parent process group. +// +// Precondition: callers must hold TaskSet.mu. +func (tg *ThreadGroup) parentPG() *ProcessGroup { + if tg.leader.parent != nil { + return tg.leader.parent.tg.processGroup + } + return nil +} + +// handleOrphan checks whether the process group is an orphan and has any +// stopped jobs. If yes, then appropriate signals are delivered to each thread +// group within the process group. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (pg *ProcessGroup) handleOrphan() { + // Check if this process is an orphan. + if pg.ancestors != 0 { + return + } + + // See if there are any stopped jobs. + hasStopped := false + pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) { + if tg.processGroup != pg { + return + } + tg.signalHandlers.mu.Lock() + if tg.groupStopComplete { + hasStopped = true + } + tg.signalHandlers.mu.Unlock() + }) + if !hasStopped { + return + } + + // Deliver appropriate signals to all thread groups. + pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) { + if tg.processGroup != pg { + return + } + tg.signalHandlers.mu.Lock() + tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGHUP), true /* group */) + tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGCONT), true /* group */) + tg.signalHandlers.mu.Unlock() + }) + + return +} + +// Session returns the process group's session without taking a reference. +func (pg *ProcessGroup) Session() *Session { + return pg.session +} + +// SendSignal sends a signal to all processes inside the process group. It is +// analagous to kernel/signal.c:kill_pgrp. +func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error { + tasks := pg.originator.TaskSet() + tasks.mu.RLock() + defer tasks.mu.RUnlock() + + var lastErr error + for tg := range tasks.Root.tgids { + if tg.ProcessGroup() == pg { + tg.signalHandlers.mu.Lock() + infoCopy := *info + if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { + lastErr = err + } + tg.signalHandlers.mu.Unlock() + } + } + return lastErr +} + +// CreateSession creates a new Session, with the ThreadGroup as the leader. +// +// EPERM may be returned if either the given ThreadGroup is already a Session +// leader, or a ProcessGroup already exists for the ThreadGroup's ID. +func (tg *ThreadGroup) CreateSession() error { + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + return tg.createSession() +} + +// createSession creates a new session for a threadgroup. +// +// Precondition: callers must hold TaskSet.mu for writing. +func (tg *ThreadGroup) createSession() error { + // Get the ID for this thread in the current namespace. + id := tg.pidns.tgids[tg] + + // Check if this ThreadGroup already leads a Session, or + // if the proposed group is already taken. + for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() { + if s.leader.pidns != tg.pidns { + continue + } + if s.leader == tg { + return syserror.EPERM + } + if s.id == SessionID(id) { + return syserror.EPERM + } + for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { + if pg.id == ProcessGroupID(id) { + return syserror.EPERM + } + } + } + + // Create a new Session, with a single reference. + s := &Session{ + id: SessionID(id), + leader: tg, + } + + // Create a new ProcessGroup, belonging to that Session. + // This also has a single reference (assigned below). + // + // Note that since this is a new session and a new process group, there + // will be zero ancestors for this process group. (It is an orphan at + // this point.) + pg := &ProcessGroup{ + id: ProcessGroupID(id), + originator: tg, + session: s, + ancestors: 0, + } + + // Tie them and return the result. + s.processGroups.PushBack(pg) + tg.pidns.owner.sessions.PushBack(s) + + // Leave the current group, and assign the new one. + if tg.processGroup != nil { + oldParentPG := tg.parentPG() + tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { + childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.decRefWithParent(oldParentPG) + }) + tg.processGroup.decRefWithParent(oldParentPG) + tg.processGroup = pg + } else { + // The current process group may be nil only in the case of an + // unparented thread group (i.e. the init process). This would + // not normally occur, but we allow it for the convenience of + // CreateSession working from that point. There will be no + // child processes. We always say that the very first group + // created has ancestors (avoids checks elsewhere). + // + // Note that this mirrors the parent == nil logic in + // incRef/decRef/reparent, which counts nil as an ancestor. + tg.processGroup = pg + tg.processGroup.ancestors++ + } + + // Ensure a translation is added to all namespaces. + for ns := tg.pidns; ns != nil; ns = ns.parent { + local := ns.tgids[tg] + ns.sids[s] = SessionID(local) + ns.sessions[SessionID(local)] = s + ns.pgids[pg] = ProcessGroupID(local) + ns.processGroups[ProcessGroupID(local)] = pg + } + + return nil +} + +// CreateProcessGroup creates a new process group. +// +// An EPERM error will be returned if the ThreadGroup belongs to a different +// Session, is a Session leader or the group already exists. +func (tg *ThreadGroup) CreateProcessGroup() error { + tg.pidns.owner.mu.Lock() + defer tg.pidns.owner.mu.Unlock() + + // Get the ID for this thread in the current namespace. + id := tg.pidns.tgids[tg] + + // Per above, check for a Session leader or existing group. + for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() { + if s.leader.pidns != tg.pidns { + continue + } + if s.leader == tg { + return syserror.EPERM + } + for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { + if pg.id == ProcessGroupID(id) { + return syserror.EPERM + } + } + } + + // Create a new ProcessGroup, belonging to the current Session. + // + // We manually adjust the ancestors if the parent is in the same + // session. + tg.processGroup.session.incRef() + pg := &ProcessGroup{ + id: ProcessGroupID(id), + originator: tg, + session: tg.processGroup.session, + } + if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { + pg.ancestors++ + } + + // Assign the new process group; adjust children. + oldParentPG := tg.parentPG() + tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { + childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.decRefWithParent(oldParentPG) + }) + tg.processGroup.decRefWithParent(oldParentPG) + tg.processGroup = pg + + // Add the new process group to the session. + pg.session.processGroups.PushBack(pg) + + // Ensure this translation is added to all namespaces. + for ns := tg.pidns; ns != nil; ns = ns.parent { + local := ns.tgids[tg] + ns.pgids[pg] = ProcessGroupID(local) + ns.processGroups[ProcessGroupID(local)] = pg + } + + return nil +} + +// JoinProcessGroup joins an existing process group. +// +// This function will return EACCES if an exec has been performed since fork +// by the given ThreadGroup, and EPERM if the Sessions are not the same or the +// group does not exist. +// +// If checkExec is set, then the join is not permitted after the process has +// executed exec at least once. +func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error { + pidns.owner.mu.Lock() + defer pidns.owner.mu.Unlock() + + // Lookup the ProcessGroup. + pg := pidns.processGroups[pgid] + if pg == nil { + return syserror.EPERM + } + + // Disallow the join if an execve has performed, per POSIX. + if checkExec && tg.execed { + return syserror.EACCES + } + + // See if it's in the same session as ours. + if pg.session != tg.processGroup.session { + return syserror.EPERM + } + + // Join the group; adjust children. + parentPG := tg.parentPG() + pg.incRefWithParent(parentPG) + tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { + childTG.processGroup.incRefWithParent(pg) + childTG.processGroup.decRefWithParent(tg.processGroup) + }) + tg.processGroup.decRefWithParent(parentPG) + tg.processGroup = pg + + return nil +} + +// Session returns the ThreadGroup's Session. +// +// A reference is not taken on the session. +func (tg *ThreadGroup) Session() *Session { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.processGroup.session +} + +// IDOfSession returns the Session assigned to s in PID namespace ns. +// +// If this group isn't visible in this namespace, zero will be returned. It is +// the callers responsibility to check that before using this function. +func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.sids[s] +} + +// SessionWithID returns the Session with the given ID in the PID namespace ns, +// or nil if that given ID is not defined in this namespace. +// +// A reference is not taken on the session. +func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.sessions[id] +} + +// ProcessGroup returns the ThreadGroup's ProcessGroup. +// +// A reference is not taken on the process group. +func (tg *ThreadGroup) ProcessGroup() *ProcessGroup { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.processGroup +} + +// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns. +// +// The same constraints apply as IDOfSession. +func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.pgids[pg] +} + +// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID +// namespace ns, or nil if that given ID is not defined in this namespace. +// +// A reference is not taken on the process group. +func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup { + pidns.owner.mu.RLock() + defer pidns.owner.mu.RUnlock() + return pidns.processGroups[id] +} diff --git a/pkg/sentry/kernel/shm/device.go b/pkg/sentry/kernel/shm/device.go new file mode 100644 index 000000000..3cb759072 --- /dev/null +++ b/pkg/sentry/kernel/shm/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package shm + +import "gvisor.googlesource.com/gvisor/pkg/sentry/device" + +// shmDevice is the kernel shm device. +var shmDevice = device.NewAnonDevice() diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go new file mode 100644 index 000000000..00393b5f0 --- /dev/null +++ b/pkg/sentry/kernel/shm/shm.go @@ -0,0 +1,671 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package shm implements sysv shared memory segments. +// +// Known missing features: +// +// - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement +// memory locking in general. +// +// - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy +// way to implement hugetlb support on a per-map basis, and it has no impact +// on correctness. +// +// - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap +// so it's meaningless to reserve space for swap. +// +// - No per-process segment size enforcement. This feature probably isn't used +// much anyways, since Linux sets the per-process limits to the system-wide +// limits by default. +// +// Lock ordering: mm.mappingMu -> shm registry lock -> shm lock +package shm + +import ( + "fmt" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Key represents a shm segment key. Analogous to a file name. +type Key int32 + +// ID represents the opaque handle for a shm segment. Analogous to an fd. +type ID int32 + +// Registry tracks all shared memory segments in an IPC namespace. The registry +// provides the mechanisms for creating and finding segments, and reporting +// global shm parameters. +// +// +stateify savable +type Registry struct { + // userNS owns the IPC namespace this registry belong to. Immutable. + userNS *auth.UserNamespace + + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + + // shms maps segment ids to segments. + shms map[ID]*Shm + + // keysToShms maps segment keys to segments. + keysToShms map[Key]*Shm + + // Sum of the sizes of all existing segments rounded up to page size, in + // units of page size. + totalPages uint64 + + // ID assigned to the last created segment. Used to quickly find the next + // unused ID. + lastIDUsed ID +} + +// NewRegistry creates a new shm registry. +func NewRegistry(userNS *auth.UserNamespace) *Registry { + return &Registry{ + userNS: userNS, + shms: make(map[ID]*Shm), + keysToShms: make(map[Key]*Shm), + } +} + +// FindByID looks up a segment given an ID. +func (r *Registry) FindByID(id ID) *Shm { + r.mu.Lock() + defer r.mu.Unlock() + return r.shms[id] +} + +// dissociateKey removes the association between a segment and its key, +// preventing it from being discovered in the registry. This doesn't necessarily +// mean the segment is about to be destroyed. This is analogous to unlinking a +// file; the segment can still be used by a process already referencing it, but +// cannot be discovered by a new process. +func (r *Registry) dissociateKey(s *Shm) { + r.mu.Lock() + defer r.mu.Unlock() + s.mu.Lock() + defer s.mu.Unlock() + if s.key != linux.IPC_PRIVATE { + delete(r.keysToShms, s.key) + s.key = linux.IPC_PRIVATE + } +} + +// FindOrCreate looks up or creates a segment in the registry. It's functionally +// analogous to open(2). +func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { + if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) { + // "A new segment was to be created and size is less than SHMMIN or + // greater than SHMMAX." - man shmget(2) + // + // Note that 'private' always implies the creation of a new segment + // whether IPC_CREAT is specified or not. + return nil, syserror.EINVAL + } + + r.mu.Lock() + defer r.mu.Unlock() + + if len(r.shms) >= linux.SHMMNI { + // "All possible shared memory IDs have been taken (SHMMNI) ..." + // - man shmget(2) + return nil, syserror.ENOSPC + } + + if !private { + // Look up an existing segment. + if shm := r.keysToShms[key]; shm != nil { + shm.mu.Lock() + defer shm.mu.Unlock() + + // Check that caller can access the segment. + if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) { + // "The user does not have permission to access the shared + // memory segment, and does not have the CAP_IPC_OWNER + // capability in the user namespace that governs its IPC + // namespace." - man shmget(2) + return nil, syserror.EACCES + } + + if size > shm.size { + // "A segment for the given key exists, but size is greater than + // the size of that segment." - man shmget(2) + return nil, syserror.EINVAL + } + + if create && exclusive { + // "IPC_CREAT and IPC_EXCL were specified in shmflg, but a + // shared memory segment already exists for key." + // - man shmget(2) + return nil, syserror.EEXIST + } + + return shm, nil + } + + if !create { + // "No segment exists for the given key, and IPC_CREAT was not + // specified." - man shmget(2) + return nil, syserror.ENOENT + } + } + + var sizeAligned uint64 + if val, ok := usermem.Addr(size).RoundUp(); ok { + sizeAligned = uint64(val) + } else { + return nil, syserror.EINVAL + } + + if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > linux.SHMALL { + // "... allocating a segment of the requested size would cause the + // system to exceed the system-wide limit on shared memory (SHMALL)." + // - man shmget(2) + return nil, syserror.ENOSPC + } + + // Need to create a new segment. + creator := fs.FileOwnerFromContext(ctx) + perms := fs.FilePermsFromMode(mode) + return r.newShm(ctx, pid, key, creator, perms, size) +} + +// newShm creates a new segment in the registry. +// +// Precondition: Caller must hold r.mu. +func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) { + mfp := pgalloc.MemoryFileProviderFromContext(ctx) + if mfp == nil { + panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider)) + } + + effectiveSize := uint64(usermem.Addr(size).MustRoundUp()) + fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous) + if err != nil { + return nil, err + } + + shm := &Shm{ + mfp: mfp, + registry: r, + creator: creator, + size: size, + effectiveSize: effectiveSize, + fr: fr, + key: key, + perms: perms, + owner: creator, + creatorPID: pid, + changeTime: ktime.NowFromContext(ctx), + } + + // Find the next available ID. + for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { + // Handle wrap around. + if id < 0 { + id = 0 + continue + } + if r.shms[id] == nil { + r.lastIDUsed = id + + shm.ID = id + r.shms[id] = shm + r.keysToShms[key] = shm + + r.totalPages += effectiveSize / usermem.PageSize + + return shm, nil + } + } + + log.Warningf("Shm ids exhuasted, they may be leaking") + return nil, syserror.ENOSPC +} + +// IPCInfo reports global parameters for sysv shared memory segments on this +// system. See shmctl(IPC_INFO). +func (r *Registry) IPCInfo() *linux.ShmParams { + return &linux.ShmParams{ + ShmMax: linux.SHMMAX, + ShmMin: linux.SHMMIN, + ShmMni: linux.SHMMNI, + ShmSeg: linux.SHMSEG, + ShmAll: linux.SHMALL, + } +} + +// ShmInfo reports linux-specific global parameters for sysv shared memory +// segments on this system. See shmctl(SHM_INFO). +func (r *Registry) ShmInfo() *linux.ShmInfo { + r.mu.Lock() + defer r.mu.Unlock() + + return &linux.ShmInfo{ + UsedIDs: int32(r.lastIDUsed), + ShmTot: r.totalPages, + ShmRss: r.totalPages, // We could probably get a better estimate from memory accounting. + ShmSwp: 0, // No reclaim at the moment. + } +} + +// remove deletes a segment from this registry, deaccounting the memory used by +// the segment. +// +// Precondition: Must follow a call to r.dissociateKey(s). +func (r *Registry) remove(s *Shm) { + r.mu.Lock() + defer r.mu.Unlock() + s.mu.Lock() + defer s.mu.Unlock() + + if s.key != linux.IPC_PRIVATE { + panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked())) + } + + delete(r.shms, s.ID) + r.totalPages -= s.effectiveSize / usermem.PageSize +} + +// Shm represents a single shared memory segment. +// +// Shm segment are backed directly by an allocation from platform +// memory. Segments are always mapped as a whole, greatly simplifying how +// mappings are tracked. However note that mremap and munmap calls may cause the +// vma for a segment to become fragmented; which requires special care when +// unmapping a segment. See mm/shm.go. +// +// Segments persist until they are explicitly marked for destruction via +// shmctl(SHM_RMID). +// +// Shm implements memmap.Mappable and memmap.MappingIdentity. +// +// +stateify savable +type Shm struct { + // AtomicRefCount tracks the number of references to this segment from + // maps. A segment always holds a reference to itself, until it's marked for + // destruction. + refs.AtomicRefCount + + mfp pgalloc.MemoryFileProvider + + // registry points to the shm registry containing this segment. Immutable. + registry *Registry + + // ID is the kernel identifier for this segment. Immutable. + ID ID + + // creator is the user that created the segment. Immutable. + creator fs.FileOwner + + // size is the requested size of the segment at creation, in + // bytes. Immutable. + size uint64 + + // effectiveSize of the segment, rounding up to the next page + // boundary. Immutable. + // + // Invariant: effectiveSize must be a multiple of usermem.PageSize. + effectiveSize uint64 + + // fr is the offset into mfp.MemoryFile() that backs this contents of this + // segment. Immutable. + fr platform.FileRange + + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + + // key is the public identifier for this segment. + key Key + + // perms is the access permissions for the segment. + perms fs.FilePermissions + + // owner of this segment. + owner fs.FileOwner + // attachTime is updated on every successful shmat. + attachTime ktime.Time + // detachTime is updated on every successful shmdt. + detachTime ktime.Time + // changeTime is updated on every successful changes to the segment via + // shmctl(IPC_SET). + changeTime ktime.Time + + // creatorPID is the PID of the process that created the segment. + creatorPID int32 + // lastAttachDetachPID is the pid of the process that issued the last shmat + // or shmdt syscall. + lastAttachDetachPID int32 + + // pendingDestruction indicates the segment was marked as destroyed through + // shmctl(IPC_RMID). When marked as destroyed, the segment will not be found + // in the registry and can no longer be attached. When the last user + // detaches from the segment, it is destroyed. + pendingDestruction bool +} + +// Precondition: Caller must hold s.mu. +func (s *Shm) debugLocked() string { + return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}", + s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (s *Shm) MappedName(ctx context.Context) string { + s.mu.Lock() + defer s.mu.Unlock() + return fmt.Sprintf("SYSV%08d", s.key) +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (s *Shm) DeviceID() uint64 { + return shmDevice.DeviceID() +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (s *Shm) InodeID() uint64 { + // "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use + // this. Changing this will break them." -- Linux, ipc/shm.c:newseg() + return uint64(s.ID) +} + +// DecRef overrides refs.RefCount.DecRef with a destructor. +// +// Precondition: Caller must not hold s.mu. +func (s *Shm) DecRef() { + s.DecRefWithDestructor(s.destroy) +} + +// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm +// segments. +func (s *Shm) Msync(context.Context, memmap.MappableRange) error { + return nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) error { + s.mu.Lock() + defer s.mu.Unlock() + s.attachTime = ktime.NowFromContext(ctx) + if pid, ok := context.ThreadGroupIDFromContext(ctx); ok { + s.lastAttachDetachPID = pid + } else { + // AddMapping is called during a syscall, so ctx should always be a task + // context. + log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked()) + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) { + s.mu.Lock() + defer s.mu.Unlock() + // TODO(b/38173783): RemoveMapping may be called during task exit, when ctx + // is context.Background. Gracefully handle missing clocks. Failing to + // update the detach time in these cases is ok, since no one can observe the + // omission. + if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { + s.detachTime = clock.Now() + } + + // If called from a non-task context we also won't have a threadgroup + // id. Silently skip updating the lastAttachDetachPid in that case. + if pid, ok := context.ThreadGroupIDFromContext(ctx); ok { + s.lastAttachDetachPID = pid + } else { + log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked()) + } +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error { + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + var err error + if required.End > s.fr.Length() { + err = &memmap.BusError{syserror.EFAULT} + } + if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 { + return []memmap.Translation{ + { + Source: source, + File: s.mfp.MemoryFile(), + Offset: s.fr.Start + source.Start, + Perms: usermem.AnyAccess, + }, + }, err + } + return nil, err +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (s *Shm) InvalidateUnsavable(ctx context.Context) error { + return nil +} + +// AttachOpts describes various flags passed to shmat(2). +type AttachOpts struct { + Execute bool + Readonly bool + Remap bool +} + +// ConfigureAttach creates an mmap configuration for the segment with the +// requested attach options. +// +// ConfigureAttach returns with a ref on s on success. The caller should drop +// this once the map is installed. This reference prevents s from being +// destroyed before the returned configuration is used. +func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) { + s.mu.Lock() + defer s.mu.Unlock() + if s.pendingDestruction && s.ReadRefs() == 0 { + return memmap.MMapOpts{}, syserror.EIDRM + } + + if !s.checkPermissions(ctx, fs.PermMask{ + Read: true, + Write: !opts.Readonly, + Execute: opts.Execute, + }) { + // "The calling process does not have the required permissions for the + // requested attach type, and does not have the CAP_IPC_OWNER capability + // in the user namespace that governs its IPC namespace." - man shmat(2) + return memmap.MMapOpts{}, syserror.EACCES + } + s.IncRef() + return memmap.MMapOpts{ + Length: s.size, + Offset: 0, + Addr: addr, + Fixed: opts.Remap, + Perms: usermem.AccessType{ + Read: true, + Write: !opts.Readonly, + Execute: opts.Execute, + }, + MaxPerms: usermem.AnyAccess, + Mappable: s, + MappingIdentity: s, + }, nil +} + +// EffectiveSize returns the size of the underlying shared memory segment. This +// may be larger than the requested size at creation, due to rounding to page +// boundaries. +func (s *Shm) EffectiveSize() uint64 { + return s.effectiveSize +} + +// IPCStat returns information about a shm. See shmctl(IPC_STAT). +func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The caller must have read permission on the shared memory segment." + // - man shmctl(2) + if !s.checkPermissions(ctx, fs.PermMask{Read: true}) { + // "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow + // read access for shmid, and the calling process does not have the + // CAP_IPC_OWNER capability in the user namespace that governs its IPC + // namespace." - man shmctl(2) + return nil, syserror.EACCES + } + + var mode uint16 + if s.pendingDestruction { + mode |= linux.SHM_DEST + } + creds := auth.CredentialsFromContext(ctx) + + nattach := uint64(s.ReadRefs()) + // Don't report the self-reference we keep prior to being marked for + // destruction. However, also don't report a count of -1 for segments marked + // as destroyed, with no mappings. + if !s.pendingDestruction { + nattach-- + } + + ds := &linux.ShmidDS{ + ShmPerm: linux.IPCPerm{ + Key: uint32(s.key), + UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), + Mode: mode | uint16(s.perms.LinuxMode()), + Seq: 0, // IPC sequences not supported. + }, + ShmSegsz: s.size, + ShmAtime: s.attachTime.TimeT(), + ShmDtime: s.detachTime.TimeT(), + ShmCtime: s.changeTime.TimeT(), + ShmCpid: s.creatorPID, + ShmLpid: s.lastAttachDetachPID, + ShmNattach: nattach, + } + + return ds, nil +} + +// Set modifies attributes for a segment. See shmctl(IPC_SET). +func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { + s.mu.Lock() + defer s.mu.Unlock() + + if !s.checkOwnership(ctx) { + return syserror.EPERM + } + + creds := auth.CredentialsFromContext(ctx) + uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID)) + gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID)) + if !uid.Ok() || !gid.Ok() { + return syserror.EINVAL + } + + // User may only modify the lower 9 bits of the mode. All the other bits are + // always 0 for the underlying inode. + mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff) + s.perms = fs.FilePermsFromMode(mode) + + s.owner.UID = uid + s.owner.GID = gid + + s.changeTime = ktime.NowFromContext(ctx) + return nil +} + +func (s *Shm) destroy() { + s.mfp.MemoryFile().DecRef(s.fr) + s.registry.remove(s) +} + +// MarkDestroyed marks a segment for destruction. The segment is actually +// destroyed once it has no references. MarkDestroyed may be called multiple +// times, and is safe to call after a segment has already been destroyed. See +// shmctl(IPC_RMID). +func (s *Shm) MarkDestroyed() { + s.registry.dissociateKey(s) + + s.mu.Lock() + // Only drop the segment's self-reference once, when destruction is + // requested. Otherwise, repeated calls to shmctl(IPC_RMID) would force a + // segment to be destroyed prematurely, potentially with active maps to the + // segment's address range. Remaining references are dropped when the + // segment is detached or unmaped. + if !s.pendingDestruction { + s.pendingDestruction = true + s.mu.Unlock() // Must release s.mu before calling s.DecRef. + s.DecRef() + return + } + s.mu.Unlock() +} + +// checkOwnership verifies whether a segment may be accessed by ctx as an +// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux. +// +// Precondition: Caller must hold s.mu. +func (s *Shm) checkOwnership(ctx context.Context) bool { + creds := auth.CredentialsFromContext(ctx) + if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID { + return true + } + + // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux + // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented + // for use to "override IPC ownership checks". + return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS) +} + +// checkPermissions verifies whether a segment is accessible by ctx for access +// described by req. See ipc/util.c:ipcperms() in Linux. +// +// Precondition: Caller must hold s.mu. +func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool { + creds := auth.CredentialsFromContext(ctx) + + p := s.perms.Other + if s.owner.UID == creds.EffectiveKUID { + p = s.perms.User + } else if creds.InGroup(s.owner.GID) { + p = s.perms.Group + } + if p.SupersetOf(req) { + return true + } + + // Tasks with CAP_IPC_OWNER may bypass permission checks. + return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) +} diff --git a/pkg/sentry/kernel/shm/shm_state_autogen.go b/pkg/sentry/kernel/shm/shm_state_autogen.go new file mode 100755 index 000000000..d94d01fce --- /dev/null +++ b/pkg/sentry/kernel/shm/shm_state_autogen.go @@ -0,0 +1,74 @@ +// automatically generated by stateify. + +package shm + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *Registry) beforeSave() {} +func (x *Registry) save(m state.Map) { + x.beforeSave() + m.Save("userNS", &x.userNS) + m.Save("shms", &x.shms) + m.Save("keysToShms", &x.keysToShms) + m.Save("totalPages", &x.totalPages) + m.Save("lastIDUsed", &x.lastIDUsed) +} + +func (x *Registry) afterLoad() {} +func (x *Registry) load(m state.Map) { + m.Load("userNS", &x.userNS) + m.Load("shms", &x.shms) + m.Load("keysToShms", &x.keysToShms) + m.Load("totalPages", &x.totalPages) + m.Load("lastIDUsed", &x.lastIDUsed) +} + +func (x *Shm) beforeSave() {} +func (x *Shm) save(m state.Map) { + x.beforeSave() + m.Save("AtomicRefCount", &x.AtomicRefCount) + m.Save("mfp", &x.mfp) + m.Save("registry", &x.registry) + m.Save("ID", &x.ID) + m.Save("creator", &x.creator) + m.Save("size", &x.size) + m.Save("effectiveSize", &x.effectiveSize) + m.Save("fr", &x.fr) + m.Save("key", &x.key) + m.Save("perms", &x.perms) + m.Save("owner", &x.owner) + m.Save("attachTime", &x.attachTime) + m.Save("detachTime", &x.detachTime) + m.Save("changeTime", &x.changeTime) + m.Save("creatorPID", &x.creatorPID) + m.Save("lastAttachDetachPID", &x.lastAttachDetachPID) + m.Save("pendingDestruction", &x.pendingDestruction) +} + +func (x *Shm) afterLoad() {} +func (x *Shm) load(m state.Map) { + m.Load("AtomicRefCount", &x.AtomicRefCount) + m.Load("mfp", &x.mfp) + m.Load("registry", &x.registry) + m.Load("ID", &x.ID) + m.Load("creator", &x.creator) + m.Load("size", &x.size) + m.Load("effectiveSize", &x.effectiveSize) + m.Load("fr", &x.fr) + m.Load("key", &x.key) + m.Load("perms", &x.perms) + m.Load("owner", &x.owner) + m.Load("attachTime", &x.attachTime) + m.Load("detachTime", &x.detachTime) + m.Load("changeTime", &x.changeTime) + m.Load("creatorPID", &x.creatorPID) + m.Load("lastAttachDetachPID", &x.lastAttachDetachPID) + m.Load("pendingDestruction", &x.pendingDestruction) +} + +func init() { + state.Register("shm.Registry", (*Registry)(nil), state.Fns{Save: (*Registry).save, Load: (*Registry).load}) + state.Register("shm.Shm", (*Shm)(nil), state.Fns{Save: (*Shm).save, Load: (*Shm).load}) +} diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go new file mode 100644 index 000000000..b528ec0dc --- /dev/null +++ b/pkg/sentry/kernel/signal.go @@ -0,0 +1,76 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" +) + +// SignalPanic is used to panic the running threads. It is a signal which +// cannot be used by the application: it must be caught and ignored by the +// runtime (in order to catch possible races). +const SignalPanic = linux.SIGUSR2 + +// sendExternalSignal is called when an asynchronous signal is sent to the +// sentry ("in sentry context"). On some platforms, it may also be called when +// an asynchronous signal is sent to sandboxed application threads ("in +// application context"). +// +// context is used only for debugging to differentiate these cases. +// +// Preconditions: Kernel must have an init process. +func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) { + switch linux.Signal(info.Signo) { + case platform.SignalInterrupt: + // Assume that a call to platform.Context.Interrupt() misfired. + + case SignalPanic: + // SignalPanic is also specially handled in sentry setup to ensure that + // it causes a panic even after tasks exit, but SignalPanic may also + // be sent here if it is received while in app context. + panic("Signal-induced panic") + + default: + log.Infof("Received external signal %d in %s context", info.Signo, context) + if k.globalInit == nil { + panic(fmt.Sprintf("Received external signal %d before init created", info.Signo)) + } + k.globalInit.SendSignal(info) + } +} + +// SignalInfoPriv returns a SignalInfo equivalent to Linux's SEND_SIG_PRIV. +func SignalInfoPriv(sig linux.Signal) *arch.SignalInfo { + return &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoKernel, + } +} + +// SignalInfoNoInfo returns a SignalInfo equivalent to Linux's SEND_SIG_NOINFO. +func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo { + info := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + info.SetPid(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg))) + info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + return info +} diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go new file mode 100644 index 000000000..ce8bcb5e5 --- /dev/null +++ b/pkg/sentry/kernel/signal_handlers.go @@ -0,0 +1,89 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" +) + +// SignalHandlers holds information about signal actions. +// +// +stateify savable +type SignalHandlers struct { + // mu protects actions, as well as the signal state of all tasks and thread + // groups using this SignalHandlers object. (See comment on + // ThreadGroup.signalHandlers.) + mu sync.Mutex `state:"nosave"` + + // actions is the action to be taken upon receiving each signal. + actions map[linux.Signal]arch.SignalAct +} + +// NewSignalHandlers returns a new SignalHandlers specifying all default +// actions. +func NewSignalHandlers() *SignalHandlers { + return &SignalHandlers{ + actions: make(map[linux.Signal]arch.SignalAct), + } +} + +// Fork returns a copy of sh for a new thread group. +func (sh *SignalHandlers) Fork() *SignalHandlers { + sh2 := NewSignalHandlers() + sh.mu.Lock() + defer sh.mu.Unlock() + for sig, act := range sh.actions { + sh2.actions[sig] = act + } + return sh2 +} + +// CopyForExec returns a copy of sh for a thread group that is undergoing an +// execve. (See comments in Task.finishExec.) +func (sh *SignalHandlers) CopyForExec() *SignalHandlers { + sh2 := NewSignalHandlers() + sh.mu.Lock() + defer sh.mu.Unlock() + for sig, act := range sh.actions { + if act.Handler == arch.SignalActIgnore { + sh2.actions[sig] = arch.SignalAct{ + Handler: arch.SignalActIgnore, + } + } + } + return sh2 +} + +// IsIgnored returns true if the signal is ignored. +func (sh *SignalHandlers) IsIgnored(sig linux.Signal) bool { + sh.mu.Lock() + defer sh.mu.Unlock() + sa, ok := sh.actions[sig] + return ok && sa.Handler == arch.SignalActIgnore +} + +// dequeueActionLocked returns the SignalAct that should be used to handle sig. +// +// Preconditions: sh.mu must be locked. +func (sh *SignalHandlers) dequeueAction(sig linux.Signal) arch.SignalAct { + act := sh.actions[sig] + if act.IsResetHandler() { + delete(sh.actions, sig) + } + return act +} diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go new file mode 100644 index 000000000..0572053db --- /dev/null +++ b/pkg/sentry/kernel/syscalls.go @@ -0,0 +1,307 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi" + "gvisor.googlesource.com/gvisor/pkg/bits" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// maxSyscallNum is the highest supported syscall number. +// +// The types below create fast lookup slices for all syscalls. This maximum +// serves as a sanity check that we don't allocate huge slices for a very large +// syscall. +const maxSyscallNum = 2000 + +// SyscallFn is a syscall implementation. +type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error) + +// MissingFn is a syscall to be called when an implementation is missing. +type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) + +// Possible flags for SyscallFlagsTable.enable. +const ( + // syscallPresent indicates that this is not a missing syscall. + // + // This flag is used internally in SyscallFlagsTable. + syscallPresent = 1 << iota + + // StraceEnableLog enables syscall log tracing. + StraceEnableLog + + // StraceEnableEvent enables syscall event tracing. + StraceEnableEvent + + // ExternalBeforeEnable enables the external hook before syscall execution. + ExternalBeforeEnable + + // ExternalAfterEnable enables the external hook after syscall execution. + ExternalAfterEnable +) + +// StraceEnableBits combines both strace log and event flags. +const StraceEnableBits = StraceEnableLog | StraceEnableEvent + +// SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall +// basis. +type SyscallFlagsTable struct { + // mu protects writes to the fields below. + // + // Atomic loads are always allowed. Atomic stores are allowed only + // while mu is held. + mu sync.Mutex + + // enable contains the enable bits for each syscall. + // + // missing syscalls have the same value in enable as missingEnable to + // avoid an extra branch in Word. + enable []uint32 + + // missingEnable contains the enable bits for missing syscalls. + missingEnable uint32 +} + +// Init initializes the struct, with all syscalls in table set to enable. +// +// max is the largest syscall number in table. +func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) { + e.enable = make([]uint32, max+1) + for num := range table { + e.enable[num] = syscallPresent + } +} + +// Word returns the enable bitfield for sysno. +func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 { + if sysno < uintptr(len(e.enable)) { + return atomic.LoadUint32(&e.enable[sysno]) + } + + return atomic.LoadUint32(&e.missingEnable) +} + +// Enable sets enable bit bit for all syscalls based on s. +// +// Syscalls missing from s are disabled. +// +// Syscalls missing from the initial table passed to Init cannot be added as +// individual syscalls. If present in s they will be ignored. +// +// Callers to Word may see either the old or new value while this function +// is executing. +func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) { + e.mu.Lock() + defer e.mu.Unlock() + + missingVal := atomic.LoadUint32(&e.missingEnable) + if missingEnable { + missingVal |= bit + } else { + missingVal &^= bit + } + atomic.StoreUint32(&e.missingEnable, missingVal) + + for num := range e.enable { + val := atomic.LoadUint32(&e.enable[num]) + if !bits.IsOn32(val, syscallPresent) { + // Missing. + atomic.StoreUint32(&e.enable[num], missingVal) + continue + } + + if s[uintptr(num)] { + val |= bit + } else { + val &^= bit + } + atomic.StoreUint32(&e.enable[num], val) + } +} + +// EnableAll sets enable bit bit for all syscalls, present and missing. +func (e *SyscallFlagsTable) EnableAll(bit uint32) { + e.mu.Lock() + defer e.mu.Unlock() + + missingVal := atomic.LoadUint32(&e.missingEnable) + missingVal |= bit + atomic.StoreUint32(&e.missingEnable, missingVal) + + for num := range e.enable { + val := atomic.LoadUint32(&e.enable[num]) + if !bits.IsOn32(val, syscallPresent) { + // Missing. + atomic.StoreUint32(&e.enable[num], missingVal) + continue + } + + val |= bit + atomic.StoreUint32(&e.enable[num], val) + } +} + +// Stracer traces syscall execution. +type Stracer interface { + // SyscallEnter is called on syscall entry. + // + // The returned private data is passed to SyscallExit. + // + // TODO(gvisor.dev/issue/155): remove kernel imports from the strace + // package so that the type can be used directly. + SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} + + // SyscallExit is called on syscall exit. + SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error) +} + +// SyscallTable is a lookup table of system calls. Critically, a SyscallTable +// is *immutable*. In order to make supporting suspend and resume sane, they +// must be uniquely registered and may not change during operation. +// +// +stateify savable +type SyscallTable struct { + // OS is the operating system that this syscall table implements. + OS abi.OS `state:"wait"` + + // Arch is the architecture that this syscall table targets. + Arch arch.Arch `state:"wait"` + + // The OS version that this syscall table implements. + Version Version `state:"manual"` + + // AuditNumber is a numeric constant that represents the syscall table. If + // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by + // linux/audit.h. + AuditNumber uint32 `state:"manual"` + + // Table is the collection of functions. + Table map[uintptr]SyscallFn `state:"manual"` + + // lookup is a fixed-size array that holds the syscalls (indexed by + // their numbers). It is used for fast look ups. + lookup []SyscallFn `state:"manual"` + + // Emulate is a collection of instruction addresses to emulate. The + // keys are addresses, and the values are system call numbers. + Emulate map[usermem.Addr]uintptr `state:"manual"` + + // The function to call in case of a missing system call. + Missing MissingFn `state:"manual"` + + // Stracer traces this syscall table. + Stracer Stracer `state:"manual"` + + // External is used to handle an external callback. + External func(*Kernel) `state:"manual"` + + // ExternalFilterBefore is called before External is called before the syscall is executed. + // External is not called if it returns false. + ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` + + // ExternalFilterAfter is called before External is called after the syscall is executed. + // External is not called if it returns false. + ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` + + // FeatureEnable stores the strace and one-shot enable bits. + FeatureEnable SyscallFlagsTable `state:"manual"` +} + +// allSyscallTables contains all known tables. +var allSyscallTables []*SyscallTable + +// SyscallTables returns a read-only slice of registered SyscallTables. +func SyscallTables() []*SyscallTable { + return allSyscallTables +} + +// LookupSyscallTable returns the SyscallCall table for the OS/Arch combination. +func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { + for _, s := range allSyscallTables { + if s.OS == os && s.Arch == a { + return s, true + } + } + return nil, false +} + +// RegisterSyscallTable registers a new syscall table for use by a Kernel. +func RegisterSyscallTable(s *SyscallTable) { + if s.Table == nil { + // Ensure non-nil lookup table. + s.Table = make(map[uintptr]SyscallFn) + } + if s.Emulate == nil { + // Ensure non-nil emulate table. + s.Emulate = make(map[usermem.Addr]uintptr) + } + + var max uintptr + for num := range s.Table { + if num > max { + max = num + } + } + + if max > maxSyscallNum { + panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) + } + + s.lookup = make([]SyscallFn, max+1) + + // Initialize the fast-lookup table. + for num, fn := range s.Table { + s.lookup[num] = fn + } + + s.FeatureEnable.init(s.Table, max) + + if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { + panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) + } + + // Save a reference to this table. + // + // This is required for a Kernel to find the table and for save/restore + // operations below. + allSyscallTables = append(allSyscallTables, s) +} + +// Lookup returns the syscall implementation, if one exists. +func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { + if sysno < uintptr(len(s.lookup)) { + return s.lookup[sysno] + } + + return nil +} + +// LookupEmulate looks up an emulation syscall number. +func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { + sysno, ok := s.Emulate[addr] + return sysno, ok +} + +// mapLookup is similar to Lookup, except that it only uses the syscall table, +// that is, it skips the fast look array. This is available for benchmarking. +func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { + return s.Table[sysno] +} diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go new file mode 100644 index 000000000..00358326b --- /dev/null +++ b/pkg/sentry/kernel/syscalls_state.go @@ -0,0 +1,29 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import "fmt" + +// afterLoad is invoked by stateify. +func (s *SyscallTable) afterLoad() { + otherTable, ok := LookupSyscallTable(s.OS, s.Arch) + if !ok { + // Couldn't find a reference? + panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch)) + } + + // Copy the table. + *s = *otherTable +} diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go new file mode 100644 index 000000000..175d1b247 --- /dev/null +++ b/pkg/sentry/kernel/syslog.go @@ -0,0 +1,106 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "math/rand" + "sync" +) + +// syslog represents a sentry-global kernel log. +// +// Currently, it contains only fun messages for a dmesg easter egg. +// +// +stateify savable +type syslog struct { + // mu protects the below. + mu sync.Mutex `state:"nosave"` + + // msg is the syslog message buffer. It is lazily initialized. + msg []byte +} + +// Log returns a copy of the syslog. +func (s *syslog) Log() []byte { + s.mu.Lock() + defer s.mu.Unlock() + + if s.msg != nil { + // Already initialized, just return a copy. + o := make([]byte, len(s.msg)) + copy(o, s.msg) + return o + } + + // Not initialized, create message. + allMessages := []string{ + "Synthesizing system calls...", + "Mounting deweydecimalfs...", + "Moving files to filing cabinet...", + "Digging up root...", + "Constructing home...", + "Segmenting fault lines...", + "Creating bureaucratic processes...", + "Searching for needles in stacks...", + "Preparing for the zombie uprising...", + "Feeding the init monster...", + "Creating cloned children...", + "Daemonizing children...", + "Waiting for children...", + "Gathering forks...", + "Committing treasure map to memory...", + "Reading process obituaries...", + "Searching for socket adapter...", + "Creating process schedule...", + "Generating random numbers by fair dice roll...", + "Rewriting operating system in Javascript...", + "Consulting tar man page...", + "Forking spaghetti code...", + "Checking naughty and nice process list...", + "Checking naughty and nice process list...", // Check it up to twice. + "Granting licence to kill(2)...", // British spelling for British movie. + "Letting the watchdogs out...", + } + + selectMessage := func() string { + i := rand.Intn(len(allMessages)) + m := allMessages[i] + + // Delete the selected message. + allMessages[i] = allMessages[len(allMessages)-1] + allMessages = allMessages[:len(allMessages)-1] + + return m + } + + const format = "<6>[%11.6f] %s\n" + + s.msg = append(s.msg, []byte(fmt.Sprintf(format, 0.0, "Starting gVisor..."))...) + + time := 0.1 + for i := 0; i < 10; i++ { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...) + } + + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...) + + // Return a copy. + o := make([]byte, len(s.msg)) + copy(o, s.msg) + return o +} diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go new file mode 100644 index 000000000..f9378c2de --- /dev/null +++ b/pkg/sentry/kernel/task.go @@ -0,0 +1,723 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl" + "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/third_party/gvsync" +) + +// Task represents a thread of execution in the untrusted app. It +// includes registers and any thread-specific state that you would +// normally expect. +// +// Each task is associated with a goroutine, called the task goroutine, that +// executes code (application code, system calls, etc.) on behalf of that task. +// See Task.run (task_run.go). +// +// All fields that are "owned by the task goroutine" can only be mutated by the +// task goroutine while it is running. The task goroutine does not require +// synchronization to read these fields, although it still requires +// synchronization as described for those fields to mutate them. +// +// All fields that are "exclusive to the task goroutine" can only be accessed +// by the task goroutine while it is running. The task goroutine does not +// require synchronization to read or write these fields. +// +// +stateify savable +type Task struct { + taskNode + + // runState is what the task goroutine is executing if it is not stopped. + // If runState is nil, the task goroutine should exit or has exited. + // runState is exclusive to the task goroutine. + runState taskRunState + + // haveSyscallReturn is true if tc.Arch().Return() represents a value + // returned by a syscall (or set by ptrace after a syscall). + // + // haveSyscallReturn is exclusive to the task goroutine. + haveSyscallReturn bool + + // interruptChan is notified whenever the task goroutine is interrupted + // (usually by a pending signal). interruptChan is effectively a condition + // variable that can be used in select statements. + // + // interruptChan is not saved; because saving interrupts all tasks, + // interruptChan is always notified after restore (see Task.run). + interruptChan chan struct{} `state:"nosave"` + + // gosched contains the current scheduling state of the task goroutine. + // + // gosched is protected by goschedSeq. gosched is owned by the task + // goroutine. + goschedSeq gvsync.SeqCount `state:"nosave"` + gosched TaskGoroutineSchedInfo + + // yieldCount is the number of times the task goroutine has called + // Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or + // Task.Yield(), voluntarily ceasing execution. + // + // yieldCount is accessed using atomic memory operations. yieldCount is + // owned by the task goroutine. + yieldCount uint64 + + // pendingSignals is the set of pending signals that may be handled only by + // this task. + // + // pendingSignals is protected by (taskNode.)tg.signalHandlers.mu + // (hereafter "the signal mutex"); see comment on + // ThreadGroup.signalHandlers. + pendingSignals pendingSignals + + // signalMask is the set of signals whose delivery is currently blocked. + // + // signalMask is accessed using atomic memory operations, and is protected + // by the signal mutex (such that reading signalMask is safe if either the + // signal mutex is locked or if atomic memory operations are used, while + // writing signalMask requires both). signalMask is owned by the task + // goroutine. + signalMask linux.SignalSet + + // If the task goroutine is currently executing Task.sigtimedwait, + // realSignalMask is the previous value of signalMask, which has temporarily + // been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0. + // + // realSignalMask is exclusive to the task goroutine. + realSignalMask linux.SignalSet + + // If haveSavedSignalMask is true, savedSignalMask is the signal mask that + // should be applied after the task has either delivered one signal to a + // user handler or is about to resume execution in the untrusted + // application. + // + // Both haveSavedSignalMask and savedSignalMask are exclusive to the task + // goroutine. + haveSavedSignalMask bool + savedSignalMask linux.SignalSet + + // signalStack is the alternate signal stack used by signal handlers for + // which the SA_ONSTACK flag is set. + // + // signalStack is exclusive to the task goroutine. + signalStack arch.SignalStack + + // If groupStopPending is true, the task should participate in a group + // stop in the interrupt path. + // + // groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux. + // + // groupStopPending is protected by the signal mutex. + groupStopPending bool + + // If groupStopAcknowledged is true, the task has already acknowledged that + // it is entering the most recent group stop that has been initiated on its + // thread group. + // + // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux. + // + // groupStopAcknowledged is protected by the signal mutex. + groupStopAcknowledged bool + + // If trapStopPending is true, the task goroutine should enter a + // PTRACE_INTERRUPT-induced stop from the interrupt path. + // + // trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that + // Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects + // JOBCTL_STOP_PENDING. + // + // trapStopPending is protected by the signal mutex. + trapStopPending bool + + // If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group + // stop has begun or ended since the last time the task entered a + // ptrace-stop from the group-stop path. + // + // trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux. + // + // trapNotifyPending is protected by the signal mutex. + trapNotifyPending bool + + // If stop is not nil, it is the internally-initiated condition that + // currently prevents the task goroutine from running. + // + // stop is protected by the signal mutex. + stop TaskStop + + // stopCount is the number of active external stops (calls to + // Task.BeginExternalStop that have not been paired with a call to + // Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is + // non-zero if the task goroutine should stop. + // + // Mutating stopCount requires both locking the signal mutex and using + // atomic memory operations. Reading stopCount requires either locking the + // signal mutex or using atomic memory operations. This allows Task.doStop + // to require only a single atomic read in the common case where stopCount + // is 0. + // + // stopCount is not saved, because external stops cannot be retained across + // a save/restore cycle. (Suppose a sentryctl command issues an external + // stop; after a save/restore cycle, the restored sentry has no knowledge + // of the pre-save sentryctl command, and the stopped task would remain + // stopped forever.) + stopCount int32 `state:"nosave"` + + // endStopCond is signaled when stopCount transitions to 0. The combination + // of stopCount and endStopCond effectively form a sync.WaitGroup, but + // WaitGroup provides no way to read its counter value. + // + // Invariant: endStopCond.L is the signal mutex. (This is not racy because + // sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine + // calls sync.Cond.Wait; and only the task goroutine can change the + // identity of the signal mutex, in Task.finishExec.) + endStopCond sync.Cond `state:"nosave"` + + // exitStatus is the task's exit status. + // + // exitStatus is protected by the signal mutex. + exitStatus ExitStatus + + // syscallRestartBlock represents a custom restart function to run in + // restart_syscall(2) to resume an interrupted syscall. + // + // syscallRestartBlock is exclusive to the task goroutine. + syscallRestartBlock SyscallRestartBlock + + // p provides the mechanism by which the task runs code in userspace. The p + // interface object is immutable. + p platform.Context `state:"nosave"` + + // k is the Kernel that this task belongs to. The k pointer is immutable. + k *Kernel + + // containerID has no equivalent in Linux; it's used by runsc to track all + // tasks that belong to a given containers since cgroups aren't implemented. + // It's inherited by the children, is immutable, and may be empty. + // + // NOTE: cgroups can be used to track this when implemented. + containerID string + + // mu protects some of the following fields. + mu sync.Mutex `state:"nosave"` + + // tc holds task data provided by the ELF loader. + // + // tc is protected by mu, and is owned by the task goroutine. + tc TaskContext + + // fsc is the task's filesystem context. + // + // fsc is protected by mu, and is owned by the task goroutine. + fsc *FSContext + + // fds is the task's file descriptor table. + // + // fds is protected by mu, and is owned by the task goroutine. + fds *FDMap + + // If vforkParent is not nil, it is the task that created this task with + // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when + // this TaskContext is released. + // + // vforkParent is protected by the TaskSet mutex. + vforkParent *Task + + // exitState is the task's progress through the exit path. + // + // exitState is protected by the TaskSet mutex. exitState is owned by the + // task goroutine. + exitState TaskExitState + + // exitTracerNotified is true if the exit path has either signaled the + // task's tracer to indicate the exit, or determined that no such signal is + // needed. exitTracerNotified can only be true if exitState is + // TaskExitZombie or TaskExitDead. + // + // exitTracerNotified is protected by the TaskSet mutex. + exitTracerNotified bool + + // exitTracerAcked is true if exitTracerNotified is true and either the + // task's tracer has acknowledged the exit notification, or the exit path + // has determined that no such notification is needed. + // + // exitTracerAcked is protected by the TaskSet mutex. + exitTracerAcked bool + + // exitParentNotified is true if the exit path has either signaled the + // task's parent to indicate the exit, or determined that no such signal is + // needed. exitParentNotified can only be true if exitState is + // TaskExitZombie or TaskExitDead. + // + // exitParentNotified is protected by the TaskSet mutex. + exitParentNotified bool + + // exitParentAcked is true if exitParentNotified is true and either the + // task's parent has acknowledged the exit notification, or the exit path + // has determined that no such acknowledgment is needed. + // + // exitParentAcked is protected by the TaskSet mutex. + exitParentAcked bool + + // goroutineStopped is a WaitGroup whose counter value is 1 when the task + // goroutine is running and 0 when the task goroutine is stopped or has + // exited. + goroutineStopped sync.WaitGroup `state:"nosave"` + + // ptraceTracer is the task that is ptrace-attached to this one. If + // ptraceTracer is nil, this task is not being traced. Note that due to + // atomic.Value limitations (atomic.Value.Store(nil) panics), a nil + // ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)). + // + // ptraceTracer is protected by the TaskSet mutex, and accessed with atomic + // operations. This allows paths that wouldn't otherwise lock the TaskSet + // mutex, notably the syscall path, to check if ptraceTracer is nil without + // additional synchronization. + ptraceTracer atomic.Value `state:".(*Task)"` + + // ptraceTracees is the set of tasks that this task is ptrace-attached to. + // + // ptraceTracees is protected by the TaskSet mutex. + ptraceTracees map[*Task]struct{} + + // ptraceSeized is true if ptraceTracer attached to this task with + // PTRACE_SEIZE. + // + // ptraceSeized is protected by the TaskSet mutex. + ptraceSeized bool + + // ptraceOpts contains ptrace options explicitly set by the tracer. If + // ptraceTracer is nil, ptraceOpts is expected to be the zero value. + // + // ptraceOpts is protected by the TaskSet mutex. + ptraceOpts ptraceOptions + + // ptraceSyscallMode controls ptrace behavior around syscall entry and + // exit. + // + // ptraceSyscallMode is protected by the TaskSet mutex. + ptraceSyscallMode ptraceSyscallMode + + // If ptraceSinglestep is true, the next time the task executes application + // code, single-stepping should be enabled. ptraceSinglestep is stored + // independently of the architecture-specific trap flag because tracer + // detaching (which can happen concurrently with the tracee's execution if + // the tracer exits) must disable single-stepping, and the task's + // architectural state is implicitly exclusive to the task goroutine (no + // synchronization occurs before passing registers to SwitchToApp). + // + // ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP. + // + // ptraceSinglestep is protected by the TaskSet mutex. + ptraceSinglestep bool + + // If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the + // time that t entered the ptrace stop, reset to 0 when the tracer + // acknowledges the stop with a wait*() syscall. Otherwise, it is the + // signal number passed to the ptrace operation that ended the last ptrace + // stop on this task. In the latter case, the effect of ptraceCode depends + // on the nature of the ptrace stop; signal-delivery-stop uses it to + // conditionally override ptraceSiginfo, syscall-entry/exit-stops send the + // signal to the task after leaving the stop, and PTRACE_EVENT stops and + // traced group stops ignore it entirely. + // + // Linux contextually stores the equivalent of ptraceCode in + // task_struct::exit_code. + // + // ptraceCode is protected by the TaskSet mutex. + ptraceCode int32 + + // ptraceSiginfo is the value returned to the tracer by + // ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO). + // (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.) + // ptraceSiginfo is nil if the task is in a ptraced group-stop (this is + // required for PTRACE_GETSIGINFO to return EINVAL during such stops, which + // is in turn required to distinguish group stops from other ptrace stops, + // per subsection "Group-stop" in ptrace(2)). + // + // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo. + // + // ptraceSiginfo is protected by the TaskSet mutex. + ptraceSiginfo *arch.SignalInfo + + // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to + // the tracer by ptrace(PTRACE_GETEVENTMSG). + // + // ptraceEventMsg is protected by the TaskSet mutex. + ptraceEventMsg uint64 + + // The struct that holds the IO-related usage. The ioUsage pointer is + // immutable. + ioUsage *usage.IO + + // logPrefix is a string containing the task's thread ID in the root PID + // namespace, and is prepended to log messages emitted by Task.Infof etc. + logPrefix atomic.Value `state:".(string)"` + + // creds is the task's credentials. + // + // creds is protected by mu, however the value itself is immutable and can + // only be changed by a copy. After reading the pointer, access will + // proceed outside the scope of mu. creds is owned by the task goroutine. + creds *auth.Credentials + + // utsns is the task's UTS namespace. + // + // utsns is protected by mu. utsns is owned by the task goroutine. + utsns *UTSNamespace + + // ipcns is the task's IPC namespace. + // + // ipcns is protected by mu. ipcns is owned by the task goroutine. + ipcns *IPCNamespace + + // abstractSockets tracks abstract sockets that are in use. + // + // abstractSockets is protected by mu. + abstractSockets *AbstractSocketNamespace + + // parentDeathSignal is sent to this task's thread group when its parent exits. + // + // parentDeathSignal is protected by mu. + parentDeathSignal linux.Signal + + // syscallFilters is all seccomp-bpf syscall filters applicable to the + // task, in the order in which they were installed. The type of the atomic + // is []bpf.Program. Writing needs to be protected by the signal mutex. + // + // syscallFilters is owned by the task goroutine. + syscallFilters atomic.Value `state:".([]bpf.Program)"` + + // If cleartid is non-zero, treat it as a pointer to a ThreadID in the + // task's virtual address space; when the task exits, set the pointed-to + // ThreadID to 0, and wake any futex waiters. + // + // cleartid is exclusive to the task goroutine. + cleartid usermem.Addr + + // This is mostly a fake cpumask just for sched_set/getaffinity as we + // don't really control the affinity. + // + // Invariant: allowedCPUMask.Size() == + // sched.CPUMaskSize(Kernel.applicationCores). + // + // allowedCPUMask is protected by mu. + allowedCPUMask sched.CPUSet + + // cpu is the fake cpu number returned by getcpu(2). cpu is ignored + // entirely if Kernel.useHostCores is true. + // + // cpu is accessed using atomic memory operations. + cpu int32 + + // This is used to keep track of changes made to a process' priority/niceness. + // It is mostly used to provide some reasonable return value from + // getpriority(2) after a call to setpriority(2) has been made. + // We currently do not actually modify a process' scheduling priority. + // NOTE: This represents the userspace view of priority (nice). + // This means that the value should be in the range [-20, 19]. + // + // niceness is protected by mu. + niceness int + + // This is used to track the numa policy for the current thread. This can be + // modified through a set_mempolicy(2) syscall. Since we always report a + // single numa node, all policies are no-ops. We only track this information + // so that we can return reasonable values if the application calls + // get_mempolicy(2) after setting a non-default policy. Note that in the + // real syscall, nodemask can be longer than 4 bytes, but we always report a + // single node so never need to save more than a single bit. + // + // numaPolicy and numaNodeMask are protected by mu. + numaPolicy int32 + numaNodeMask uint32 + + // If netns is true, the task is in a non-root network namespace. Network + // namespaces aren't currently implemented in full; being in a network + // namespace simply prevents the task from observing any network devices + // (including loopback) or using abstract socket addresses (see unix(7)). + // + // netns is protected by mu. netns is owned by the task goroutine. + netns bool + + // If rseqPreempted is true, before the next call to p.Switch(), interrupt + // RSEQ critical regions as defined by tg.rseq and write the task + // goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number + // written to rseqCPUAddr. + // + // If rseqCPUAddr is 0, rseqCPU is -1. + // + // rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task + // goroutine. + rseqPreempted bool `state:"nosave"` + rseqCPUAddr usermem.Addr + rseqCPU int32 + + // copyScratchBuffer is a buffer available to CopyIn/CopyOut + // implementations that require an intermediate buffer to copy data + // into/out of. It prevents these buffers from being allocated/zeroed in + // each syscall and eventually garbage collected. + // + // copyScratchBuffer is exclusive to the task goroutine. + copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"` + + // blockingTimer is used for blocking timeouts. blockingTimerChan is the + // channel that is sent to when blockingTimer fires. + // + // blockingTimer is exclusive to the task goroutine. + blockingTimer *ktime.Timer `state:"nosave"` + blockingTimerChan <-chan struct{} `state:"nosave"` + + // futexWaiter is used for futex(FUTEX_WAIT) syscalls. + // + // futexWaiter is exclusive to the task goroutine. + futexWaiter *futex.Waiter `state:"nosave"` + + // startTime is the real time at which the task started. It is set when + // a Task is created or invokes execve(2). + // + // startTime is protected by mu. + startTime ktime.Time +} + +func (t *Task) savePtraceTracer() *Task { + return t.ptraceTracer.Load().(*Task) +} + +func (t *Task) loadPtraceTracer(tracer *Task) { + t.ptraceTracer.Store(tracer) +} + +func (t *Task) saveLogPrefix() string { + return t.logPrefix.Load().(string) +} + +func (t *Task) loadLogPrefix(prefix string) { + t.logPrefix.Store(prefix) +} + +func (t *Task) saveSyscallFilters() []bpf.Program { + if f := t.syscallFilters.Load(); f != nil { + return f.([]bpf.Program) + } + return nil +} + +func (t *Task) loadSyscallFilters(filters []bpf.Program) { + t.syscallFilters.Store(filters) +} + +// afterLoad is invoked by stateify. +func (t *Task) afterLoad() { + t.interruptChan = make(chan struct{}, 1) + t.gosched.State = TaskGoroutineNonexistent + if t.stop != nil { + t.stopCount = 1 + } + t.endStopCond.L = &t.tg.signalHandlers.mu + t.p = t.k.Platform.NewContext() + t.rseqPreempted = true + t.futexWaiter = futex.NewWaiter() +} + +// copyScratchBufferLen is the length of Task.copyScratchBuffer. +const copyScratchBufferLen = 144 // sizeof(struct stat) + +// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut +// functions. It must only be used within those functions and can only be used +// by the task goroutine; it exists to improve performance and thus +// intentionally lacks any synchronization. +// +// Callers should pass a constant value as an argument if possible, which will +// allow the compiler to inline and optimize out the if statement below. +func (t *Task) CopyScratchBuffer(size int) []byte { + if size > copyScratchBufferLen { + return make([]byte, size) + } + return t.copyScratchBuffer[:size] +} + +// FutexWaiter returns the Task's futex.Waiter. +func (t *Task) FutexWaiter() *futex.Waiter { + return t.futexWaiter +} + +// Kernel returns the Kernel containing t. +func (t *Task) Kernel() *Kernel { + return t.k +} + +// Value implements context.Context.Value. +// +// Preconditions: The caller must be running on the task goroutine (as implied +// by the requirements of context.Context). +func (t *Task) Value(key interface{}) interface{} { + switch key { + case CtxCanTrace: + return t.CanTrace + case CtxKernel: + return t.k + case CtxPIDNamespace: + return t.tg.pidns + case CtxUTSNamespace: + return t.utsns + case CtxIPCNamespace: + return t.ipcns + case CtxTask: + return t + case auth.CtxCredentials: + return t.creds + case context.CtxThreadGroupID: + return int32(t.ThreadGroup().ID()) + case fs.CtxRoot: + return t.fsc.RootDirectory() + case fs.CtxDirentCacheLimiter: + return t.k.DirentCacheLimiter + case inet.CtxStack: + return t.NetworkContext() + case ktime.CtxRealtimeClock: + return t.k.RealtimeClock() + case limits.CtxLimits: + return t.tg.limits + case pgalloc.CtxMemoryFile: + return t.k.mf + case pgalloc.CtxMemoryFileProvider: + return t.k + case platform.CtxPlatform: + return t.k + case uniqueid.CtxGlobalUniqueID: + return t.k.UniqueID() + case uniqueid.CtxGlobalUniqueIDProvider: + return t.k + case uniqueid.CtxInotifyCookie: + return t.k.GenerateInotifyCookie() + case unimpl.CtxEvents: + return t.k + default: + return nil + } +} + +// SetClearTID sets t's cleartid. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) SetClearTID(addr usermem.Addr) { + t.cleartid = addr +} + +// SetSyscallRestartBlock sets the restart block for use in +// restart_syscall(2). After registering a restart block, a syscall should +// return ERESTART_RESTARTBLOCK to request a restart using the block. +// +// Precondition: The caller must be running on the task goroutine. +func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) { + t.syscallRestartBlock = r +} + +// SyscallRestartBlock returns the currently registered restart block for use in +// restart_syscall(2). This function is *not* idempotent and may be called once +// per syscall. This function must not be called if a restart block has not been +// registered for the current syscall. +// +// Precondition: The caller must be running on the task goroutine. +func (t *Task) SyscallRestartBlock() SyscallRestartBlock { + r := t.syscallRestartBlock + // Explicitly set the restart block to nil so that a future syscall can't + // accidentally reuse it. + t.syscallRestartBlock = nil + return r +} + +// IsChrooted returns true if the root directory of t's FSContext is not the +// root directory of t's MountNamespace. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) IsChrooted() bool { + realRoot := t.k.mounts.Root() + defer realRoot.DecRef() + root := t.fsc.RootDirectory() + if root != nil { + defer root.DecRef() + } + return root != realRoot +} + +// TaskContext returns t's TaskContext. +// +// Precondition: The caller must be running on the task goroutine, or t.mu must +// be locked. +func (t *Task) TaskContext() *TaskContext { + return &t.tc +} + +// FSContext returns t's FSContext. FSContext does not take an additional +// reference on the returned FSContext. +// +// Precondition: The caller must be running on the task goroutine, or t.mu must +// be locked. +func (t *Task) FSContext() *FSContext { + return t.fsc +} + +// FDMap returns t's FDMap. FDMap does not take an additional reference on the +// returned FDMap. +// +// Precondition: The caller must be running on the task goroutine, or t.mu must +// be locked. +func (t *Task) FDMap() *FDMap { + return t.fds +} + +// WithMuLocked executes f with t.mu locked. +func (t *Task) WithMuLocked(f func(*Task)) { + t.mu.Lock() + defer t.mu.Unlock() + f(t) +} + +// MountNamespace returns t's MountNamespace. MountNamespace does not take an +// additional reference on the returned MountNamespace. +func (t *Task) MountNamespace() *fs.MountNamespace { + return t.k.mounts +} + +// AbstractSockets returns t's AbstractSocketNamespace. +func (t *Task) AbstractSockets() *AbstractSocketNamespace { + return t.abstractSockets +} + +// ContainerID returns t's container ID. +func (t *Task) ContainerID() string { + return t.containerID +} diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go new file mode 100644 index 000000000..1ca2a82eb --- /dev/null +++ b/pkg/sentry/kernel/task_acct.go @@ -0,0 +1,196 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// Accounting, limits, timers. + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Getitimer implements getitimer(2). +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) { + var tm ktime.Time + var s ktime.Setting + switch id { + case linux.ITIMER_REAL: + tm, s = t.tg.itimerRealTimer.Get() + case linux.ITIMER_VIRTUAL: + tm = t.tg.UserCPUClock().Now() + t.tg.signalHandlers.mu.Lock() + s, _ = t.tg.itimerVirtSetting.At(tm) + t.tg.signalHandlers.mu.Unlock() + case linux.ITIMER_PROF: + tm = t.tg.CPUClock().Now() + t.tg.signalHandlers.mu.Lock() + s, _ = t.tg.itimerProfSetting.At(tm) + t.tg.signalHandlers.mu.Unlock() + default: + return linux.ItimerVal{}, syserror.EINVAL + } + val, iv := ktime.SpecFromSetting(tm, s) + return linux.ItimerVal{ + Value: linux.DurationToTimeval(val), + Interval: linux.DurationToTimeval(iv), + }, nil +} + +// Setitimer implements setitimer(2). +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, error) { + var tm ktime.Time + var olds ktime.Setting + switch id { + case linux.ITIMER_REAL: + news, err := ktime.SettingFromSpec(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), t.tg.itimerRealTimer.Clock()) + if err != nil { + return linux.ItimerVal{}, err + } + tm, olds = t.tg.itimerRealTimer.Swap(news) + case linux.ITIMER_VIRTUAL: + c := t.tg.UserCPUClock() + var err error + t.k.cpuClockTicker.Atomically(func() { + tm = c.Now() + var news ktime.Setting + news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm) + if err != nil { + return + } + t.tg.signalHandlers.mu.Lock() + olds = t.tg.itimerVirtSetting + t.tg.itimerVirtSetting = news + t.tg.updateCPUTimersEnabledLocked() + t.tg.signalHandlers.mu.Unlock() + }) + if err != nil { + return linux.ItimerVal{}, err + } + case linux.ITIMER_PROF: + c := t.tg.CPUClock() + var err error + t.k.cpuClockTicker.Atomically(func() { + tm = c.Now() + var news ktime.Setting + news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm) + if err != nil { + return + } + t.tg.signalHandlers.mu.Lock() + olds = t.tg.itimerProfSetting + t.tg.itimerProfSetting = news + t.tg.updateCPUTimersEnabledLocked() + t.tg.signalHandlers.mu.Unlock() + }) + if err != nil { + return linux.ItimerVal{}, err + } + default: + return linux.ItimerVal{}, syserror.EINVAL + } + oldval, oldiv := ktime.SpecFromSetting(tm, olds) + return linux.ItimerVal{ + Value: linux.DurationToTimeval(oldval), + Interval: linux.DurationToTimeval(oldiv), + }, nil +} + +// IOUsage returns the io usage of the thread. +func (t *Task) IOUsage() *usage.IO { + return t.ioUsage +} + +// IOUsage returns the total io usage of all dead and live threads in the group. +func (tg *ThreadGroup) IOUsage() *usage.IO { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + + io := *tg.ioUsage + // Account for active tasks. + for t := tg.tasks.Front(); t != nil; t = t.Next() { + io.Accumulate(t.IOUsage()) + } + return &io +} + +// Name returns t's name. +func (t *Task) Name() string { + t.mu.Lock() + defer t.mu.Unlock() + return t.tc.Name +} + +// SetName changes t's name. +func (t *Task) SetName(name string) { + t.mu.Lock() + defer t.mu.Unlock() + t.tc.Name = name + t.Debugf("Set thread name to %q", name) +} + +// Limits implements context.Context.Limits. +func (t *Task) Limits() *limits.LimitSet { + return t.ThreadGroup().Limits() +} + +// StartTime returns t's start time. +func (t *Task) StartTime() ktime.Time { + t.mu.Lock() + defer t.mu.Unlock() + return t.startTime +} + +// MaxRSS returns the maximum resident set size of the task in bytes. which +// should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or +// RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these +// flags. +func (t *Task) MaxRSS(which int32) uint64 { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + + switch which { + case linux.RUSAGE_SELF, linux.RUSAGE_THREAD: + // If there's an active mm we can use its value. + if mm := t.MemoryManager(); mm != nil { + if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > t.tg.maxRSS { + return mmMaxRSS + } + } + return t.tg.maxRSS + case linux.RUSAGE_CHILDREN: + return t.tg.childMaxRSS + case linux.RUSAGE_BOTH: + maxRSS := t.tg.maxRSS + if maxRSS < t.tg.childMaxRSS { + maxRSS = t.tg.childMaxRSS + } + if mm := t.MemoryManager(); mm != nil { + if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > maxRSS { + return mmMaxRSS + } + } + return maxRSS + default: + // We'll only get here if which is invalid. + return 0 + } +} diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go new file mode 100644 index 000000000..1c76c4d84 --- /dev/null +++ b/pkg/sentry/kernel/task_block.go @@ -0,0 +1,212 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "time" + + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// BlockWithTimeout blocks t until an event is received from C, the application +// monotonic clock indicates that timeout has elapsed (only if haveTimeout is true), +// or t is interrupted. It returns: +// +// - The remaining timeout, which is guaranteed to be 0 if the timeout expired, +// and is unspecified if haveTimeout is false. +// +// - An error which is nil if an event is received from C, ETIMEDOUT if the timeout +// expired, and syserror.ErrInterrupted if t is interrupted. +func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) { + if !haveTimeout { + return timeout, t.block(C, nil) + } + + start := t.Kernel().MonotonicClock().Now() + deadline := start.Add(timeout) + err := t.BlockWithDeadline(C, true, deadline) + + // Timeout, explicitly return a remaining duration of 0. + if err == syserror.ETIMEDOUT { + return 0, err + } + + // Compute the remaining timeout. Note that even if block() above didn't + // return due to a timeout, we may have used up any of the remaining time + // since then. We cap the remaining timeout to 0 to make it easier to + // directly use the returned duration. + end := t.Kernel().MonotonicClock().Now() + remainingTimeout := timeout - end.Sub(start) + if remainingTimeout < 0 { + remainingTimeout = 0 + } + + return remainingTimeout, err +} + +// BlockWithDeadline blocks t until an event is received from C, the +// application monotonic clock indicates a time of deadline (only if +// haveDeadline is true), or t is interrupted. It returns nil if an event is +// received from C, ETIMEDOUT if the deadline expired, and +// syserror.ErrInterrupted if t is interrupted. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error { + if !haveDeadline { + return t.block(C, nil) + } + + // Start the timeout timer. + t.blockingTimer.Swap(ktime.Setting{ + Enabled: true, + Next: deadline, + }) + + err := t.block(C, t.blockingTimerChan) + + // Stop the timeout timer and drain the channel. + t.blockingTimer.Swap(ktime.Setting{}) + select { + case <-t.blockingTimerChan: + default: + } + + return err +} + +// BlockWithTimer blocks t until an event is received from C or tchan, or t is +// interrupted. It returns nil if an event is received from C, ETIMEDOUT if an +// event is received from tchan, and syserror.ErrInterrupted if t is +// interrupted. +// +// Most clients should use BlockWithDeadline or BlockWithTimeout instead. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) BlockWithTimer(C <-chan struct{}, tchan <-chan struct{}) error { + return t.block(C, tchan) +} + +// Block blocks t until an event is received from C or t is interrupted. It +// returns nil if an event is received from C and syserror.ErrInterrupted if t +// is interrupted. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Block(C <-chan struct{}) error { + return t.block(C, nil) +} + +// block blocks a task on one of many events. +// N.B. defer is too expensive to be used here. +func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { + // Fast path if the request is already done. + select { + case <-C: + return nil + default: + } + + // Deactive our address space, we don't need it. + interrupt := t.SleepStart() + + select { + case <-C: + t.SleepFinish(true) + return nil + + case <-interrupt: + t.SleepFinish(false) + // Return the indicated error on interrupt. + return syserror.ErrInterrupted + + case <-timerChan: + // We've timed out. + t.SleepFinish(true) + return syserror.ETIMEDOUT + } +} + +// SleepStart implements amutex.Sleeper.SleepStart. +func (t *Task) SleepStart() <-chan struct{} { + t.Deactivate() + t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible) + return t.interruptChan +} + +// SleepFinish implements amutex.Sleeper.SleepFinish. +func (t *Task) SleepFinish(success bool) { + if !success { + // The interrupted notification is consumed only at the top-level + // (Run). Therefore we attempt to reset the pending notification. + // This will also elide our next entry back into the task, so we + // will process signals, state changes, etc. + t.interruptSelf() + } + t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible) + t.Activate() +} + +// Interrupted implements amutex.Sleeper.Interrupted +func (t *Task) Interrupted() bool { + return len(t.interruptChan) != 0 +} + +// UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart. +func (t *Task) UninterruptibleSleepStart(deactivate bool) { + if deactivate { + t.Deactivate() + } + t.accountTaskGoroutineEnter(TaskGoroutineBlockedUninterruptible) +} + +// UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish. +func (t *Task) UninterruptibleSleepFinish(activate bool) { + t.accountTaskGoroutineLeave(TaskGoroutineBlockedUninterruptible) + if activate { + t.Activate() + } +} + +// interrupted returns true if interrupt or interruptSelf has been called at +// least once since the last call to interrupted. +func (t *Task) interrupted() bool { + select { + case <-t.interruptChan: + return true + default: + return false + } +} + +// interrupt unblocks the task and interrupts it if it's currently running in +// userspace. +func (t *Task) interrupt() { + t.interruptSelf() + t.p.Interrupt() +} + +// interruptSelf is like Interrupt, but can only be called by the task +// goroutine. +func (t *Task) interruptSelf() { + select { + case t.interruptChan <- struct{}{}: + t.Debugf("Interrupt queued") + default: + t.Debugf("Dropping duplicate interrupt") + } + // platform.Context.Interrupt() is unnecessary since a task goroutine + // calling interruptSelf() cannot also be blocked in + // platform.Context.Switch(). +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go new file mode 100644 index 000000000..bba8ddd39 --- /dev/null +++ b/pkg/sentry/kernel/task_clone.go @@ -0,0 +1,516 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bpf" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SharingOptions controls what resources are shared by a new task created by +// Task.Clone, or an existing task affected by Task.Unshare. +type SharingOptions struct { + // If NewAddressSpace is true, the task should have an independent virtual + // address space. + NewAddressSpace bool + + // If NewSignalHandlers is true, the task should use an independent set of + // signal handlers. + NewSignalHandlers bool + + // If NewThreadGroup is true, the task should be the leader of its own + // thread group. TerminationSignal is the signal that the thread group + // will send to its parent when it exits. If NewThreadGroup is false, + // TerminationSignal is ignored. + NewThreadGroup bool + TerminationSignal linux.Signal + + // If NewPIDNamespace is true: + // + // - In the context of Task.Clone, the new task should be the init task + // (TID 1) in a new PID namespace. + // + // - In the context of Task.Unshare, the task should create a new PID + // namespace, and all subsequent clones of the task should be members of + // the new PID namespace. + NewPIDNamespace bool + + // If NewUserNamespace is true, the task should have an independent user + // namespace. + NewUserNamespace bool + + // If NewNetworkNamespace is true, the task should have an independent + // network namespace. (Note that network namespaces are not really + // implemented; see comment on Task.netns for details.) + NewNetworkNamespace bool + + // If NewFiles is true, the task should use an independent file descriptor + // table. + NewFiles bool + + // If NewFSContext is true, the task should have an independent FSContext. + NewFSContext bool + + // If NewUTSNamespace is true, the task should have an independent UTS + // namespace. + NewUTSNamespace bool + + // If NewIPCNamespace is true, the task should have an independent IPC + // namespace. + NewIPCNamespace bool +} + +// CloneOptions controls the behavior of Task.Clone. +type CloneOptions struct { + // SharingOptions defines the set of resources that the new task will share + // with its parent. + SharingOptions + + // Stack is the initial stack pointer of the new task. If Stack is 0, the + // new task will start with the same stack pointer as its parent. + Stack usermem.Addr + + // If SetTLS is true, set the new task's TLS (thread-local storage) + // descriptor to TLS. If SetTLS is false, TLS is ignored. + SetTLS bool + TLS usermem.Addr + + // If ChildClearTID is true, when the child exits, 0 is written to the + // address ChildTID in the child's memory, and if the write is successful a + // futex wake on the same address is performed. + // + // If ChildSetTID is true, the child's thread ID (in the child's PID + // namespace) is written to address ChildTID in the child's memory. (As in + // Linux, failed writes are silently ignored.) + ChildClearTID bool + ChildSetTID bool + ChildTID usermem.Addr + + // If ParentSetTID is true, the child's thread ID (in the parent's PID + // namespace) is written to address ParentTID in the parent's memory. (As + // in Linux, failed writes are silently ignored.) + // + // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID + // causes the child's thread ID to be written to ptid in both the parent + // and child's memory, but this is a documentation error fixed by + // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID"). + ParentSetTID bool + ParentTID usermem.Addr + + // If Vfork is true, place the parent in vforkStop until the cloned task + // releases its TaskContext. + Vfork bool + + // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for + // this clone(), and do not ptrace-attach the caller's tracer to the new + // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate). + Untraced bool + + // If InheritTracer is true, ptrace-attach the caller's tracer to the new + // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported + // for it. If both Untraced and InheritTracer are true, no event will be + // reported, but tracer inheritance will still occur. + InheritTracer bool +} + +// Clone implements the clone(2) syscall and returns the thread ID of the new +// task in t's PID namespace. Clone may return both a non-zero thread ID and a +// non-nil error. +// +// Preconditions: The caller must be running Task.doSyscallInvoke on the task +// goroutine. +func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { + // Since signal actions may refer to application signal handlers by virtual + // address, any set of signal handlers must refer to the same address + // space. + if !opts.NewSignalHandlers && opts.NewAddressSpace { + return 0, nil, syserror.EINVAL + } + // In order for the behavior of thread-group-directed signals to be sane, + // all tasks in a thread group must share signal handlers. + if !opts.NewThreadGroup && opts.NewSignalHandlers { + return 0, nil, syserror.EINVAL + } + // All tasks in a thread group must be in the same PID namespace. + if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) { + return 0, nil, syserror.EINVAL + } + // The two different ways of specifying a new PID namespace are + // incompatible. + if opts.NewPIDNamespace && t.childPIDNamespace != nil { + return 0, nil, syserror.EINVAL + } + // Thread groups and FS contexts cannot span user namespaces. + if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) { + return 0, nil, syserror.EINVAL + } + + // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a + // single clone(2) or unshare(2) call, the user namespace is guaranteed to + // be created first, giving the child (clone(2)) or caller (unshare(2)) + // privileges over the remaining namespaces created by the call." - + // user_namespaces(7) + creds := t.Credentials() + userns := creds.UserNamespace + if opts.NewUserNamespace { + var err error + // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and + // the caller is in a chroot environment (i.e., the caller's root + // directory does not match the root directory of the mount namespace + // in which it resides)." - clone(2). Neither chroot(2) nor + // user_namespaces(7) document this. + if t.IsChrooted() { + return 0, nil, syserror.EPERM + } + userns, err = creds.NewChildUserNamespace() + if err != nil { + return 0, nil, err + } + } + if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { + return 0, nil, syserror.EPERM + } + + utsns := t.UTSNamespace() + if opts.NewUTSNamespace { + // Note that this must happen after NewUserNamespace so we get + // the new userns if there is one. + utsns = t.UTSNamespace().Clone(userns) + } + + ipcns := t.IPCNamespace() + if opts.NewIPCNamespace { + // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC + // namespace" + ipcns = NewIPCNamespace(userns) + } + + tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) + if err != nil { + return 0, nil, err + } + // clone() returns 0 in the child. + tc.Arch.SetReturn(0) + if opts.Stack != 0 { + tc.Arch.SetStack(uintptr(opts.Stack)) + } + if opts.SetTLS { + if !tc.Arch.SetTLS(uintptr(opts.TLS)) { + return 0, nil, syserror.EPERM + } + } + + var fsc *FSContext + if opts.NewFSContext { + fsc = t.fsc.Fork() + } else { + fsc = t.fsc + fsc.IncRef() + } + + var fds *FDMap + if opts.NewFiles { + fds = t.fds.Fork() + } else { + fds = t.fds + fds.IncRef() + } + + pidns := t.tg.pidns + if t.childPIDNamespace != nil { + pidns = t.childPIDNamespace + } else if opts.NewPIDNamespace { + pidns = pidns.NewChild(userns) + } + tg := t.tg + if opts.NewThreadGroup { + sh := t.tg.signalHandlers + if opts.NewSignalHandlers { + sh = sh.Fork() + } + tg = t.k.newThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock) + } + + cfg := &TaskConfig{ + Kernel: t.k, + ThreadGroup: tg, + SignalMask: t.SignalMask(), + TaskContext: tc, + FSContext: fsc, + FDMap: fds, + Credentials: creds, + Niceness: t.Niceness(), + NetworkNamespaced: t.netns, + AllowedCPUMask: t.CPUMask(), + UTSNamespace: utsns, + IPCNamespace: ipcns, + AbstractSocketNamespace: t.abstractSockets, + ContainerID: t.ContainerID(), + } + if opts.NewThreadGroup { + cfg.Parent = t + } else { + cfg.InheritParent = t + } + if opts.NewNetworkNamespace { + cfg.NetworkNamespaced = true + } + nt, err := t.tg.pidns.owner.NewTask(cfg) + if err != nil { + if opts.NewThreadGroup { + tg.release() + } + return 0, nil, err + } + + // "A child process created via fork(2) inherits a copy of its parent's + // alternate signal stack settings" - sigaltstack(2). + // + // However kernel/fork.c:copy_process() adds a limitation to this: + // "sigaltstack should be cleared when sharing the same VM". + if opts.NewAddressSpace || opts.Vfork { + nt.SetSignalStack(t.SignalStack()) + } + + if userns != creds.UserNamespace { + if err := nt.SetUserNamespace(userns); err != nil { + // This shouldn't be possible: userns was created from nt.creds, so + // nt should have CAP_SYS_ADMIN in userns. + panic("Task.Clone: SetUserNamespace failed: " + err.Error()) + } + } + + // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to + // nt that it must receive before its task goroutine starts running. + tid := nt.k.tasks.Root.IDOfTask(nt) + defer nt.Start(tid) + + // "If fork/clone and execve are allowed by @prog, any child processes will + // be constrained to the same filters and system call ABI as the parent." - + // Documentation/prctl/seccomp_filter.txt + if f := t.syscallFilters.Load(); f != nil { + copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...) + nt.syscallFilters.Store(copiedFilters) + } + if opts.Vfork { + nt.vforkParent = t + } + + if opts.ChildClearTID { + nt.SetClearTID(opts.ChildTID) + } + if opts.ChildSetTID { + // Can't use Task.CopyOut, which assumes AddressSpaceActive. + usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{}) + } + ntid := t.tg.pidns.IDOfTask(nt) + if opts.ParentSetTID { + t.CopyOut(opts.ParentTID, ntid) + } + + kind := ptraceCloneKindClone + if opts.Vfork { + kind = ptraceCloneKindVfork + } else if opts.TerminationSignal == linux.SIGCHLD { + kind = ptraceCloneKindFork + } + if t.ptraceClone(kind, nt, opts) { + if opts.Vfork { + return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil + } + return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil + } + if opts.Vfork { + t.maybeBeginVforkStop(nt) + return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil + } + return ntid, nil, nil +} + +// maybeBeginVforkStop checks if a previously-started vfork child is still +// running and has not yet released its MM, such that its parent t should enter +// a vforkStop. +// +// Preconditions: The caller must be running on t's task goroutine. +func (t *Task) maybeBeginVforkStop(child *Task) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.killedLocked() { + child.vforkParent = nil + return + } + if child.vforkParent == t { + t.beginInternalStopLocked((*vforkStop)(nil)) + } +} + +func (t *Task) unstopVforkParent() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if p := t.vforkParent; p != nil { + p.tg.signalHandlers.mu.Lock() + defer p.tg.signalHandlers.mu.Unlock() + if _, ok := p.stop.(*vforkStop); ok { + p.endInternalStopLocked() + } + // Parent no longer needs to be unstopped. + t.vforkParent = nil + } +} + +// +stateify savable +type runSyscallAfterPtraceEventClone struct { + vforkChild *Task + + // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's + // PID namespace. vforkChildTID must be stored since the child may exit and + // release its TID before the PTRACE_EVENT stop ends. + vforkChildTID ThreadID +} + +func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState { + if r.vforkChild != nil { + t.maybeBeginVforkStop(r.vforkChild) + return &runSyscallAfterVforkStop{r.vforkChildTID} + } + return (*runSyscallExit)(nil) +} + +// +stateify savable +type runSyscallAfterVforkStop struct { + // childTID has the same meaning as + // runSyscallAfterPtraceEventClone.vforkChildTID. + childTID ThreadID +} + +func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { + t.ptraceVforkDone(r.childTID) + return (*runSyscallExit)(nil) +} + +// Unshare changes the set of resources t shares with other tasks, as specified +// by opts. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) Unshare(opts *SharingOptions) error { + // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and + // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if + // t is the only task using its MM, which due to clone(2)'s rules imply + // that it is also the only task using its signal handlers / in its thread + // group, and cause EINVAL to be returned otherwise. + // + // Since we don't count the number of tasks using each address space or set + // of signal handlers, we reject NewSignalHandlers and NewAddressSpace + // altogether, and interpret NewThreadGroup as requiring that t be the only + // member of its thread group. This seems to be logically coherent, in the + // sense that clone(2) allows a task to share signal handlers and address + // spaces with tasks in other thread groups. + if opts.NewAddressSpace || opts.NewSignalHandlers { + return syserror.EINVAL + } + if opts.NewThreadGroup { + t.tg.signalHandlers.mu.Lock() + if t.tg.tasksCount != 1 { + t.tg.signalHandlers.mu.Unlock() + return syserror.EINVAL + } + t.tg.signalHandlers.mu.Unlock() + // This isn't racy because we're the only living task, and therefore + // the only task capable of creating new ones, in our thread group. + } + if opts.NewUserNamespace { + if t.IsChrooted() { + return syserror.EPERM + } + // This temporary is needed because Go. + creds := t.Credentials() + newUserNS, err := creds.NewChildUserNamespace() + if err != nil { + return err + } + err = t.SetUserNamespace(newUserNS) + if err != nil { + return err + } + } + haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) + if opts.NewPIDNamespace { + if !haveCapSysAdmin { + return syserror.EPERM + } + t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) + } + t.mu.Lock() + // Can't defer unlock: DecRefs must occur without holding t.mu. + if opts.NewNetworkNamespace { + if !haveCapSysAdmin { + t.mu.Unlock() + return syserror.EPERM + } + t.netns = true + } + if opts.NewUTSNamespace { + if !haveCapSysAdmin { + t.mu.Unlock() + return syserror.EPERM + } + // Note that this must happen after NewUserNamespace, so the + // new user namespace is used if there is one. + t.utsns = t.utsns.Clone(t.creds.UserNamespace) + } + if opts.NewIPCNamespace { + if !haveCapSysAdmin { + t.mu.Unlock() + return syserror.EPERM + } + // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC + // namespace" + t.ipcns = NewIPCNamespace(t.creds.UserNamespace) + } + var oldfds *FDMap + if opts.NewFiles { + oldfds = t.fds + t.fds = oldfds.Fork() + } + var oldfsc *FSContext + if opts.NewFSContext { + oldfsc = t.fsc + t.fsc = oldfsc.Fork() + } + t.mu.Unlock() + if oldfds != nil { + oldfds.DecRef() + } + if oldfsc != nil { + oldfsc.DecRef() + } + return nil +} + +// vforkStop is a TaskStop imposed on a task that creates a child with +// CLONE_VFORK or vfork(2), that ends when the child task ceases to use its +// current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so +// that the child and parent share mappings until the child execve()s into a +// new process image or exits.) +// +// +stateify savable +type vforkStop struct{} + +// StopIgnoresKill implements TaskStop.Killable. +func (*vforkStop) Killable() bool { return true } diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go new file mode 100644 index 000000000..bbd294141 --- /dev/null +++ b/pkg/sentry/kernel/task_context.go @@ -0,0 +1,174 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" + "gvisor.googlesource.com/gvisor/pkg/sentry/loader" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserr" +) + +var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) + +// Auxmap contains miscellaneous data for the task. +type Auxmap map[string]interface{} + +// TaskContext is the subset of a task's data that is provided by the loader. +// +// +stateify savable +type TaskContext struct { + // Name is the thread name set by the prctl(PR_SET_NAME) system call. + Name string + + // Arch is the architecture-specific context (registers, etc.) + Arch arch.Context + + // MemoryManager is the task's address space. + MemoryManager *mm.MemoryManager + + // fu implements futexes in the address space. + fu *futex.Manager + + // st is the task's syscall table. + st *SyscallTable +} + +// release releases all resources held by the TaskContext. release is called by +// the task when it execs into a new TaskContext or exits. +func (tc *TaskContext) release() { + // Nil out pointers so that if the task is saved after release, it doesn't + // follow the pointers to possibly now-invalid objects. + if tc.MemoryManager != nil { + // TODO(b/38173783) + tc.MemoryManager.DecUsers(context.Background()) + tc.MemoryManager = nil + } + tc.fu = nil +} + +// Fork returns a duplicate of tc. The copied TaskContext always has an +// independent arch.Context. If shareAddressSpace is true, the copied +// TaskContext shares an address space with the original; otherwise, the copied +// TaskContext has an independent address space that is initially a duplicate +// of the original's. +func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) { + newTC := &TaskContext{ + Name: tc.Name, + Arch: tc.Arch.Fork(), + st: tc.st, + } + if shareAddressSpace { + newTC.MemoryManager = tc.MemoryManager + if newTC.MemoryManager != nil { + if !newTC.MemoryManager.IncUsers() { + // Shouldn't be possible since tc.MemoryManager should be a + // counted user. + panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager")) + } + } + newTC.fu = tc.fu + } else { + newMM, err := tc.MemoryManager.Fork(ctx) + if err != nil { + return nil, err + } + newTC.MemoryManager = newMM + newTC.fu = k.futexes.Fork() + } + return newTC, nil +} + +// Arch returns t's arch.Context. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Arch() arch.Context { + return t.tc.Arch +} + +// MemoryManager returns t's MemoryManager. MemoryManager does not take an +// additional reference on the returned MM. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) MemoryManager() *mm.MemoryManager { + return t.tc.MemoryManager +} + +// SyscallTable returns t's syscall table. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) SyscallTable() *SyscallTable { + return t.tc.st +} + +// Stack returns the userspace stack. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Stack() *arch.Stack { + return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())} +} + +// LoadTaskImage loads filename into a new TaskContext. +// +// It takes several arguments: +// * mounts: MountNamespace to lookup filename in +// * root: Root to lookup filename under +// * wd: Working directory to lookup filename under +// * maxTraversals: maximum number of symlinks to follow +// * filename: path to binary to load +// * argv: Binary argv +// * envv: Binary envv +// * fs: Binary FeatureSet +func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) { + // Prepare a new user address space to load into. + m := mm.NewMemoryManager(k, k) + defer m.DecUsers(ctx) + + os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso) + if err != nil { + return nil, err + } + + // Lookup our new syscall table. + st, ok := LookupSyscallTable(os, ac.Arch()) + if !ok { + // No syscall table found. This means that the ELF binary does not match + // the architecture. + return nil, errNoSyscalls + } + + if !m.IncUsers() { + panic("Failed to increment users count on new MM") + } + return &TaskContext{ + Name: name, + Arch: ac, + MemoryManager: m, + fu: k.futexes.Fork(), + st: st, + }, nil +} diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go new file mode 100644 index 000000000..5d1425d5c --- /dev/null +++ b/pkg/sentry/kernel/task_exec.go @@ -0,0 +1,262 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file implements the machinery behind the execve() syscall. In brief, a +// thread executes an execve() by killing all other threads in its thread +// group, assuming the leader's identity, and then switching process images. +// +// This design is effectively mandated by Linux. From ptrace(2): +// +// """ +// execve(2) under ptrace +// When one thread in a multithreaded process calls execve(2), the +// kernel destroys all other threads in the process, and resets the +// thread ID of the execing thread to the thread group ID (process ID). +// (Or, to put things another way, when a multithreaded process does an +// execve(2), at completion of the call, it appears as though the +// execve(2) occurred in the thread group leader, regardless of which +// thread did the execve(2).) This resetting of the thread ID looks +// very confusing to tracers: +// +// * All other threads stop in PTRACE_EVENT_EXIT stop, if the +// PTRACE_O_TRACEEXIT option was turned on. Then all other threads +// except the thread group leader report death as if they exited via +// _exit(2) with exit code 0. +// +// * The execing tracee changes its thread ID while it is in the +// execve(2). (Remember, under ptrace, the "pid" returned from +// waitpid(2), or fed into ptrace calls, is the tracee's thread ID.) +// That is, the tracee's thread ID is reset to be the same as its +// process ID, which is the same as the thread group leader's thread +// ID. +// +// * Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC +// option was turned on. +// +// * If the thread group leader has reported its PTRACE_EVENT_EXIT stop +// by this time, it appears to the tracer that the dead thread leader +// "reappears from nowhere". (Note: the thread group leader does not +// report death via WIFEXITED(status) until there is at least one +// other live thread. This eliminates the possibility that the +// tracer will see it dying and then reappearing.) If the thread +// group leader was still alive, for the tracer this may look as if +// thread group leader returns from a different system call than it +// entered, or even "returned from a system call even though it was +// not in any system call". If the thread group leader was not +// traced (or was traced by a different tracer), then during +// execve(2) it will appear as if it has become a tracee of the +// tracer of the execing tracee. +// +// All of the above effects are the artifacts of the thread ID change in +// the tracee. +// """ + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// execStop is a TaskStop that a task sets on itself when it wants to execve +// and is waiting for the other tasks in its thread group to exit first. +// +// +stateify savable +type execStop struct{} + +// Killable implements TaskStop.Killable. +func (*execStop) Killable() bool { return true } + +// Execve implements the execve(2) syscall by killing all other tasks in its +// thread group and switching to newTC. Execve always takes ownership of newTC. +// +// Preconditions: The caller must be running Task.doSyscallInvoke on the task +// goroutine. +func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + + if t.tg.exiting || t.tg.execing != nil { + // We lost to a racing group-exit, kill, or exec from another thread + // and should just exit. + newTC.release() + return nil, syserror.EINTR + } + + // Cancel any racing group stops. + t.tg.endGroupStopLocked(false) + + // If the task has any siblings, they have to exit before the exec can + // continue. + t.tg.execing = t + if t.tg.tasks.Front() != t.tg.tasks.Back() { + // "[All] other threads except the thread group leader report death as + // if they exited via _exit(2) with exit code 0." - ptrace(2) + for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { + if t != sibling { + sibling.killLocked() + } + } + // The last sibling to exit will wake t. + t.beginInternalStopLocked((*execStop)(nil)) + } + + return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil +} + +// The runSyscallAfterExecStop state continues execve(2) after all siblings of +// a thread in the execve syscall have exited. +// +// +stateify savable +type runSyscallAfterExecStop struct { + tc *TaskContext +} + +func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { + t.tg.pidns.owner.mu.Lock() + t.tg.execing = nil + if t.killed() { + t.tg.pidns.owner.mu.Unlock() + r.tc.release() + return (*runInterrupt)(nil) + } + // We are the thread group leader now. Save our old thread ID for + // PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this + // point it will get a PID of 0, but this is consistent with Linux. + oldTID := ThreadID(0) + if tracer := t.Tracer(); tracer != nil { + oldTID = tracer.tg.pidns.tids[t] + } + t.promoteLocked() + // "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle + // this first since POSIX timers are protected by the signal mutex, which + // we're about to change. Note that we have to stop and destroy timers + // without holding any mutexes to avoid circular lock ordering. + var its []*IntervalTimer + t.tg.signalHandlers.mu.Lock() + for _, it := range t.tg.timers { + its = append(its, it) + } + t.tg.timers = make(map[linux.TimerID]*IntervalTimer) + t.tg.signalHandlers.mu.Unlock() + t.tg.pidns.owner.mu.Unlock() + for _, it := range its { + it.DestroyTimer() + } + t.tg.pidns.owner.mu.Lock() + // "During an execve(2), the dispositions of handled signals are reset to + // the default; the dispositions of ignored signals are left unchanged. ... + // [The] signal mask is preserved across execve(2). ... [The] pending + // signal set is preserved across an execve(2)." - signal(7) + // + // Details: + // + // - If the thread group is sharing its signal handlers with another thread + // group via CLONE_SIGHAND, execve forces the signal handlers to be copied + // (see Linux's fs/exec.c:de_thread). We're not reference-counting signal + // handlers, so we always make a copy. + // + // - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags, + // restorer (if present), and mask are always reset. (See Linux's + // fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.) + t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec() + t.endStopCond.L = &t.tg.signalHandlers.mu + // "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2) + t.signalStack = arch.SignalStack{Flags: arch.SignalStackFlagDisable} + // "The termination signal is reset to SIGCHLD (see clone(2))." + t.tg.terminationSignal = linux.SIGCHLD + // execed indicates that the process can no longer join a process group + // in some scenarios (namely, the parent call setpgid(2) on the child). + // See the JoinProcessGroup function in sessions.go for more context. + t.tg.execed = true + // Maximum RSS is preserved across execve(2). + t.updateRSSLocked() + // Restartable sequence state is discarded. + t.rseqPreempted = false + t.rseqCPUAddr = 0 + t.rseqCPU = -1 + t.tg.rscr.Store(&RSEQCriticalRegion{}) + t.tg.pidns.owner.mu.Unlock() + + // Remove FDs with the CloseOnExec flag set. + t.fds.RemoveIf(func(file *fs.File, flags FDFlags) bool { + return flags.CloseOnExec + }) + + // Switch to the new process. + t.MemoryManager().Deactivate() + t.mu.Lock() + // Update credentials to reflect the execve. This should precede switching + // MMs to ensure that dumpability has been reset first, if needed. + t.updateCredsForExecLocked() + t.tc.release() + t.tc = *r.tc + t.mu.Unlock() + t.unstopVforkParent() + // NOTE(b/30316266): All locks must be dropped prior to calling Activate. + t.MemoryManager().Activate() + + t.ptraceExec(oldTID) + return (*runSyscallExit)(nil) +} + +// promoteLocked makes t the leader of its thread group. If t is already the +// thread group leader, promoteLocked is a no-op. +// +// Preconditions: All other tasks in t's thread group, including the existing +// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must +// be locked for writing. +func (t *Task) promoteLocked() { + oldLeader := t.tg.leader + if t == oldLeader { + return + } + // Swap the leader's TIDs with the execing task's. The latter will be + // released when the old leader is reaped below. + for ns := t.tg.pidns; ns != nil; ns = ns.parent { + oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader] + ns.tids[oldLeader] = oldTID + ns.tids[t] = leaderTID + ns.tasks[oldTID] = oldLeader + ns.tasks[leaderTID] = t + // Neither the ThreadGroup nor TGID change, so no need to + // update ns.tgids. + } + + // Inherit the old leader's start time. + oldStartTime := oldLeader.StartTime() + t.mu.Lock() + t.startTime = oldStartTime + t.mu.Unlock() + + t.tg.leader = t + t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t]) + t.updateLogPrefixLocked() + // Reap the original leader. If it has a tracer, detach it instead of + // waiting for it to acknowledge the original leader's death. + oldLeader.exitParentNotified = true + oldLeader.exitParentAcked = true + if tracer := oldLeader.Tracer(); tracer != nil { + delete(tracer.ptraceTracees, oldLeader) + oldLeader.forgetTracerLocked() + // Notify the tracer that it will no longer be receiving these events + // from the tracee. + tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue) + } + oldLeader.exitNotifyLocked(false) +} diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go new file mode 100644 index 000000000..158e665d3 --- /dev/null +++ b/pkg/sentry/kernel/task_exit.go @@ -0,0 +1,1159 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file implements the task exit cycle: +// +// - Tasks are asynchronously requested to exit with Task.Kill. +// +// - When able, the task goroutine enters the exit path starting from state +// runExit. +// +// - Other tasks observe completed exits with Task.Wait (which implements the +// wait*() family of syscalls). + +import ( + "errors" + "fmt" + "strconv" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// An ExitStatus is a value communicated from an exiting task or thread group +// to the party that reaps it. +// +// +stateify savable +type ExitStatus struct { + // Code is the numeric value passed to the call to exit or exit_group that + // caused the exit. If the exit was not caused by such a call, Code is 0. + Code int + + // Signo is the signal that caused the exit. If the exit was not caused by + // a signal, Signo is 0. + Signo int +} + +// Signaled returns true if the ExitStatus indicates that the exiting task or +// thread group was killed by a signal. +func (es ExitStatus) Signaled() bool { + return es.Signo != 0 +} + +// Status returns the numeric representation of the ExitStatus returned by e.g. +// the wait4() system call. +func (es ExitStatus) Status() uint32 { + return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff) +} + +// ShellExitCode returns the numeric exit code that Bash would return for an +// exit status of es. +func (es ExitStatus) ShellExitCode() int { + if es.Signaled() { + return 128 + es.Signo + } + return es.Code +} + +// TaskExitState represents a step in the task exit path. +// +// "Exiting" and "exited" are often ambiguous; prefer to name specific states. +type TaskExitState int + +const ( + // TaskExitNone indicates that the task has not begun exiting. + TaskExitNone TaskExitState = iota + + // TaskExitInitiated indicates that the task goroutine has entered the exit + // path, and the task is no longer eligible to participate in group stops + // or group signal handling. TaskExitInitiated is analogous to Linux's + // PF_EXITING. + TaskExitInitiated + + // TaskExitZombie indicates that the task has released its resources, and + // the task no longer prevents a sibling thread from completing execve. + TaskExitZombie + + // TaskExitDead indicates that the task's thread IDs have been released, + // and the task no longer prevents its thread group leader from being + // reaped. ("Reaping" refers to the transitioning of a task from + // TaskExitZombie to TaskExitDead.) + TaskExitDead +) + +// String implements fmt.Stringer. +func (t TaskExitState) String() string { + switch t { + case TaskExitNone: + return "TaskExitNone" + case TaskExitInitiated: + return "TaskExitInitiated" + case TaskExitZombie: + return "TaskExitZombie" + case TaskExitDead: + return "TaskExitDead" + default: + return strconv.Itoa(int(t)) + } +} + +// killLocked marks t as killed by enqueueing a SIGKILL, without causing the +// thread-group-affecting side effects SIGKILL usually has. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) killLocked() { + // Clear killable stops. + if t.stop != nil && t.stop.Killable() { + t.endInternalStopLocked() + } + t.pendingSignals.enqueue(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + // Linux just sets SIGKILL in the pending signal bitmask without + // enqueueing an actual siginfo, such that + // kernel/signal.c:collect_signal() initializes si_code to SI_USER. + Code: arch.SignalInfoUser, + }, nil) + t.interrupt() +} + +// killed returns true if t has a SIGKILL pending. killed is analogous to +// Linux's fatal_signal_pending(). +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) killed() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.killedLocked() +} + +func (t *Task) killedLocked() bool { + return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0 +} + +// PrepareExit indicates an exit with status es. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) PrepareExit(es ExitStatus) { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.exitStatus = es +} + +// PrepareGroupExit indicates a group exit with status es to t's thread group. +// +// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it +// does not tail-call do_exit(), except that it *does* set Task.exitStatus. +// (Linux does not do so until within do_exit(), since it reuses exit_code for +// ptrace.) +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) PrepareGroupExit(es ExitStatus) { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.tg.exiting || t.tg.execing != nil { + // Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e. + // this "group exit" is being executed by the killed sibling of an + // execing task, then Task.Execve never set t.tg.exitStatus, so it's + // still the zero value. This is consistent with Linux, both in intent + // ("all other threads ... report death as if they exited via _exit(2) + // with exit code 0" - ptrace(2), "execve under ptrace") and in + // implementation (compare fs/exec.c:de_thread() => + // kernel/signal.c:zap_other_threads() and + // kernel/exit.c:do_group_exit() => + // include/linux/sched.h:signal_group_exit()). + t.exitStatus = t.tg.exitStatus + return + } + t.tg.exiting = true + t.tg.exitStatus = es + t.exitStatus = es + for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { + if sibling != t { + sibling.killLocked() + } + } +} + +// Kill requests that all tasks in ts exit as if group exiting with status es. +// Kill does not wait for tasks to exit. +// +// Kill has no analogue in Linux; it's provided for save/restore only. +func (ts *TaskSet) Kill(es ExitStatus) { + ts.mu.Lock() + defer ts.mu.Unlock() + ts.Root.exiting = true + for t := range ts.Root.tids { + t.tg.signalHandlers.mu.Lock() + if !t.tg.exiting { + t.tg.exiting = true + t.tg.exitStatus = es + } + t.killLocked() + t.tg.signalHandlers.mu.Unlock() + } +} + +// advanceExitStateLocked checks that t's current exit state is oldExit, then +// sets it to newExit. If t's current exit state is not oldExit, +// advanceExitStateLocked panics. +// +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) { + if t.exitState != oldExit { + panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState)) + } + t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit) + t.exitState = newExit +} + +// runExit is the entry point into the task exit path. +// +// +stateify savable +type runExit struct{} + +func (*runExit) execute(t *Task) taskRunState { + t.ptraceExit() + return (*runExitMain)(nil) +} + +// +stateify savable +type runExitMain struct{} + +func (*runExitMain) execute(t *Task) taskRunState { + lastExiter := t.exitThreadGroup() + + // If the task has a cleartid, and the thread group wasn't killed by a + // signal, handle that before releasing the MM. + if t.cleartid != 0 { + t.tg.signalHandlers.mu.Lock() + signaled := t.tg.exiting && t.tg.exitStatus.Signaled() + t.tg.signalHandlers.mu.Unlock() + if !signaled { + if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil { + t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1) + } + // If the CopyOut fails, there's nothing we can do. + } + } + + // Deactivate the address space and update max RSS before releasing the + // task's MM. + t.Deactivate() + t.tg.pidns.owner.mu.Lock() + t.updateRSSLocked() + t.tg.pidns.owner.mu.Unlock() + t.mu.Lock() + t.tc.release() + t.mu.Unlock() + + // Releasing the MM unblocks a blocked CLONE_VFORK parent. + t.unstopVforkParent() + + t.fsc.DecRef() + t.fds.DecRef() + + // If this is the last task to exit from the thread group, release the + // thread group's resources. + if lastExiter { + t.tg.release() + } + + // Detach tracees. + t.exitPtrace() + + // Reparent the task's children. + t.exitChildren() + + // Don't tail-call runExitNotify, as exitChildren may have initiated a stop + // to wait for a PID namespace to die. + return (*runExitNotify)(nil) +} + +// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread +// group that it is no longer eligible to participate in group activities. It +// returns true if t is the last task in its thread group to call +// exitThreadGroup. +func (t *Task) exitThreadGroup() bool { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.tg.signalHandlers.mu.Lock() + // Can't defer unlock: see below. + + t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated) + t.tg.activeTasks-- + last := t.tg.activeTasks == 0 + + // Ensure that someone will handle the signals we can't. + t.setSignalMaskLocked(^linux.SignalSet(0)) + + // Check if this task's exit interacts with an initiated group stop. + if !t.groupStopPending { + t.tg.signalHandlers.mu.Unlock() + return last + } + t.groupStopPending = false + sig := t.tg.groupStopSignal + notifyParent := t.participateGroupStopLocked() + // signalStop must be called with t's signal mutex unlocked. + t.tg.signalHandlers.mu.Unlock() + if notifyParent && t.tg.leader.parent != nil { + t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) + } + return last +} + +func (t *Task) exitChildren() { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + newParent := t.findReparentTargetLocked() + if newParent == nil { + // "If the init process of a PID namespace terminates, the kernel + // terminates all of the processes in the namespace via a SIGKILL + // signal." - pid_namespaces(7) + t.Debugf("Init process terminating, killing namespace") + t.tg.pidns.exiting = true + for other := range t.tg.pidns.tgids { + if other == t.tg { + continue + } + other.signalHandlers.mu.Lock() + other.leader.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + }, true /* group */) + other.signalHandlers.mu.Unlock() + } + // TODO(b/37722272): The init process waits for all processes in the + // namespace to exit before completing its own exit + // (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all + // other tasks in the namespace are dead, except possibly for this + // thread group's leader (which can't be reaped until this task exits). + } + // This is correct even if newParent is nil (it ensures that children don't + // wait for a parent to reap them.) + for c := range t.children { + if sig := c.ParentDeathSignal(); sig != 0 { + siginfo := &arch.SignalInfo{ + Signo: int32(sig), + Code: arch.SignalInfoUser, + } + siginfo.SetPid(int32(c.tg.pidns.tids[t])) + siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) + c.tg.signalHandlers.mu.Lock() + c.sendSignalLocked(siginfo, true /* group */) + c.tg.signalHandlers.mu.Unlock() + } + c.reparentLocked(newParent) + if newParent != nil { + newParent.children[c] = struct{}{} + } + } +} + +// findReparentTargetLocked returns the task to which t's children should be +// reparented. If no such task exists, findNewParentLocked returns nil. +// +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) findReparentTargetLocked() *Task { + // Reparent to any sibling in the same thread group that hasn't begun + // exiting. + if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil { + return t2 + } + // "A child process that is orphaned within the namespace will be + // reparented to [the init process for the namespace] ..." - + // pid_namespaces(7) + if init := t.tg.pidns.tasks[InitTID]; init != nil { + return init.tg.anyNonExitingTaskLocked() + } + return nil +} + +func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task { + for t := tg.tasks.Front(); t != nil; t = t.Next() { + if t.exitState == TaskExitNone { + return t + } + } + return nil +} + +// reparentLocked changes t's parent. The new parent may be nil. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) reparentLocked(parent *Task) { + oldParent := t.parent + t.parent = parent + // If a thread group leader's parent changes, reset the thread group's + // termination signal to SIGCHLD and re-check exit notification. (Compare + // kernel/exit.c:reparent_leader().) + if t != t.tg.leader { + return + } + if oldParent == nil && parent == nil { + return + } + if oldParent != nil && parent != nil && oldParent.tg == parent.tg { + return + } + t.tg.terminationSignal = linux.SIGCHLD + if t.exitParentNotified && !t.exitParentAcked { + t.exitParentNotified = false + t.exitNotifyLocked(false) + } +} + +// When a task exits, other tasks in the system, notably the task's parent and +// ptracer, may want to be notified. The exit notification system ensures that +// interested tasks receive signals and/or are woken from blocking calls to +// wait*() syscalls; these notifications must be resolved before exiting tasks +// can be reaped and disappear from the system. +// +// Each task may have a parent task and/or a tracer task. If both a parent and +// a tracer exist, they may be the same task, different tasks in the same +// thread group, or tasks in different thread groups. (In the last case, Linux +// refers to the task as being ptrace-reparented due to an implementation +// detail; we avoid this terminology to avoid confusion.) +// +// A thread group is *empty* if all non-leader tasks in the thread group are +// dead, and the leader is either a zombie or dead. The exit of a thread group +// leader is never waitable - by either the parent or tracer - until the thread +// group is empty. +// +// There are a few ways for an exit notification to be resolved: +// +// - The exit notification may be acknowledged by a call to Task.Wait with +// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall). +// +// - If the notified party is the parent, and the parent thread group is not +// also the tracer thread group, and the notification signal is SIGCHLD, the +// parent may explicitly ignore the notification (see quote in exitNotify). +// Note that it's possible for the notified party to ignore the signal in other +// cases, but the notification is only resolved under the above conditions. +// (Actually, there is one exception; see the last paragraph of the "leader, +// has tracer, tracer thread group is parent thread group" case below.) +// +// - If the notified party is the parent, and the parent does not exist, the +// notification is resolved as if ignored. (This is only possible in the +// sentry. In Linux, the only task / thread group without a parent is global +// init, and killing global init causes a kernel panic.) +// +// - If the notified party is a tracer, the tracer may detach the traced task. +// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.) +// +// In addition, if the notified party is the parent, the parent may exit and +// cause the notifying task to be reparented to another thread group. This does +// not resolve the notification; instead, the notification must be resent to +// the new parent. +// +// The series of notifications generated for a given task's exit depend on +// whether it is a thread group leader; whether the task is ptraced; and, if +// so, whether the tracer thread group is the same as the parent thread group. +// +// - Non-leader, no tracer: No notification is generated; the task is reaped +// immediately. +// +// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer +// notification is resolved (by waiting or detaching), the task is reaped. (For +// non-leaders, whether the tracer and parent thread groups are the same is +// irrelevant.) +// +// - Leader, no tracer: The task remains a zombie, with no notification sent, +// until all other tasks in the thread group are dead. (In Linux terms, this +// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks +// are removed from their thread_group list in kernel/exit.c:release_task() => +// __exit_signal() => __unhash_process().) Then the thread group's termination +// signal is sent to the parent. When the parent notification is resolved (by +// waiting or ignoring), the task is reaped. +// +// - Leader, has tracer, tracer thread group is not parent thread group: +// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by +// waiting or detaching), and all other tasks in the thread group are dead, the +// thread group's termination signal is sent to the parent. (Note that the +// tracer cannot resolve the exit notification by waiting until the thread +// group is empty.) When the parent notification is resolved, the task is +// reaped. +// +// - Leader, has tracer, tracer thread group is parent thread group: +// +// If all other tasks in the thread group are dead, the thread group's +// termination signal is sent to the parent. At this point, the notification +// can only be resolved by waiting. If the parent detaches from the task as a +// tracer, the notification is not resolved, but the notification can now be +// resolved by waiting or ignoring. When the parent notification is resolved, +// the task is reaped. +// +// If at least one task in the thread group is not dead, SIGCHLD is sent to the +// parent. At this point, the notification cannot be resolved at all; once the +// thread group becomes empty, it can be resolved only by waiting. If the +// parent detaches from the task as a tracer before all remaining tasks die, +// then exit notification proceeds as in the case where the leader never had a +// tracer. If the parent detaches from the task as a tracer after all remaining +// tasks die, the notification is not resolved, but the notification can now be +// resolved by waiting or ignoring. When the parent notification is resolved, +// the task is reaped. +// +// In both of the above cases, when the parent detaches from the task as a +// tracer while the thread group is empty, whether or not the parent resolves +// the notification by ignoring it is based on the parent's SIGCHLD signal +// action, whether or not the thread group's termination signal is SIGCHLD +// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()). +// +// There is one final wrinkle: A leader can become a non-leader due to a +// sibling execve. In this case, the execing thread detaches the leader's +// tracer (if one exists) and reaps the leader immediately. In Linux, this is +// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked(). + +// +stateify savable +type runExitNotify struct{} + +func (*runExitNotify) execute(t *Task) taskRunState { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie) + t.tg.liveTasks-- + // Check if this completes a sibling's execve. + if t.tg.execing != nil && t.tg.liveTasks == 1 { + // execing blocks the addition of new tasks to the thread group, so + // the sole living task must be the execing one. + e := t.tg.execing + e.tg.signalHandlers.mu.Lock() + if _, ok := e.stop.(*execStop); ok { + e.endInternalStopLocked() + } + e.tg.signalHandlers.mu.Unlock() + } + t.exitNotifyLocked(false) + // The task goroutine will now exit. + return nil +} + +// exitNotifyLocked is called after changes to t's state that affect exit +// notification. +// +// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace; +// thanks to Linux's haphazard implementation of this functionality, such cases +// determine whether parent notifications are ignored based on the parent's +// handling of SIGCHLD, regardless of what the exited task's thread group's +// termination signal is. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { + if t.exitState != TaskExitZombie { + return + } + if !t.exitTracerNotified { + t.exitTracerNotified = true + tracer := t.Tracer() + if tracer == nil { + t.exitTracerAcked = true + } else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg { + // Don't set exitParentNotified if t is non-leader, even if the + // tracer is in the parent thread group, so that if the parent + // detaches the following call to exitNotifyLocked passes through + // the !exitParentNotified case below and causes t to be reaped + // immediately. + // + // Tracer notification doesn't care about about + // SIG_IGN/SA_NOCLDWAIT. + tracer.tg.signalHandlers.mu.Lock() + tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */) + tracer.tg.signalHandlers.mu.Unlock() + // Wake EventTraceeStop waiters as well since this task will never + // ptrace-stop again. + tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop) + } else { + // t is a leader and the tracer is in the parent thread group. + t.exitParentNotified = true + sig := linux.SIGCHLD + if t.tg.tasksCount == 1 { + sig = t.tg.terminationSignal + } + // This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either + // (in Linux, the check in do_notify_parent() is gated by + // !tsk->ptrace.) + t.parent.tg.signalHandlers.mu.Lock() + t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */) + t.parent.tg.signalHandlers.mu.Unlock() + // See below for rationale for this event mask. + t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) + } + } + if t.exitTracerAcked && !t.exitParentNotified { + if t != t.tg.leader { + t.exitParentNotified = true + t.exitParentAcked = true + } else if t.tg.tasksCount == 1 { + t.exitParentNotified = true + if t.parent == nil { + t.exitParentAcked = true + } else { + // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is + // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see + // sigaction(2)), then children that terminate do not become + // zombies and a call to wait() or waitpid() will block until all + // children have terminated, and then fail with errno set to + // ECHILD. (The original POSIX standard left the behavior of + // setting SIGCHLD to SIG_IGN unspecified. Note that even though + // the default disposition of SIGCHLD is "ignore", explicitly + // setting the disposition to SIG_IGN results in different + // treatment of zombie process children.) Linux 2.6 conforms to + // this specification." - wait(2) + // + // Some undocumented Linux-specific details: + // + // - All of the above is ignored if the termination signal isn't + // SIGCHLD. + // + // - SA_NOCLDWAIT causes the leader to be immediately reaped, but + // does not suppress the SIGCHLD. + signalParent := t.tg.terminationSignal.IsValid() + t.parent.tg.signalHandlers.mu.Lock() + if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach { + if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok { + if act.Handler == arch.SignalActIgnore { + t.exitParentAcked = true + signalParent = false + } else if act.Flags&arch.SignalFlagNoCldWait != 0 { + t.exitParentAcked = true + } + } + } + if signalParent { + t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */) + } + t.parent.tg.signalHandlers.mu.Unlock() + // If a task in the parent was waiting for a child group stop + // or continue, it needs to be notified of the exit, because + // there may be no remaining eligible tasks (so that wait + // should return ECHILD). + t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) + } + } + } + if t.exitTracerAcked && t.exitParentAcked { + t.advanceExitStateLocked(TaskExitZombie, TaskExitDead) + for ns := t.tg.pidns; ns != nil; ns = ns.parent { + tid := ns.tids[t] + delete(ns.tasks, tid) + delete(ns.tids, t) + if t == t.tg.leader { + delete(ns.tgids, t.tg) + } + } + t.tg.exitedCPUStats.Accumulate(t.CPUStats()) + t.tg.ioUsage.Accumulate(t.ioUsage) + t.tg.signalHandlers.mu.Lock() + t.tg.tasks.Remove(t) + t.tg.tasksCount-- + tc := t.tg.tasksCount + t.tg.signalHandlers.mu.Unlock() + if tc == 1 && t != t.tg.leader { + // Our fromPtraceDetach doesn't matter here (in Linux terms, this + // is via a call to release_task()). + t.tg.leader.exitNotifyLocked(false) + } else if tc == 0 { + t.tg.processGroup.decRefWithParent(t.tg.parentPG()) + } + if t.parent != nil { + delete(t.parent.children, t) + t.parent = nil + } + } +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo { + info := &arch.SignalInfo{ + Signo: int32(sig), + } + info.SetPid(int32(receiver.tg.pidns.tids[t])) + info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) + if t.exitStatus.Signaled() { + info.Code = arch.CLD_KILLED + info.SetStatus(int32(t.exitStatus.Signo)) + } else { + info.Code = arch.CLD_EXITED + info.SetStatus(int32(t.exitStatus.Code)) + } + // TODO(b/72102453): Set utime, stime. + return info +} + +// ExitStatus returns t's exit status, which is only guaranteed to be +// meaningful if t.ExitState() != TaskExitNone. +func (t *Task) ExitStatus() ExitStatus { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.exitStatus +} + +// ExitStatus returns the exit status that would be returned by a consuming +// wait*() on tg. +func (tg *ThreadGroup) ExitStatus() ExitStatus { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + if tg.exiting { + return tg.exitStatus + } + return tg.leader.exitStatus +} + +// TerminationSignal returns the thread group's termination signal. +func (tg *ThreadGroup) TerminationSignal() linux.Signal { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.terminationSignal +} + +// Task events that can be waited for. +const ( + // EventExit represents an exit notification generated for a child thread + // group leader or a tracee under the conditions specified in the comment + // above runExitNotify. + EventExit waiter.EventMask = 1 << iota + + // EventChildGroupStop occurs when a child thread group completes a group + // stop (i.e. all tasks in the child thread group have entered a stopped + // state as a result of a group stop). + EventChildGroupStop + + // EventTraceeStop occurs when a task that is ptraced by a task in the + // notified thread group enters a ptrace stop (see ptrace(2)). + EventTraceeStop + + // EventGroupContinue occurs when a child thread group, or a thread group + // whose leader is ptraced by a task in the notified thread group, that had + // initiated or completed a group stop leaves the group stop, due to the + // child thread group or any task in the child thread group being sent + // SIGCONT. + EventGroupContinue +) + +// WaitOptions controls the behavior of Task.Wait. +type WaitOptions struct { + // If SpecificTID is non-zero, only events from the task with thread ID + // SpecificTID are eligible to be waited for. SpecificTID is resolved in + // the PID namespace of the waiter (the method receiver of Task.Wait). If + // no such task exists, or that task would not otherwise be eligible to be + // waited for by the waiting task, then there are no waitable tasks and + // Wait will return ECHILD. + SpecificTID ThreadID + + // If SpecificPGID is non-zero, only events from ThreadGroups with a + // matching ProcessGroupID are eligible to be waited for. (Same + // constraints as SpecificTID apply.) + SpecificPGID ProcessGroupID + + // Terminology note: Per waitpid(2), "a clone child is one which delivers + // no signal, or a signal other than SIGCHLD to its parent upon + // termination." In Linux, termination signal is technically a per-task + // property rather than a per-thread-group property. However, clone() + // forces no termination signal for tasks created with CLONE_THREAD, and + // execve() resets the termination signal to SIGCHLD, so all + // non-group-leader threads have no termination signal and are therefore + // "clone tasks". + + // If NonCloneTasks is true, events from non-clone tasks are eligible to be + // waited for. + NonCloneTasks bool + + // If CloneTasks is true, events from clone tasks are eligible to be waited + // for. + CloneTasks bool + + // If SiblingChildren is true, events from children tasks of any task + // in the thread group of the waiter are eligible to be waited for. + SiblingChildren bool + + // Events is a bitwise combination of the events defined above that specify + // what events are of interest to the call to Wait. + Events waiter.EventMask + + // If ConsumeEvent is true, the Wait should consume the event such that it + // cannot be returned by a future Wait. Note that if a task exit is + // consumed in this way, in most cases the task will be reaped. + ConsumeEvent bool + + // If BlockInterruptErr is not nil, Wait will block until either an event + // is available or there are no tasks that could produce a waitable event; + // if that blocking is interrupted, Wait returns BlockInterruptErr. If + // BlockInterruptErr is nil, Wait will not block. + BlockInterruptErr error +} + +// Preconditions: The TaskSet mutex must be locked (for reading or writing). +func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool { + if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] { + return false + } + if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] { + return false + } + // Tracees are always eligible. + if tracee { + return true + } + if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD { + return o.NonCloneTasks + } + return o.CloneTasks +} + +// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g. +// waitpid(WNOHANG)) that find no waitable events, but determine that waitable +// events may exist in the future. (In contrast, if a non-blocking or blocking +// Wait determines that there are no tasks that can produce a waitable event, +// Task.Wait returns ECHILD.) +var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events") + +// WaitResult contains information about a waited-for event. +type WaitResult struct { + // Task is the task that reported the event. + Task *Task + + // TID is the thread ID of Task in the PID namespace of the task that + // called Wait (that is, the method receiver of the call to Task.Wait). TID + // is provided because consuming exit waits cause the thread ID to be + // deallocated. + TID ThreadID + + // UID is the real UID of Task in the user namespace of the task that + // called Wait. + UID auth.UID + + // Event is exactly one of the events defined above. + Event waiter.EventMask + + // Status is the numeric status associated with the event. + Status uint32 +} + +// Wait waits for an event from a thread group that is a child of t's thread +// group, or a task in such a thread group, or a task that is ptraced by t, +// subject to the options specified in opts. +func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) { + if opts.BlockInterruptErr == nil { + return t.waitOnce(opts) + } + w, ch := waiter.NewChannelEntry(nil) + t.tg.eventQueue.EventRegister(&w, opts.Events) + defer t.tg.eventQueue.EventUnregister(&w) + for { + wr, err := t.waitOnce(opts) + if err != ErrNoWaitableEvent { + // This includes err == nil. + return wr, err + } + if err := t.Block(ch); err != nil { + return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr) + } + } +} + +func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) { + anyWaitableTasks := false + + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + + if opts.SiblingChildren { + // We can wait on the children and tracees of any task in the + // same thread group. + for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() { + wr, any := t.waitParentLocked(opts, parent) + if wr != nil { + return wr, nil + } + anyWaitableTasks = anyWaitableTasks || any + } + } else { + // We can only wait on this task. + var wr *WaitResult + wr, anyWaitableTasks = t.waitParentLocked(opts, t) + if wr != nil { + return wr, nil + } + } + + if anyWaitableTasks { + return nil, ErrNoWaitableEvent + } + return nil, syserror.ECHILD +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) { + anyWaitableTasks := false + + for child := range parent.children { + if !opts.matchesTask(child, parent.tg.pidns, false) { + continue + } + // Non-leaders don't notify parents on exit and aren't eligible to + // be waited on. + if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked { + anyWaitableTasks = true + if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil { + return wr, anyWaitableTasks + } + } + // Check for group stops and continues. Tasks that have passed + // TaskExitInitiated can no longer participate in group stops. + if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 { + continue + } + if child.exitState >= TaskExitInitiated { + continue + } + // If the waiter is in the same thread group as the task's + // tracer, do not report its group stops; they will be reported + // as ptrace stops instead. This also skips checking for group + // continues, but they'll be checked for when scanning tracees + // below. (Per kernel/exit.c:wait_consider_task(): "If a + // ptracer wants to distinguish the two events for its own + // children, it should create a separate process which takes + // the role of real parent.") + if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg { + continue + } + anyWaitableTasks = true + if opts.Events&EventChildGroupStop != 0 { + if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil { + return wr, anyWaitableTasks + } + } + if opts.Events&EventGroupContinue != 0 { + if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil { + return wr, anyWaitableTasks + } + } + } + for tracee := range parent.ptraceTracees { + if !opts.matchesTask(tracee, parent.tg.pidns, true) { + continue + } + // Non-leaders do notify tracers on exit. + if opts.Events&EventExit != 0 && !tracee.exitTracerAcked { + anyWaitableTasks = true + if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil { + return wr, anyWaitableTasks + } + } + if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 { + continue + } + if tracee.exitState >= TaskExitInitiated { + continue + } + anyWaitableTasks = true + if opts.Events&EventTraceeStop != 0 { + if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil { + return wr, anyWaitableTasks + } + } + if opts.Events&EventGroupContinue != 0 { + if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil { + return wr, anyWaitableTasks + } + } + } + + return nil, anyWaitableTasks +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult { + if asPtracer && !target.exitTracerNotified { + return nil + } + if !asPtracer && !target.exitParentNotified { + return nil + } + // Zombied thread group leaders are never waitable until their thread group + // is otherwise empty. Usually this is caught by the + // target.exitParentNotified check above, but if t is both (in the thread + // group of) target's tracer and parent, asPtracer may be true. + if target == target.tg.leader && target.tg.tasksCount != 1 { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + status := target.exitStatus.Status() + if !opts.ConsumeEvent { + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventExit, + Status: status, + } + } + // Surprisingly, the exit status reported by a non-consuming wait can + // differ from that reported by a consuming wait; the latter will return + // the group exit code if one is available. + if target.tg.exiting { + status = target.tg.exitStatus.Status() + } + // t may be (in the thread group of) target's parent, tracer, or both. We + // don't need to check for !exitTracerAcked because tracees are detached + // here, and we don't need to check for !exitParentAcked because zombies + // will be reaped here. + if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified { + target.exitTracerAcked = true + target.ptraceTracer.Store((*Task)(nil)) + delete(t.ptraceTracees, target) + } + if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified { + target.exitParentAcked = true + if target == target.tg.leader { + // target.tg.exitedCPUStats doesn't include target.CPUStats() yet, + // and won't until after target.exitNotifyLocked() (maybe). Include + // target.CPUStats() explicitly. This is consistent with Linux, + // which accounts an exited task's cputime to its thread group in + // kernel/exit.c:release_task() => __exit_signal(), and uses + // thread_group_cputime_adjusted() in wait_task_zombie(). + t.tg.childCPUStats.Accumulate(target.CPUStats()) + t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats) + t.tg.childCPUStats.Accumulate(target.tg.childCPUStats) + // Update t's child max resident set size. The size will be the maximum + // of this thread's size and all its childrens' sizes. + if t.tg.childMaxRSS < target.tg.maxRSS { + t.tg.childMaxRSS = target.tg.maxRSS + } + if t.tg.childMaxRSS < target.tg.childMaxRSS { + t.tg.childMaxRSS = target.tg.childMaxRSS + } + } + } + target.exitNotifyLocked(false) + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventExit, + Status: status, + } +} + +// updateRSSLocked updates t.tg.maxRSS. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) updateRSSLocked() { + if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS { + t.tg.maxRSS = mmMaxRSS + } +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult { + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if !target.tg.groupStopWaitable { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + sig := target.tg.groupStopSignal + if opts.ConsumeEvent { + target.tg.groupStopWaitable = false + } + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventChildGroupStop, + // There is no name for these status constants. + Status: (uint32(sig)&0xff)<<8 | 0x7f, + } +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult { + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if !target.tg.groupContWaitable { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + if opts.ConsumeEvent { + target.tg.groupContWaitable = false + } + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventGroupContinue, + Status: 0xffff, + } +} + +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult { + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.stop == nil { + return nil + } + if _, ok := target.stop.(*ptraceStop); !ok { + return nil + } + if target.ptraceCode == 0 { + return nil + } + pid := t.tg.pidns.tids[target] + uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() + code := target.ptraceCode + if opts.ConsumeEvent { + target.ptraceCode = 0 + } + return &WaitResult{ + Task: target, + TID: pid, + UID: uid, + Event: EventTraceeStop, + Status: uint32(code)<<8 | 0x7f, + } +} + +// ExitState returns t's current progress through the exit path. +func (t *Task) ExitState() TaskExitState { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + return t.exitState +} + +// ParentDeathSignal returns t's parent death signal. +func (t *Task) ParentDeathSignal() linux.Signal { + t.mu.Lock() + defer t.mu.Unlock() + return t.parentDeathSignal +} + +// SetParentDeathSignal sets t's parent death signal. +func (t *Task) SetParentDeathSignal(sig linux.Signal) { + t.mu.Lock() + defer t.mu.Unlock() + t.parentDeathSignal = sig +} diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go new file mode 100644 index 000000000..f98097c2c --- /dev/null +++ b/pkg/sentry/kernel/task_futex.go @@ -0,0 +1,54 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// Futex returns t's futex manager. +// +// Preconditions: The caller must be running on the task goroutine, or t.mu +// must be locked. +func (t *Task) Futex() *futex.Manager { + return t.tc.fu +} + +// SwapUint32 implements futex.Target.SwapUint32. +func (t *Task) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { + return t.MemoryManager().SwapUint32(t, addr, new, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CompareAndSwapUint32 implemets futex.Target.CompareAndSwapUint32. +func (t *Task) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { + return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// LoadUint32 implemets futex.Target.LoadUint32. +func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) { + return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// GetSharedKey implements futex.Target.GetSharedKey. +func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) { + return t.MemoryManager().GetSharedFutexKey(t, addr) +} diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go new file mode 100644 index 000000000..17f08729a --- /dev/null +++ b/pkg/sentry/kernel/task_identity.go @@ -0,0 +1,568 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Credentials returns t's credentials. +// +// This value must be considered immutable. +func (t *Task) Credentials() *auth.Credentials { + t.mu.Lock() + defer t.mu.Unlock() + return t.creds +} + +// UserNamespace returns the user namespace associated with the task. +func (t *Task) UserNamespace() *auth.UserNamespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.creds.UserNamespace +} + +// HasCapabilityIn checks if the task has capability cp in user namespace ns. +func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool { + t.mu.Lock() + defer t.mu.Unlock() + return t.creds.HasCapabilityIn(cp, ns) +} + +// HasCapability checks if the task has capability cp in its user namespace. +func (t *Task) HasCapability(cp linux.Capability) bool { + t.mu.Lock() + defer t.mu.Unlock() + return t.creds.HasCapability(cp) +} + +// SetUID implements the semantics of setuid(2). +func (t *Task) SetUID(uid auth.UID) error { + // setuid considers -1 to be invalid. + if !uid.Ok() { + return syserror.EINVAL + } + t.mu.Lock() + defer t.mu.Unlock() + kuid := t.creds.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return syserror.EINVAL + } + // "setuid() sets the effective user ID of the calling process. If the + // effective UID of the caller is root (more precisely: if the caller has + // the CAP_SETUID capability), the real UID and saved set-user-ID are also + // set." - setuid(2) + if t.creds.HasCapability(linux.CAP_SETUID) { + t.setKUIDsUncheckedLocked(kuid, kuid, kuid) + return nil + } + // "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID + // capability) and uid does not match the real UID or saved set-user-ID of + // the calling process." + if kuid != t.creds.RealKUID && kuid != t.creds.SavedKUID { + return syserror.EPERM + } + t.setKUIDsUncheckedLocked(t.creds.RealKUID, kuid, t.creds.SavedKUID) + return nil +} + +// SetREUID implements the semantics of setreuid(2). +func (t *Task) SetREUID(r, e auth.UID) error { + t.mu.Lock() + defer t.mu.Unlock() + // "Supplying a value of -1 for either the real or effective user ID forces + // the system to leave that ID unchanged." - setreuid(2) + newR := t.creds.RealKUID + if r.Ok() { + newR = t.creds.UserNamespace.MapToKUID(r) + if !newR.Ok() { + return syserror.EINVAL + } + } + newE := t.creds.EffectiveKUID + if e.Ok() { + newE = t.creds.UserNamespace.MapToKUID(e) + if !newE.Ok() { + return syserror.EINVAL + } + } + if !t.creds.HasCapability(linux.CAP_SETUID) { + // "Unprivileged processes may only set the effective user ID to the + // real user ID, the effective user ID, or the saved set-user-ID." + if newE != t.creds.RealKUID && newE != t.creds.EffectiveKUID && newE != t.creds.SavedKUID { + return syserror.EPERM + } + // "Unprivileged users may only set the real user ID to the real user + // ID or the effective user ID." + if newR != t.creds.RealKUID && newR != t.creds.EffectiveKUID { + return syserror.EPERM + } + } + // "If the real user ID is set (i.e., ruid is not -1) or the effective user + // ID is set to a value not equal to the previous real user ID, the saved + // set-user-ID will be set to the new effective user ID." + newS := t.creds.SavedKUID + if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKUID) { + newS = newE + } + t.setKUIDsUncheckedLocked(newR, newE, newS) + return nil +} + +// SetRESUID implements the semantics of the setresuid(2) syscall. +func (t *Task) SetRESUID(r, e, s auth.UID) error { + t.mu.Lock() + defer t.mu.Unlock() + // "Unprivileged user processes may change the real UID, effective UID, and + // saved set-user-ID, each to one of: the current real UID, the current + // effective UID or the current saved set-user-ID. Privileged processes (on + // Linux, those having the CAP_SETUID capability) may set the real UID, + // effective UID, and saved set-user-ID to arbitrary values. If one of the + // arguments equals -1, the corresponding value is not changed." - + // setresuid(2) + var err error + newR := t.creds.RealKUID + if r.Ok() { + newR, err = t.creds.UseUID(r) + if err != nil { + return err + } + } + newE := t.creds.EffectiveKUID + if e.Ok() { + newE, err = t.creds.UseUID(e) + if err != nil { + return err + } + } + newS := t.creds.SavedKUID + if s.Ok() { + newS, err = t.creds.UseUID(s) + if err != nil { + return err + } + } + t.setKUIDsUncheckedLocked(newR, newE, newS) + return nil +} + +// Preconditions: t.mu must be locked. +func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { + root := t.creds.UserNamespace.MapToKUID(auth.RootUID) + oldR, oldE, oldS := t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID + t.creds = t.creds.Fork() // See doc for creds. + t.creds.RealKUID, t.creds.EffectiveKUID, t.creds.SavedKUID = newR, newE, newS + + // "1. If one or more of the real, effective or saved set user IDs was + // previously 0, and as a result of the UID changes all of these IDs have a + // nonzero value, then all capabilities are cleared from the permitted and + // effective capability sets." - capabilities(7) + if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) { + // prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's + // "keep capabilities" flag, which determines whether the thread's permitted + // capability set is cleared when a change is made to the + // thread's user IDs such that the thread's real UID, effective + // UID, and saved set-user-ID all become nonzero when at least + // one of them previously had the value 0. By default, the + // permitted capability set is cleared when such a change is + // made; setting the "keep capabilities" flag prevents it from + // being cleared." (A thread's effective capability set is always + // cleared when such a credential change is made, + // regardless of the setting of the "keep capabilities" flag.) + if !t.creds.KeepCaps { + t.creds.PermittedCaps = 0 + t.creds.EffectiveCaps = 0 + } + } + // """ + // 2. If the effective user ID is changed from 0 to nonzero, then all + // capabilities are cleared from the effective set. + // + // 3. If the effective user ID is changed from nonzero to 0, then the + // permitted set is copied to the effective set. + // """ + if oldE == root && newE != root { + t.creds.EffectiveCaps = 0 + } else if oldE != root && newE == root { + t.creds.EffectiveCaps = t.creds.PermittedCaps + } + // "4. If the filesystem user ID is changed from 0 to nonzero (see + // setfsuid(2)), then the following capabilities are cleared from the + // effective set: ..." + // (filesystem UIDs aren't implemented, nor are any of the capabilities in + // question) + + // Not documented, but compare Linux's kernel/cred.c:commit_creds(). + if oldE != newE { + t.parentDeathSignal = 0 + } +} + +// SetGID implements the semantics of setgid(2). +func (t *Task) SetGID(gid auth.GID) error { + if !gid.Ok() { + return syserror.EINVAL + } + t.mu.Lock() + defer t.mu.Unlock() + kgid := t.creds.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + if t.creds.HasCapability(linux.CAP_SETGID) { + t.setKGIDsUncheckedLocked(kgid, kgid, kgid) + return nil + } + if kgid != t.creds.RealKGID && kgid != t.creds.SavedKGID { + return syserror.EPERM + } + t.setKGIDsUncheckedLocked(t.creds.RealKGID, kgid, t.creds.SavedKGID) + return nil +} + +// SetREGID implements the semantics of setregid(2). +func (t *Task) SetREGID(r, e auth.GID) error { + t.mu.Lock() + defer t.mu.Unlock() + newR := t.creds.RealKGID + if r.Ok() { + newR = t.creds.UserNamespace.MapToKGID(r) + if !newR.Ok() { + return syserror.EINVAL + } + } + newE := t.creds.EffectiveKGID + if e.Ok() { + newE = t.creds.UserNamespace.MapToKGID(e) + if !newE.Ok() { + return syserror.EINVAL + } + } + if !t.creds.HasCapability(linux.CAP_SETGID) { + if newE != t.creds.RealKGID && newE != t.creds.EffectiveKGID && newE != t.creds.SavedKGID { + return syserror.EPERM + } + if newR != t.creds.RealKGID && newR != t.creds.EffectiveKGID { + return syserror.EPERM + } + } + newS := t.creds.SavedKGID + if r.Ok() || (e.Ok() && newE != t.creds.EffectiveKGID) { + newS = newE + } + t.setKGIDsUncheckedLocked(newR, newE, newS) + return nil +} + +// SetRESGID implements the semantics of the setresgid(2) syscall. +func (t *Task) SetRESGID(r, e, s auth.GID) error { + t.mu.Lock() + defer t.mu.Unlock() + var err error + newR := t.creds.RealKGID + if r.Ok() { + newR, err = t.creds.UseGID(r) + if err != nil { + return err + } + } + newE := t.creds.EffectiveKGID + if e.Ok() { + newE, err = t.creds.UseGID(e) + if err != nil { + return err + } + } + newS := t.creds.SavedKGID + if s.Ok() { + newS, err = t.creds.UseGID(s) + if err != nil { + return err + } + } + t.setKGIDsUncheckedLocked(newR, newE, newS) + return nil +} + +func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { + oldE := t.creds.EffectiveKGID + t.creds = t.creds.Fork() // See doc for creds. + t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS + + // Not documented, but compare Linux's kernel/cred.c:commit_creds(). + if oldE != newE { + t.parentDeathSignal = 0 + } +} + +// SetExtraGIDs attempts to change t's supplemental groups. All IDs are +// interpreted as being in t's user namespace. +func (t *Task) SetExtraGIDs(gids []auth.GID) error { + t.mu.Lock() + defer t.mu.Unlock() + if !t.creds.HasCapability(linux.CAP_SETGID) { + return syserror.EPERM + } + kgids := make([]auth.KGID, len(gids)) + for i, gid := range gids { + kgid := t.creds.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + kgids[i] = kgid + } + t.creds = t.creds.Fork() // See doc for creds. + t.creds.ExtraKGIDs = kgids + return nil +} + +// SetCapabilitySets attempts to change t's permitted, inheritable, and +// effective capability sets. +func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error { + t.mu.Lock() + defer t.mu.Unlock() + // "Permitted: This is a limiting superset for the effective capabilities + // that the thread may assume." - capabilities(7) + if effective & ^permitted != 0 { + return syserror.EPERM + } + // "It is also a limiting superset for the capabilities that may be added + // to the inheritable set by a thread that does not have the CAP_SETPCAP + // capability in its effective set." + if !t.creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(t.creds.InheritableCaps|t.creds.PermittedCaps) != 0) { + return syserror.EPERM + } + // "If a thread drops a capability from its permitted set, it can never + // reacquire that capability (unless it execve(2)s ..." + if permitted & ^t.creds.PermittedCaps != 0 { + return syserror.EPERM + } + // "... if a capability is not in the bounding set, then a thread can't add + // this capability to its inheritable set, even if it was in its permitted + // capabilities ..." + if inheritable & ^(t.creds.InheritableCaps|t.creds.BoundingCaps) != 0 { + return syserror.EPERM + } + t.creds = t.creds.Fork() // See doc for creds. + t.creds.PermittedCaps = permitted + t.creds.InheritableCaps = inheritable + t.creds.EffectiveCaps = effective + return nil +} + +// DropBoundingCapability attempts to drop capability cp from t's capability +// bounding set. +func (t *Task) DropBoundingCapability(cp linux.Capability) error { + t.mu.Lock() + defer t.mu.Unlock() + if !t.creds.HasCapability(linux.CAP_SETPCAP) { + return syserror.EPERM + } + t.creds = t.creds.Fork() // See doc for creds. + t.creds.BoundingCaps &^= auth.CapabilitySetOf(cp) + return nil +} + +// SetUserNamespace attempts to move c into ns. +func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { + t.mu.Lock() + defer t.mu.Unlock() + + // "A process reassociating itself with a user namespace must have the + // CAP_SYS_ADMIN capability in the target user namespace." - setns(2) + // + // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN + // in ns (by rule 3 in auth.Credentials.HasCapability). + if !t.creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { + return syserror.EPERM + } + + t.creds = t.creds.Fork() // See doc for creds. + t.creds.UserNamespace = ns + // "The child process created by clone(2) with the CLONE_NEWUSER flag + // starts out with a complete set of capabilities in the new user + // namespace. Likewise, a process that creates a new user namespace using + // unshare(2) or joins an existing user namespace using setns(2) gains a + // full set of capabilities in that namespace." + t.creds.PermittedCaps = auth.AllCapabilities + t.creds.InheritableCaps = 0 + t.creds.EffectiveCaps = auth.AllCapabilities + t.creds.BoundingCaps = auth.AllCapabilities + // "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER + // flag sets the "securebits" flags (see capabilities(7)) to their default + // values (all flags disabled) in the child (for clone(2)) or caller (for + // unshare(2), or setns(2)." - user_namespaces(7) + t.creds.KeepCaps = false + + return nil +} + +// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS. +func (t *Task) SetKeepCaps(k bool) { + t.mu.Lock() + defer t.mu.Unlock() + t.creds = t.creds.Fork() // See doc for creds. + t.creds.KeepCaps = k +} + +// updateCredsForExec updates t.creds to reflect an execve(). +// +// NOTE(b/30815691): We currently do not implement privileged executables +// (set-user/group-ID bits and file capabilities). This allows us to make a lot +// of simplifying assumptions: +// +// - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which +// disables the features we don't support anyway, is always set. This +// drastically simplifies this function. +// +// - We don't implement AT_SECURE, because no_new_privs always being set means +// that the conditions that require AT_SECURE never arise. (Compare Linux's +// security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().) +// +// - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since +// seccomp-bpf is also allowed if the task has no_new_privs set. +// +// - Task.ptraceAttach does not serialize with execve as it does in Linux, +// since no_new_privs being set has the same effect as the presence of an +// unprivileged tracer. +// +// Preconditions: t.mu must be locked. +func (t *Task) updateCredsForExecLocked() { + // """ + // During an execve(2), the kernel calculates the new capabilities of + // the process using the following algorithm: + // + // P'(permitted) = (P(inheritable) & F(inheritable)) | + // (F(permitted) & cap_bset) + // + // P'(effective) = F(effective) ? P'(permitted) : 0 + // + // P'(inheritable) = P(inheritable) [i.e., unchanged] + // + // where: + // + // P denotes the value of a thread capability set before the + // execve(2) + // + // P' denotes the value of a thread capability set after the + // execve(2) + // + // F denotes a file capability set + // + // cap_bset is the value of the capability bounding set + // + // ... + // + // In order to provide an all-powerful root using capability sets, during + // an execve(2): + // + // 1. If a set-user-ID-root program is being executed, or the real user ID + // of the process is 0 (root) then the file inheritable and permitted sets + // are defined to be all ones (i.e. all capabilities enabled). + // + // 2. If a set-user-ID-root program is being executed, then the file + // effective bit is defined to be one (enabled). + // + // The upshot of the above rules, combined with the capabilities + // transformations described above, is that when a process execve(2)s a + // set-user-ID-root program, or when a process with an effective UID of 0 + // execve(2)s a program, it gains all capabilities in its permitted and + // effective capability sets, except those masked out by the capability + // bounding set. + // """ - capabilities(7) + // (ambient capability sets omitted) + // + // As the last paragraph implies, the case of "a set-user-ID root program + // is being executed" also includes the case where (namespace) root is + // executing a non-set-user-ID program; the actual check is just based on + // the effective user ID. + var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0 + fileEffective := false + root := t.creds.UserNamespace.MapToKUID(auth.RootUID) + if t.creds.EffectiveKUID == root || t.creds.RealKUID == root { + newPermitted = t.creds.InheritableCaps | t.creds.BoundingCaps + if t.creds.EffectiveKUID == root { + fileEffective = true + } + } + + t.creds = t.creds.Fork() // See doc for creds. + + // Now we enter poorly-documented, somewhat confusing territory. (The + // accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds + // is not very helpful.) My reading of it is: + // + // If at least one of the following is true: + // + // A1. The execing task is ptraced, and the tracer did not have + // CAP_SYS_PTRACE in the execing task's user namespace at the time of + // PTRACE_ATTACH. + // + // A2. The execing task shares its FS context with at least one task in + // another thread group. + // + // A3. The execing task has no_new_privs set. + // + // AND at least one of the following is true: + // + // B1. The new effective user ID (which may come from set-user-ID, or be the + // execing task's existing effective user ID) is not equal to the task's + // real UID. + // + // B2. The new effective group ID (which may come from set-group-ID, or be + // the execing task's existing effective group ID) is not equal to the + // task's real GID. + // + // B3. The new permitted capability set contains capabilities not in the + // task's permitted capability set. + // + // Then: + // + // C1. Limit the new permitted capability set to the task's permitted + // capability set. + // + // C2. If either the task does not have CAP_SETUID in its user namespace, or + // the task has no_new_privs set, force the new effective UID and GID to + // the task's real UID and GID. + // + // But since no_new_privs is always set (A3 is always true), this becomes + // much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1 + // is a no-op. So we can just do C1 and C2 unconditionally. + if t.creds.EffectiveKUID != t.creds.RealKUID || t.creds.EffectiveKGID != t.creds.RealKGID { + t.creds.EffectiveKUID = t.creds.RealKUID + t.creds.EffectiveKGID = t.creds.RealKGID + t.parentDeathSignal = 0 + } + // (Saved set-user-ID is always set to the new effective user ID, and saved + // set-group-ID is always set to the new effective group ID, regardless of + // the above.) + t.creds.SavedKUID = t.creds.RealKUID + t.creds.SavedKGID = t.creds.RealKGID + t.creds.PermittedCaps &= newPermitted + if fileEffective { + t.creds.EffectiveCaps = t.creds.PermittedCaps + } else { + t.creds.EffectiveCaps = 0 + } + + // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent + // calls to execve(2). + t.creds.KeepCaps = false + + // "The bounding set is inherited at fork(2) from the thread's parent, and + // is preserved across an execve(2)". So we're done. +} diff --git a/pkg/sentry/kernel/task_list.go b/pkg/sentry/kernel/task_list.go new file mode 100755 index 000000000..57d3f098d --- /dev/null +++ b/pkg/sentry/kernel/task_list.go @@ -0,0 +1,173 @@ +package kernel + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type taskElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (taskElementMapper) linkerFor(elem *Task) *Task { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type taskList struct { + head *Task + tail *Task +} + +// Reset resets list l to the empty state. +func (l *taskList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *taskList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *taskList) Front() *Task { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *taskList) Back() *Task { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *taskList) PushFront(e *Task) { + taskElementMapper{}.linkerFor(e).SetNext(l.head) + taskElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + taskElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *taskList) PushBack(e *Task) { + taskElementMapper{}.linkerFor(e).SetNext(nil) + taskElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + taskElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *taskList) PushBackList(m *taskList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + taskElementMapper{}.linkerFor(l.tail).SetNext(m.head) + taskElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *taskList) InsertAfter(b, e *Task) { + a := taskElementMapper{}.linkerFor(b).Next() + taskElementMapper{}.linkerFor(e).SetNext(a) + taskElementMapper{}.linkerFor(e).SetPrev(b) + taskElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + taskElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *taskList) InsertBefore(a, e *Task) { + b := taskElementMapper{}.linkerFor(a).Prev() + taskElementMapper{}.linkerFor(e).SetNext(a) + taskElementMapper{}.linkerFor(e).SetPrev(b) + taskElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + taskElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *taskList) Remove(e *Task) { + prev := taskElementMapper{}.linkerFor(e).Prev() + next := taskElementMapper{}.linkerFor(e).Next() + + if prev != nil { + taskElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + taskElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type taskEntry struct { + next *Task + prev *Task +} + +// Next returns the entry that follows e in the list. +func (e *taskEntry) Next() *Task { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *taskEntry) Prev() *Task { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *taskEntry) SetNext(elem *Task) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *taskEntry) SetPrev(elem *Task) { + e.prev = elem +} diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go new file mode 100644 index 000000000..e0e57e8bd --- /dev/null +++ b/pkg/sentry/kernel/task_log.go @@ -0,0 +1,137 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sort" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // maxStackDebugBytes is the maximum number of user stack bytes that may be + // printed by debugDumpStack. + maxStackDebugBytes = 1024 +) + +// Infof logs an formatted info message by calling log.Infof. +func (t *Task) Infof(fmt string, v ...interface{}) { + if log.IsLogging(log.Info) { + log.Infof(t.logPrefix.Load().(string)+fmt, v...) + } +} + +// Warningf logs a warning string by calling log.Warningf. +func (t *Task) Warningf(fmt string, v ...interface{}) { + if log.IsLogging(log.Warning) { + log.Warningf(t.logPrefix.Load().(string)+fmt, v...) + } +} + +// Debugf creates a debug string that includes the task ID. +func (t *Task) Debugf(fmt string, v ...interface{}) { + if log.IsLogging(log.Debug) { + log.Debugf(t.logPrefix.Load().(string)+fmt, v...) + } +} + +// IsLogging returns true iff this level is being logged. +func (t *Task) IsLogging(level log.Level) bool { + return log.IsLogging(level) +} + +// DebugDumpState logs task state at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) DebugDumpState() { + t.debugDumpRegisters() + t.debugDumpStack() + if mm := t.MemoryManager(); mm != nil { + t.Debugf("Mappings:\n%s", mm) + } + t.Debugf("FDMap:\n%s", t.fds) +} + +// debugDumpRegisters logs register state at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) debugDumpRegisters() { + if !t.IsLogging(log.Debug) { + return + } + regmap, err := t.Arch().RegisterMap() + if err != nil { + t.Debugf("Registers: %v", err) + } else { + t.Debugf("Registers:") + var regs []string + for reg := range regmap { + regs = append(regs, reg) + } + sort.Strings(regs) + for _, reg := range regs { + t.Debugf("%-8s = %016x", reg, regmap[reg]) + } + } +} + +// debugDumpStack logs user stack contents at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) debugDumpStack() { + if !t.IsLogging(log.Debug) { + return + } + m := t.MemoryManager() + if m == nil { + t.Debugf("Memory manager for task is gone, skipping application stack dump.") + return + } + t.Debugf("Stack:") + start := usermem.Addr(t.Arch().Stack()) + // Round addr down to a 16-byte boundary. + start &= ^usermem.Addr(15) + // Print 16 bytes per line, one byte at a time. + for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 { + addr, ok := start.AddLength(offset) + if !ok { + break + } + var data [16]byte + n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{ + IgnorePermissions: true, + }) + // Print as much of the line as we can, even if an error was + // encountered. + if n > 0 { + t.Debugf("%x: % x", addr, data[:n]) + } + if err != nil { + t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err) + break + } + } +} + +// updateLogPrefix updates the task's cached log prefix to reflect its +// current thread ID. +// +// Preconditions: The task's owning TaskSet.mu must be locked. +func (t *Task) updateLogPrefixLocked() { + // Use the task's TID in the root PID namespace for logging. + t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t])) +} diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go new file mode 100644 index 000000000..04c684c1a --- /dev/null +++ b/pkg/sentry/kernel/task_net.go @@ -0,0 +1,35 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" +) + +// IsNetworkNamespaced returns true if t is in a non-root network namespace. +func (t *Task) IsNetworkNamespaced() bool { + t.mu.Lock() + defer t.mu.Unlock() + return t.netns +} + +// NetworkContext returns the network stack used by the task. NetworkContext +// may return nil if no network stack is available. +func (t *Task) NetworkContext() inet.Stack { + if t.IsNetworkNamespaced() { + return nil + } + return t.k.networkStack +} diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go new file mode 100644 index 000000000..a79101a18 --- /dev/null +++ b/pkg/sentry/kernel/task_run.go @@ -0,0 +1,340 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "runtime" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// A taskRunState is a reified state in the task state machine. See README.md +// for details. The canonical list of all run states, as well as transitions +// between them, is given in run_states.dot. +// +// The set of possible states is enumerable and completely defined by the +// kernel package, so taskRunState would ideally be represented by a +// discriminated union. However, Go does not support sum types. +// +// Hence, as with TaskStop, data-free taskRunStates should be represented as +// typecast nils to avoid unnecessary allocation. +type taskRunState interface { + // execute executes the code associated with this state over the given task + // and returns the following state. If execute returns nil, the task + // goroutine should exit. + // + // It is valid to tail-call a following state's execute to avoid the + // overhead of converting the following state to an interface object and + // checking for stops, provided that the tail-call cannot recurse. + execute(*Task) taskRunState +} + +// run runs the task goroutine. +// +// threadID a dummy value set to the task's TID in the root PID namespace to +// make it visible in stack dumps. A goroutine for a given task can be identified +// searching for Task.run()'s argument value. +func (t *Task) run(threadID uintptr) { + // Construct t.blockingTimer here. We do this here because we can't + // reconstruct t.blockingTimer during restore in Task.afterLoad(), because + // kernel.timekeeper.SetClocks() hasn't been called yet. + blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier() + t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier) + defer t.blockingTimer.Destroy() + t.blockingTimerChan = blockingTimerChan + + // Activate our address space. + t.Activate() + // The corresponding t.Deactivate occurs in the exit path + // (runExitMain.execute) so that when + // Platform.CooperativelySharesAddressSpace() == true, we give up the + // AddressSpace before the task goroutine finishes executing. + + // If this is a newly-started task, it should check for participation in + // group stops. If this is a task resuming after restore, it was + // interrupted by saving. In either case, the task is initially + // interrupted. + t.interruptSelf() + + for { + // Explanation for this ordering: + // + // - A freshly-started task that is stopped should not do anything + // before it enters the stop. + // + // - If taskRunState.execute returns nil, the task goroutine should + // exit without checking for a stop. + // + // - Task.Start won't start Task.run if t.runState is nil, so this + // ordering is safe. + t.doStop() + t.runState = t.runState.execute(t) + if t.runState == nil { + t.accountTaskGoroutineEnter(TaskGoroutineNonexistent) + t.goroutineStopped.Done() + t.tg.liveGoroutines.Done() + t.tg.pidns.owner.liveGoroutines.Done() + t.tg.pidns.owner.runningGoroutines.Done() + + // Keep argument alive because stack trace for dead variables may not be correct. + runtime.KeepAlive(threadID) + return + } + } +} + +// doStop is called by Task.run to block until the task is not stopped. +func (t *Task) doStop() { + if atomic.LoadInt32(&t.stopCount) == 0 { + return + } + t.Deactivate() + // NOTE(b/30316266): t.Activate() must be called without any locks held, so + // this defer must precede the defer for unlocking the signal mutex. + defer t.Activate() + t.accountTaskGoroutineEnter(TaskGoroutineStopped) + defer t.accountTaskGoroutineLeave(TaskGoroutineStopped) + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.tg.pidns.owner.runningGoroutines.Add(-1) + defer t.tg.pidns.owner.runningGoroutines.Add(1) + t.goroutineStopped.Add(-1) + defer t.goroutineStopped.Add(1) + for t.stopCount > 0 { + t.endStopCond.Wait() + } +} + +// The runApp state checks for interrupts before executing untrusted +// application code. +// +// +stateify savable +type runApp struct{} + +func (*runApp) execute(t *Task) taskRunState { + if t.interrupted() { + // Checkpointing instructs tasks to stop by sending an interrupt, so we + // must check for stops before entering runInterrupt (instead of + // tail-calling it). + return (*runInterrupt)(nil) + } + + // We're about to switch to the application again. If there's still a + // unhandled SyscallRestartErrno that wasn't translated to an EINTR, + // restart the syscall that was interrupted. If there's a saved signal + // mask, restore it. (Note that restoring the saved signal mask may unblock + // a pending signal, causing another interruption, but that signal should + // not interact with the interrupted syscall.) + if t.haveSyscallReturn { + if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + if sre == ERESTART_RESTARTBLOCK { + t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) + t.Arch().RestartSyscallWithRestartBlock() + } else { + t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) + t.Arch().RestartSyscall() + } + } + t.haveSyscallReturn = false + } + if t.haveSavedSignalMask { + t.SetSignalMask(t.savedSignalMask) + t.haveSavedSignalMask = false + if t.interrupted() { + return (*runInterrupt)(nil) + } + } + + // Apply restartable sequences. + if t.rseqPreempted { + t.rseqPreempted = false + if t.rseqCPUAddr != 0 { + cpu := int32(hostcpu.GetCPU()) + if t.rseqCPU != cpu { + t.rseqCPU = cpu + if err := t.rseqCopyOutCPU(); err != nil { + t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err) + t.forceSignal(linux.SIGSEGV, false) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + // Re-enter the task run loop for signal delivery. + return (*runApp)(nil) + } + } + } + t.rseqInterrupt() + } + + // Check if we need to enable single-stepping. Tracers expect that the + // kernel preserves the value of the single-step flag set by PTRACE_SETREGS + // whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this + // includes our ptrace platform, by the way), so we should only clear the + // single-step flag if we're responsible for setting it. (clearSinglestep + // is therefore analogous to Linux's TIF_FORCED_TF.) + // + // Strictly speaking, we should also not clear the single-step flag if we + // single-step through an instruction that sets the single-step flag + // (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their + // own TF. (Famous last words, I know.) + clearSinglestep := false + if t.hasTracer() { + t.tg.pidns.owner.mu.RLock() + if t.ptraceSinglestep { + clearSinglestep = !t.Arch().SingleStep() + t.Arch().SetSingleStep() + } + t.tg.pidns.owner.mu.RUnlock() + } + + t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) + info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) + t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) + + if clearSinglestep { + t.Arch().ClearSingleStep() + } + + switch err { + case nil: + // Handle application system call. + return t.doSyscall() + + case platform.ErrContextInterrupt: + // Interrupted by platform.Context.Interrupt(). Re-enter the run + // loop to figure out why. + return (*runApp)(nil) + + case platform.ErrContextSignalCPUID: + // Is this a CPUID instruction? + expected := arch.CPUIDInstruction[:] + found := make([]byte, len(expected)) + _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) + if err == nil && bytes.Equal(expected, found) { + // Skip the cpuid instruction. + t.Arch().CPUIDEmulate(t) + t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) + + // Resume execution. + return (*runApp)(nil) + } + + // The instruction at the given RIP was not a CPUID, and we + // fallthrough to the default signal deliver behavior below. + fallthrough + + case platform.ErrContextSignal: + // Looks like a signal has been delivered to us. If it's a synchronous + // signal (SEGV, SIGBUS, etc.), it should be sent to the application + // thread that received it. + sig := linux.Signal(info.Signo) + + // Was it a fault that we should handle internally? If so, this wasn't + // an application-generated signal and we should continue execution + // normally. + if at.Any() { + addr := usermem.Addr(info.Addr()) + err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack())) + if err == nil { + // The fault was handled appropriately. + // We can resume running the application. + return (*runApp)(nil) + } + + // Is this a vsyscall that we need emulate? + if at.Execute { + if sysno, ok := t.tc.st.LookupEmulate(addr); ok { + return t.doVsyscall(addr, sysno) + } + } + + // Faults are common, log only at debug level. + t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err) + t.DebugDumpState() + + // Continue to signal handling. + // + // Convert a BusError error to a SIGBUS from a SIGSEGV. All + // other info bits stay the same (address, etc.). + if _, ok := err.(*memmap.BusError); ok { + sig = linux.SIGBUS + info.Signo = int32(linux.SIGBUS) + } + } + + switch sig { + case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP: + // Synchronous signal. Send it to ourselves. Assume the signal is + // legitimate and force it (work around the signal being ignored or + // blocked) like Linux does. Conveniently, this is even the correct + // behavior for SIGTRAP from single-stepping. + t.forceSignal(linux.Signal(sig), false /* unconditional */) + t.SendSignal(info) + + case platform.SignalInterrupt: + // Assume that a call to platform.Context.Interrupt() misfired. + + case linux.SIGPROF: + // It's a profiling interrupt: there's not much + // we can do. We've already paid a decent cost + // by intercepting the signal, at this point we + // simply ignore it. + + default: + // Asynchronous signal. Let the system deal with it. + t.k.sendExternalSignal(info, "application") + } + + return (*runApp)(nil) + + case platform.ErrContextCPUPreempted: + // Ensure that RSEQ critical sections are interrupted and per-thread + // CPU values are updated before the next platform.Context.Switch(). + t.rseqPreempted = true + return (*runApp)(nil) + + default: + // What happened? Can't continue. + t.Warningf("Unexpected SwitchToApp error: %v", err) + t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)}) + return (*runExit)(nil) + } +} + +// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. +func (t *Task) waitGoroutineStoppedOrExited() { + t.goroutineStopped.Wait() +} + +// WaitExited blocks until all task goroutines in tg have exited. +// +// WaitExited does not correspond to anything in Linux; it's provided so that +// external callers of Kernel.CreateProcess can wait for the created thread +// group to terminate. +func (tg *ThreadGroup) WaitExited() { + tg.liveGoroutines.Wait() +} + +// Yield yields the processor for the calling task. +func (t *Task) Yield() { + atomic.AddUint64(&t.yieldCount, 1) + runtime.Gosched() +} diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go new file mode 100644 index 000000000..5455f6ea9 --- /dev/null +++ b/pkg/sentry/kernel/task_sched.go @@ -0,0 +1,637 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// CPU scheduling, real and fake. + +import ( + "fmt" + "math/rand" + "sync/atomic" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// TaskGoroutineState is a coarse representation of the current execution +// status of a kernel.Task goroutine. +type TaskGoroutineState int + +const ( + // TaskGoroutineNonexistent indicates that the task goroutine has either + // not yet been created by Task.Start() or has returned from Task.run(). + // This must be the zero value for TaskGoroutineState. + TaskGoroutineNonexistent TaskGoroutineState = iota + + // TaskGoroutineRunningSys indicates that the task goroutine is executing + // sentry code. + TaskGoroutineRunningSys + + // TaskGoroutineRunningApp indicates that the task goroutine is executing + // application code. + TaskGoroutineRunningApp + + // TaskGoroutineBlockedInterruptible indicates that the task goroutine is + // blocked in Task.block(), and hence may be woken by Task.interrupt() + // (e.g. due to signal delivery). + TaskGoroutineBlockedInterruptible + + // TaskGoroutineBlockedUninterruptible indicates that the task goroutine is + // stopped outside of Task.block() and Task.doStop(), and hence cannot be + // woken by Task.interrupt(). + TaskGoroutineBlockedUninterruptible + + // TaskGoroutineStopped indicates that the task goroutine is blocked in + // Task.doStop(). TaskGoroutineStopped is similar to + // TaskGoroutineBlockedUninterruptible, but is a separate state to make it + // possible to determine when Task.stop is meaningful. + TaskGoroutineStopped +) + +// TaskGoroutineSchedInfo contains task goroutine scheduling state which must +// be read and updated atomically. +// +// +stateify savable +type TaskGoroutineSchedInfo struct { + // Timestamp was the value of Kernel.cpuClock when this + // TaskGoroutineSchedInfo was last updated. + Timestamp uint64 + + // State is the current state of the task goroutine. + State TaskGoroutineState + + // UserTicks is the amount of time the task goroutine has spent executing + // its associated Task's application code, in units of linux.ClockTick. + UserTicks uint64 + + // SysTicks is the amount of time the task goroutine has spent executing in + // the sentry, in units of linux.ClockTick. + SysTicks uint64 +} + +// userTicksAt returns the extrapolated value of ts.UserTicks after +// Kernel.CPUClockNow() indicates a time of now. +// +// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is +// monotonic, this is satisfied if now is the result of a previous call to +// Kernel.CPUClockNow().) This requirement exists because otherwise a racing +// change to t.gosched can cause userTicksAt to adjust stats by too much, +// making the observed stats non-monotonic. +func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 { + if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp { + // Update stats to reflect execution since the last update. + return ts.UserTicks + (now - ts.Timestamp) + } + return ts.UserTicks +} + +// sysTicksAt returns the extrapolated value of ts.SysTicks after +// Kernel.CPUClockNow() indicates a time of now. +// +// Preconditions: As for userTicksAt. +func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 { + if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys { + return ts.SysTicks + (now - ts.Timestamp) + } + return ts.SysTicks +} + +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) { + now := t.k.CPUClockNow() + if t.gosched.State != TaskGoroutineRunningSys { + panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state)) + } + t.goschedSeq.BeginWrite() + // This function is very hot; avoid defer. + t.gosched.SysTicks += now - t.gosched.Timestamp + t.gosched.Timestamp = now + t.gosched.State = state + t.goschedSeq.EndWrite() +} + +// Preconditions: The caller must be running on the task goroutine, and leaving +// a state indicated by a previous call to +// t.accountTaskGoroutineEnter(state). +func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { + now := t.k.CPUClockNow() + if t.gosched.State != state { + panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys)) + } + t.goschedSeq.BeginWrite() + // This function is very hot; avoid defer. + if state == TaskGoroutineRunningApp { + t.gosched.UserTicks += now - t.gosched.Timestamp + } + t.gosched.Timestamp = now + t.gosched.State = TaskGoroutineRunningSys + t.goschedSeq.EndWrite() +} + +// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info. +// Most clients should use t.CPUStats() instead. +func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo { + return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched) +} + +// CPUStats returns the CPU usage statistics of t. +func (t *Task) CPUStats() usage.CPUStats { + return t.cpuStatsAt(t.k.CPUClockNow()) +} + +// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. +func (t *Task) cpuStatsAt(now uint64) usage.CPUStats { + tsched := t.TaskGoroutineSchedInfo() + return usage.CPUStats{ + UserTime: time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)), + SysTime: time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)), + VoluntarySwitches: atomic.LoadUint64(&t.yieldCount), + } +} + +// CPUStats returns the combined CPU usage statistics of all past and present +// threads in tg. +func (tg *ThreadGroup) CPUStats() usage.CPUStats { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + // Hack to get a pointer to the Kernel. + if tg.leader == nil { + // Per comment on tg.leader, this is only possible if nothing in the + // ThreadGroup has ever executed anyway. + return usage.CPUStats{} + } + return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow()) +} + +// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex +// must be locked. +func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats { + stats := tg.exitedCPUStats + // Account for live tasks. + for t := tg.tasks.Front(); t != nil; t = t.Next() { + stats.Accumulate(t.cpuStatsAt(now)) + } + return stats +} + +// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return +// resource usage statistics for all children of [tg] that have terminated and +// been waited for. These statistics will include the resources used by +// grandchildren, and further removed descendants, if all of the intervening +// descendants waited on their terminated children." +func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.childCPUStats +} + +// taskClock is a ktime.Clock that measures the time that a task has spent +// executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID. +// +// +stateify savable +type taskClock struct { + t *Task + + // If includeSys is true, the taskClock includes both time spent executing + // application code as well as time spent in the sentry. Otherwise, the + // taskClock includes only time spent executing application code. + includeSys bool + + // Implements waiter.Waitable. TimeUntil wouldn't change its estimation + // based on either of the clock events, so there's no event to be + // notified for. + ktime.NoClockEvents `state:"nosave"` + + // Implements ktime.Clock.WallTimeUntil. + // + // As an upper bound, a task's clock cannot advance faster than CPU + // time. It would have to execute at a rate of more than 1 task-second + // per 1 CPU-second, which isn't possible. + ktime.WallRateClock `state:"nosave"` +} + +// UserCPUClock returns a clock measuring the CPU time the task has spent +// executing application code. +func (t *Task) UserCPUClock() ktime.Clock { + return &taskClock{t: t, includeSys: false} +} + +// CPUClock returns a clock measuring the CPU time the task has spent executing +// application and "kernel" code. +func (t *Task) CPUClock() ktime.Clock { + return &taskClock{t: t, includeSys: true} +} + +// Now implements ktime.Clock.Now. +func (tc *taskClock) Now() ktime.Time { + stats := tc.t.CPUStats() + if tc.includeSys { + return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) + } + return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) +} + +// tgClock is a ktime.Clock that measures the time a thread group has spent +// executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID. +// +// +stateify savable +type tgClock struct { + tg *ThreadGroup + + // If includeSys is true, the tgClock includes both time spent executing + // application code as well as time spent in the sentry. Otherwise, the + // tgClock includes only time spent executing application code. + includeSys bool + + // Implements waiter.Waitable. + ktime.ClockEventsQueue `state:"nosave"` +} + +// Now implements ktime.Clock.Now. +func (tgc *tgClock) Now() ktime.Time { + stats := tgc.tg.CPUStats() + if tgc.includeSys { + return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) + } + return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) +} + +// WallTimeUntil implements ktime.Clock.WallTimeUntil. +func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration { + // Thread group CPU time should not exceed wall time * live tasks, since + // task goroutines exit after the transition to TaskExitZombie in + // runExitNotify. + tgc.tg.pidns.owner.mu.RLock() + n := tgc.tg.liveTasks + tgc.tg.pidns.owner.mu.RUnlock() + if n == 0 { + if t.Before(now) { + return 0 + } + // The timer tick raced with thread group exit, after which no more + // tasks can enter the thread group. So tgc.Now() will never advance + // again. Return a large delay; the timer should be stopped long before + // it comes again anyway. + return time.Hour + } + // This is a lower bound on the amount of time that can elapse before an + // associated timer expires, so returning this value tends to result in a + // sequence of closely-spaced ticks just before timer expiry. To avoid + // this, round up to the nearest ClockTick; CPU usage measurements are + // limited to this resolution anyway. + remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond + return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick +} + +// UserCPUClock returns a ktime.Clock that measures the time that a thread +// group has spent executing. +func (tg *ThreadGroup) UserCPUClock() ktime.Clock { + return &tgClock{tg: tg, includeSys: false} +} + +// CPUClock returns a ktime.Clock that measures the time that a thread group +// has spent executing, including sentry time. +func (tg *ThreadGroup) CPUClock() ktime.Clock { + return &tgClock{tg: tg, includeSys: true} +} + +type kernelCPUClockTicker struct { + k *Kernel + + // These are essentially kernelCPUClockTicker.Notify local variables that + // are cached between calls to reduce allocations. + rng *rand.Rand + tgs []*ThreadGroup +} + +func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker { + return &kernelCPUClockTicker{ + k: k, + rng: rand.New(rand.NewSource(rand.Int63())), + } +} + +// Notify implements ktime.TimerListener.Notify. +func (ticker *kernelCPUClockTicker) Notify(exp uint64) { + // Only increment cpuClock by 1 regardless of the number of expirations. + // This approximately compensates for cases where thread throttling or bad + // Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and + // presumably task goroutines as well, from executing for a long period of + // time. It's also necessary to prevent CPU clocks from seeing large + // discontinuous jumps. + now := atomic.AddUint64(&ticker.k.cpuClock, 1) + + // Check thread group CPU timers. + tgs := ticker.k.tasks.Root.ThreadGroupsAppend(ticker.tgs) + for _, tg := range tgs { + if atomic.LoadUint32(&tg.cpuTimersEnabled) == 0 { + continue + } + + ticker.k.tasks.mu.RLock() + if tg.leader == nil { + // No tasks have ever run in this thread group. + ticker.k.tasks.mu.RUnlock() + continue + } + // Accumulate thread group CPU stats, and randomly select running tasks + // using reservoir sampling to receive CPU timer signals. + var virtReceiver *Task + nrVirtCandidates := 0 + var profReceiver *Task + nrProfCandidates := 0 + tgUserTime := tg.exitedCPUStats.UserTime + tgSysTime := tg.exitedCPUStats.SysTime + for t := tg.tasks.Front(); t != nil; t = t.Next() { + tsched := t.TaskGoroutineSchedInfo() + tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)) + tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)) + switch tsched.State { + case TaskGoroutineRunningApp: + // Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU + // timers. + nrVirtCandidates++ + if int(randInt31n(ticker.rng, int32(nrVirtCandidates))) == 0 { + virtReceiver = t + } + fallthrough + case TaskGoroutineRunningSys: + // Considered by ITIMER_PROF and RLIMIT_CPU timers. + nrProfCandidates++ + if int(randInt31n(ticker.rng, int32(nrProfCandidates))) == 0 { + profReceiver = t + } + } + } + tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds()) + tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds()) + + // All of the following are standard (not real-time) signals, which are + // automatically deduplicated, so we ignore the number of expirations. + tg.signalHandlers.mu.Lock() + // It should only be possible for these timers to advance if we found + // at least one running task. + if virtReceiver != nil { + // ITIMER_VIRTUAL + newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow) + tg.itimerVirtSetting = newItimerVirtSetting + if exp != 0 { + virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true) + } + } + if profReceiver != nil { + // ITIMER_PROF + newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow) + tg.itimerProfSetting = newItimerProfSetting + if exp != 0 { + profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true) + } + // RLIMIT_CPU soft limit + newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow) + tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting + if exp != 0 { + profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true) + } + // RLIMIT_CPU hard limit + rlimitCPUMax := tg.limits.Get(limits.CPU).Max + if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) { + profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true) + } + } + tg.signalHandlers.mu.Unlock() + + ticker.k.tasks.mu.RUnlock() + } + + // Retain tgs between calls to Notify to reduce allocations. + for i := range tgs { + tgs[i] = nil + } + ticker.tgs = tgs[:0] +} + +// Destroy implements ktime.TimerListener.Destroy. +func (ticker *kernelCPUClockTicker) Destroy() { +} + +// randInt31n returns a random integer in [0, n). +// +// randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported. +// See that function for details. +func randInt31n(rng *rand.Rand, n int32) int32 { + v := rng.Uint32() + prod := uint64(v) * uint64(n) + low := uint32(prod) + if low < uint32(n) { + thresh := uint32(-n) % uint32(n) + for low < thresh { + v = rng.Uint32() + prod = uint64(v) * uint64(n) + low = uint32(prod) + } + } + return int32(prod >> 32) +} + +// NotifyRlimitCPUUpdated is called by setrlimit. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) NotifyRlimitCPUUpdated() { + t.k.cpuClockTicker.Atomically(func() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + rlimitCPU := t.tg.limits.Get(limits.CPU) + t.tg.rlimitCPUSoftSetting = ktime.Setting{ + Enabled: rlimitCPU.Cur != limits.Infinity, + Next: ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()), + Period: time.Second, + } + if rlimitCPU.Max != limits.Infinity { + // Check if tg is already over the hard limit. + tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow()) + tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds()) + if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) { + t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true) + } + } + t.tg.updateCPUTimersEnabledLocked() + }) +} + +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) updateCPUTimersEnabledLocked() { + rlimitCPU := tg.limits.Get(limits.CPU) + if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity { + atomic.StoreUint32(&tg.cpuTimersEnabled, 1) + } else { + atomic.StoreUint32(&tg.cpuTimersEnabled, 0) + } +} + +// StateStatus returns a string representation of the task's current state, +// appropriate for /proc/[pid]/status. +func (t *Task) StateStatus() string { + switch s := t.TaskGoroutineSchedInfo().State; s { + case TaskGoroutineNonexistent: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + switch t.exitState { + case TaskExitZombie: + return "Z (zombie)" + case TaskExitDead: + return "X (dead)" + default: + // The task goroutine can't exit before passing through + // runExitNotify, so this indicates that the task has been created, + // but the task goroutine hasn't yet started. The Linux equivalent + // is struct task_struct::state == TASK_NEW + // (kernel/fork.c:copy_process() => + // kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is + // masked out by TASK_REPORT for /proc/[pid]/status, leaving only + // TASK_RUNNING. + return "R (running)" + } + case TaskGoroutineRunningSys, TaskGoroutineRunningApp: + return "R (running)" + case TaskGoroutineBlockedInterruptible: + return "S (sleeping)" + case TaskGoroutineStopped: + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + switch t.stop.(type) { + case *groupStop: + return "T (stopped)" + case *ptraceStop: + return "t (tracing stop)" + } + fallthrough + case TaskGoroutineBlockedUninterruptible: + // This is the name Linux uses for TASK_UNINTERRUPTIBLE and + // TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL): + // fs/proc/array.c:task_state_array. + return "D (disk sleep)" + default: + panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s)) + } +} + +// CPUMask returns a copy of t's allowed CPU mask. +func (t *Task) CPUMask() sched.CPUSet { + t.mu.Lock() + defer t.mu.Unlock() + return t.allowedCPUMask.Copy() +} + +// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of +// mask. +// +// Preconditions: mask.Size() == +// sched.CPUSetSize(t.Kernel().ApplicationCores()). +func (t *Task) SetCPUMask(mask sched.CPUSet) error { + if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want { + panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want)) + } + + // Remove CPUs in mask above Kernel.applicationCores. + mask.ClearAbove(t.k.applicationCores) + + // Ensure that at least 1 CPU is still allowed. + if mask.NumCPUs() == 0 { + return syserror.EINVAL + } + + if t.k.useHostCores { + // No-op; pretend the mask was immediately changed back. + return nil + } + + t.tg.pidns.owner.mu.RLock() + rootTID := t.tg.pidns.owner.Root.tids[t] + t.tg.pidns.owner.mu.RUnlock() + + t.mu.Lock() + defer t.mu.Unlock() + t.allowedCPUMask = mask + atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID)) + return nil +} + +// CPU returns the cpu id for a given task. +func (t *Task) CPU() int32 { + if t.k.useHostCores { + return int32(hostcpu.GetCPU()) + } + + return atomic.LoadInt32(&t.cpu) +} + +// assignCPU returns the virtualized CPU number for the task with global TID +// tid and allowedCPUMask allowed. +func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) { + // To pretend that threads are evenly distributed to allowed CPUs, choose n + // to be less than the number of CPUs in allowed ... + n := int(tid) % int(allowed.NumCPUs()) + // ... then pick the nth CPU in allowed. + allowed.ForEachCPU(func(c uint) { + if n--; n == 0 { + cpu = int32(c) + } + }) + return cpu +} + +// Niceness returns t's niceness. +func (t *Task) Niceness() int { + t.mu.Lock() + defer t.mu.Unlock() + return t.niceness +} + +// Priority returns t's priority. +func (t *Task) Priority() int { + t.mu.Lock() + defer t.mu.Unlock() + return t.niceness + 20 +} + +// SetNiceness sets t's niceness to n. +func (t *Task) SetNiceness(n int) { + t.mu.Lock() + defer t.mu.Unlock() + t.niceness = n +} + +// NumaPolicy returns t's current numa policy. +func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) { + t.mu.Lock() + defer t.mu.Unlock() + return t.numaPolicy, t.numaNodeMask +} + +// SetNumaPolicy sets t's numa policy. +func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) { + t.mu.Lock() + defer t.mu.Unlock() + t.numaPolicy = policy + t.numaNodeMask = nodeMask +} diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go new file mode 100644 index 000000000..654cf7525 --- /dev/null +++ b/pkg/sentry/kernel/task_signals.go @@ -0,0 +1,1110 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file defines the behavior of task signal handling. + +import ( + "fmt" + "sync/atomic" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/eventchannel" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + ucspb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SignalAction is an internal signal action. +type SignalAction int + +// Available signal actions. +// Note that although we refer the complete set internally, +// the application is only capable of using the Default and +// Ignore actions from the system call interface. +const ( + SignalActionTerm SignalAction = iota + SignalActionCore + SignalActionStop + SignalActionIgnore + SignalActionHandler +) + +// Default signal handler actions. Note that for most signals, +// (except SIGKILL and SIGSTOP) these can be overridden by the app. +var defaultActions = map[linux.Signal]SignalAction{ + // POSIX.1-1990 standard. + linux.SIGHUP: SignalActionTerm, + linux.SIGINT: SignalActionTerm, + linux.SIGQUIT: SignalActionCore, + linux.SIGILL: SignalActionCore, + linux.SIGABRT: SignalActionCore, + linux.SIGFPE: SignalActionCore, + linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects + linux.SIGSEGV: SignalActionCore, + linux.SIGPIPE: SignalActionTerm, + linux.SIGALRM: SignalActionTerm, + linux.SIGTERM: SignalActionTerm, + linux.SIGUSR1: SignalActionTerm, + linux.SIGUSR2: SignalActionTerm, + linux.SIGCHLD: SignalActionIgnore, + linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects + linux.SIGSTOP: SignalActionStop, + linux.SIGTSTP: SignalActionStop, + linux.SIGTTIN: SignalActionStop, + linux.SIGTTOU: SignalActionStop, + // POSIX.1-2001 standard. + linux.SIGBUS: SignalActionCore, + linux.SIGPROF: SignalActionTerm, + linux.SIGSYS: SignalActionCore, + linux.SIGTRAP: SignalActionCore, + linux.SIGURG: SignalActionIgnore, + linux.SIGVTALRM: SignalActionTerm, + linux.SIGXCPU: SignalActionCore, + linux.SIGXFSZ: SignalActionCore, + // The rest on linux. + linux.SIGSTKFLT: SignalActionTerm, + linux.SIGIO: SignalActionTerm, + linux.SIGPWR: SignalActionTerm, + linux.SIGWINCH: SignalActionIgnore, +} + +// computeAction figures out what to do given a signal number +// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop, +// and SIGKILL always results in a SignalActionTerm. +// Signal 0 is always ignored as many programs use it for various internal functions +// and don't expect it to do anything. +// +// In the event the signal is not one of these, act.Handler determines what +// happens next. +// If act.Handler is: +// 0, the default action is taken; +// 1, the signal is ignored; +// anything else, the function returns SignalActionHandler. +func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction { + switch sig { + case linux.SIGSTOP: + return SignalActionStop + case linux.SIGKILL: + return SignalActionTerm + case linux.Signal(0): + return SignalActionIgnore + } + + switch act.Handler { + case arch.SignalActDefault: + return defaultActions[sig] + case arch.SignalActIgnore: + return SignalActionIgnore + default: + return SignalActionHandler + } +} + +// UnblockableSignals contains the set of signals which cannot be blocked. +var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP) + +// StopSignals is the set of signals whose default action is SignalActionStop. +var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU) + +// dequeueSignalLocked returns a pending signal that is *not* included in mask. +// If there are no pending unmasked signals, dequeueSignalLocked returns nil. +// +// Preconditions: t.tg.signalHandlers.mu must be locked. +func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *arch.SignalInfo { + if info := t.pendingSignals.dequeue(mask); info != nil { + return info + } + return t.tg.pendingSignals.dequeue(mask) +} + +// discardSpecificLocked removes all instances of the given signal from all +// signal queues in tg. +// +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) { + tg.pendingSignals.discardSpecific(sig) + for t := tg.tasks.Front(); t != nil; t = t.Next() { + t.pendingSignals.discardSpecific(sig) + } +} + +// PendingSignals returns the set of pending signals. +func (t *Task) PendingSignals() linux.SignalSet { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet +} + +// deliverSignal delivers the given signal and returns the following run state. +func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState { + sigact := computeAction(linux.Signal(info.Signo), act) + + if t.haveSyscallReturn { + if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + // Signals that are ignored, cause a thread group stop, or + // terminate the thread group do not interact with interrupted + // syscalls; in Linux terms, they are never returned to the signal + // handling path from get_signal => get_signal_to_deliver. The + // behavior of an interrupted syscall is determined by the first + // signal that is actually handled (by userspace). + if sigact == SignalActionHandler { + switch { + case sre == ERESTARTNOHAND: + fallthrough + case sre == ERESTART_RESTARTBLOCK: + fallthrough + case (sre == ERESTARTSYS && !act.IsRestart()): + t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) + t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1))) + default: + t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) + t.Arch().RestartSyscall() + } + } + } + } + + switch sigact { + case SignalActionTerm, SignalActionCore: + // "Default action is to terminate the process." - signal(7) + t.Debugf("Signal %d: terminating thread group", info.Signo) + + // Emit an event channel messages related to this uncaught signal. + ucs := &ucspb.UncaughtSignal{ + Tid: int32(t.Kernel().TaskSet().Root.IDOfTask(t)), + Pid: int32(t.Kernel().TaskSet().Root.IDOfThreadGroup(t.ThreadGroup())), + Registers: t.Arch().StateData().Proto(), + SignalNumber: info.Signo, + } + + // Attach an fault address if appropriate. + switch linux.Signal(info.Signo) { + case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS: + ucs.FaultAddr = info.Addr() + } + + eventchannel.Emit(ucs) + + t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)}) + return (*runExit)(nil) + + case SignalActionStop: + // "Default action is to stop the process." + t.initiateGroupStop(info) + + case SignalActionIgnore: + // "Default action is to ignore the signal." + t.Debugf("Signal %d: ignored", info.Signo) + + case SignalActionHandler: + // Try to deliver the signal to the user-configured handler. + t.Debugf("Signal %d: delivering to handler", info.Signo) + if err := t.deliverSignalToHandler(info, act); err != nil { + // This is not a warning, it can occur during normal operation. + t.Debugf("Failed to deliver signal %+v to user handler: %v", info, err) + + // Send a forced SIGSEGV. If the signal that couldn't be delivered + // was a SIGSEGV, force the handler to SIG_DFL. + t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + } + + default: + panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act))) + } + return (*runInterrupt)(nil) +} + +// deliverSignalToHandler changes the task's userspace state to enter the given +// user-configured handler for the given signal. +func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error { + // Signal delivery to an application handler interrupts restartable + // sequences. + t.rseqInterrupt() + + // Are executing on the main stack, + // or the provided alternate stack? + sp := usermem.Addr(t.Arch().Stack()) + + // N.B. This is a *copy* of the alternate stack that the user's signal + // handler expects to see in its ucontext (even if it's not in use). + alt := t.signalStack + if act.IsOnStack() && alt.IsEnabled() { + alt.SetOnStack() + if !alt.Contains(sp) { + sp = usermem.Addr(alt.Top()) + } + } + + // Set up the signal handler. If we have a saved signal mask, the signal + // handler should run with the current mask, but sigreturn should restore + // the saved one. + st := &arch.Stack{t.Arch(), t.MemoryManager(), sp} + mask := t.signalMask + if t.haveSavedSignalMask { + mask = t.savedSignalMask + } + if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil { + return err + } + t.haveSavedSignalMask = false + + // Add our signal mask. + newMask := t.signalMask | act.Mask + if !act.IsNoDefer() { + newMask |= linux.SignalSetOf(linux.Signal(info.Signo)) + } + t.SetSignalMask(newMask) + + return nil +} + +var ctrlResume = &SyscallControl{ignoreReturn: true} + +// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if +// rt is true). +func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) { + st := t.Stack() + sigset, alt, err := t.Arch().SignalRestore(st, rt) + if err != nil { + return nil, err + } + + // Attempt to record the given signal stack. Note that we silently + // ignore failures here, as does Linux. Only an EFAULT may be + // generated, but SignalRestore has already deserialized the entire + // frame successfully. + t.SetSignalStack(alt) + + // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked. + t.SetSignalMask(sigset &^ UnblockableSignals) + + return ctrlResume, nil +} + +// Sigtimedwait implements the semantics of sigtimedwait(2). +// +// Preconditions: The caller must be running on the task goroutine. t.exitState +// < TaskExitZombie. +func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) { + // set is the set of signals we're interested in; invert it to get the set + // of signals to block. + mask := ^(set &^ UnblockableSignals) + + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if info := t.dequeueSignalLocked(mask); info != nil { + return info, nil + } + + if timeout == 0 { + return nil, syserror.EAGAIN + } + + // Unblock signals we're waiting for. Remember the original signal mask so + // that Task.sendSignalTimerLocked doesn't discard ignored signals that + // we're temporarily unblocking. + t.realSignalMask = t.signalMask + t.setSignalMaskLocked(t.signalMask & mask) + + // Wait for a timeout or new signal. + t.tg.signalHandlers.mu.Unlock() + _, err := t.BlockWithTimeout(nil, true, timeout) + t.tg.signalHandlers.mu.Lock() + + // Restore the original signal mask. + t.setSignalMaskLocked(t.realSignalMask) + t.realSignalMask = 0 + + if info := t.dequeueSignalLocked(mask); info != nil { + return info, nil + } + if err == syserror.ETIMEDOUT { + return nil, syserror.EAGAIN + } + return nil, err +} + +// SendSignal sends the given signal to t. +// +// The following errors may be returned: +// +// syserror.ESRCH - The task has exited. +// syserror.EINVAL - The signal is not valid. +// syserror.EAGAIN - THe signal is realtime, and cannot be queued. +// +func (t *Task) SendSignal(info *arch.SignalInfo) error { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.sendSignalLocked(info, false /* group */) +} + +// SendGroupSignal sends the given signal to t's thread group. +func (t *Task) SendGroupSignal(info *arch.SignalInfo) error { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + return t.sendSignalLocked(info, true /* group */) +} + +// SendSignal sends the given signal to tg, using tg's leader to determine if +// the signal is blocked. +func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + return tg.leader.sendSignalLocked(info, true /* group */) +} + +func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error { + return t.sendSignalTimerLocked(info, group, nil) +} + +func (t *Task) sendSignalTimerLocked(info *arch.SignalInfo, group bool, timer *IntervalTimer) error { + if t.exitState == TaskExitDead { + return syserror.ESRCH + } + sig := linux.Signal(info.Signo) + if sig == 0 { + return nil + } + if !sig.IsValid() { + return syserror.EINVAL + } + + // Signal side effects apply even if the signal is ultimately discarded. + t.tg.applySignalSideEffectsLocked(sig) + + // TODO: "Only signals for which the "init" process has established a + // signal handler can be sent to the "init" process by other members of the + // PID namespace. This restriction applies even to privileged processes, + // and prevents other members of the PID namespace from accidentally + // killing the "init" process." - pid_namespaces(7). We don't currently do + // this for child namespaces, though we should; we also don't do this for + // the root namespace (the same restriction applies to global init on + // Linux), where whether or not we should is much murkier. In practice, + // most sandboxed applications are not prepared to function as an init + // process. + + // Unmasked, ignored signals are discarded without being queued, unless + // they will be visible to a tracer. Even for group signals, it's the + // originally-targeted task's signal mask and tracer that matter; compare + // Linux's kernel/signal.c:__send_signal() => prepare_signal() => + // sig_ignored(). + ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore + if sigset := linux.SignalSetOf(sig); sigset&t.signalMask == 0 && sigset&t.realSignalMask == 0 && ignored && !t.hasTracer() { + t.Debugf("Discarding ignored signal %d", sig) + if timer != nil { + timer.signalRejectedLocked() + } + return nil + } + + q := &t.pendingSignals + if group { + q = &t.tg.pendingSignals + } + if !q.enqueue(info, timer) { + if sig.IsRealtime() { + return syserror.EAGAIN + } + t.Debugf("Discarding duplicate signal %d", sig) + if timer != nil { + timer.signalRejectedLocked() + } + return nil + } + + // Find a receiver to notify. Note that the task we choose to notify, if + // any, may not be the task that actually dequeues and handles the signal; + // e.g. a racing signal mask change may cause the notified task to become + // ineligible, or a racing sibling task may dequeue the signal first. + if t.canReceiveSignalLocked(sig) { + t.Debugf("Notified of signal %d", sig) + t.interrupt() + return nil + } + if group { + if nt := t.tg.findSignalReceiverLocked(sig); nt != nil { + nt.Debugf("Notified of group signal %d", sig) + nt.interrupt() + return nil + } + } + t.Debugf("No task notified of signal %d", sig) + return nil +} + +func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) { + switch { + case linux.SignalSetOf(sig)&StopSignals != 0: + // Stop signals cause all prior SIGCONT to be discarded. (This is + // despite the fact this has little effect since SIGCONT's most + // important effect is applied when the signal is sent in the branch + // below, not when the signal is delivered.) + tg.discardSpecificLocked(linux.SIGCONT) + case sig == linux.SIGCONT: + // "The SIGCONT signal has a side effect of waking up (all threads of) + // a group-stopped process. This side effect happens before + // signal-delivery-stop. The tracer can't suppress this side effect (it + // can only suppress signal injection, which only causes the SIGCONT + // handler to not be executed in the tracee, if such a handler is + // installed." - ptrace(2) + tg.endGroupStopLocked(true) + case sig == linux.SIGKILL: + // "SIGKILL does not generate signal-delivery-stop and therefore the + // tracer can't suppress it. SIGKILL kills even within system calls + // (syscall-exit-stop is not generated prior to death by SIGKILL)." - + // ptrace(2) + // + // Note that this differs from ThreadGroup.requestExit in that it + // ignores tg.execing. + if !tg.exiting { + tg.exiting = true + tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)} + } + for t := tg.tasks.Front(); t != nil; t = t.Next() { + t.killLocked() + } + } +} + +// canReceiveSignalLocked returns true if t should be interrupted to receive +// the given signal. canReceiveSignalLocked is analogous to Linux's +// kernel/signal.c:wants_signal(), but see below for divergences. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool { + // - Do not choose tasks that are blocking the signal. + if linux.SignalSetOf(sig)&t.signalMask != 0 { + return false + } + // - No need to check Task.exitState, as the exit path sets every bit in the + // signal mask when it transitions from TaskExitNone to TaskExitInitiated. + // - No special case for SIGKILL: SIGKILL already interrupted all tasks in the + // task group via applySignalSideEffects => killLocked. + // - Do not choose stopped tasks, which cannot handle signals. + if t.stop != nil { + return false + } + // - TODO(b/38173783): No special case for when t is also the sending task, + // because the identity of the sender is unknown. + // - Do not choose tasks that have already been interrupted, as they may be + // busy handling another signal. + if len(t.interruptChan) != 0 { + return false + } + return true +} + +// findSignalReceiverLocked returns a task in tg that should be interrupted to +// receive the given signal. If no such task exists, findSignalReceiverLocked +// returns nil. +// +// Linux actually records curr_target to balance the group signal targets. +// +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task { + for t := tg.tasks.Front(); t != nil; t = t.Next() { + if t.canReceiveSignalLocked(sig) { + return t + } + } + return nil +} + +// forceSignal ensures that the task is not ignoring or blocking the given +// signal. If unconditional is true, forceSignal takes action even if the +// signal isn't being ignored or blocked. +func (t *Task) forceSignal(sig linux.Signal, unconditional bool) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.forceSignalLocked(sig, unconditional) +} + +func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) { + blocked := linux.SignalSetOf(sig)&t.signalMask != 0 + act := t.tg.signalHandlers.actions[sig] + ignored := act.Handler == arch.SignalActIgnore + if blocked || ignored || unconditional { + act.Handler = arch.SignalActDefault + t.tg.signalHandlers.actions[sig] = act + if blocked { + t.setSignalMaskLocked(t.signalMask &^ linux.SignalSetOf(sig)) + } + } +} + +// SignalMask returns a copy of t's signal mask. +func (t *Task) SignalMask() linux.SignalSet { + return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.signalMask))) +} + +// SetSignalMask sets t's signal mask. +// +// Preconditions: SetSignalMask can only be called by the task goroutine. +// t.exitState < TaskExitZombie. +func (t *Task) SetSignalMask(mask linux.SignalSet) { + // By precondition, t prevents t.tg from completing an execve and mutating + // t.tg.signalHandlers, so we can skip the TaskSet mutex. + t.tg.signalHandlers.mu.Lock() + t.setSignalMaskLocked(mask) + t.tg.signalHandlers.mu.Unlock() +} + +// Preconditions: The signal mutex must be locked. +func (t *Task) setSignalMaskLocked(mask linux.SignalSet) { + oldMask := t.signalMask + atomic.StoreUint64((*uint64)(&t.signalMask), uint64(mask)) + + // If the new mask blocks any signals that were not blocked by the old + // mask, and at least one such signal is pending in tg.pendingSignals, and + // t has been woken, it could be the case that t was woken to handle that + // signal, but will no longer do so as a result of its new signal mask, so + // we have to pick a replacement. + blocked := mask &^ oldMask + blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet + if blockedGroupPending != 0 && t.interrupted() { + linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) { + if nt := t.tg.findSignalReceiverLocked(sig); nt != nil { + nt.interrupt() + return + } + }) + // We have to re-issue the interrupt consumed by t.interrupted() since + // it might have been for a different reason. + t.interruptSelf() + } + + // Conversely, if the new mask unblocks any signals that were blocked by + // the old mask, and at least one such signal is pending, we may now need + // to handle that signal. + unblocked := oldMask &^ mask + unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet) + if unblockedPending != 0 { + t.interruptSelf() + } +} + +// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's +// comment). +// +// Preconditions: SetSavedSignalMask can only be called by the task goroutine. +func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { + t.savedSignalMask = mask + t.haveSavedSignalMask = true +} + +// SignalStack returns the task-private signal stack. +func (t *Task) SignalStack() arch.SignalStack { + alt := t.signalStack + if t.onSignalStack(alt) { + alt.Flags |= arch.SignalStackFlagOnStack + } + return alt +} + +// onSignalStack returns true if the task is executing on the given signal stack. +func (t *Task) onSignalStack(alt arch.SignalStack) bool { + sp := usermem.Addr(t.Arch().Stack()) + return alt.Contains(sp) +} + +// SetSignalStack sets the task-private signal stack. +// +// This value may not be changed if the task is currently executing on the +// signal stack, i.e. if t.onSignalStack returns true. In this case, this +// function will return false. Otherwise, true is returned. +func (t *Task) SetSignalStack(alt arch.SignalStack) bool { + // Check that we're not executing on the stack. + if t.onSignalStack(t.signalStack) { + return false + } + + if alt.Flags&arch.SignalStackFlagDisable != 0 { + // Don't record anything beyond the flags. + t.signalStack = arch.SignalStack{ + Flags: arch.SignalStackFlagDisable, + } + } else { + // Mask out irrelevant parts: only disable matters. + alt.Flags &= arch.SignalStackFlagDisable + t.signalStack = alt + } + return true +} + +// SetSignalAct atomically sets the thread group's signal action for signal sig +// to *actptr (if actptr is not nil) and returns the old signal action. +func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) { + if !sig.IsValid() { + return arch.SignalAct{}, syserror.EINVAL + } + + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + sh := tg.signalHandlers + sh.mu.Lock() + defer sh.mu.Unlock() + oldact := sh.actions[sig] + if actptr != nil { + if sig == linux.SIGKILL || sig == linux.SIGSTOP { + return oldact, syserror.EINVAL + } + + act := *actptr + act.Mask &^= UnblockableSignals + sh.actions[sig] = act + // From POSIX, by way of Linux: + // + // "Setting a signal action to SIG_IGN for a signal that is pending + // shall cause the pending signal to be discarded, whether or not it is + // blocked." + // + // "Setting a signal action to SIG_DFL for a signal that is pending and + // whose default action is to ignore the signal (for example, SIGCHLD), + // shall cause the pending signal to be discarded, whether or not it is + // blocked." + if computeAction(sig, act) == SignalActionIgnore { + tg.discardSpecificLocked(sig) + } + } + return oldact, nil +} + +// CopyOutSignalAct converts the given SignalAct into an architecture-specific +// type and then copies it out to task memory. +func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error { + n := t.Arch().NewSignalAct() + n.SerializeFrom(s) + _, err := t.CopyOut(addr, n) + return err +} + +// CopyInSignalAct copies an architecture-specific sigaction type from task +// memory and then converts it into a SignalAct. +func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) { + n := t.Arch().NewSignalAct() + var s arch.SignalAct + if _, err := t.CopyIn(addr, n); err != nil { + return s, err + } + n.DeserializeTo(&s) + return s, nil +} + +// CopyOutSignalStack converts the given SignalStack into an +// architecture-specific type and then copies it out to task memory. +func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error { + n := t.Arch().NewSignalStack() + n.SerializeFrom(s) + _, err := t.CopyOut(addr, n) + return err +} + +// CopyInSignalStack copies an architecture-specific stack_t from task memory +// and then converts it into a SignalStack. +func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) { + n := t.Arch().NewSignalStack() + var s arch.SignalStack + if _, err := t.CopyIn(addr, n); err != nil { + return s, err + } + n.DeserializeTo(&s) + return s, nil +} + +// groupStop is a TaskStop placed on tasks that have received a stop signal +// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from +// the ptrace man page.) +// +// +stateify savable +type groupStop struct{} + +// Killable implements TaskStop.Killable. +func (*groupStop) Killable() bool { return true } + +// initiateGroupStop attempts to initiate a group stop based on a +// previously-dequeued stop signal. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) initiateGroupStop(info *arch.SignalInfo) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.groupStopPending { + t.Debugf("Signal %d: not stopping thread group: lost to racing stop signal", info.Signo) + return + } + if !t.tg.groupStopDequeued { + t.Debugf("Signal %d: not stopping thread group: lost to racing SIGCONT", info.Signo) + return + } + if t.tg.exiting { + t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo) + return + } + if t.tg.execing != nil { + t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo) + return + } + if !t.tg.groupStopComplete { + t.tg.groupStopSignal = linux.Signal(info.Signo) + } + t.tg.groupStopPendingCount = 0 + for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() { + if t2.killedLocked() || t2.exitState >= TaskExitInitiated { + t2.groupStopPending = false + continue + } + t2.groupStopPending = true + t2.groupStopAcknowledged = false + if t2.ptraceSeized { + t2.trapNotifyPending = true + if s, ok := t2.stop.(*ptraceStop); ok && s.listen { + t2.endInternalStopLocked() + } + } + t2.interrupt() + t.tg.groupStopPendingCount++ + } + t.Debugf("Signal %d: stopping %d threads in thread group", info.Signo, t.tg.groupStopPendingCount) +} + +// endGroupStopLocked ensures that all prior stop signals received by tg are +// not stopping tg and will not stop tg in the future. If broadcast is true, +// parent and tracer notification will be scheduled if appropriate. +// +// Preconditions: The signal mutex must be locked. +func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) { + // Discard all previously-queued stop signals. + linux.ForEachSignal(StopSignals, tg.discardSpecificLocked) + + if tg.groupStopPendingCount == 0 && !tg.groupStopComplete { + return + } + + completeStr := "incomplete" + if tg.groupStopComplete { + completeStr = "complete" + } + tg.leader.Debugf("Ending %s group stop with %d threads pending", completeStr, tg.groupStopPendingCount) + for t := tg.tasks.Front(); t != nil; t = t.Next() { + t.groupStopPending = false + if t.ptraceSeized { + t.trapNotifyPending = true + if s, ok := t.stop.(*ptraceStop); ok && s.listen { + t.endInternalStopLocked() + } + } else { + if _, ok := t.stop.(*groupStop); ok { + t.endInternalStopLocked() + } + } + } + if broadcast { + // Instead of notifying the parent here, set groupContNotify so that + // one of the continuing tasks does so. (Linux does something similar.) + // The reason we do this is to keep locking sane. In order to send a + // signal to the parent, we need to lock its signal mutex, but we're + // already holding tg's signal mutex, and the TaskSet mutex must be + // locked for writing for us to hold two signal mutexes. Since we don't + // want to require this for endGroupStopLocked (which is called from + // signal-sending paths), nor do we want to lose atomicity by releasing + // the mutexes we're already holding, just let the continuing thread + // group deal with it. + tg.groupContNotify = true + tg.groupContInterrupted = !tg.groupStopComplete + tg.groupContWaitable = true + } + // Unsetting groupStopDequeued will cause racing calls to initiateGroupStop + // to recognize that the group stop has been cancelled. + tg.groupStopDequeued = false + tg.groupStopSignal = 0 + tg.groupStopPendingCount = 0 + tg.groupStopComplete = false + tg.groupStopWaitable = false +} + +// participateGroupStopLocked is called to handle thread group side effects +// after t unsets t.groupStopPending. The caller must handle task side effects +// (e.g. placing the task goroutine into the group stop). It returns true if +// the caller must notify t.tg.leader's parent of a completed group stop (which +// participateGroupStopLocked cannot do due to holding the wrong locks). +// +// Preconditions: The signal mutex must be locked. +func (t *Task) participateGroupStopLocked() bool { + if t.groupStopAcknowledged { + return false + } + t.groupStopAcknowledged = true + t.tg.groupStopPendingCount-- + if t.tg.groupStopPendingCount != 0 { + return false + } + if t.tg.groupStopComplete { + return false + } + t.Debugf("Completing group stop") + t.tg.groupStopComplete = true + t.tg.groupStopWaitable = true + t.tg.groupContNotify = false + t.tg.groupContWaitable = false + return true +} + +// signalStop sends a signal to t's thread group of a new group stop, group +// continue, or ptrace stop, if appropriate. code and status are set in the +// signal sent to tg, if any. +// +// Preconditions: The TaskSet mutex must be locked (for reading or writing). +func (t *Task) signalStop(target *Task, code int32, status int32) { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD] + if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) { + sigchld := &arch.SignalInfo{ + Signo: int32(linux.SIGCHLD), + Code: code, + } + sigchld.SetPid(int32(t.tg.pidns.tids[target])) + sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + sigchld.SetStatus(status) + // TODO(b/72102453): Set utime, stime. + t.sendSignalLocked(sigchld, true /* group */) + } +} + +// The runInterrupt state handles conditions indicated by interrupts. +// +// +stateify savable +type runInterrupt struct{} + +func (*runInterrupt) execute(t *Task) taskRunState { + // Interrupts are de-duplicated (if t is interrupted twice before + // t.interrupted() is called, t.interrupted() will only return true once), + // so early exits from this function must re-enter the runInterrupt state + // to check for more interrupt-signaled conditions. + + t.tg.signalHandlers.mu.Lock() + + // Did we just leave a group stop? + if t.tg.groupContNotify { + t.tg.groupContNotify = false + sig := t.tg.groupStopSignal + intr := t.tg.groupContInterrupted + t.tg.signalHandlers.mu.Unlock() + t.tg.pidns.owner.mu.RLock() + // For consistency with Linux, if the parent and (thread group + // leader's) tracer are in the same thread group, deduplicate + // notifications. + notifyParent := t.tg.leader.parent != nil + if tracer := t.tg.leader.Tracer(); tracer != nil { + if notifyParent && tracer.tg == t.tg.leader.parent.tg { + notifyParent = false + } + // Sending CLD_STOPPED to the tracer doesn't really make any sense; + // the thread group leader may have already entered the stop and + // notified its tracer accordingly. But it's consistent with + // Linux... + if intr { + tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + if !notifyParent { + tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop) + } else { + tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop) + } + } else { + tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig)) + tracer.tg.eventQueue.Notify(EventGroupContinue) + } + } + if notifyParent { + // If groupContInterrupted, do as Linux does and pretend the group + // stop completed just before it ended. The theoretical behavior in + // this case would be to send a SIGCHLD indicating the completed + // stop, followed by a SIGCHLD indicating the continue. However, + // SIGCHLD is a standard signal, so the latter would always be + // dropped. Hence sending only the former is equivalent. + if intr { + t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop) + } else { + t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue) + } + } + t.tg.pidns.owner.mu.RUnlock() + return (*runInterrupt)(nil) + } + + // Do we need to enter a group stop or related ptrace stop? This path is + // analogous to Linux's kernel/signal.c:get_signal() => do_signal_stop() + // (with ptrace enabled) and do_jobctl_trap(). + if t.groupStopPending || t.trapStopPending || t.trapNotifyPending { + sig := t.tg.groupStopSignal + notifyParent := false + if t.groupStopPending { + t.groupStopPending = false + // We care about t.tg.groupStopSignal (for tracer notification) + // even if this doesn't complete a group stop, so keep the + // value of sig we've already read. + notifyParent = t.participateGroupStopLocked() + } + t.trapStopPending = false + t.trapNotifyPending = false + // Drop the signal mutex so we can take the TaskSet mutex. + t.tg.signalHandlers.mu.Unlock() + + t.tg.pidns.owner.mu.RLock() + if t.tg.leader.parent == nil { + notifyParent = false + } + if tracer := t.Tracer(); tracer != nil { + if t.ptraceSeized { + if sig == 0 { + sig = linux.SIGTRAP + } + // "If tracee was attached using PTRACE_SEIZE, group-stop is + // indicated by PTRACE_EVENT_STOP: status>>16 == + // PTRACE_EVENT_STOP. This allows detection of group-stops + // without requiring an extra PTRACE_GETSIGINFO call." - + // "Group-stop", ptrace(2) + t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8 + t.ptraceSiginfo = &arch.SignalInfo{ + Signo: int32(sig), + Code: t.ptraceCode, + } + t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + } else { + t.ptraceCode = int32(sig) + t.ptraceSiginfo = nil + } + if t.beginPtraceStopLocked() { + tracer.signalStop(t, arch.CLD_STOPPED, int32(sig)) + // For consistency with Linux, if the parent and tracer are in the + // same thread group, deduplicate notification signals. + if notifyParent && tracer.tg == t.tg.leader.parent.tg { + notifyParent = false + tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop) + } else { + tracer.tg.eventQueue.Notify(EventTraceeStop) + } + } + } else { + t.tg.signalHandlers.mu.Lock() + if !t.killedLocked() { + t.beginInternalStopLocked((*groupStop)(nil)) + } + t.tg.signalHandlers.mu.Unlock() + } + if notifyParent { + t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) + } + t.tg.pidns.owner.mu.RUnlock() + + return (*runInterrupt)(nil) + } + + // Are there signals pending? + if info := t.dequeueSignalLocked(t.signalMask); info != nil { + if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 { + // Indicate that we've dequeued a stop signal before unlocking the + // signal mutex; initiateGroupStop will check for races with + // endGroupStopLocked after relocking it. + t.tg.groupStopDequeued = true + } + if t.ptraceSignalLocked(info) { + // Dequeueing the signal action must wait until after the + // signal-delivery-stop ends since the tracer can change or + // suppress the signal. + t.tg.signalHandlers.mu.Unlock() + return (*runInterruptAfterSignalDeliveryStop)(nil) + } + act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo)) + t.tg.signalHandlers.mu.Unlock() + return t.deliverSignal(info, act) + } + + t.tg.signalHandlers.mu.Unlock() + return (*runApp)(nil) +} + +// +stateify savable +type runInterruptAfterSignalDeliveryStop struct{} + +func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState { + t.tg.pidns.owner.mu.Lock() + // Can't defer unlock: deliverSignal must be called without holding TaskSet + // mutex. + sig := linux.Signal(t.ptraceCode) + defer func() { + t.ptraceSiginfo = nil + }() + if !sig.IsValid() { + t.tg.pidns.owner.mu.Unlock() + return (*runInterrupt)(nil) + } + info := t.ptraceSiginfo + if sig != linux.Signal(info.Signo) { + info.Signo = int32(sig) + info.Errno = 0 + info.Code = arch.SignalInfoUser + // pid isn't a valid field for all signal numbers, but Linux + // doesn't care (kernel/signal.c:ptrace_signal()). + // + // Linux uses t->parent for the tid and uid here, which is the tracer + // if it hasn't detached or the real parent otherwise. + parent := t.parent + if tracer := t.Tracer(); tracer != nil { + parent = tracer + } + if parent == nil { + // Tracer has detached and t was created by Kernel.CreateProcess(). + // Pretend the parent is in an ancestor PID + user namespace. + info.SetPid(0) + info.SetUid(int32(auth.OverflowUID)) + } else { + info.SetPid(int32(t.tg.pidns.tids[parent])) + info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + } + } + t.tg.signalHandlers.mu.Lock() + t.tg.pidns.owner.mu.Unlock() + // If the signal is masked, re-queue it. + if linux.SignalSetOf(sig)&t.signalMask != 0 { + t.sendSignalLocked(info, false /* group */) + t.tg.signalHandlers.mu.Unlock() + return (*runInterrupt)(nil) + } + act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo)) + t.tg.signalHandlers.mu.Unlock() + return t.deliverSignal(info, act) +} diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go new file mode 100644 index 000000000..b42531e57 --- /dev/null +++ b/pkg/sentry/kernel/task_start.go @@ -0,0 +1,287 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// TaskConfig defines the configuration of a new Task (see below). +type TaskConfig struct { + // Kernel is the owning Kernel. + Kernel *Kernel + + // Parent is the new task's parent. Parent may be nil. + Parent *Task + + // If InheritParent is not nil, use InheritParent's parent as the new + // task's parent. + InheritParent *Task + + // ThreadGroup is the ThreadGroup the new task belongs to. + ThreadGroup *ThreadGroup + + // SignalMask is the new task's initial signal mask. + SignalMask linux.SignalSet + + // TaskContext is the TaskContext of the new task. Ownership of the + // TaskContext is transferred to TaskSet.NewTask, whether or not it + // succeeds. + TaskContext *TaskContext + + // FSContext is the FSContext of the new task. A reference must be held on + // FSContext, which is transferred to TaskSet.NewTask whether or not it + // succeeds. + FSContext *FSContext + + // FDMap is the FDMap of the new task. A reference must be held on FDMap, + // which is transferred to TaskSet.NewTask whether or not it succeeds. + FDMap *FDMap + + // Credentials is the Credentials of the new task. + Credentials *auth.Credentials + + // Niceness is the niceness of the new task. + Niceness int + + // If NetworkNamespaced is true, the new task should observe a non-root + // network namespace. + NetworkNamespaced bool + + // AllowedCPUMask contains the cpus that this task can run on. + AllowedCPUMask sched.CPUSet + + // UTSNamespace is the UTSNamespace of the new task. + UTSNamespace *UTSNamespace + + // IPCNamespace is the IPCNamespace of the new task. + IPCNamespace *IPCNamespace + + // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. + AbstractSocketNamespace *AbstractSocketNamespace + + // ContainerID is the container the new task belongs to. + ContainerID string +} + +// NewTask creates a new task defined by cfg. +// +// NewTask does not start the returned task; the caller must call Task.Start. +func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { + t, err := ts.newTask(cfg) + if err != nil { + cfg.TaskContext.release() + cfg.FSContext.DecRef() + cfg.FDMap.DecRef() + return nil, err + } + return t, nil +} + +// newTask is a helper for TaskSet.NewTask that only takes ownership of parts +// of cfg if it succeeds. +func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { + tg := cfg.ThreadGroup + tc := cfg.TaskContext + t := &Task{ + taskNode: taskNode{ + tg: tg, + parent: cfg.Parent, + children: make(map[*Task]struct{}), + }, + runState: (*runApp)(nil), + interruptChan: make(chan struct{}, 1), + signalMask: cfg.SignalMask, + signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, + tc: *tc, + fsc: cfg.FSContext, + fds: cfg.FDMap, + p: cfg.Kernel.Platform.NewContext(), + k: cfg.Kernel, + ptraceTracees: make(map[*Task]struct{}), + allowedCPUMask: cfg.AllowedCPUMask.Copy(), + ioUsage: &usage.IO{}, + creds: cfg.Credentials, + niceness: cfg.Niceness, + netns: cfg.NetworkNamespaced, + utsns: cfg.UTSNamespace, + ipcns: cfg.IPCNamespace, + abstractSockets: cfg.AbstractSocketNamespace, + rseqCPU: -1, + futexWaiter: futex.NewWaiter(), + containerID: cfg.ContainerID, + } + t.endStopCond.L = &t.tg.signalHandlers.mu + t.ptraceTracer.Store((*Task)(nil)) + // We don't construct t.blockingTimer until Task.run(); see that function + // for justification. + + // Make the new task (and possibly thread group) visible to the rest of + // the system atomically. + ts.mu.Lock() + defer ts.mu.Unlock() + tg.signalHandlers.mu.Lock() + defer tg.signalHandlers.mu.Unlock() + if tg.exiting || tg.execing != nil { + // If the caller is in the same thread group, then what we return + // doesn't matter too much since the caller will exit before it returns + // to userspace. If the caller isn't in the same thread group, then + // we're in uncharted territory and can return whatever we want. + return nil, syserror.EINTR + } + if err := ts.assignTIDsLocked(t); err != nil { + return nil, err + } + // Below this point, newTask is expected not to fail (there is no rollback + // of assignTIDsLocked or any of the following). + + // Logging on t's behalf will panic if t.logPrefix hasn't been initialized. + // This is the earliest point at which we can do so (since t now has thread + // IDs). + t.updateLogPrefixLocked() + + if cfg.InheritParent != nil { + t.parent = cfg.InheritParent.parent + } + if t.parent != nil { + t.parent.children[t] = struct{}{} + } + + if tg.leader == nil { + // New thread group. + tg.leader = t + if parentPG := tg.parentPG(); parentPG == nil { + tg.createSession() + } else { + // Inherit the process group. + parentPG.incRefWithParent(parentPG) + tg.processGroup = parentPG + } + } + tg.tasks.PushBack(t) + tg.tasksCount++ + tg.liveTasks++ + tg.activeTasks++ + + // Propagate external TaskSet stops to the new task. + t.stopCount = ts.stopCount + + t.mu.Lock() + defer t.mu.Unlock() + + t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t]) + + t.startTime = t.k.RealtimeClock().Now() + + return t, nil +} + +// assignTIDsLocked ensures that new task t is visible in all PID namespaces in +// which it should be visible. +// +// Preconditions: ts.mu must be locked for writing. +func (ts *TaskSet) assignTIDsLocked(t *Task) error { + type allocatedTID struct { + ns *PIDNamespace + tid ThreadID + } + var allocatedTIDs []allocatedTID + for ns := t.tg.pidns; ns != nil; ns = ns.parent { + tid, err := ns.allocateTID() + if err != nil { + // Failure. Remove the tids we already allocated in descendant + // namespaces. + for _, a := range allocatedTIDs { + delete(a.ns.tasks, a.tid) + delete(a.ns.tids, t) + if t.tg.leader == nil { + delete(a.ns.tgids, t.tg) + } + } + return err + } + ns.tasks[tid] = t + ns.tids[t] = tid + if t.tg.leader == nil { + // New thread group. + ns.tgids[t.tg] = tid + } + allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) + } + return nil +} + +// allocateTID returns an unused ThreadID from ns. +// +// Preconditions: ns.owner.mu must be locked for writing. +func (ns *PIDNamespace) allocateTID() (ThreadID, error) { + if ns.exiting { + // "In this case, a subsequent fork(2) into this PID namespace will + // fail with the error ENOMEM; it is not possible to create a new + // processes [sic] in a PID namespace whose init process has + // terminated." - pid_namespaces(7) + return 0, syserror.ENOMEM + } + tid := ns.last + for { + // Next. + tid++ + if tid > TasksLimit { + tid = InitTID + 1 + } + + // Is it available? + _, ok := ns.tasks[tid] + if !ok { + ns.last = tid + return tid, nil + } + + // Did we do a full cycle? + if tid == ns.last { + // No tid available. + return 0, syserror.EAGAIN + } + } +} + +// Start starts the task goroutine. Start must be called exactly once for each +// task returned by NewTask. +// +// 'tid' must be the task's TID in the root PID namespace and it's used for +// debugging purposes only (set as parameter to Task.run to make it visible +// in stack dumps). +func (t *Task) Start(tid ThreadID) { + // If the task was restored, it may be "starting" after having already exited. + if t.runState == nil { + return + } + t.goroutineStopped.Add(1) + t.tg.liveGoroutines.Add(1) + t.tg.pidns.owner.liveGoroutines.Add(1) + t.tg.pidns.owner.runningGoroutines.Add(1) + + // Task is now running in system mode. + t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) + + // Use the task's TID in the root PID namespace to make it visible in stack dumps. + go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops +} diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go new file mode 100644 index 000000000..e735a5dd0 --- /dev/null +++ b/pkg/sentry/kernel/task_stop.go @@ -0,0 +1,226 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// This file implements task stops, which represent the equivalent of Linux's +// uninterruptible sleep states in a way that is compatible with save/restore. +// Task stops comprise both internal stops (which form part of the task's +// "normal" control flow) and external stops (which do not); see README.md for +// details. +// +// There are multiple interfaces for interacting with stops because there are +// multiple cases to consider: +// +// - A task goroutine can begin a stop on its associated task (e.g. a +// vfork() syscall stopping the calling task until the child task releases its +// MM). In this case, calling Task.interrupt is both unnecessary (the task +// goroutine obviously cannot be blocked in Task.block or executing application +// code) and undesirable (as it may spuriously interrupt a in-progress +// syscall). +// +// Beginning internal stops in this case is implemented by +// Task.beginInternalStop / Task.beginInternalStopLocked. As of this writing, +// there are no instances of this case that begin external stops, except for +// autosave; however, autosave terminates the sentry without ending the +// external stop, so the spurious interrupt is moot. +// +// - An arbitrary goroutine can begin a stop on an unrelated task (e.g. all +// tasks being stopped in preparation for state checkpointing). If the task +// goroutine may be in Task.block or executing application code, it must be +// interrupted by Task.interrupt for it to actually enter the stop; since, +// strictly speaking, we have no way of determining this, we call +// Task.interrupt unconditionally. +// +// Beginning external stops in this case is implemented by +// Task.BeginExternalStop. As of this writing, there are no instances of this +// case that begin internal stops. +// +// - An arbitrary goroutine can end a stop on an unrelated task (e.g. an +// exiting task resuming a sibling task that has been blocked in an execve() +// syscall waiting for other tasks to exit). In this case, Task.endStopCond +// must be notified to kick the task goroutine out of Task.doStop. +// +// Ending internal stops in this case is implemented by +// Task.endInternalStopLocked. Ending external stops in this case is +// implemented by Task.EndExternalStop. +// +// - Hypothetically, a task goroutine can end an internal stop on its +// associated task. As of this writing, there are no instances of this case. +// However, any instances of this case could still use the above functions, +// since notifying Task.endStopCond would be unnecessary but harmless. + +import ( + "fmt" + "sync/atomic" +) + +// A TaskStop is a condition visible to the task control flow graph that +// prevents a task goroutine from running or exiting, i.e. an internal stop. +// +// NOTE(b/30793614): Most TaskStops don't contain any data; they're +// distinguished by their type. The obvious way to implement such a TaskStop +// is: +// +// type groupStop struct{} +// func (groupStop) Killable() bool { return true } +// ... +// t.beginInternalStop(groupStop{}) +// +// However, this doesn't work because the state package can't serialize values, +// only pointers. Furthermore, the correctness of save/restore depends on the +// ability to pass a TaskStop to endInternalStop that will compare equal to the +// TaskStop that was passed to beginInternalStop, even if a save/restore cycle +// occurred between the two. As a result, the current idiom is to always use a +// typecast nil for data-free TaskStops: +// +// type groupStop struct{} +// func (*groupStop) Killable() bool { return true } +// ... +// t.beginInternalStop((*groupStop)(nil)) +// +// This is pretty gross, but the alternatives seem grosser. +type TaskStop interface { + // Killable returns true if Task.Kill should end the stop prematurely. + // Killable is analogous to Linux's TASK_WAKEKILL. + Killable() bool +} + +// beginInternalStop indicates the start of an internal stop that applies to t. +// +// Preconditions: The task must not already be in an internal stop (i.e. t.stop +// == nil). The caller must be running on the task goroutine. +func (t *Task) beginInternalStop(s TaskStop) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.beginInternalStopLocked(s) +} + +// Preconditions: The signal mutex must be locked. All preconditions for +// Task.beginInternalStop also apply. +func (t *Task) beginInternalStopLocked(s TaskStop) { + if t.stop != nil { + panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop)) + } + t.Debugf("Entering internal stop %#v", s) + t.stop = s + t.beginStopLocked() +} + +// endInternalStopLocked indicates the end of an internal stop that applies to +// t. endInternalStopLocked does not wait for the task to resume. +// +// The caller is responsible for ensuring that the internal stop they expect +// actually applies to t; this requires holding the signal mutex which protects +// t.stop, which is why there is no endInternalStop that locks the signal mutex +// for you. +// +// Preconditions: The signal mutex must be locked. The task must be in an +// internal stop (i.e. t.stop != nil). +func (t *Task) endInternalStopLocked() { + if t.stop == nil { + panic("Attempting to leave non-existent internal stop") + } + t.Debugf("Leaving internal stop %#v", t.stop) + t.stop = nil + t.endStopLocked() +} + +// BeginExternalStop indicates the start of an external stop that applies to t. +// BeginExternalStop does not wait for t's task goroutine to stop. +func (t *Task) BeginExternalStop() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.beginStopLocked() + t.interrupt() +} + +// EndExternalStop indicates the end of an external stop started by a previous +// call to Task.BeginExternalStop. EndExternalStop does not wait for t's task +// goroutine to resume. +func (t *Task) EndExternalStop() { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.endStopLocked() +} + +// beginStopLocked increments t.stopCount to indicate that a new internal or +// external stop applies to t. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) beginStopLocked() { + if newval := atomic.AddInt32(&t.stopCount, 1); newval <= 0 { + // Most likely overflow. + panic(fmt.Sprintf("Invalid stopCount: %d", newval)) + } +} + +// endStopLocked decerements t.stopCount to indicate that an existing internal +// or external stop no longer applies to t. +// +// Preconditions: The signal mutex must be locked. +func (t *Task) endStopLocked() { + if newval := atomic.AddInt32(&t.stopCount, -1); newval < 0 { + panic(fmt.Sprintf("Invalid stopCount: %d", newval)) + } else if newval == 0 { + t.endStopCond.Signal() + } +} + +// BeginExternalStop indicates the start of an external stop that applies to +// all current and future tasks in ts. BeginExternalStop does not wait for +// task goroutines to stop. +func (ts *TaskSet) BeginExternalStop() { + ts.mu.Lock() + defer ts.mu.Unlock() + ts.stopCount++ + if ts.stopCount <= 0 { + panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount)) + } + if ts.Root == nil { + return + } + for t := range ts.Root.tids { + t.tg.signalHandlers.mu.Lock() + t.beginStopLocked() + t.tg.signalHandlers.mu.Unlock() + t.interrupt() + } +} + +// EndExternalStop indicates the end of an external stop started by a previous +// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task +// goroutines to resume. +func (ts *TaskSet) EndExternalStop() { + ts.mu.Lock() + defer ts.mu.Unlock() + ts.stopCount-- + if ts.stopCount < 0 { + panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount)) + } + if ts.Root == nil { + return + } + for t := range ts.Root.tids { + t.tg.signalHandlers.mu.Lock() + t.endStopLocked() + t.tg.signalHandlers.mu.Unlock() + } +} diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go new file mode 100644 index 000000000..a9283d0df --- /dev/null +++ b/pkg/sentry/kernel/task_syscall.go @@ -0,0 +1,447 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "os" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/bits" + "gvisor.googlesource.com/gvisor/pkg/metric" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel +// include/linux/errno.h. These errnos are never returned to userspace +// directly, but are used to communicate the expected behavior of an +// interrupted syscall from the syscall to signal handling. +type SyscallRestartErrno int + +// These numeric values are significant because ptrace syscall exit tracing can +// observe them. +// +// For all of the following errnos, if the syscall is not interrupted by a +// signal delivered to a user handler, the syscall is restarted. +const ( + // ERESTARTSYS is returned by an interrupted syscall to indicate that it + // should be converted to EINTR if interrupted by a signal delivered to a + // user handler without SA_RESTART set, and restarted otherwise. + ERESTARTSYS = SyscallRestartErrno(512) + + // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it + // should always be restarted. + ERESTARTNOINTR = SyscallRestartErrno(513) + + // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it + // should be converted to EINTR if interrupted by a signal delivered to a + // user handler, and restarted otherwise. + ERESTARTNOHAND = SyscallRestartErrno(514) + + // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate + // that it should be restarted using a custom function. The interrupted + // syscall must register a custom restart function by calling + // Task.SetRestartSyscallFn. + ERESTART_RESTARTBLOCK = SyscallRestartErrno(516) +) + +var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application") + +// Error implements error.Error. +func (e SyscallRestartErrno) Error() string { + // Descriptions are borrowed from strace. + switch e { + case ERESTARTSYS: + return "to be restarted if SA_RESTART is set" + case ERESTARTNOINTR: + return "to be restarted" + case ERESTARTNOHAND: + return "to be restarted if no handler" + case ERESTART_RESTARTBLOCK: + return "interrupted by signal" + default: + return "(unknown interrupt error)" + } +} + +// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by +// rv, the value in a syscall return register. +func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) { + switch int(rv) { + case -int(ERESTARTSYS): + return ERESTARTSYS, true + case -int(ERESTARTNOINTR): + return ERESTARTNOINTR, true + case -int(ERESTARTNOHAND): + return ERESTARTNOHAND, true + case -int(ERESTART_RESTARTBLOCK): + return ERESTART_RESTARTBLOCK, true + default: + return 0, false + } +} + +// SyscallRestartBlock represents the restart block for a syscall restartable +// with a custom function. It encapsulates the state required to restart a +// syscall across a S/R. +type SyscallRestartBlock interface { + Restart(t *Task) (uintptr, error) +} + +// SyscallControl is returned by syscalls to control the behavior of +// Task.doSyscallInvoke. +type SyscallControl struct { + // next is the state that the task goroutine should switch to. If next is + // nil, the task goroutine should continue to syscall exit as usual. + next taskRunState + + // If ignoreReturn is true, Task.doSyscallInvoke should not store any value + // in the task's syscall return value register. + ignoreReturn bool +} + +var ( + // CtrlDoExit is returned by the implementations of the exit and exit_group + // syscalls to enter the task exit path directly, skipping syscall exit + // tracing. + CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true} + + // ctrlStopAndReinvokeSyscall is returned by syscalls using the external + // feature before syscall execution. This causes Task.doSyscallInvoke + // to return runSyscallReinvoke, allowing Task.run to check for stops + // before immediately re-invoking the syscall (skipping the re-checking + // of seccomp filters and ptrace which would confuse userspace + // tracing). + ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true} + + // ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at + // their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather + // than tail-calling it, allowing stops to be checked before syscall exit. + ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)} +) + +func (t *Task) invokeExternal() { + t.BeginExternalStop() + go func() { // S/R-SAFE: External control flow. + defer t.EndExternalStop() + t.SyscallTable().External(t.Kernel()) + }() +} + +func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) { + s := t.SyscallTable() + + fe := s.FeatureEnable.Word(sysno) + + var straceContext interface{} + if bits.IsAnyOn32(fe, StraceEnableBits) { + straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe) + } + + if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) { + t.invokeExternal() + // Ensure we check for stops, then invoke the syscall again. + ctrl = ctrlStopAndReinvokeSyscall + } else { + fn := s.Lookup(sysno) + if fn != nil { + // Call our syscall implementation. + rval, ctrl, err = fn(t, args) + } else { + // Use the missing function if not found. + rval, err = t.SyscallTable().Missing(t, sysno, args) + } + } + + if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { + t.invokeExternal() + // Don't reinvoke the syscall. + } + + if bits.IsAnyOn32(fe, StraceEnableBits) { + s.Stracer.SyscallExit(straceContext, t, sysno, rval, err) + } + + return +} + +// doSyscall is the entry point for an invocation of a system call specified by +// the current state of t's registers. +// +// The syscall path is very hot; avoid defer. +func (t *Task) doSyscall() taskRunState { + sysno := t.Arch().SyscallNo() + args := t.Arch().SyscallArgs() + + // Tracers expect to see this between when the task traps into the kernel + // to perform a syscall and when the syscall is actually invoked. + // This useless-looking temporary is needed because Go. + tmp := uintptr(syscall.ENOSYS) + t.Arch().SetReturn(-tmp) + + // Check seccomp filters. The nil check is for performance (as seccomp use + // is rare), not needed for correctness. + if t.syscallFilters.Load() != nil { + switch r := t.checkSeccompSyscall(int32(sysno), args, usermem.Addr(t.Arch().IP())); r { + case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: + t.Debugf("Syscall %d: denied by seccomp", sysno) + return (*runSyscallExit)(nil) + case linux.SECCOMP_RET_ALLOW: + // ok + case linux.SECCOMP_RET_KILL_THREAD: + t.Debugf("Syscall %d: killed by seccomp", sysno) + t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + return (*runExit)(nil) + case linux.SECCOMP_RET_TRACE: + t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno) + return (*runSyscallAfterPtraceEventSeccomp)(nil) + default: + panic(fmt.Sprintf("Unknown seccomp result %d", r)) + } + } + + return t.doSyscallEnter(sysno, args) +} + +type runSyscallAfterPtraceEventSeccomp struct{} + +func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { + if t.killed() { + // "[S]yscall-exit-stop is not generated prior to death by SIGKILL." - + // ptrace(2) + return (*runInterrupt)(nil) + } + sysno := t.Arch().SyscallNo() + // "The tracer can skip the system call by changing the syscall number to + // -1." - Documentation/prctl/seccomp_filter.txt + if sysno == ^uintptr(0) { + return (*runSyscallExit)(nil).execute(t) + } + args := t.Arch().SyscallArgs() + return t.doSyscallEnter(sysno, args) +} + +func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState { + if next, ok := t.ptraceSyscallEnter(); ok { + return next + } + return t.doSyscallInvoke(sysno, args) +} + +// +stateify savable +type runSyscallAfterSyscallEnterStop struct{} + +func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState { + if sig := linux.Signal(t.ptraceCode); sig.IsValid() { + t.tg.signalHandlers.mu.Lock() + t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) + t.tg.signalHandlers.mu.Unlock() + } + if t.killed() { + return (*runInterrupt)(nil) + } + sysno := t.Arch().SyscallNo() + if sysno == ^uintptr(0) { + return (*runSyscallExit)(nil) + } + args := t.Arch().SyscallArgs() + return t.doSyscallInvoke(sysno, args) +} + +// +stateify savable +type runSyscallAfterSysemuStop struct{} + +func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState { + if sig := linux.Signal(t.ptraceCode); sig.IsValid() { + t.tg.signalHandlers.mu.Lock() + t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) + t.tg.signalHandlers.mu.Unlock() + } + if t.killed() { + return (*runInterrupt)(nil) + } + return (*runSyscallExit)(nil).execute(t) +} + +func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState { + rval, ctrl, err := t.executeSyscall(sysno, args) + + if ctrl != nil { + if !ctrl.ignoreReturn { + t.Arch().SetReturn(rval) + } + if ctrl.next != nil { + return ctrl.next + } + } else if err != nil { + t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno)))) + t.haveSyscallReturn = true + } else { + t.Arch().SetReturn(rval) + } + + return (*runSyscallExit)(nil).execute(t) +} + +// +stateify savable +type runSyscallReinvoke struct{} + +func (*runSyscallReinvoke) execute(t *Task) taskRunState { + if t.killed() { + // It's possible that since the last execution, the task has + // been forcible killed. Invoking the system call here could + // result in an infinite loop if it is again preempted by an + // external stop and reinvoked. + return (*runInterrupt)(nil) + } + + sysno := t.Arch().SyscallNo() + args := t.Arch().SyscallArgs() + return t.doSyscallInvoke(sysno, args) +} + +// +stateify savable +type runSyscallExit struct{} + +func (*runSyscallExit) execute(t *Task) taskRunState { + t.ptraceSyscallExit() + return (*runApp)(nil) +} + +// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as +// indicated by an execution fault at address addr. doVsyscall returns the +// task's next run state. +func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { + vsyscallCount.Increment() + + // Grab the caller up front, to make sure there's a sensible stack. + caller := t.Arch().Native(uintptr(0)) + if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil { + t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err) + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + return (*runApp)(nil) + } + + // For _vsyscalls_, there is no need to translate System V calling convention + // to syscall ABI because they both use RDI, RSI, and RDX for the first three + // arguments and none of the vsyscalls uses more than two arguments. + args := t.Arch().SyscallArgs() + if t.syscallFilters.Load() != nil { + switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r { + case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: + t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller)) + return (*runApp)(nil) + case linux.SECCOMP_RET_ALLOW: + // ok + case linux.SECCOMP_RET_TRACE: + t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller)) + return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller} + case linux.SECCOMP_RET_KILL_THREAD: + t.Debugf("vsyscall %d: killed by seccomp", sysno) + t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + return (*runExit)(nil) + default: + panic(fmt.Sprintf("Unknown seccomp result %d", r)) + } + } + + return t.doVsyscallInvoke(sysno, args, caller) +} + +type runVsyscallAfterPtraceEventSeccomp struct { + addr usermem.Addr + sysno uintptr + caller interface{} +} + +func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { + if t.killed() { + return (*runInterrupt)(nil) + } + sysno := t.Arch().SyscallNo() + // "... the syscall may not be changed to another system call using the + // orig_rax register. It may only be changed to -1 order [sic] to skip the + // currently emulated call. ... The tracer MUST NOT modify rip or rsp." - + // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip + // causes do_exit(SIGSYS), and changing sp is ignored. + if (sysno != ^uintptr(0) && sysno != r.sysno) || usermem.Addr(t.Arch().IP()) != r.addr { + t.PrepareExit(ExitStatus{Signo: int(linux.SIGSYS)}) + return (*runExit)(nil) + } + if sysno == ^uintptr(0) { + return (*runApp)(nil) + } + return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller) +} + +func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState { + rval, ctrl, err := t.executeSyscall(sysno, args) + if ctrl != nil { + t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl) + // Set the return value. The stack has already been adjusted. + t.Arch().SetReturn(0) + } else if err == nil { + t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller)) + // Set the return value. The stack has already been adjusted. + t.Arch().SetReturn(uintptr(rval)) + } else { + t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err) + if err == syserror.EFAULT { + t.forceSignal(linux.SIGSEGV, false /* unconditional */) + t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) + // A return is not emulated in this case. + return (*runApp)(nil) + } + t.Arch().SetReturn(uintptr(-t.ExtractErrno(err, int(sysno)))) + } + t.Arch().SetIP(t.Arch().Value(caller)) + t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width())) + return (*runApp)(nil) +} + +// ExtractErrno extracts an integer error number from the error. +// The syscall number is purely for context in the error case. Use -1 if +// syscall number is unknown. +func (t *Task) ExtractErrno(err error, sysno int) int { + switch err := err.(type) { + case nil: + return 0 + case syscall.Errno: + return int(err) + case SyscallRestartErrno: + return int(err) + case *memmap.BusError: + // Bus errors may generate SIGBUS, but for syscalls they still + // return EFAULT. See case in task_run.go where the fault is + // handled (and the SIGBUS is delivered). + return int(syscall.EFAULT) + case *os.PathError: + return t.ExtractErrno(err.Err, sysno) + case *os.LinkError: + return t.ExtractErrno(err.Err, sysno) + case *os.SyscallError: + return t.ExtractErrno(err.Err, sysno) + default: + if errno, ok := syserror.TranslateError(err); ok { + return int(errno) + } + } + panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err)) +} diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go new file mode 100644 index 000000000..461bd7316 --- /dev/null +++ b/pkg/sentry/kernel/task_usermem.go @@ -0,0 +1,301 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// MAX_RW_COUNT is the maximum size in bytes of a single read or write. +// Reads and writes that exceed this size may be silently truncated. +// (Linux: include/linux/fs.h:MAX_RW_COUNT) +var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown()) + +// Activate ensures that the task has an active address space. +func (t *Task) Activate() { + if mm := t.MemoryManager(); mm != nil { + if err := mm.Activate(); err != nil { + panic("unable to activate mm: " + err.Error()) + } + } +} + +// Deactivate relinquishes the task's active address space. +func (t *Task) Deactivate() { + if mm := t.MemoryManager(); mm != nil { + mm.Deactivate() + } +} + +// CopyIn copies a fixed-size value or slice of fixed-size values in from the +// task's memory. The copy will fail with syscall.EFAULT if it traverses user +// memory that is unmapped or not readable by the user. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) { + return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyInBytes is a fast version of CopyIn if the caller can serialize the +// data without reflection and pass in a byte slice. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { + return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyOut copies a fixed-size value or slice of fixed-size values out to the +// task's memory. The copy will fail with syscall.EFAULT if it traverses user +// memory that is unmapped or not writeable by the user. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) { + return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyOutBytes is a fast version of CopyOut if the caller can serialize the +// data without reflection and pass in a byte slice. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { + return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyInString copies a NUL-terminated string of length at most maxlen in from +// the task's memory. The copy will fail with syscall.EFAULT if it traverses +// user memory that is unmapped or not readable by the user. +// +// This Task's AddressSpace must be active. +func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error) { + return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{ + AddressSpaceActive: true, + }) +} + +// CopyInVector copies a NULL-terminated vector of strings from the task's +// memory. The copy will fail with syscall.EFAULT if it traverses +// user memory that is unmapped or not readable by the user. +// +// maxElemSize is the maximum size of each individual element. +// +// maxTotalSize is the maximum total length of all elements plus the total +// number of elements. For example, the following strings correspond to +// the following set of sizes: +// +// { "a", "b", "c" } => 6 (3 for lengths, 3 for elements) +// { "abc" } => 4 (3 for length, 1 for elements) +// +// This Task's AddressSpace must be active. +func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error) { + var v []string + for { + argAddr := t.Arch().Native(0) + if _, err := t.CopyIn(addr, argAddr); err != nil { + return v, err + } + if t.Arch().Value(argAddr) == 0 { + break + } + // Each string has a zero terminating byte counted, so copying out a string + // requires at least one byte of space. Also, see the calculation below. + if maxTotalSize <= 0 { + return nil, syserror.ENOMEM + } + thisMax := maxElemSize + if maxTotalSize < thisMax { + thisMax = maxTotalSize + } + arg, err := t.CopyInString(usermem.Addr(t.Arch().Value(argAddr)), thisMax) + if err != nil { + return v, err + } + v = append(v, arg) + addr += usermem.Addr(t.Arch().Width()) + maxTotalSize -= len(arg) + 1 + } + return v, nil +} + +// CopyOutIovecs converts src to an array of struct iovecs and copies it to the +// memory mapped at addr. +// +// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the +// task goroutine. t's AddressSpace must be active. +func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error { + switch t.Arch().Width() { + case 8: + const itemLen = 16 + if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok { + return syserror.EFAULT + } + + b := t.CopyScratchBuffer(itemLen) + for ; !src.IsEmpty(); src = src.Tail() { + ar := src.Head() + usermem.ByteOrder.PutUint64(b[0:8], uint64(ar.Start)) + usermem.ByteOrder.PutUint64(b[8:16], uint64(ar.Length())) + if _, err := t.CopyOutBytes(addr, b); err != nil { + return err + } + addr += itemLen + } + + default: + return syserror.ENOSYS + } + + return nil +} + +// CopyInIovecs copies an array of numIovecs struct iovecs from the memory +// mapped at addr, converts them to usermem.AddrRanges, and returns them as a +// usermem.AddrRangeSeq. +// +// CopyInIovecs shares the following properties with Linux's +// lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector(): +// +// - If the length of any AddrRange would exceed the range of an ssize_t, +// CopyInIovecs returns EINVAL. +// +// - If the length of any AddrRange would cause its end to overflow, +// CopyInIovecs returns EFAULT. +// +// - If any AddrRange would include addresses outside the application address +// range, CopyInIovecs returns EFAULT. +// +// - The combined length of all AddrRanges is limited to MAX_RW_COUNT. If the +// combined length of all AddrRanges would otherwise exceed this amount, ranges +// beyond MAX_RW_COUNT are silently truncated. +// +// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the +// task goroutine. t's AddressSpace must be active. +func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) { + if numIovecs == 0 { + return usermem.AddrRangeSeq{}, nil + } + + var dst []usermem.AddrRange + if numIovecs > 1 { + dst = make([]usermem.AddrRange, 0, numIovecs) + } + + switch t.Arch().Width() { + case 8: + const itemLen = 16 + if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok { + return usermem.AddrRangeSeq{}, syserror.EFAULT + } + + b := t.CopyScratchBuffer(itemLen) + for i := 0; i < numIovecs; i++ { + if _, err := t.CopyInBytes(addr, b); err != nil { + return usermem.AddrRangeSeq{}, err + } + + base := usermem.Addr(usermem.ByteOrder.Uint64(b[0:8])) + length := usermem.ByteOrder.Uint64(b[8:16]) + if length > math.MaxInt64 { + return usermem.AddrRangeSeq{}, syserror.EINVAL + } + ar, ok := t.MemoryManager().CheckIORange(base, int64(length)) + if !ok { + return usermem.AddrRangeSeq{}, syserror.EFAULT + } + + if numIovecs == 1 { + // Special case to avoid allocating dst. + return usermem.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil + } + dst = append(dst, ar) + + addr += itemLen + } + + default: + return usermem.AddrRangeSeq{}, syserror.ENOSYS + } + + // Truncate to MAX_RW_COUNT. + var total uint64 + for i := range dst { + dstlen := uint64(dst[i].Length()) + if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen { + dst[i].End -= usermem.Addr(dstlen - rem) + dstlen = rem + } + total += dstlen + } + + return usermem.AddrRangeSeqFromSlice(dst), nil +} + +// SingleIOSequence returns a usermem.IOSequence representing [addr, +// addr+length) in t's address space. If this contains addresses outside the +// application address range, it returns EFAULT. If length exceeds +// MAX_RW_COUNT, the range is silently truncated. +// +// SingleIOSequence is analogous to Linux's +// lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and +// write syscalls in Linux do not use import_single_range(). However they check +// access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address +// ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().) +func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) { + if length > MAX_RW_COUNT { + length = MAX_RW_COUNT + } + ar, ok := t.MemoryManager().CheckIORange(addr, int64(length)) + if !ok { + return usermem.IOSequence{}, syserror.EFAULT + } + return usermem.IOSequence{ + IO: t.MemoryManager(), + Addrs: usermem.AddrRangeSeqOf(ar), + Opts: opts, + }, nil +} + +// IovecsIOSequence returns a usermem.IOSequence representing the array of +// iovcnt struct iovecs at addr in t's address space. opts applies to the +// returned IOSequence, not the reading of the struct iovec array. +// +// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec(). +// +// Preconditions: As for Task.CopyInIovecs. +func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) { + if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { + return usermem.IOSequence{}, syserror.EINVAL + } + ars, err := t.CopyInIovecs(addr, iovcnt) + if err != nil { + return usermem.IOSequence{}, err + } + return usermem.IOSequence{ + IO: t.MemoryManager(), + Addrs: ars, + Opts: opts, + }, nil +} diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go new file mode 100644 index 000000000..8bd53928e --- /dev/null +++ b/pkg/sentry/kernel/thread_group.go @@ -0,0 +1,330 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" +) + +// A ThreadGroup is a logical grouping of tasks that has widespread +// significance to other kernel features (e.g. signal handling). ("Thread +// groups" are usually called "processes" in userspace documentation.) +// +// ThreadGroup is a superset of Linux's struct signal_struct. +// +// +stateify savable +type ThreadGroup struct { + threadGroupNode + + // signalHandlers is the set of signal handlers used by every task in this + // thread group. (signalHandlers may also be shared with other thread + // groups.) + // + // signalHandlers.mu (hereafter "the signal mutex") protects state related + // to signal handling, as well as state that usually needs to be atomic + // with signal handling, for all ThreadGroups and Tasks using + // signalHandlers. (This is analogous to Linux's use of struct + // sighand_struct::siglock.) + // + // The signalHandlers pointer can only be mutated during an execve + // (Task.finishExec). Consequently, when it's possible for a task in the + // thread group to be completing an execve, signalHandlers is protected by + // the owning TaskSet.mu. Otherwise, it is possible to read the + // signalHandlers pointer without synchronization. In particular, + // completing an execve requires that all other tasks in the thread group + // have exited, so task goroutines do not need the owning TaskSet.mu to + // read the signalHandlers pointer of their thread groups. + signalHandlers *SignalHandlers + + // pendingSignals is the set of pending signals that may be handled by any + // task in this thread group. + // + // pendingSignals is protected by the signal mutex. + pendingSignals pendingSignals + + // If groupStopDequeued is true, a task in the thread group has dequeued a + // stop signal, but has not yet initiated the group stop. + // + // groupStopDequeued is analogous to Linux's JOBCTL_STOP_DEQUEUED. + // + // groupStopDequeued is protected by the signal mutex. + groupStopDequeued bool + + // groupStopSignal is the signal that caused a group stop to be initiated. + // + // groupStopSignal is protected by the signal mutex. + groupStopSignal linux.Signal + + // groupStopPendingCount is the number of active tasks in the thread group + // for which Task.groupStopPending is set. + // + // groupStopPendingCount is analogous to Linux's + // signal_struct::group_stop_count. + // + // groupStopPendingCount is protected by the signal mutex. + groupStopPendingCount int + + // If groupStopComplete is true, groupStopPendingCount transitioned from + // non-zero to zero without an intervening SIGCONT. + // + // groupStopComplete is analogous to Linux's SIGNAL_STOP_STOPPED. + // + // groupStopComplete is protected by the signal mutex. + groupStopComplete bool + + // If groupStopWaitable is true, the thread group is indicating a waitable + // group stop event (as defined by EventChildGroupStop). + // + // Linux represents the analogous state as SIGNAL_STOP_STOPPED being set + // and group_exit_code being non-zero. + // + // groupStopWaitable is protected by the signal mutex. + groupStopWaitable bool + + // If groupContNotify is true, then a SIGCONT has recently ended a group + // stop on this thread group, and the first task to observe it should + // notify its parent. groupContInterrupted is true iff SIGCONT ended an + // incomplete group stop. If groupContNotify is false, groupContInterrupted is + // meaningless. + // + // Analogues in Linux: + // + // - groupContNotify && groupContInterrupted is represented by + // SIGNAL_CLD_STOPPED. + // + // - groupContNotify && !groupContInterrupted is represented by + // SIGNAL_CLD_CONTINUED. + // + // - !groupContNotify is represented by neither flag being set. + // + // groupContNotify and groupContInterrupted are protected by the signal + // mutex. + groupContNotify bool + groupContInterrupted bool + + // If groupContWaitable is true, the thread group is indicating a waitable + // continue event (as defined by EventGroupContinue). + // + // groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED. + // + // groupContWaitable is protected by the signal mutex. + groupContWaitable bool + + // exiting is true if all tasks in the ThreadGroup should exit. exiting is + // analogous to Linux's SIGNAL_GROUP_EXIT. + // + // exiting is protected by the signal mutex. exiting can only transition + // from false to true. + exiting bool + + // exitStatus is the thread group's exit status. + // + // While exiting is false, exitStatus is protected by the signal mutex. + // When exiting becomes true, exitStatus becomes immutable. + exitStatus ExitStatus + + // terminationSignal is the signal that this thread group's leader will + // send to its parent when it exits. + // + // terminationSignal is protected by the TaskSet mutex. + terminationSignal linux.Signal + + // liveGoroutines is the number of non-exited task goroutines in the thread + // group. + // + // liveGoroutines is not saved; it is reset as task goroutines are + // restarted by Task.Start. + liveGoroutines sync.WaitGroup `state:"nosave"` + + timerMu sync.Mutex `state:"nosave"` + + // itimerRealTimer implements ITIMER_REAL for the thread group. + itimerRealTimer *ktime.Timer + + // itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group. + // + // itimerVirtSetting is protected by the signal mutex. + itimerVirtSetting ktime.Setting + + // itimerProfSetting is the ITIMER_PROF setting for the thread group. + // + // itimerProfSetting is protected by the signal mutex. + itimerProfSetting ktime.Setting + + // rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit + // notifications for the thread group. + // + // rlimitCPUSoftSetting is protected by the signal mutex. + rlimitCPUSoftSetting ktime.Setting + + // cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true, + // itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true, + // or limits.Get(CPU) is finite. + // + // cpuTimersEnabled is protected by the signal mutex. cpuTimersEnabled is + // accessed using atomic memory operations. + cpuTimersEnabled uint32 + + // timers is the thread group's POSIX interval timers. nextTimerID is the + // TimerID at which allocation should begin searching for an unused ID. + // + // timers and nextTimerID are protected by timerMu. + timers map[linux.TimerID]*IntervalTimer + nextTimerID linux.TimerID + + // exitedCPUStats is the CPU usage for all exited tasks in the thread + // group. exitedCPUStats is protected by the TaskSet mutex. + exitedCPUStats usage.CPUStats + + // childCPUStats is the CPU usage of all joined descendants of this thread + // group. childCPUStats is protected by the TaskSet mutex. + childCPUStats usage.CPUStats + + // ioUsage is the I/O usage for all exited tasks in the thread group. + // The ioUsage pointer is immutable. + ioUsage *usage.IO + + // maxRSS is the historical maximum resident set size of the thread group, updated when: + // + // - A task in the thread group exits, since after all tasks have + // exited the MemoryManager is no longer reachable. + // + // - The thread group completes an execve, since this changes + // MemoryManagers. + // + // maxRSS is protected by the TaskSet mutex. + maxRSS uint64 + + // childMaxRSS is the maximum resident set size in bytes of all joined + // descendants of this thread group. + // + // childMaxRSS is protected by the TaskSet mutex. + childMaxRSS uint64 + + // Resource limits for this ThreadGroup. The limits pointer is immutable. + limits *limits.LimitSet + + // processGroup is the processGroup for this thread group. + // + // processGroup is protected by the TaskSet mutex. + processGroup *ProcessGroup + + // execed indicates an exec has occurred since creation. This will be + // set by finishExec, and new TheadGroups will have this field cleared. + // When execed is set, the processGroup may no longer be changed. + // + // execed is protected by the TaskSet mutex. + execed bool + + // rscr is the thread group's RSEQ critical region. + rscr atomic.Value `state:".(*RSEQCriticalRegion)"` +} + +// newThreadGroup returns a new, empty thread group in PID namespace ns. The +// thread group leader will send its parent terminationSignal when it exits. +// The new thread group isn't visible to the system until a task has been +// created inside of it by a successful call to TaskSet.NewTask. +func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup { + tg := &ThreadGroup{ + threadGroupNode: threadGroupNode{ + pidns: ns, + }, + signalHandlers: sh, + terminationSignal: terminationSignal, + ioUsage: &usage.IO{}, + limits: limits, + } + tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg}) + tg.timers = make(map[linux.TimerID]*IntervalTimer) + tg.rscr.Store(&RSEQCriticalRegion{}) + return tg +} + +// saveRscr is invopked by stateify. +func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion { + return tg.rscr.Load().(*RSEQCriticalRegion) +} + +// loadRscr is invoked by stateify. +func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) { + tg.rscr.Store(rscr) +} + +// SignalHandlers returns the signal handlers used by tg. +// +// Preconditions: The caller must provide the synchronization required to read +// tg.signalHandlers, as described in the field's comment. +func (tg *ThreadGroup) SignalHandlers() *SignalHandlers { + return tg.signalHandlers +} + +// Limits returns tg's limits. +func (tg *ThreadGroup) Limits() *limits.LimitSet { + return tg.limits +} + +// release releases the thread group's resources. +func (tg *ThreadGroup) release() { + // Timers must be destroyed without holding the TaskSet or signal mutexes + // since timers send signals with Timer.mu locked. + tg.itimerRealTimer.Destroy() + var its []*IntervalTimer + tg.pidns.owner.mu.Lock() + tg.signalHandlers.mu.Lock() + for _, it := range tg.timers { + its = append(its, it) + } + tg.timers = make(map[linux.TimerID]*IntervalTimer) // nil maps can't be saved + tg.signalHandlers.mu.Unlock() + tg.pidns.owner.mu.Unlock() + for _, it := range its { + it.DestroyTimer() + } +} + +// forEachChildThreadGroupLocked indicates over all child ThreadGroups. +// +// Precondition: TaskSet.mu must be held. +func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) { + for t := tg.tasks.Front(); t != nil; t = t.Next() { + for child := range t.children { + if child == child.tg.leader { + fn(child.tg) + } + } + } +} + +// itimerRealListener implements ktime.Listener for ITIMER_REAL expirations. +// +// +stateify savable +type itimerRealListener struct { + tg *ThreadGroup +} + +// Notify implements ktime.TimerListener.Notify. +func (l *itimerRealListener) Notify(exp uint64) { + l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM)) +} + +// Destroy implements ktime.TimerListener.Destroy. +func (l *itimerRealListener) Destroy() { +} diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go new file mode 100644 index 000000000..656bbd46c --- /dev/null +++ b/pkg/sentry/kernel/threads.go @@ -0,0 +1,465 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// TasksLimit is the maximum number of threads for untrusted application. +// Linux doesn't really limit this directly, rather it is limited by total +// memory size, stacks allocated and a global maximum. There's no real reason +// for us to limit it either, (esp. since threads are backed by go routines), +// and we would expect to hit resource limits long before hitting this number. +// However, for correctness, we still check that the user doesn't exceed this +// number. +// +// Note that because of the way futexes are implemented, there *are* in fact +// serious restrictions on valid thread IDs. They are limited to 2^30 - 1 +// (kernel/fork.c:MAX_THREADS). +const TasksLimit = (1 << 16) + +// ThreadID is a generic thread identifier. +type ThreadID int32 + +// String returns a decimal representation of the ThreadID. +func (tid ThreadID) String() string { + return fmt.Sprintf("%d", tid) +} + +// InitTID is the TID given to the first task added to each PID namespace. The +// thread group led by InitTID is called the namespace's init process. The +// death of a PID namespace's init process causes all tasks visible in that +// namespace to be killed. +const InitTID ThreadID = 1 + +// A TaskSet comprises all tasks in a system. +// +// +stateify savable +type TaskSet struct { + // mu protects all relationships betweens tasks and thread groups in the + // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.) + mu sync.RWMutex `state:"nosave"` + + // Root is the root PID namespace, in which all tasks in the TaskSet are + // visible. The Root pointer is immutable. + Root *PIDNamespace + + // sessions is the set of all sessions. + sessions sessionList + + // stopCount is the number of active external stops applicable to all tasks + // in the TaskSet (calls to TaskSet.BeginExternalStop that have not been + // paired with a call to TaskSet.EndExternalStop). stopCount is protected + // by mu. + // + // stopCount is not saved for the same reason as Task.stopCount; it is + // always reset to zero after restore. + stopCount int32 `state:"nosave"` + + // liveGoroutines is the number of non-exited task goroutines in the + // TaskSet. + // + // liveGoroutines is not saved; it is reset as task goroutines are + // restarted by Task.Start. + liveGoroutines sync.WaitGroup `state:"nosave"` + + // runningGoroutines is the number of running task goroutines in the + // TaskSet. + // + // runningGoroutines is not saved; its counter value is required to be zero + // at time of save (but note that this is not necessarily the same thing as + // sync.WaitGroup's zero value). + runningGoroutines sync.WaitGroup `state:"nosave"` +} + +// newTaskSet returns a new, empty TaskSet. +func newTaskSet() *TaskSet { + ts := &TaskSet{} + ts.Root = newPIDNamespace(ts, nil /* parent */, auth.NewRootUserNamespace()) + return ts +} + +// forEachThreadGroupLocked applies f to each thread group in ts. +// +// Preconditions: ts.mu must be locked (for reading or writing). +func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) { + for tg := range ts.Root.tgids { + f(tg) + } +} + +// A PIDNamespace represents a PID namespace, a bimap between thread IDs and +// tasks. See the pid_namespaces(7) man page for further details. +// +// N.B. A task is said to be visible in a PID namespace if the PID namespace +// contains a thread ID that maps to that task. +// +// +stateify savable +type PIDNamespace struct { + // owner is the TaskSet that this PID namespace belongs to. The owner + // pointer is immutable. + owner *TaskSet + + // parent is the PID namespace of the process that created this one. If + // this is the root PID namespace, parent is nil. The parent pointer is + // immutable. + // + // Invariant: All tasks that are visible in this namespace are also visible + // in all ancestor namespaces. + parent *PIDNamespace + + // userns is the user namespace with which this PID namespace is + // associated. Privileged operations on this PID namespace must have + // appropriate capabilities in userns. The userns pointer is immutable. + userns *auth.UserNamespace + + // The following fields are protected by owner.mu. + + // last is the last ThreadID to be allocated in this namespace. + last ThreadID + + // tasks is a mapping from ThreadIDs in this namespace to tasks visible in + // the namespace. + tasks map[ThreadID]*Task + + // tids is a mapping from tasks visible in this namespace to their + // identifiers in this namespace. + tids map[*Task]ThreadID + + // tgids is a mapping from thread groups visible in this namespace to + // their identifiers in this namespace. + // + // The content of tgids is equivalent to tids[tg.leader]. This exists + // primarily as an optimization to quickly find all thread groups. + tgids map[*ThreadGroup]ThreadID + + // sessions is a mapping from SessionIDs in this namespace to sessions + // visible in the namespace. + sessions map[SessionID]*Session + + // sids is a mapping from sessions visible in this namespace to their + // identifiers in this namespace. + sids map[*Session]SessionID + + // processGroups is a mapping from ProcessGroupIDs in this namespace to + // process groups visible in the namespace. + processGroups map[ProcessGroupID]*ProcessGroup + + // pgids is a mapping from process groups visible in this namespace to + // their identifiers in this namespace. + pgids map[*ProcessGroup]ProcessGroupID + + // exiting indicates that the namespace's init process is exiting or has + // exited. + exiting bool +} + +func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace { + return &PIDNamespace{ + owner: ts, + parent: parent, + userns: userns, + tasks: make(map[ThreadID]*Task), + tids: make(map[*Task]ThreadID), + tgids: make(map[*ThreadGroup]ThreadID), + sessions: make(map[SessionID]*Session), + sids: make(map[*Session]SessionID), + processGroups: make(map[ProcessGroupID]*ProcessGroup), + pgids: make(map[*ProcessGroup]ProcessGroupID), + } +} + +// NewChild returns a new, empty PID namespace that is a child of ns. Authority +// over the new PID namespace is controlled by userns. +func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace { + return newPIDNamespace(ns.owner, ns, userns) +} + +// TaskWithID returns the task with thread ID tid in PID namespace ns. If no +// task has that TID, TaskWithID returns nil. +func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task { + ns.owner.mu.RLock() + t := ns.tasks[tid] + ns.owner.mu.RUnlock() + return t +} + +// ThreadGroupWithID returns the thread group lead by the task with thread ID +// tid in PID namespace ns. If no task has that TID, or if the task with that +// TID is not a thread group leader, ThreadGroupWithID returns nil. +func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + t := ns.tasks[tid] + if t == nil { + return nil + } + if t != t.tg.leader { + return nil + } + return t.tg +} + +// IDOfTask returns the TID assigned to the given task in PID namespace ns. If +// the task is not visible in that namespace, IDOfTask returns 0. (This return +// value is significant in some cases, e.g. getppid() is documented as +// returning 0 if the caller's parent is in an ancestor namespace and +// consequently not visible to the caller.) If the task is nil, IDOfTask returns +// 0. +func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID { + ns.owner.mu.RLock() + id := ns.tids[t] + ns.owner.mu.RUnlock() + return id +} + +// IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns. +// If the task is not visible in that namespace, IDOfThreadGroup returns 0. +func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID { + ns.owner.mu.RLock() + id := ns.tgids[tg] + ns.owner.mu.RUnlock() + return id +} + +// Tasks returns a snapshot of the tasks in ns. +func (ns *PIDNamespace) Tasks() []*Task { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + tasks := make([]*Task, 0, len(ns.tasks)) + for t := range ns.tids { + tasks = append(tasks, t) + } + return tasks +} + +// ThreadGroups returns a snapshot of the thread groups in ns. +func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup { + return ns.ThreadGroupsAppend(nil) +} + +// ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs. +func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + for tg := range ns.tgids { + tgs = append(tgs, tg) + } + return tgs +} + +// UserNamespace returns the user namespace associated with PID namespace ns. +func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace { + return ns.userns +} + +// A threadGroupNode defines the relationship between a thread group and the +// rest of the system. Conceptually, threadGroupNode is data belonging to the +// owning TaskSet, as if TaskSet contained a field `nodes +// map[*ThreadGroup]*threadGroupNode`. However, for practical reasons, +// threadGroupNode is embedded in the ThreadGroup it represents. +// (threadGroupNode is an anonymous field in ThreadGroup; this is to expose +// threadGroupEntry's methods on ThreadGroup to make it implement +// threadGroupLinker.) +// +// +stateify savable +type threadGroupNode struct { + // pidns is the PID namespace containing the thread group and all of its + // member tasks. The pidns pointer is immutable. + pidns *PIDNamespace + + // eventQueue is notified whenever a event of interest to Task.Wait occurs + // in a child of this thread group, or a ptrace tracee of a task in this + // thread group. Events are defined in task_exit.go. + // + // Note that we cannot check and save this wait queue similarly to other + // wait queues, as the queue will not be empty by the time of saving, due + // to the wait sourced from Exec(). + eventQueue waiter.Queue `state:"nosave"` + + // leader is the thread group's leader, which is the oldest task in the + // thread group; usually the last task in the thread group to call + // execve(), or if no such task exists then the first task in the thread + // group, which was created by a call to fork() or clone() without + // CLONE_THREAD. Once a thread group has been made visible to the rest of + // the system by TaskSet.newTask, leader is never nil. + // + // Note that it's possible for the leader to exit without causing the rest + // of the thread group to exit; in such a case, leader will still be valid + // and non-nil, but leader will not be in tasks. + // + // leader is protected by the TaskSet mutex. + leader *Task + + // If execing is not nil, it is a task in the thread group that has killed + // all other tasks so that it can become the thread group leader and + // perform an execve. (execing may already be the thread group leader.) + // + // execing is analogous to Linux's signal_struct::group_exit_task. + // + // execing is protected by the TaskSet mutex. + execing *Task + + // tasks is all tasks in the thread group that have not yet been reaped. + // + // tasks is protected by both the TaskSet mutex and the signal mutex: + // Mutating tasks requires locking the TaskSet mutex for writing *and* + // locking the signal mutex. Reading tasks requires locking the TaskSet + // mutex *or* locking the signal mutex. + tasks taskList + + // tasksCount is the number of tasks in the thread group that have not yet + // been reaped; equivalently, tasksCount is the number of tasks in tasks. + // + // tasksCount is protected by both the TaskSet mutex and the signal mutex, + // as with tasks. + tasksCount int + + // liveTasks is the number of tasks in the thread group that have not yet + // reached TaskExitZombie. + // + // liveTasks is protected by the TaskSet mutex (NOT the signal mutex). + liveTasks int + + // activeTasks is the number of tasks in the thread group that have not yet + // reached TaskExitInitiated. + // + // activeTasks is protected by both the TaskSet mutex and the signal mutex, + // as with tasks. + activeTasks int +} + +// PIDNamespace returns the PID namespace containing tg. +func (tg *ThreadGroup) PIDNamespace() *PIDNamespace { + return tg.pidns +} + +// TaskSet returns the TaskSet containing tg. +func (tg *ThreadGroup) TaskSet() *TaskSet { + return tg.pidns.owner +} + +// Leader returns tg's leader. +func (tg *ThreadGroup) Leader() *Task { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + return tg.leader +} + +// Count returns the number of non-exited threads in the group. +func (tg *ThreadGroup) Count() int { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + var count int + for t := tg.tasks.Front(); t != nil; t = t.Next() { + count++ + } + return count +} + +// MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for +// all tasks in tg. +func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID { + tg.pidns.owner.mu.RLock() + defer tg.pidns.owner.mu.RUnlock() + + var tasks []ThreadID + for t := tg.tasks.Front(); t != nil; t = t.Next() { + if id, ok := pidns.tids[t]; ok { + tasks = append(tasks, id) + } + } + return tasks +} + +// ID returns tg's leader's thread ID in its own PID namespace. If tg's leader +// is dead, ID returns 0. +func (tg *ThreadGroup) ID() ThreadID { + tg.pidns.owner.mu.RLock() + id := tg.pidns.tgids[tg] + tg.pidns.owner.mu.RUnlock() + return id +} + +// A taskNode defines the relationship between a task and the rest of the +// system. The comments on threadGroupNode also apply to taskNode. +// +// +stateify savable +type taskNode struct { + // tg is the thread group that this task belongs to. The tg pointer is + // immutable. + tg *ThreadGroup `state:"wait"` + + // taskEntry links into tg.tasks. Note that this means that + // Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread + // group. See threadGroupNode.tasks for synchronization info. + taskEntry + + // parent is the task's parent. parent may be nil. + // + // parent is protected by the TaskSet mutex. + parent *Task + + // children is this task's children. + // + // children is protected by the TaskSet mutex. + children map[*Task]struct{} + + // If childPIDNamespace is not nil, all new tasks created by this task will + // be members of childPIDNamespace rather than this one. (As a corollary, + // this task becomes unable to create sibling tasks in the same thread + // group.) + // + // childPIDNamespace is exclusive to the task goroutine. + childPIDNamespace *PIDNamespace +} + +// ThreadGroup returns the thread group containing t. +func (t *Task) ThreadGroup() *ThreadGroup { + return t.tg +} + +// PIDNamespace returns the PID namespace containing t. +func (t *Task) PIDNamespace() *PIDNamespace { + return t.tg.pidns +} + +// TaskSet returns the TaskSet containing t. +func (t *Task) TaskSet() *TaskSet { + return t.tg.pidns.owner +} + +// Timekeeper returns the system Timekeeper. +func (t *Task) Timekeeper() *Timekeeper { + return t.k.timekeeper +} + +// Parent returns t's parent. +func (t *Task) Parent() *Task { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + return t.parent +} + +// ThreadID returns t's thread ID in its own PID namespace. If the task is +// dead, ThreadID returns 0. +func (t *Task) ThreadID() ThreadID { + return t.tg.pidns.IDOfTask(t) +} diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go new file mode 100644 index 000000000..c0660d362 --- /dev/null +++ b/pkg/sentry/kernel/time/context.go @@ -0,0 +1,44 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package time + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// contextID is the time package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxRealtimeClock is a Context.Value key for the current real time. + CtxRealtimeClock contextID = iota +) + +// RealtimeClockFromContext returns the real time clock associated with context +// ctx. +func RealtimeClockFromContext(ctx context.Context) Clock { + if v := ctx.Value(CtxRealtimeClock); v != nil { + return v.(Clock) + } + return nil +} + +// NowFromContext returns the current real time associated with context ctx. +func NowFromContext(ctx context.Context) Time { + if clk := RealtimeClockFromContext(ctx); clk != nil { + return clk.Now() + } + panic("encountered context without RealtimeClock") +} diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go new file mode 100644 index 000000000..3846cf1ea --- /dev/null +++ b/pkg/sentry/kernel/time/time.go @@ -0,0 +1,691 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package time defines the Timer type, which provides a periodic timer that +// works by sampling a user-provided clock. +package time + +import ( + "fmt" + "math" + "sync" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Events that may be generated by a Clock. +const ( + // ClockEventSet occurs when a Clock undergoes a discontinuous change. + ClockEventSet waiter.EventMask = 1 << iota + + // ClockEventRateIncrease occurs when the rate at which a Clock advances + // increases significantly, such that values returned by previous calls to + // Clock.WallTimeUntil may be too large. + ClockEventRateIncrease +) + +// Time represents an instant in time with nanosecond precision. +// +// Time may represent time with respect to any clock and may not have any +// meaning in the real world. +// +// +stateify savable +type Time struct { + ns int64 +} + +var ( + // MinTime is the zero time instant, the lowest possible time that can + // be represented by Time. + MinTime = Time{ns: math.MinInt64} + + // MaxTime is the highest possible time that can be represented by + // Time. + MaxTime = Time{ns: math.MaxInt64} + + // ZeroTime represents the zero time in an unspecified Clock's domain. + ZeroTime = Time{ns: 0} +) + +const ( + // MinDuration is the minimum duration representable by time.Duration. + MinDuration = time.Duration(math.MinInt64) + + // MaxDuration is the maximum duration representable by time.Duration. + MaxDuration = time.Duration(math.MaxInt64) +) + +// FromNanoseconds returns a Time representing the point ns nanoseconds after +// an unspecified Clock's zero time. +func FromNanoseconds(ns int64) Time { + return Time{ns} +} + +// FromSeconds returns a Time representing the point s seconds after an +// unspecified Clock's zero time. +func FromSeconds(s int64) Time { + if s > math.MaxInt64/time.Second.Nanoseconds() { + return MaxTime + } + return Time{s * 1e9} +} + +// FromUnix converts from Unix seconds and nanoseconds to Time, assuming a real +// time Unix clock domain. +func FromUnix(s int64, ns int64) Time { + if s > math.MaxInt64/time.Second.Nanoseconds() { + return MaxTime + } + t := s * 1e9 + if t > math.MaxInt64-ns { + return MaxTime + } + return Time{t + ns} +} + +// FromTimespec converts from Linux Timespec to Time. +func FromTimespec(ts linux.Timespec) Time { + return Time{ts.ToNsecCapped()} +} + +// FromTimeval converts a Linux Timeval to Time. +func FromTimeval(tv linux.Timeval) Time { + return Time{tv.ToNsecCapped()} +} + +// Nanoseconds returns nanoseconds elapsed since the zero time in t's Clock +// domain. If t represents walltime, this is nanoseconds since the Unix epoch. +func (t Time) Nanoseconds() int64 { + return t.ns +} + +// Seconds returns seconds elapsed since the zero time in t's Clock domain. If +// t represents walltime, this is seconds since Unix epoch. +func (t Time) Seconds() int64 { + return t.Nanoseconds() / time.Second.Nanoseconds() +} + +// Timespec converts Time to a Linux timespec. +func (t Time) Timespec() linux.Timespec { + return linux.NsecToTimespec(t.Nanoseconds()) +} + +// Unix returns the (seconds, nanoseconds) representation of t such that +// seconds*1e9 + nanoseconds = t. +func (t Time) Unix() (s int64, ns int64) { + s = t.ns / 1e9 + ns = t.ns % 1e9 + return +} + +// TimeT converts Time to a Linux time_t. +func (t Time) TimeT() linux.TimeT { + return linux.NsecToTimeT(t.Nanoseconds()) +} + +// Timeval converts Time to a Linux timeval. +func (t Time) Timeval() linux.Timeval { + return linux.NsecToTimeval(t.Nanoseconds()) +} + +// Add adds the duration of d to t. +func (t Time) Add(d time.Duration) Time { + if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) { + return MaxTime + } + if t.ns < 0 && d.Nanoseconds() < math.MinInt64-int64(t.ns) { + return MinTime + } + return Time{int64(t.ns) + d.Nanoseconds()} +} + +// AddTime adds the duration of u to t. +func (t Time) AddTime(u Time) Time { + return t.Add(time.Duration(u.ns)) +} + +// Equal reports whether the two times represent the same instant in time. +func (t Time) Equal(u Time) bool { + return t.ns == u.ns +} + +// Before reports whether the instant t is before the instant u. +func (t Time) Before(u Time) bool { + return t.ns < u.ns +} + +// After reports whether the instant t is after the instant u. +func (t Time) After(u Time) bool { + return t.ns > u.ns +} + +// Sub returns the duration of t - u. +// +// N.B. This measure may not make sense for every Time returned by ktime.Clock. +// Callers who need wall time duration can use ktime.Clock.WallTimeUntil to +// estimate that wall time. +func (t Time) Sub(u Time) time.Duration { + dur := time.Duration(int64(t.ns)-int64(u.ns)) * time.Nanosecond + switch { + case u.Add(dur).Equal(t): + return dur + case t.Before(u): + return MinDuration + default: + return MaxDuration + } +} + +// IsMin returns whether t represents the lowest possible time instant. +func (t Time) IsMin() bool { + return t == MinTime +} + +// IsZero returns whether t represents the zero time instant in t's Clock domain. +func (t Time) IsZero() bool { + return t == ZeroTime +} + +// String returns the time represented in nanoseconds as a string. +func (t Time) String() string { + return fmt.Sprintf("%dns", t.Nanoseconds()) +} + +// A Clock is an abstract time source. +type Clock interface { + // Now returns the current time in nanoseconds according to the Clock. + Now() Time + + // WallTimeUntil returns the estimated wall time until Now will return a + // value greater than or equal to t, given that a recent call to Now + // returned now. If t has already passed, WallTimeUntil may return 0 or a + // negative value. + // + // WallTimeUntil must be abstract to support Clocks that do not represent + // wall time (e.g. thread group execution timers). Clocks that represent + // wall times may embed the WallRateClock type to obtain an appropriate + // trivial implementation of WallTimeUntil. + // + // WallTimeUntil is used to determine when associated Timers should next + // check for expirations. Returning too small a value may result in + // spurious Timer goroutine wakeups, while returning too large a value may + // result in late expirations. Implementations should usually err on the + // side of underestimating. + WallTimeUntil(t, now Time) time.Duration + + // Waitable methods may be used to subscribe to Clock events. Waiters will + // not be preserved by Save and must be re-established during restore. + // + // Since Clock events are transient, implementations of + // waiter.Waitable.Readiness should return 0. + waiter.Waitable +} + +// WallRateClock implements Clock.WallTimeUntil for Clocks that elapse at the +// same rate as wall time. +type WallRateClock struct{} + +// WallTimeUntil implements Clock.WallTimeUntil. +func (WallRateClock) WallTimeUntil(t, now Time) time.Duration { + return t.Sub(now) +} + +// NoClockEvents implements waiter.Waitable for Clocks that do not generate +// events. +type NoClockEvents struct{} + +// Readiness implements waiter.Waitable.Readiness. +func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask { + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (NoClockEvents) EventUnregister(e *waiter.Entry) { +} + +// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and +// defining waiter.Waitable.Readiness as required by Clock. +type ClockEventsQueue struct { + waiter.Queue +} + +// Readiness implements waiter.Waitable.Readiness. +func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask { + return 0 +} + +// A TimerListener receives expirations from a Timer. +type TimerListener interface { + // Notify is called when its associated Timer expires. exp is the number of + // expirations. + // + // Notify is called with the associated Timer's mutex locked, so Notify + // must not take any locks that precede Timer.mu in lock order. + // + // Preconditions: exp > 0. + Notify(exp uint64) + + // Destroy is called when the timer is destroyed. + Destroy() +} + +// Setting contains user-controlled mutable Timer properties. +// +// +stateify savable +type Setting struct { + // Enabled is true if the timer is running. + Enabled bool + + // Next is the time in nanoseconds of the next expiration. + Next Time + + // Period is the time in nanoseconds between expirations. If Period is + // zero, the timer will not automatically restart after expiring. + // + // Invariant: Period >= 0. + Period time.Duration +} + +// SettingFromSpec converts a (value, interval) pair to a Setting based on a +// reading from c. value is interpreted as a time relative to c.Now(). +func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) { + return SettingFromSpecAt(value, interval, c.Now()) +} + +// SettingFromSpecAt converts a (value, interval) pair to a Setting. value is +// interpreted as a time relative to now. +func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) { + if value < 0 { + return Setting{}, syserror.EINVAL + } + if value == 0 { + return Setting{Period: interval}, nil + } + return Setting{ + Enabled: true, + Next: now.Add(value), + Period: interval, + }, nil +} + +// SettingFromAbsSpec converts a (value, interval) pair to a Setting. value is +// interpreted as an absolute time. +func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) { + if value.Before(ZeroTime) { + return Setting{}, syserror.EINVAL + } + if value.IsZero() { + return Setting{Period: interval}, nil + } + return Setting{ + Enabled: true, + Next: value, + Period: interval, + }, nil +} + +// SettingFromItimerspec converts a linux.Itimerspec to a Setting. If abs is +// true, its.Value is interpreted as an absolute time. Otherwise, it is +// interpreted as a time relative to c.Now(). +func SettingFromItimerspec(its linux.Itimerspec, abs bool, c Clock) (Setting, error) { + if abs { + return SettingFromAbsSpec(FromTimespec(its.Value), its.Interval.ToDuration()) + } + return SettingFromSpec(its.Value.ToDuration(), its.Interval.ToDuration(), c) +} + +// SpecFromSetting converts a timestamp and a Setting to a (relative value, +// interval) pair, as used by most Linux syscalls that return a struct +// itimerval or struct itimerspec. +func SpecFromSetting(now Time, s Setting) (value, period time.Duration) { + if !s.Enabled { + return 0, s.Period + } + return s.Next.Sub(now), s.Period +} + +// ItimerspecFromSetting converts a Setting to a linux.Itimerspec. +func ItimerspecFromSetting(now Time, s Setting) linux.Itimerspec { + val, iv := SpecFromSetting(now, s) + return linux.Itimerspec{ + Interval: linux.DurationToTimespec(iv), + Value: linux.DurationToTimespec(val), + } +} + +// At returns an updated Setting and a number of expirations after the +// associated Clock indicates a time of now. +// +// Settings may be created by successive calls to At with decreasing +// values of now (i.e. time may appear to go backward). Supporting this is +// required to support non-monotonic clocks, as well as allowing +// Timer.clock.Now() to be called without holding Timer.mu. +func (s Setting) At(now Time) (Setting, uint64) { + if !s.Enabled { + return s, 0 + } + if s.Next.After(now) { + return s, 0 + } + if s.Period == 0 { + s.Enabled = false + return s, 1 + } + exp := 1 + uint64(now.Sub(s.Next).Nanoseconds())/uint64(s.Period) + s.Next = s.Next.Add(time.Duration(uint64(s.Period) * exp)) + return s, exp +} + +// Timer is an optionally-periodic timer driven by sampling a user-specified +// Clock. Timer's semantics support the requirements of Linux's interval timers +// (setitimer(2), timer_create(2), timerfd_create(2)). +// +// Timers should be created using NewTimer and must be cleaned up by calling +// Timer.Destroy when no longer used. +// +// +stateify savable +type Timer struct { + // clock is the time source. clock is immutable. + clock Clock + + // listener is notified of expirations. listener is immutable. + listener TimerListener + + // mu protects the following mutable fields. + mu sync.Mutex `state:"nosave"` + + // setting is the timer setting. setting is protected by mu. + setting Setting + + // paused is true if the Timer is paused. paused is protected by mu. + paused bool + + // kicker is used to wake the Timer goroutine. The kicker pointer is + // immutable, but its state is protected by mu. + kicker *time.Timer `state:"nosave"` + + // entry is registered with clock.EventRegister. entry is immutable. + // + // Per comment in Clock, entry must be re-registered after restore; per + // comment in Timer.Load, this is done in Timer.Resume. + entry waiter.Entry `state:"nosave"` + + // events is the channel that will be notified whenever entry receives an + // event. It is also closed by Timer.Destroy to instruct the Timer + // goroutine to exit. + events chan struct{} `state:"nosave"` +} + +// timerTickEvents are Clock events that require the Timer goroutine to Tick +// prematurely. +const timerTickEvents = ClockEventSet | ClockEventRateIncrease + +// NewTimer returns a new Timer that will obtain time from clock and send +// expirations to listener. The Timer is initially stopped and has no first +// expiration or period configured. +func NewTimer(clock Clock, listener TimerListener) *Timer { + t := &Timer{ + clock: clock, + listener: listener, + } + t.init() + return t +} + +// After waits for the duration to elapse according to clock and then sends a +// notification on the returned channel. The timer is started immediately and +// will fire exactly once. The second return value is the start time used with +// the duration. +// +// Callers must call Timer.Destroy. +func After(clock Clock, duration time.Duration) (*Timer, Time, <-chan struct{}) { + notifier, tchan := NewChannelNotifier() + t := NewTimer(clock, notifier) + now := clock.Now() + + t.Swap(Setting{ + Enabled: true, + Period: 0, + Next: now.Add(duration), + }) + return t, now, tchan +} + +// init initializes Timer state that is not preserved across save/restore. If +// init has already been called, calling it again is a no-op. +// +// Preconditions: t.mu must be locked, or the caller must have exclusive access +// to t. +func (t *Timer) init() { + if t.kicker != nil { + return + } + // If t.kicker is nil, the Timer goroutine can't be running, so we can't + // race with it. + t.kicker = time.NewTimer(0) + t.entry, t.events = waiter.NewChannelEntry(nil) + t.clock.EventRegister(&t.entry, timerTickEvents) + go t.runGoroutine() // S/R-SAFE: synchronized by t.mu +} + +// Destroy releases resources owned by the Timer. A Destroyed Timer must not be +// used again; in particular, a Destroyed Timer should not be Saved. +func (t *Timer) Destroy() { + // Stop the Timer, ensuring that the Timer goroutine will not call + // t.kicker.Reset, before calling t.kicker.Stop. + t.mu.Lock() + t.setting.Enabled = false + t.mu.Unlock() + t.kicker.Stop() + // Unregister t.entry, ensuring that the Clock will not send to t.events, + // before closing t.events to instruct the Timer goroutine to exit. + t.clock.EventUnregister(&t.entry) + close(t.events) + t.listener.Destroy() +} + +func (t *Timer) runGoroutine() { + for { + select { + case <-t.kicker.C: + case _, ok := <-t.events: + if !ok { + // Channel closed by Destroy. + return + } + } + t.Tick() + } +} + +// Tick requests that the Timer immediately check for expirations and +// re-evaluate when it should next check for expirations. +func (t *Timer) Tick() { + now := t.clock.Now() + t.mu.Lock() + defer t.mu.Unlock() + if t.paused { + return + } + s, exp := t.setting.At(now) + t.setting = s + if exp > 0 { + t.listener.Notify(exp) + } + t.resetKickerLocked(now) +} + +// Pause pauses the Timer, ensuring that it does not generate any further +// expirations until Resume is called. If the Timer is already paused, Pause +// has no effect. +func (t *Timer) Pause() { + t.mu.Lock() + defer t.mu.Unlock() + t.paused = true + // t.kicker may be nil if we were restored but never resumed. + if t.kicker != nil { + t.kicker.Stop() + } +} + +// Resume ends the effect of Pause. If the Timer is not paused, Resume has no +// effect. +func (t *Timer) Resume() { + t.mu.Lock() + defer t.mu.Unlock() + if !t.paused { + return + } + t.paused = false + + // Lazily initialize the Timer. We can't call Timer.init until Timer.Resume + // because save/restore will restore Timers before + // kernel.Timekeeper.SetClocks() has been called, so if t.clock is backed + // by a kernel.Timekeeper then the Timer goroutine will panic if it calls + // t.clock.Now(). + t.init() + + // Kick the Timer goroutine in case it was already initialized, but the + // Timer goroutine was sleeping. + t.kicker.Reset(0) +} + +// Get returns a snapshot of the Timer's current Setting and the time +// (according to the Timer's Clock) at which the snapshot was taken. +// +// Preconditions: The Timer must not be paused (since its Setting cannot +// be advanced to the current time while it is paused.) +func (t *Timer) Get() (Time, Setting) { + now := t.clock.Now() + t.mu.Lock() + defer t.mu.Unlock() + if t.paused { + panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t)) + } + s, exp := t.setting.At(now) + t.setting = s + if exp > 0 { + t.listener.Notify(exp) + } + t.resetKickerLocked(now) + return now, s +} + +// Swap atomically changes the Timer's Setting and returns the Timer's previous +// Setting and the time (according to the Timer's Clock) at which the snapshot +// was taken. Setting s.Enabled to true starts the Timer, while setting +// s.Enabled to false stops it. +// +// Preconditions: The Timer must not be paused. +func (t *Timer) Swap(s Setting) (Time, Setting) { + return t.SwapAnd(s, nil) +} + +// SwapAnd atomically changes the Timer's Setting, calls f if it is not nil, +// and returns the Timer's previous Setting and the time (according to the +// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true +// starts the timer, while setting s.Enabled to false stops it. +// +// Preconditions: The Timer must not be paused. f cannot call any Timer methods +// since it is called with the Timer mutex locked. +func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) { + now := t.clock.Now() + t.mu.Lock() + defer t.mu.Unlock() + if t.paused { + panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t)) + } + oldS, oldExp := t.setting.At(now) + if oldExp > 0 { + t.listener.Notify(oldExp) + } + if f != nil { + f() + } + newS, newExp := s.At(now) + t.setting = newS + if newExp > 0 { + t.listener.Notify(newExp) + } + t.resetKickerLocked(now) + return now, oldS +} + +// Atomically invokes f atomically with respect to expirations of t; that is, t +// cannot generate expirations while f is being called. +// +// Preconditions: f cannot call any Timer methods since it is called with the +// Timer mutex locked. +func (t *Timer) Atomically(f func()) { + t.mu.Lock() + defer t.mu.Unlock() + f() +} + +// Preconditions: t.mu must be locked. +func (t *Timer) resetKickerLocked(now Time) { + if t.setting.Enabled { + // Clock.WallTimeUntil may return a negative value. This is fine; + // time.when treats negative Durations as 0. + t.kicker.Reset(t.clock.WallTimeUntil(t.setting.Next, now)) + } + // We don't call t.kicker.Stop if !t.setting.Enabled because in most cases + // resetKickerLocked will be called from the Timer goroutine itself, in + // which case t.kicker has already fired and t.kicker.Stop will be an + // expensive no-op (time.Timer.Stop => time.stopTimer => runtime.stopTimer + // => runtime.deltimer). +} + +// Clock returns the Clock used by t. +func (t *Timer) Clock() Clock { + return t.clock +} + +// ChannelNotifier is a TimerListener that sends a message on an empty struct +// channel. +// +// ChannelNotifier cannot be saved or loaded. +type ChannelNotifier struct { + // tchan must be a buffered channel. + tchan chan struct{} +} + +// NewChannelNotifier creates a new channel notifier. +// +// If the notifier is used with a timer, Timer.Destroy will close the channel +// returned here. +func NewChannelNotifier() (TimerListener, <-chan struct{}) { + tchan := make(chan struct{}, 1) + return &ChannelNotifier{tchan}, tchan +} + +// Notify implements ktime.TimerListener.Notify. +func (c *ChannelNotifier) Notify(uint64) { + select { + case c.tchan <- struct{}{}: + default: + } +} + +// Destroy implements ktime.TimerListener.Destroy and will close the channel. +func (c *ChannelNotifier) Destroy() { + close(c.tchan) +} diff --git a/pkg/sentry/kernel/time/time_state_autogen.go b/pkg/sentry/kernel/time/time_state_autogen.go new file mode 100755 index 000000000..1750b55d6 --- /dev/null +++ b/pkg/sentry/kernel/time/time_state_autogen.go @@ -0,0 +1,56 @@ +// automatically generated by stateify. + +package time + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *Time) beforeSave() {} +func (x *Time) save(m state.Map) { + x.beforeSave() + m.Save("ns", &x.ns) +} + +func (x *Time) afterLoad() {} +func (x *Time) load(m state.Map) { + m.Load("ns", &x.ns) +} + +func (x *Setting) beforeSave() {} +func (x *Setting) save(m state.Map) { + x.beforeSave() + m.Save("Enabled", &x.Enabled) + m.Save("Next", &x.Next) + m.Save("Period", &x.Period) +} + +func (x *Setting) afterLoad() {} +func (x *Setting) load(m state.Map) { + m.Load("Enabled", &x.Enabled) + m.Load("Next", &x.Next) + m.Load("Period", &x.Period) +} + +func (x *Timer) beforeSave() {} +func (x *Timer) save(m state.Map) { + x.beforeSave() + m.Save("clock", &x.clock) + m.Save("listener", &x.listener) + m.Save("setting", &x.setting) + m.Save("paused", &x.paused) +} + +func (x *Timer) afterLoad() {} +func (x *Timer) load(m state.Map) { + m.Load("clock", &x.clock) + m.Load("listener", &x.listener) + m.Load("setting", &x.setting) + m.Load("paused", &x.paused) +} + +func init() { + state.Register("time.Time", (*Time)(nil), state.Fns{Save: (*Time).save, Load: (*Time).load}) + state.Register("time.Setting", (*Setting)(nil), state.Fns{Save: (*Setting).save, Load: (*Setting).load}) + state.Register("time.Timer", (*Timer)(nil), state.Fns{Save: (*Timer).save, Load: (*Timer).load}) +} diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go new file mode 100644 index 000000000..505a4fa4f --- /dev/null +++ b/pkg/sentry/kernel/timekeeper.go @@ -0,0 +1,306 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sync" + "time" + + "gvisor.googlesource.com/gvisor/pkg/log" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time" +) + +// Timekeeper manages all of the kernel clocks. +// +// +stateify savable +type Timekeeper struct { + // clocks are the clock sources. + // + // These are not saved directly, as the new machine's clock may behave + // differently. + // + // It is set only once, by SetClocks. + clocks sentrytime.Clocks `state:"nosave"` + + // bootTime is the realtime when the system "booted". i.e., when + // SetClocks was called in the initial (not restored) run. + bootTime ktime.Time + + // monotonicOffset is the offset to apply to the monotonic clock output + // from clocks. + // + // It is set only once, by SetClocks. + monotonicOffset int64 `state:"nosave"` + + // restored, if non-nil, indicates that this Timekeeper was restored + // from a state file. The clocks are not set until restored is closed. + restored chan struct{} `state:"nosave"` + + // saveMonotonic is the (offset) value of the monotonic clock at the + // time of save. + // + // It is only valid if restored is non-nil. + // + // It is only used in SetClocks after restore to compute the new + // monotonicOffset. + saveMonotonic int64 + + // saveRealtime is the value of the realtime clock at the time of save. + // + // It is only valid if restored is non-nil. + // + // It is only used in SetClocks after restore to compute the new + // monotonicOffset. + saveRealtime int64 + + // params manages the parameter page. + params *VDSOParamPage + + // mu protects destruction with stop and wg. + mu sync.Mutex `state:"nosave"` + + // stop is used to tell the update goroutine to exit. + stop chan struct{} `state:"nosave"` + + // wg is used to indicate that the update goroutine has exited. + wg sync.WaitGroup `state:"nosave"` +} + +// NewTimekeeper returns a Timekeeper that is automatically kept up-to-date. +// NewTimekeeper does not take ownership of paramPage. +// +// SetClocks must be called on the returned Timekeeper before it is usable. +func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) { + return &Timekeeper{ + params: NewVDSOParamPage(mfp, paramPage), + }, nil +} + +// SetClocks the backing clock source. +// +// SetClocks must be called before the Timekeeper is used, and it may not be +// called more than once, as changing the clock source without extra correction +// could cause time discontinuities. +// +// It must also be called after Load. +func (t *Timekeeper) SetClocks(c sentrytime.Clocks) { + // Update the params, marking them "not ready", as we may need to + // restart calibration on this new machine. + if t.restored != nil { + if err := t.params.Write(func() vdsoParams { + return vdsoParams{} + }); err != nil { + panic("unable to reset VDSO params: " + err.Error()) + } + } + + if t.clocks != nil { + panic("SetClocks called on previously-initialized Timekeeper") + } + + t.clocks = c + + // Compute the offset of the monotonic clock from the base Clocks. + // + // In a fresh (not restored) sentry, monotonic time starts at zero. + // + // In a restored sentry, monotonic time jumps forward by approximately + // the same amount as real time. There are no guarantees here, we are + // just making a best-effort attempt to to make it appear that the app + // was simply not scheduled for a long period, rather than that the + // real time clock was changed. + // + // If real time went backwards, it remains the same. + wantMonotonic := int64(0) + + nowMonotonic, err := t.clocks.GetTime(sentrytime.Monotonic) + if err != nil { + panic("Unable to get current monotonic time: " + err.Error()) + } + + nowRealtime, err := t.clocks.GetTime(sentrytime.Realtime) + if err != nil { + panic("Unable to get current realtime: " + err.Error()) + } + + if t.restored != nil { + wantMonotonic = t.saveMonotonic + elapsed := nowRealtime - t.saveRealtime + if elapsed > 0 { + wantMonotonic += elapsed + } + } + + t.monotonicOffset = wantMonotonic - nowMonotonic + + if t.restored == nil { + // Hold on to the initial "boot" time. + t.bootTime = ktime.FromNanoseconds(nowRealtime) + } + + t.mu.Lock() + defer t.mu.Unlock() + t.startUpdater() + + if t.restored != nil { + close(t.restored) + } +} + +// startUpdater starts an update goroutine that keeps the clocks updated. +// +// mu must be held. +func (t *Timekeeper) startUpdater() { + if t.stop != nil { + // Timekeeper already started + return + } + t.stop = make(chan struct{}) + + // Keep the clocks up to date. + // + // Note that the Go runtime uses host CLOCK_MONOTONIC to service the + // timer, so it may run at a *slightly* different rate from the + // application CLOCK_MONOTONIC. That is fine, as we only need to update + // at approximately this rate. + timer := time.NewTicker(sentrytime.ApproxUpdateInterval) + t.wg.Add(1) + go func() { // S/R-SAFE: stopped during save. + for { + // Start with an update immediately, so the clocks are + // ready ASAP. + + // Call Update within a Write block to prevent the VDSO + // from using the old params between Update and + // Write. + if err := t.params.Write(func() vdsoParams { + monotonicParams, monotonicOk, realtimeParams, realtimeOk := t.clocks.Update() + + var p vdsoParams + if monotonicOk { + p.monotonicReady = 1 + p.monotonicBaseCycles = int64(monotonicParams.BaseCycles) + p.monotonicBaseRef = int64(monotonicParams.BaseRef) + t.monotonicOffset + p.monotonicFrequency = monotonicParams.Frequency + } + if realtimeOk { + p.realtimeReady = 1 + p.realtimeBaseCycles = int64(realtimeParams.BaseCycles) + p.realtimeBaseRef = int64(realtimeParams.BaseRef) + p.realtimeFrequency = realtimeParams.Frequency + } + + log.Debugf("Updating VDSO parameters: %+v", p) + + return p + }); err != nil { + log.Warningf("Unable to update VDSO parameter page: %v", err) + } + + select { + case <-timer.C: + case <-t.stop: + t.wg.Done() + return + } + } + }() +} + +// stopUpdater stops the update goroutine, blocking until it exits. +// +// mu must be held. +func (t *Timekeeper) stopUpdater() { + if t.stop == nil { + // Updater not running. + return + } + + close(t.stop) + t.wg.Wait() + t.stop = nil +} + +// Destroy destroys the Timekeeper, freeing all associated resources. +func (t *Timekeeper) Destroy() { + t.mu.Lock() + defer t.mu.Unlock() + + t.stopUpdater() +} + +// PauseUpdates stops clock parameter updates. This should only be used when +// Tasks are not running and thus cannot access the clock. +func (t *Timekeeper) PauseUpdates() { + t.mu.Lock() + defer t.mu.Unlock() + t.stopUpdater() +} + +// ResumeUpdates restarts clock parameter updates stopped by PauseUpdates. +func (t *Timekeeper) ResumeUpdates() { + t.mu.Lock() + defer t.mu.Unlock() + t.startUpdater() +} + +// GetTime returns the current time in nanoseconds. +func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) { + if t.clocks == nil { + if t.restored == nil { + panic("Timekeeper used before initialized with SetClocks") + } + <-t.restored + } + now, err := t.clocks.GetTime(c) + if err == nil && c == sentrytime.Monotonic { + now += t.monotonicOffset + } + return now, err +} + +// BootTime returns the system boot real time. +func (t *Timekeeper) BootTime() ktime.Time { + return t.bootTime +} + +// timekeeperClock is a ktime.Clock that reads time from a +// kernel.Timekeeper-managed clock. +// +// +stateify savable +type timekeeperClock struct { + tk *Timekeeper + c sentrytime.ClockID + + // Implements ktime.Clock.WallTimeUntil. + ktime.WallRateClock `state:"nosave"` + + // Implements waiter.Waitable. (We have no ability to detect + // discontinuities from external changes to CLOCK_REALTIME). + ktime.NoClockEvents `state:"nosave"` +} + +// Now implements ktime.Clock.Now. +func (tc *timekeeperClock) Now() ktime.Time { + now, err := tc.tk.GetTime(tc.c) + if err != nil { + panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err)) + } + return ktime.FromNanoseconds(now) +} diff --git a/pkg/sentry/kernel/timekeeper_state.go b/pkg/sentry/kernel/timekeeper_state.go new file mode 100644 index 000000000..6ce358a05 --- /dev/null +++ b/pkg/sentry/kernel/timekeeper_state.go @@ -0,0 +1,41 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/time" +) + +// beforeSave is invoked by stateify. +func (t *Timekeeper) beforeSave() { + if t.stop != nil { + panic("pauseUpdates must be called before Save") + } + + // N.B. we want the *offset* monotonic time. + var err error + if t.saveMonotonic, err = t.GetTime(time.Monotonic); err != nil { + panic("unable to get current monotonic time: " + err.Error()) + } + + if t.saveRealtime, err = t.GetTime(time.Realtime); err != nil { + panic("unable to get current realtime: " + err.Error()) + } +} + +// afterLoad is invoked by stateify. +func (t *Timekeeper) afterLoad() { + t.restored = make(chan struct{}) +} diff --git a/pkg/sentry/kernel/uncaught_signal_go_proto/uncaught_signal.pb.go b/pkg/sentry/kernel/uncaught_signal_go_proto/uncaught_signal.pb.go new file mode 100755 index 000000000..6f5580ebe --- /dev/null +++ b/pkg/sentry/kernel/uncaught_signal_go_proto/uncaught_signal.pb.go @@ -0,0 +1,119 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// source: pkg/sentry/kernel/uncaught_signal.proto + +package gvisor + +import ( + fmt "fmt" + proto "github.com/golang/protobuf/proto" + registers_go_proto "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto" + math "math" +) + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package + +type UncaughtSignal struct { + Tid int32 `protobuf:"varint,1,opt,name=tid,proto3" json:"tid,omitempty"` + Pid int32 `protobuf:"varint,2,opt,name=pid,proto3" json:"pid,omitempty"` + Registers *registers_go_proto.Registers `protobuf:"bytes,3,opt,name=registers,proto3" json:"registers,omitempty"` + SignalNumber int32 `protobuf:"varint,4,opt,name=signal_number,json=signalNumber,proto3" json:"signal_number,omitempty"` + FaultAddr uint64 `protobuf:"varint,5,opt,name=fault_addr,json=faultAddr,proto3" json:"fault_addr,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *UncaughtSignal) Reset() { *m = UncaughtSignal{} } +func (m *UncaughtSignal) String() string { return proto.CompactTextString(m) } +func (*UncaughtSignal) ProtoMessage() {} +func (*UncaughtSignal) Descriptor() ([]byte, []int) { + return fileDescriptor_5ca9e03e13704688, []int{0} +} + +func (m *UncaughtSignal) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_UncaughtSignal.Unmarshal(m, b) +} +func (m *UncaughtSignal) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_UncaughtSignal.Marshal(b, m, deterministic) +} +func (m *UncaughtSignal) XXX_Merge(src proto.Message) { + xxx_messageInfo_UncaughtSignal.Merge(m, src) +} +func (m *UncaughtSignal) XXX_Size() int { + return xxx_messageInfo_UncaughtSignal.Size(m) +} +func (m *UncaughtSignal) XXX_DiscardUnknown() { + xxx_messageInfo_UncaughtSignal.DiscardUnknown(m) +} + +var xxx_messageInfo_UncaughtSignal proto.InternalMessageInfo + +func (m *UncaughtSignal) GetTid() int32 { + if m != nil { + return m.Tid + } + return 0 +} + +func (m *UncaughtSignal) GetPid() int32 { + if m != nil { + return m.Pid + } + return 0 +} + +func (m *UncaughtSignal) GetRegisters() *registers_go_proto.Registers { + if m != nil { + return m.Registers + } + return nil +} + +func (m *UncaughtSignal) GetSignalNumber() int32 { + if m != nil { + return m.SignalNumber + } + return 0 +} + +func (m *UncaughtSignal) GetFaultAddr() uint64 { + if m != nil { + return m.FaultAddr + } + return 0 +} + +func init() { + proto.RegisterType((*UncaughtSignal)(nil), "gvisor.UncaughtSignal") +} + +func init() { + proto.RegisterFile("pkg/sentry/kernel/uncaught_signal.proto", fileDescriptor_5ca9e03e13704688) +} + +var fileDescriptor_5ca9e03e13704688 = []byte{ + // 210 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x4c, 0x8e, 0x4d, 0x4a, 0xc6, 0x30, + 0x10, 0x86, 0x89, 0xfd, 0x81, 0xc6, 0x1f, 0x34, 0xab, 0x20, 0x88, 0x45, 0x17, 0x76, 0xd5, 0x80, + 0x9e, 0xc0, 0x0b, 0xb8, 0x88, 0xb8, 0x2e, 0x69, 0x13, 0xd3, 0xd0, 0x9a, 0x86, 0x49, 0x22, 0x78, + 0x24, 0x6f, 0x29, 0x4d, 0xd4, 0xef, 0xdb, 0x0d, 0xcf, 0xbc, 0xf3, 0xcc, 0x8b, 0x1f, 0xdc, 0xa2, + 0x99, 0x57, 0x36, 0xc0, 0x17, 0x5b, 0x14, 0x58, 0xb5, 0xb2, 0x68, 0x27, 0x11, 0xf5, 0x1c, 0x06, + 0x6f, 0xb4, 0x15, 0x6b, 0xef, 0x60, 0x0b, 0x1b, 0xa9, 0xf5, 0xa7, 0xf1, 0x1b, 0x5c, 0xdf, 0x1e, + 0x1d, 0x08, 0x98, 0x66, 0x06, 0x4a, 0x1b, 0x1f, 0x14, 0xf8, 0x1c, 0xbc, 0xfb, 0x46, 0xf8, 0xe2, + 0xed, 0x57, 0xf1, 0x9a, 0x0c, 0xe4, 0x12, 0x17, 0xc1, 0x48, 0x8a, 0x5a, 0xd4, 0x55, 0x7c, 0x1f, + 0x77, 0xe2, 0x8c, 0xa4, 0x27, 0x99, 0x38, 0x23, 0x09, 0xc3, 0xcd, 0xbf, 0x89, 0x16, 0x2d, 0xea, + 0x4e, 0x1f, 0xaf, 0xfa, 0xfc, 0xb3, 0xe7, 0x7f, 0x0b, 0x7e, 0xc8, 0x90, 0x7b, 0x7c, 0x9e, 0x0b, + 0x0e, 0x36, 0x7e, 0x8c, 0x0a, 0x68, 0x99, 0x64, 0x67, 0x19, 0xbe, 0x24, 0x46, 0x6e, 0x30, 0x7e, + 0x17, 0x71, 0x0d, 0x83, 0x90, 0x12, 0x68, 0xd5, 0xa2, 0xae, 0xe4, 0x4d, 0x22, 0xcf, 0x52, 0xc2, + 0x58, 0xa7, 0xca, 0x4f, 0x3f, 0x01, 0x00, 0x00, 0xff, 0xff, 0xfd, 0x62, 0x54, 0xdf, 0x06, 0x01, + 0x00, 0x00, +} diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go new file mode 100644 index 000000000..96fe3cbb9 --- /dev/null +++ b/pkg/sentry/kernel/uts_namespace.go @@ -0,0 +1,102 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" +) + +// UTSNamespace represents a UTS namespace, a holder of two system identifiers: +// the hostname and domain name. +// +// +stateify savable +type UTSNamespace struct { + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + hostName string + domainName string + + // userns is the user namespace associated with the UTSNamespace. + // Privileged operations on this UTSNamespace must have appropriate + // capabilities in userns. + // + // userns is immutable. + userns *auth.UserNamespace +} + +// NewUTSNamespace creates a new UTS namespace. +func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace { + return &UTSNamespace{ + hostName: hostName, + domainName: domainName, + userns: userns, + } +} + +// UTSNamespace returns the task's UTS namespace. +func (t *Task) UTSNamespace() *UTSNamespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.utsns +} + +// HostName returns the host name of this UTS namespace. +func (u *UTSNamespace) HostName() string { + u.mu.Lock() + defer u.mu.Unlock() + return u.hostName +} + +// SetHostName sets the host name of this UTS namespace. +func (u *UTSNamespace) SetHostName(host string) { + u.mu.Lock() + defer u.mu.Unlock() + u.hostName = host +} + +// DomainName returns the domain name of this UTS namespace. +func (u *UTSNamespace) DomainName() string { + u.mu.Lock() + defer u.mu.Unlock() + return u.domainName +} + +// SetDomainName sets the domain name of this UTS namespace. +func (u *UTSNamespace) SetDomainName(domain string) { + u.mu.Lock() + defer u.mu.Unlock() + u.domainName = domain +} + +// UserNamespace returns the user namespace associated with this UTS namespace. +func (u *UTSNamespace) UserNamespace() *auth.UserNamespace { + u.mu.Lock() + defer u.mu.Unlock() + return u.userns +} + +// Clone makes a copy of this UTS namespace, associating the given user +// namespace. +func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace { + u.mu.Lock() + defer u.mu.Unlock() + return &UTSNamespace{ + hostName: u.hostName, + domainName: u.domainName, + userns: userns, + } +} diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go new file mode 100644 index 000000000..d40ad74f4 --- /dev/null +++ b/pkg/sentry/kernel/vdso.go @@ -0,0 +1,148 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// vdsoParams are the parameters exposed to the VDSO. +// +// They are exposed to the VDSO via a parameter page managed by VDSOParamPage, +// which also includes a sequence counter. +type vdsoParams struct { + monotonicReady uint64 + monotonicBaseCycles int64 + monotonicBaseRef int64 + monotonicFrequency uint64 + + realtimeReady uint64 + realtimeBaseCycles int64 + realtimeBaseRef int64 + realtimeFrequency uint64 +} + +// VDSOParamPage manages a VDSO parameter page. +// +// Its memory layout looks like: +// +// type page struct { +// // seq is a sequence counter that protects the fields below. +// seq uint64 +// vdsoParams +// } +// +// Everything in the struct is 8 bytes for easy alignment. +// +// It must be kept in sync with params in vdso/vdso_time.cc. +// +// +stateify savable +type VDSOParamPage struct { + // The parameter page is fr, allocated from mfp.MemoryFile(). + mfp pgalloc.MemoryFileProvider + fr platform.FileRange + + // seq is the current sequence count written to the page. + // + // A write is in progress if bit 1 of the counter is set. + // + // Timekeeper's updater goroutine may call Write before equality is + // checked in state_test_util tests, causing this field to change across + // save / restore. + seq uint64 +} + +// NewVDSOParamPage returns a VDSOParamPage. +// +// Preconditions: +// +// * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does +// not take ownership of fr; it must remain allocated for the lifetime of the +// VDSOParamPage. +// +// * VDSOParamPage must be the only writer to fr. +// +// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block. +func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage { + return &VDSOParamPage{mfp: mfp, fr: fr} +} + +// access returns a mapping of the param page. +func (v *VDSOParamPage) access() (safemem.Block, error) { + bs, err := v.mfp.MemoryFile().MapInternal(v.fr, usermem.ReadWrite) + if err != nil { + return safemem.Block{}, err + } + if bs.NumBlocks() != 1 { + panic(fmt.Sprintf("Multiple blocks (%d) in VDSO param BlockSeq", bs.NumBlocks())) + } + return bs.Head(), nil +} + +// incrementSeq increments the sequence counter in the param page. +func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error { + next := v.seq + 1 + old, err := safemem.SwapUint64(paramPage, next) + if err != nil { + return err + } + + if old != v.seq { + return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq) + } + + v.seq = next + return nil +} + +// Write updates the VDSO parameters. +// +// Write starts a write block, calls f to get the new parameters, writes +// out the new parameters, then ends the write block. +func (v *VDSOParamPage) Write(f func() vdsoParams) error { + paramPage, err := v.access() + if err != nil { + return err + } + + // Write begin. + next := v.seq + 1 + if next%2 != 1 { + panic("Out-of-order sequence count") + } + + err = v.incrementSeq(paramPage) + if err != nil { + return err + } + + // Get the new params. + p := f() + buf := binary.Marshal(nil, usermem.ByteOrder, p) + + // Skip the sequence counter. + if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil { + panic(fmt.Sprintf("Unable to get set VDSO parameters: %v", err)) + } + + // Write end. + return v.incrementSeq(paramPage) +} diff --git a/pkg/sentry/kernel/version.go b/pkg/sentry/kernel/version.go new file mode 100644 index 000000000..5640dd71d --- /dev/null +++ b/pkg/sentry/kernel/version.go @@ -0,0 +1,33 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// Version defines the application-visible system version. +type Version struct { + // Operating system name (e.g. "Linux"). + Sysname string + + // Operating system release (e.g. "4.4-amd64"). + Release string + + // Operating system version. On Linux this takes the shape + // "#VERSION CONFIG_FLAGS TIMESTAMP" + // where: + // - VERSION is a sequence counter incremented on every successful build + // - CONFIG_FLAGS is a space-separated list of major enabled kernel features + // (e.g. "SMP" and "PREEMPT") + // - TIMESTAMP is the build timestamp as returned by `date` + Version string +} |