// Copyright 2018 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package shm implements sysv shared memory segments. // // Known missing features: // // - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement // memory locking in general. // // - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy // way to implement hugetlb support on a per-map basis, and it has no impact // on correctness. // // - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap // so it's meaningless to reserve space for swap. // // - No per-process segment size enforcement. This feature probably isn't used // much anyways, since Linux sets the per-process limits to the system-wide // limits by default. // // Lock ordering: mm.mappingMu -> shm registry lock -> shm lock package shm import ( "fmt" "sync" "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/refs" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/usage" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserror" ) // Registry tracks all shared memory segments in an IPC namespace. The registry // provides the mechanisms for creating and finding segments, and reporting // global shm parameters. // // +stateify savable type Registry struct { // userNS owns the IPC namespace this registry belong to. Immutable. userNS *auth.UserNamespace mu sync.Mutex `state:"nosave"` // shms maps segment ids to segments. Protected by mu. shms map[int32]*Shm // Sum of the sizes of all existing segments rounded up to page size, in // units of page size. Protected by mu. totalPages uint64 // lastIDUsed is protected by mu. lastIDUsed int32 } // NewRegistry creates a new shm registry. func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ userNS: userNS, shms: make(map[int32]*Shm), } } // FindByID looks up a segment given an ID. func (r *Registry) FindByID(id int32) *Shm { r.mu.Lock() defer r.mu.Unlock() return r.shms[id] } // Precondition: Caller must hold r.mu. func (r *Registry) findByKey(key int32) *Shm { for _, v := range r.shms { if v.key == key { return v } } return nil } // FindOrCreate looks up or creates a segment in the registry. It's functionally // analogous to open(2). func (r *Registry) FindOrCreate(ctx context.Context, pid, key int32, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) { // "A new segment was to be created and size is less than SHMMIN or // greater than SHMMAX." - man shmget(2) // // Note that 'private' always implies the creation of a new segment // whether IPC_CREAT is specified or not. return nil, syserror.EINVAL } r.mu.Lock() defer r.mu.Unlock() if len(r.shms) >= linux.SHMMNI { // "All possible shared memory IDs have been taken (SHMMNI) ..." // - man shmget(2) return nil, syserror.ENOSPC } if !private { // Look up an existing segment. if shm := r.findByKey(key); shm != nil { shm.mu.Lock() defer shm.mu.Unlock() // Check that caller can access the segment. if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) { // "The user does not have permission to access the shared // memory segment, and does not have the CAP_IPC_OWNER // capability in the user namespace that governs its IPC // namespace." - man shmget(2) return nil, syserror.EACCES } if size > shm.size { // "A segment for the given key exists, but size is greater than // the size of that segment." - man shmget(2) return nil, syserror.EINVAL } if create && exclusive { // "IPC_CREAT and IPC_EXCL were specified in shmflg, but a // shared memory segment already exists for key." // - man shmget(2) return nil, syserror.EEXIST } return shm, nil } if !create { // "No segment exists for the given key, and IPC_CREAT was not // specified." - man shmget(2) return nil, syserror.ENOENT } } var sizeAligned uint64 if val, ok := usermem.Addr(size).RoundUp(); ok { sizeAligned = uint64(val) } else { return nil, syserror.EINVAL } if numPages := sizeAligned / usermem.PageSize; r.totalPages+numPages > linux.SHMALL { // "... allocating a segment of the requested size would cause the // system to exceed the system-wide limit on shared memory (SHMALL)." // - man shmget(2) return nil, syserror.ENOSPC } // Need to create a new segment. creator := fs.FileOwnerFromContext(ctx) perms := fs.FilePermsFromMode(mode) return r.newShm(ctx, pid, key, creator, perms, size) } // newShm creates a new segment in the registry. func (r *Registry) newShm(ctx context.Context, pid, key int32, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) { p := platform.FromContext(ctx) if p == nil { panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform)) } effectiveSize := uint64(usermem.Addr(size).MustRoundUp()) fr, err := p.Memory().Allocate(effectiveSize, usage.Anonymous) if err != nil { return nil, err } shm := &Shm{ p: p, registry: r, creator: creator, size: size, effectiveSize: effectiveSize, fr: fr, key: key, perms: perms, owner: creator, creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { // Handle wrap around. if id < 0 { id = 0 continue } if r.shms[id] == nil { r.lastIDUsed = id r.shms[id] = shm shm.ID = id r.totalPages += effectiveSize / usermem.PageSize return shm, nil } } log.Warningf("Shm ids exhuasted, they may be leaking") return nil, syserror.ENOSPC } // IPCInfo reports global parameters for sysv shared memory segments on this // system. See shmctl(IPC_INFO). func (r *Registry) IPCInfo() *linux.ShmParams { return &linux.ShmParams{ ShmMax: linux.SHMMAX, ShmMin: linux.SHMMIN, ShmMni: linux.SHMMNI, ShmSeg: linux.SHMSEG, ShmAll: linux.SHMALL, } } // ShmInfo reports linux-specific global parameters for sysv shared memory // segments on this system. See shmctl(SHM_INFO). func (r *Registry) ShmInfo() *linux.ShmInfo { r.mu.Lock() defer r.mu.Unlock() return &linux.ShmInfo{ UsedIDs: int32(r.lastIDUsed), ShmTot: r.totalPages, ShmRss: r.totalPages, // We could probably get a better estimate from memory accounting. ShmSwp: 0, // No reclaim at the moment. } } // remove unregisters a segment from this registry, preventing it from being // discovered in the future. Caller is responsible for ensuring s is destroyed. // // Precondition: To preserve lock ordering, caller must not hold s.mu. func (r *Registry) remove(s *Shm) { r.mu.Lock() defer r.mu.Unlock() delete(r.shms, s.ID) r.totalPages -= s.effectiveSize / usermem.PageSize } // Shm represents a single shared memory segment. // // Shm segment are backed directly by an allocation from platform // memory. Segments are always mapped as a whole, greatly simplifying how // mappings are tracked. However note that mremap and munmap calls may cause the // vma for a segment to become fragmented; which requires special care when // unmapping a segment. See mm/shm.go. // // Segments persist until they are explicitly marked for destruction via // shmctl(SHM_RMID). // // Shm implements memmap.Mappable and memmap.MappingIdentity. // // +stateify savable type Shm struct { // AtomicRefCount tracks the number of references to this segment from // maps. A segment always holds a reference to itself, until it's marked for // destruction. refs.AtomicRefCount p platform.Platform // registry points to the shm registry containing this segment. Immutable. registry *Registry // ID is the kernel identifier for this segment. Immutable. ID int32 // creator is the user that created the segment. Immutable. creator fs.FileOwner // size is the requested size of the segment at creation, in // bytes. Immutable. size uint64 // effectiveSize of the segment, rounding up to the next page // boundary. Immutable. // // Invariant: effectiveSize must be a multiple of usermem.PageSize. effectiveSize uint64 // fr is the offset into platform.Memory() that backs this contents of this // segment. Immutable. fr platform.FileRange // key is the public identifier for this segment. key int32 // mu protects all fields below. mu sync.Mutex `state:"nosave"` // perms is the access permissions for the segment. perms fs.FilePermissions // owner of this segment. owner fs.FileOwner // attachTime is updated on every successful shmat. attachTime ktime.Time // detachTime is updated on every successful shmdt. detachTime ktime.Time // changeTime is updated on every successful changes to the segment via // shmctl(IPC_SET). changeTime ktime.Time // creatorPID is the PID of the process that created the segment. creatorPID int32 // lastAttachDetachPID is the pid of the process that issued the last shmat // or shmdt syscall. lastAttachDetachPID int32 // pendingDestruction indicates the segment was marked as destroyed through // shmctl(IPC_RMID). When marked as destroyed, the segment will not be found // in the registry and can no longer be attached. When the last user // detaches from the segment, it is destroyed. Protected by mu. pendingDestruction bool } // MappedName implements memmap.MappingIdentity.MappedName. func (s *Shm) MappedName(ctx context.Context) string { return fmt.Sprintf("SYSV%08d", s.key) } // DeviceID implements memmap.MappingIdentity.DeviceID. func (s *Shm) DeviceID() uint64 { return shmDevice.DeviceID() } // InodeID implements memmap.MappingIdentity.InodeID. func (s *Shm) InodeID() uint64 { // "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use // this. Changing this will break them." -- Linux, ipc/shm.c:newseg() return uint64(s.ID) } // DecRef overrides refs.RefCount.DecRef with a destructor. func (s *Shm) DecRef() { s.DecRefWithDestructor(s.destroy) } // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm // segments. func (s *Shm) Msync(context.Context, memmap.MappableRange) error { return nil } // AddMapping implements memmap.Mappable.AddMapping. func (s *Shm) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error { s.mu.Lock() defer s.mu.Unlock() s.attachTime = ktime.NowFromContext(ctx) if pid, ok := context.ThreadGroupIDFromContext(ctx); ok { s.lastAttachDetachPID = pid } else { // AddMapping is called during a syscall, so ctx should always be a task // context. log.Warningf("Adding mapping to shm %+v but couldn't get the current pid; not updating the last attach pid", s) } return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (s *Shm) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) { s.mu.Lock() defer s.mu.Unlock() // TODO: RemoveMapping may be called during task exit, when ctx // is context.Background. Gracefully handle missing clocks. Failing to // update the detach time in these cases is ok, since no one can observe the // omission. if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { s.detachTime = clock.Now() } // If called from a non-task context we also won't have a threadgroup // id. Silently skip updating the lastAttachDetachPid in that case. if pid, ok := context.ThreadGroupIDFromContext(ctx); ok { s.lastAttachDetachPID = pid } else { log.Debugf("Couldn't obtain pid when removing mapping to shm %+v, not updating the last detach pid.", s) } } // CopyMapping implements memmap.Mappable.CopyMapping. func (s *Shm) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error { return nil } // Translate implements memmap.Mappable.Translate. func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { var err error if required.End > s.fr.Length() { err = &memmap.BusError{syserror.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ { Source: source, File: s.p.Memory(), Offset: s.fr.Start + source.Start, }, }, err } return nil, err } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (s *Shm) InvalidateUnsavable(ctx context.Context) error { return nil } // AttachOpts describes various flags passed to shmat(2). type AttachOpts struct { Execute bool Readonly bool Remap bool } // ConfigureAttach creates an mmap configuration for the segment with the // requested attach options. // // ConfigureAttach returns with a ref on s on success. The caller should drop // this once the map is installed. This reference prevents s from being // destroyed before the returned configuration is used. func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) { s.mu.Lock() defer s.mu.Unlock() if s.pendingDestruction && s.ReadRefs() == 0 { return memmap.MMapOpts{}, syserror.EIDRM } if !s.checkPermissions(ctx, fs.PermMask{ Read: true, Write: !opts.Readonly, Execute: opts.Execute, }) { // "The calling process does not have the required permissions for the // requested attach type, and does not have the CAP_IPC_OWNER capability // in the user namespace that governs its IPC namespace." - man shmat(2) return memmap.MMapOpts{}, syserror.EACCES } s.IncRef() return memmap.MMapOpts{ Length: s.size, Offset: 0, Addr: addr, Fixed: opts.Remap, Perms: usermem.AccessType{ Read: true, Write: !opts.Readonly, Execute: opts.Execute, }, MaxPerms: usermem.AnyAccess, Mappable: s, MappingIdentity: s, }, nil } // EffectiveSize returns the size of the underlying shared memory segment. This // may be larger than the requested size at creation, due to rounding to page // boundaries. func (s *Shm) EffectiveSize() uint64 { return s.effectiveSize } // IPCStat returns information about a shm. See shmctl(IPC_STAT). func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { s.mu.Lock() defer s.mu.Unlock() // "The caller must have read permission on the shared memory segment." // - man shmctl(2) if !s.checkPermissions(ctx, fs.PermMask{Read: true}) { // "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow // read access for shmid, and the calling process does not have the // CAP_IPC_OWNER capability in the user namespace that governs its IPC // namespace." - man shmctl(2) return nil, syserror.EACCES } var mode uint16 if s.pendingDestruction { mode |= linux.SHM_DEST } creds := auth.CredentialsFromContext(ctx) nattach := uint64(s.ReadRefs()) // Don't report the self-reference we keep prior to being marked for // destruction. However, also don't report a count of -1 for segments marked // as destroyed, with no mappings. if !s.pendingDestruction { nattach-- } ds := &linux.ShmidDS{ ShmPerm: linux.IPCPerm{ Key: uint32(s.key), UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), Mode: mode | uint16(s.perms.LinuxMode()), Seq: 0, // IPC sequences not supported. }, ShmSegsz: s.size, ShmAtime: s.attachTime.TimeT(), ShmDtime: s.detachTime.TimeT(), ShmCtime: s.changeTime.TimeT(), ShmCpid: s.creatorPID, ShmLpid: s.lastAttachDetachPID, ShmNattach: nattach, } return ds, nil } // Set modifies attributes for a segment. See shmctl(IPC_SET). func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { s.mu.Lock() defer s.mu.Unlock() if !s.checkOwnership(ctx) { return syserror.EPERM } creds := auth.CredentialsFromContext(ctx) uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID)) gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID)) if !uid.Ok() || !gid.Ok() { return syserror.EINVAL } // User may only modify the lower 9 bits of the mode. All the other bits are // always 0 for the underlying inode. mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff) s.perms = fs.FilePermsFromMode(mode) s.owner.UID = uid s.owner.GID = gid s.changeTime = ktime.NowFromContext(ctx) return nil } func (s *Shm) destroy() { s.registry.remove(s) s.p.Memory().DecRef(s.fr) } // MarkDestroyed marks a shm for destruction. The shm is actually destroyed once // it has no references. See shmctl(IPC_RMID). func (s *Shm) MarkDestroyed() { s.mu.Lock() defer s.mu.Unlock() // Prevent the segment from being found in the registry. s.key = linux.IPC_PRIVATE s.pendingDestruction = true s.DecRef() } // checkOwnership verifies whether a segment may be accessed by ctx as an // owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux. // // Precondition: Caller must hold s.mu. func (s *Shm) checkOwnership(ctx context.Context) bool { creds := auth.CredentialsFromContext(ctx) if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID { return true } // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented // for use to "override IPC ownership checks". return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS) } // checkPermissions verifies whether a segment is accessible by ctx for access // described by req. See ipc/util.c:ipcperms() in Linux. // // Precondition: Caller must hold s.mu. func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool { creds := auth.CredentialsFromContext(ctx) p := s.perms.Other if s.owner.UID == creds.EffectiveKUID { p = s.perms.User } else if creds.InGroup(s.owner.GID) { p = s.perms.Group } if p.SupersetOf(req) { return true } // Tasks with CAP_IPC_OWNER may bypass permission checks. return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) }