summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/control/BUILD8
-rw-r--r--pkg/sentry/control/control.proto40
-rw-r--r--pkg/sentry/devices/quotedev/BUILD16
-rw-r--r--pkg/sentry/devices/quotedev/quotedev.go52
-rw-r--r--pkg/sentry/fsimpl/gofer/handle.go41
-rw-r--r--pkg/sentry/fsimpl/gofer/save_restore.go4
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go131
-rw-r--r--pkg/sentry/fsimpl/verity/BUILD14
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go47
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go261
-rw-r--r--pkg/sentry/kernel/ipc/object.go35
-rw-r--r--pkg/sentry/kernel/msgqueue/msgqueue.go109
-rw-r--r--pkg/sentry/kernel/semaphore/semaphore.go12
-rw-r--r--pkg/sentry/kernel/shm/shm.go19
-rw-r--r--pkg/sentry/kernel/thread_group.go15
-rw-r--r--pkg/sentry/mm/aio_context.go18
-rw-r--r--pkg/sentry/platform/kvm/bluepill_fault.go6
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_msgqueue.go53
-rw-r--r--pkg/sentry/syscalls/linux/sys_sem.go19
-rw-r--r--pkg/sentry/vfs/README.md4
21 files changed, 672 insertions, 236 deletions
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index a4934a565..cfb33a398 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -1,7 +1,13 @@
-load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test", "proto_library")
package(licenses = ["notice"])
+proto_library(
+ name = "control",
+ srcs = ["control.proto"],
+ visibility = ["//visibility:public"],
+)
+
go_library(
name = "control",
srcs = [
diff --git a/pkg/sentry/control/control.proto b/pkg/sentry/control/control.proto
new file mode 100644
index 000000000..72dda3fbc
--- /dev/null
+++ b/pkg/sentry/control/control.proto
@@ -0,0 +1,40 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package gvisor;
+
+// ControlConfig configures the permission of controls.
+message ControlConfig {
+ // Names for individual control URPC service objects.
+ // Any new service object that should be given conditional access should be
+ // named here and conditionally added based on presence in allowed_controls.
+ enum Endpoint {
+ UNKNOWN = 0;
+ EVENTS = 1;
+ FS = 2;
+ LIFECYCLE = 3;
+ LOGGING = 4;
+ PROFILE = 5;
+ USAGE = 6;
+ PROC = 7;
+ STATE = 8;
+ DEBUG = 9;
+ }
+
+ // allowed_controls represents which endpoints may be registered to the
+ // server.
+ repeated Endpoint allowed_controls = 1;
+}
diff --git a/pkg/sentry/devices/quotedev/BUILD b/pkg/sentry/devices/quotedev/BUILD
deleted file mode 100644
index ee946610a..000000000
--- a/pkg/sentry/devices/quotedev/BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-load("//tools:defs.bzl", "go_library")
-
-licenses(["notice"])
-
-go_library(
- name = "quotedev",
- srcs = ["quotedev.go"],
- visibility = ["//pkg/sentry:internal"],
- deps = [
- "//pkg/abi/linux",
- "//pkg/context",
- "//pkg/errors/linuxerr",
- "//pkg/sentry/fsimpl/devtmpfs",
- "//pkg/sentry/vfs",
- ],
-)
diff --git a/pkg/sentry/devices/quotedev/quotedev.go b/pkg/sentry/devices/quotedev/quotedev.go
deleted file mode 100644
index 140856a4a..000000000
--- a/pkg/sentry/devices/quotedev/quotedev.go
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2021 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package quotedev implements a vfs.Device for /dev/gvisor_quote.
-package quotedev
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/errors/linuxerr"
- "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-const (
- quoteDevMinor = 0
-)
-
-// quoteDevice implements vfs.Device for /dev/gvisor_quote
-//
-// +stateify savable
-type quoteDevice struct{}
-
-// Open implements vfs.Device.Open.
-// TODO(b/157161182): Add support for attestation ioctls.
-func (quoteDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- return nil, linuxerr.EIO
-}
-
-// Register registers all devices implemented by this package in vfsObj.
-func Register(vfsObj *vfs.VirtualFilesystem) error {
- return vfsObj.RegisterDevice(vfs.CharDevice, linux.UNNAMED_MAJOR, quoteDevMinor, quoteDevice{}, &vfs.RegisterDeviceOptions{
- GroupName: "gvisor_quote",
- })
-}
-
-// CreateDevtmpfsFiles creates device special files in dev representing all
-// devices implemented by this package.
-func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
- return dev.CreateDeviceFile(ctx, "gvisor_quote", vfs.CharDevice, linux.UNNAMED_MAJOR, quoteDevMinor, 0666 /* mode */)
-}
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 5c57f6fea..02540a754 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/hostfd"
+ "gvisor.dev/gvisor/pkg/sync"
)
// handle represents a remote "open file descriptor", consisting of an opened
@@ -130,3 +131,43 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
}
return uint64(n), cperr
}
+
+type handleReadWriter struct {
+ ctx context.Context
+ h *handle
+ off uint64
+}
+
+var handleReadWriterPool = sync.Pool{
+ New: func() interface{} {
+ return &handleReadWriter{}
+ },
+}
+
+func getHandleReadWriter(ctx context.Context, h *handle, offset int64) *handleReadWriter {
+ rw := handleReadWriterPool.Get().(*handleReadWriter)
+ rw.ctx = ctx
+ rw.h = h
+ rw.off = uint64(offset)
+ return rw
+}
+
+func putHandleReadWriter(rw *handleReadWriter) {
+ rw.ctx = nil
+ rw.h = nil
+ handleReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *handleReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ n, err := rw.h.readToBlocksAt(rw.ctx, dsts, rw.off)
+ rw.off += n
+ return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *handleReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ n, err := rw.h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
+ rw.off += n
+ return n, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go
index e67422a2f..8dcbc61ed 100644
--- a/pkg/sentry/fsimpl/gofer/save_restore.go
+++ b/pkg/sentry/fsimpl/gofer/save_restore.go
@@ -158,6 +158,10 @@ func (d *dentryPlatformFile) afterLoad() {
// afterLoad is invoked by stateify.
func (fd *specialFileFD) afterLoad() {
fd.handle.fd = -1
+ if fd.hostFileMapper.IsInited() {
+ // Ensure that we don't call fd.hostFileMapper.Init() again.
+ fd.hostFileMapperInitOnce.Do(func() {})
+ }
}
// CompleteRestore implements
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 144a1045e..a8d47b65b 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -22,10 +22,13 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/fsmetric"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
@@ -75,6 +78,16 @@ type specialFileFD struct {
bufMu sync.Mutex `state:"nosave"`
haveBuf uint32
buf []byte
+
+ // If handle.fd >= 0, hostFileMapper caches mappings of handle.fd, and
+ // hostFileMapperInitOnce is used to initialize it on first use.
+ hostFileMapperInitOnce sync.Once `state:"nosave"`
+ hostFileMapper fsutil.HostFileMapper
+
+ // If handle.fd >= 0, fileRefs counts references on memmap.File offsets.
+ // fileRefs is protected by fileRefsMu.
+ fileRefsMu sync.Mutex `state:"nosave"`
+ fileRefs fsutil.FrameRefSet
}
func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) {
@@ -229,23 +242,13 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
}
}
- // Going through dst.CopyOutFrom() would hold MM locks around file
- // operations of unknown duration. For regularFileFD, doing so is necessary
- // to support mmap due to lock ordering; MM locks precede dentry.dataMu.
- // That doesn't hold here since specialFileFD doesn't client-cache data.
- // Just buffer the read instead.
- buf := make([]byte, dst.NumBytes())
- n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+ rw := getHandleReadWriter(ctx, &fd.handle, offset)
+ n, err := dst.CopyOutFrom(ctx, rw)
+ putHandleReadWriter(rw)
if linuxerr.Equals(linuxerr.EAGAIN, err) {
err = linuxerr.ErrWouldBlock
}
- if n == 0 {
- return bufN, err
- }
- if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil {
- return bufN + int64(cp), cperr
- }
- return bufN + int64(n), err
+ return bufN + n, err
}
// Read implements vfs.FileDescriptionImpl.Read.
@@ -316,20 +319,15 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
}
}
- // Do a buffered write. See rationale in PRead.
- buf := make([]byte, src.NumBytes())
- copied, copyErr := src.CopyIn(ctx, buf)
- if copied == 0 && copyErr != nil {
- // Only return the error if we didn't get any data.
- return 0, offset, copyErr
- }
- n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset))
+ rw := getHandleReadWriter(ctx, &fd.handle, offset)
+ n, err := src.CopyInTo(ctx, rw)
+ putHandleReadWriter(rw)
if linuxerr.Equals(linuxerr.EAGAIN, err) {
err = linuxerr.ErrWouldBlock
}
// Update offset if the offset is valid.
if offset >= 0 {
- offset += int64(n)
+ offset += n
}
// Update file size for regular files.
if fd.isRegularFile {
@@ -340,10 +338,7 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
atomic.StoreUint64(&d.size, uint64(offset))
}
}
- if err != nil {
- return int64(n), offset, err
- }
- return int64(n), offset, copyErr
+ return int64(n), offset, err
}
// Write implements vfs.FileDescriptionImpl.Write.
@@ -411,3 +406,85 @@ func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error
}
return nil
}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *specialFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ if fd.handle.fd < 0 || fd.filesystem().opts.forcePageCache {
+ return linuxerr.ENODEV
+ }
+ // After this point, fd may be used as a memmap.Mappable and memmap.File.
+ fd.hostFileMapperInitOnce.Do(fd.hostFileMapper.Init)
+ return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts)
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
+ fd.hostFileMapper.IncRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())})
+ return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (fd *specialFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
+ fd.hostFileMapper.DecRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())})
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (fd *specialFileFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
+ return fd.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (fd *specialFileFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
+ mr := optional
+ if fd.filesystem().opts.limitHostFDTranslation {
+ mr = maxFillRange(required, optional)
+ }
+ return []memmap.Translation{
+ {
+ Source: mr,
+ File: fd,
+ Offset: mr.Start,
+ Perms: hostarch.AnyAccess,
+ },
+ }, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (fd *specialFileFD) InvalidateUnsavable(ctx context.Context) error {
+ return nil
+}
+
+// IncRef implements memmap.File.IncRef.
+func (fd *specialFileFD) IncRef(fr memmap.FileRange) {
+ fd.fileRefsMu.Lock()
+ defer fd.fileRefsMu.Unlock()
+ fd.fileRefs.IncRefAndAccount(fr)
+}
+
+// DecRef implements memmap.File.DecRef.
+func (fd *specialFileFD) DecRef(fr memmap.FileRange) {
+ fd.fileRefsMu.Lock()
+ defer fd.fileRefsMu.Unlock()
+ fd.fileRefs.DecRefAndAccount(fr)
+}
+
+// MapInternal implements memmap.File.MapInternal.
+func (fd *specialFileFD) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
+ fd.requireHostFD()
+ return fd.hostFileMapper.MapInternal(fr, int(fd.handle.fd), at.Write)
+}
+
+// FD implements memmap.File.FD.
+func (fd *specialFileFD) FD() int {
+ fd.requireHostFD()
+ return int(fd.handle.fd)
+}
+
+func (fd *specialFileFD) requireHostFD() {
+ if fd.handle.fd < 0 {
+ // This is possible if fd was successfully mmapped before saving, then
+ // was restored without a host FD. This is unrecoverable: without a
+ // host FD, we can't mmap this file post-restore.
+ panic("gofer.specialFileFD can no longer be memory-mapped without a host FD")
+ }
+}
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
index 5955948f0..c12abdf33 100644
--- a/pkg/sentry/fsimpl/verity/BUILD
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -1,10 +1,24 @@
load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
+go_template_instance(
+ name = "dentry_list",
+ out = "dentry_list.go",
+ package = "verity",
+ prefix = "dentry",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*dentry",
+ "Linker": "*dentry",
+ },
+)
+
go_library(
name = "verity",
srcs = [
+ "dentry_list.go",
"filesystem.go",
"save_restore.go",
"verity.go",
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index e147d6b07..52d47994d 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -66,40 +66,23 @@ func putDentrySlice(ds *[]*dentry) {
dentrySlicePool.Put(ds)
}
-// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
-// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls
+// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for
// writing.
//
// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
// but dentry slices are allocated lazily, and it's much easier to say "defer
-// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
-// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
// +checklocksrelease:fs.renameMu
-func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
fs.renameMu.RUnlock()
if *ds == nil {
return
}
- if len(**ds) != 0 {
- fs.renameMu.Lock()
- for _, d := range **ds {
- d.checkDropLocked(ctx)
- }
- fs.renameMu.Unlock()
- }
- putDentrySlice(*ds)
-}
-
-// +checklocksrelease:fs.renameMu
-func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
- if *ds == nil {
- fs.renameMu.Unlock()
- return
- }
for _, d := range **ds {
- d.checkDropLocked(ctx)
+ d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
}
- fs.renameMu.Unlock()
putDentrySlice(*ds)
}
@@ -700,7 +683,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
}
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return err
@@ -712,7 +695,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
@@ -733,7 +716,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
if err != nil {
@@ -770,7 +753,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
if rp.Done() {
@@ -952,7 +935,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return "", err
@@ -982,7 +965,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return linux.Statx{}, err
@@ -1028,7 +1011,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil {
return nil, err
}
@@ -1039,7 +1022,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
@@ -1055,7 +1038,7 @@ func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return "", err
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index 23841ecf7..d2526263c 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -23,10 +23,12 @@
// Lock order:
//
// filesystem.renameMu
-// dentry.dirMu
-// fileDescription.mu
-// filesystem.verityMu
-// dentry.hashMu
+// dentry.cachingMu
+// filesystem.cacheMu
+// dentry.dirMu
+// fileDescription.mu
+// filesystem.verityMu
+// dentry.hashMu
//
// Locking dentry.dirMu in multiple dentries requires that parent dentries are
// locked before child dentries, and that filesystem.renameMu is locked to
@@ -96,6 +98,9 @@ const (
// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
// extended attributes. The maximum value of a 32 bit integer has 10 digits.
sizeOfStringInt32 = 10
+
+ // defaultMaxCachedDentries is the default limit of dentry cache.
+ defaultMaxCachedDentries = uint64(1000)
)
var (
@@ -106,9 +111,10 @@ var (
// Mount option names for verityfs.
const (
- moptLowerPath = "lower_path"
- moptRootHash = "root_hash"
- moptRootName = "root_name"
+ moptLowerPath = "lower_path"
+ moptRootHash = "root_hash"
+ moptRootName = "root_name"
+ moptDentryCacheLimit = "dentry_cache_limit"
)
// HashAlgorithm is a type specifying the algorithm used to hash the file
@@ -188,6 +194,17 @@ type filesystem struct {
// dentries.
renameMu sync.RWMutex `state:"nosave"`
+ // cachedDentries contains all dentries with 0 references. (Due to race
+ // conditions, it may also contain dentries with non-zero references.)
+ // cachedDentriesLen is the number of dentries in cachedDentries. These
+ // fields are protected by cacheMu.
+ cacheMu sync.Mutex `state:"nosave"`
+ cachedDentries dentryList
+ cachedDentriesLen uint64
+
+ // maxCachedDentries is the maximum size of filesystem.cachedDentries.
+ maxCachedDentries uint64
+
// verityMu synchronizes enabling verity files, protects files or
// directories from being enabled by different threads simultaneously.
// It also ensures that verity does not access files that are being
@@ -198,6 +215,10 @@ type filesystem struct {
// is for the whole file system to ensure that no more than one file is
// enabled the same time.
verityMu sync.RWMutex `state:"nosave"`
+
+ // released is nonzero once filesystem.Release has been called. It is accessed
+ // with atomic memory operations.
+ released int32
}
// InternalFilesystemOptions may be passed as
@@ -266,6 +287,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
delete(mopts, moptRootName)
rootName = root
}
+ maxCachedDentries := defaultMaxCachedDentries
+ if str, ok := mopts[moptDentryCacheLimit]; ok {
+ delete(mopts, moptDentryCacheLimit)
+ maxCD, err := strconv.ParseUint(str, 10, 64)
+ if err != nil {
+ ctx.Warningf("verity.FilesystemType.GetFilesystem: invalid dentry cache limit: %s=%s", moptDentryCacheLimit, str)
+ return nil, nil, linuxerr.EINVAL
+ }
+ maxCachedDentries = maxCD
+ }
// Check for unparsed options.
if len(mopts) != 0 {
@@ -339,12 +370,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
action: iopts.Action,
opts: opts.Data,
allowRuntimeEnable: iopts.AllowRuntimeEnable,
+ maxCachedDentries: maxCachedDentries,
}
fs.vfsfs.Init(vfsObj, &fstype, fs)
// Construct the root dentry.
d := fs.newDentry()
- d.refs = 1
+ // Set the root's reference count to 2. One reference is returned to
+ // the caller, and the other is held by fs to prevent the root from
+ // being "cached" and subsequently evicted.
+ d.refs = 2
lowerVD := vfs.MakeVirtualDentry(lowerMount, lowerMount.Root())
lowerVD.IncRef()
d.lowerVD = lowerVD
@@ -519,7 +554,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
+ atomic.StoreInt32(&fs.released, 1)
fs.lowerMount.DecRef(ctx)
+
+ fs.renameMu.Lock()
+ fs.evictAllCachedDentriesLocked(ctx)
+ fs.renameMu.Unlock()
+
+ // An extra reference was held by the filesystem on the root to prevent
+ // it from being cached/evicted.
+ fs.rootDentry.DecRef(ctx)
}
// MountOptions implements vfs.FilesystemImpl.MountOptions.
@@ -533,6 +577,11 @@ func (fs *filesystem) MountOptions() string {
type dentry struct {
vfsd vfs.Dentry
+ // refs is the reference count. Each dentry holds a reference on its
+ // parent, even if disowned. When refs reaches 0, the dentry may be
+ // added to the cache or destroyed. If refs == -1, the dentry has
+ // already been destroyed. refs is accessed using atomic memory
+ // operations.
refs int64
// fs is the owning filesystem. fs is immutable.
@@ -587,13 +636,23 @@ type dentry struct {
// is protected by hashMu.
hashMu sync.RWMutex `state:"nosave"`
hash []byte
+
+ // cachingMu is used to synchronize concurrent dentry caching attempts on
+ // this dentry.
+ cachingMu sync.Mutex `state:"nosave"`
+
+ // If cached is true, dentryEntry links dentry into
+ // filesystem.cachedDentries. cached and dentryEntry are protected by
+ // cachingMu.
+ cached bool
+ dentryEntry
}
// newDentry creates a new dentry representing the given verity file. The
-// dentry initially has no references; it is the caller's responsibility to set
-// the dentry's reference count and/or call dentry.destroy() as appropriate.
-// The dentry is initially invalid in that it contains no underlying dentry;
-// the caller is responsible for setting them.
+// dentry initially has no references, but is not cached; it is the caller's
+// responsibility to set the dentry's reference count and/or call
+// dentry.destroy() as appropriate. The dentry is initially invalid in that it
+// contains no underlying dentry; the caller is responsible for setting them.
func (fs *filesystem) newDentry() *dentry {
d := &dentry{
fs: fs,
@@ -629,42 +688,23 @@ func (d *dentry) TryIncRef() bool {
// DecRef implements vfs.DentryImpl.DecRef.
func (d *dentry) DecRef(ctx context.Context) {
- r := atomic.AddInt64(&d.refs, -1)
- if d.LogRefs() {
- refsvfs2.LogDecRef(d, r)
- }
- if r == 0 {
- d.fs.renameMu.Lock()
- d.checkDropLocked(ctx)
- d.fs.renameMu.Unlock()
- } else if r < 0 {
- panic("verity.dentry.DecRef() called without holding a reference")
+ if d.decRefNoCaching() == 0 {
+ d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
}
}
-func (d *dentry) decRefLocked(ctx context.Context) {
+// decRefNoCaching decrements d's reference count without calling
+// d.checkCachingLocked, even if d's reference count reaches 0; callers are
+// responsible for ensuring that d.checkCachingLocked will be called later.
+func (d *dentry) decRefNoCaching() int64 {
r := atomic.AddInt64(&d.refs, -1)
if d.LogRefs() {
refsvfs2.LogDecRef(d, r)
}
- if r == 0 {
- d.checkDropLocked(ctx)
- } else if r < 0 {
- panic("verity.dentry.decRefLocked() called without holding a reference")
+ if r < 0 {
+ panic("verity.dentry.decRefNoCaching() called without holding a reference")
}
-}
-
-// checkDropLocked should be called after d's reference count becomes 0 or it
-// becomes deleted.
-func (d *dentry) checkDropLocked(ctx context.Context) {
- // Dentries with a positive reference count must be retained. Dentries
- // with a negative reference count have already been destroyed.
- if atomic.LoadInt64(&d.refs) != 0 {
- return
- }
- // Refs is still zero; destroy it.
- d.destroyLocked(ctx)
- return
+ return r
}
// destroyLocked destroys the dentry.
@@ -683,6 +723,12 @@ func (d *dentry) destroyLocked(ctx context.Context) {
panic("verity.dentry.destroyLocked() called with references on the dentry")
}
+ // Drop the reference held by d on its parent without recursively
+ // locking d.fs.renameMu.
+ if d.parent != nil && d.parent.decRefNoCaching() == 0 {
+ d.parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
+ }
+
if d.lowerVD.Ok() {
d.lowerVD.DecRef(ctx)
}
@@ -695,7 +741,6 @@ func (d *dentry) destroyLocked(ctx context.Context) {
delete(d.parent.children, d.name)
}
d.parent.dirMu.Unlock()
- d.parent.decRefLocked(ctx)
}
refsvfs2.Unregister(d)
}
@@ -734,6 +779,140 @@ func (d *dentry) OnZeroWatches(context.Context) {
//TODO(b/159261227): Implement OnZeroWatches.
}
+// checkCachingLocked should be called after d's reference count becomes 0 or
+// it becomes disowned.
+//
+// For performance, checkCachingLocked can also be called after d's reference
+// count becomes non-zero, so that d can be removed from the LRU cache. This
+// may help in reducing the size of the cache and hence reduce evictions. Note
+// that this is not necessary for correctness.
+//
+// It may be called on a destroyed dentry. For example,
+// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
+// for the same dentry when the dentry is visited more than once in the same
+// operation. One of the calls may destroy the dentry, so subsequent calls will
+// do nothing.
+//
+// Preconditions: d.fs.renameMu must be locked for writing if
+// renameMuWriteLocked is true; it may be temporarily unlocked.
+func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) {
+ d.cachingMu.Lock()
+ refs := atomic.LoadInt64(&d.refs)
+ if refs == -1 {
+ // Dentry has already been destroyed.
+ d.cachingMu.Unlock()
+ return
+ }
+ if refs > 0 {
+ // fs.cachedDentries is permitted to contain dentries with non-zero refs,
+ // which are skipped by fs.evictCachedDentryLocked() upon reaching the end
+ // of the LRU. But it is still beneficial to remove d from the cache as we
+ // are already holding d.cachingMu. Keeping a cleaner cache also reduces
+ // the number of evictions (which is expensive as it acquires fs.renameMu).
+ d.removeFromCacheLocked()
+ d.cachingMu.Unlock()
+ return
+ }
+
+ if atomic.LoadInt32(&d.fs.released) != 0 {
+ d.cachingMu.Unlock()
+ if !renameMuWriteLocked {
+ // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as
+ // needed by d.destroyLocked() later.
+ d.fs.renameMu.Lock()
+ defer d.fs.renameMu.Unlock()
+ }
+ if d.parent != nil {
+ d.parent.dirMu.Lock()
+ delete(d.parent.children, d.name)
+ d.parent.dirMu.Unlock()
+ }
+ d.destroyLocked(ctx) // +checklocksforce: see above.
+ return
+ }
+
+ d.fs.cacheMu.Lock()
+ // If d is already cached, just move it to the front of the LRU.
+ if d.cached {
+ d.fs.cachedDentries.Remove(d)
+ d.fs.cachedDentries.PushFront(d)
+ d.fs.cacheMu.Unlock()
+ d.cachingMu.Unlock()
+ return
+ }
+ // Cache the dentry, then evict the least recently used cached dentry if
+ // the cache becomes over-full.
+ d.fs.cachedDentries.PushFront(d)
+ d.fs.cachedDentriesLen++
+ d.cached = true
+ shouldEvict := d.fs.cachedDentriesLen > d.fs.maxCachedDentries
+ d.fs.cacheMu.Unlock()
+ d.cachingMu.Unlock()
+
+ if shouldEvict {
+ if !renameMuWriteLocked {
+ // Need to lock d.fs.renameMu for writing as needed by
+ // d.evictCachedDentryLocked().
+ d.fs.renameMu.Lock()
+ defer d.fs.renameMu.Unlock()
+ }
+ d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above.
+ }
+}
+
+// Preconditions: d.cachingMu must be locked.
+func (d *dentry) removeFromCacheLocked() {
+ if d.cached {
+ d.fs.cacheMu.Lock()
+ d.fs.cachedDentries.Remove(d)
+ d.fs.cachedDentriesLen--
+ d.fs.cacheMu.Unlock()
+ d.cached = false
+ }
+}
+
+// Precondition: fs.renameMu must be locked for writing; it may be temporarily
+// unlocked.
+// +checklocks:fs.renameMu
+func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
+ for fs.cachedDentriesLen != 0 {
+ fs.evictCachedDentryLocked(ctx)
+ }
+}
+
+// Preconditions:
+// * fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// +checklocks:fs.renameMu
+func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
+ fs.cacheMu.Lock()
+ victim := fs.cachedDentries.Back()
+ fs.cacheMu.Unlock()
+ if victim == nil {
+ // fs.cachedDentries may have become empty between when it was
+ // checked and when we locked fs.cacheMu.
+ return
+ }
+
+ victim.cachingMu.Lock()
+ victim.removeFromCacheLocked()
+ // victim.refs may have become non-zero from an earlier path resolution
+ // since it was inserted into fs.cachedDentries.
+ if atomic.LoadInt64(&victim.refs) != 0 {
+ victim.cachingMu.Unlock()
+ return
+ }
+ if victim.parent != nil {
+ victim.parent.dirMu.Lock()
+ // Note that victim can't be a mount point (in any mount
+ // namespace), since VFS holds references on mount points.
+ fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
+ delete(victim.parent.children, victim.name)
+ victim.parent.dirMu.Unlock()
+ }
+ victim.cachingMu.Unlock()
+ victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs.
+}
+
func (d *dentry) isSymlink() bool {
return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
}
diff --git a/pkg/sentry/kernel/ipc/object.go b/pkg/sentry/kernel/ipc/object.go
index 387b35e7e..facd157c7 100644
--- a/pkg/sentry/kernel/ipc/object.go
+++ b/pkg/sentry/kernel/ipc/object.go
@@ -19,6 +19,8 @@ package ipc
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
@@ -113,3 +115,36 @@ func (o *Object) CheckPermissions(creds *auth.Credentials, req fs.PermMask) bool
}
return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, o.UserNS)
}
+
+// Set modifies attributes for an IPC object. See *ctl(IPC_SET).
+//
+// Precondition: Mechanism.mu must be held.
+func (o *Object) Set(ctx context.Context, perm *linux.IPCPerm) error {
+ creds := auth.CredentialsFromContext(ctx)
+ uid := creds.UserNamespace.MapToKUID(auth.UID(perm.UID))
+ gid := creds.UserNamespace.MapToKGID(auth.GID(perm.GID))
+ if !uid.Ok() || !gid.Ok() {
+ // The man pages don't specify an errno for invalid uid/gid, but EINVAL
+ // is generally used for invalid arguments.
+ return linuxerr.EINVAL
+ }
+
+ if !o.CheckOwnership(creds) {
+ // "The argument cmd has the value IPC_SET or IPC_RMID, but the
+ // effective user ID of the calling process is not the creator (as
+ // found in msg_perm.cuid) or the owner (as found in msg_perm.uid)
+ // of the message queue, and the caller is not privileged (Linux:
+ // does not have the CAP_SYS_ADMIN capability)."
+ return linuxerr.EPERM
+ }
+
+ // User may only modify the lower 9 bits of the mode. All the other bits are
+ // always 0 for the underlying inode.
+ mode := linux.FileMode(perm.Mode & 0x1ff)
+
+ o.Perms = fs.FilePermsFromMode(mode)
+ o.Owner.UID = uid
+ o.Owner.GID = gid
+
+ return nil
+}
diff --git a/pkg/sentry/kernel/msgqueue/msgqueue.go b/pkg/sentry/kernel/msgqueue/msgqueue.go
index fab396d7c..7c459d076 100644
--- a/pkg/sentry/kernel/msgqueue/msgqueue.go
+++ b/pkg/sentry/kernel/msgqueue/msgqueue.go
@@ -206,6 +206,48 @@ func (r *Registry) FindByID(id ipc.ID) (*Queue, error) {
return mech.(*Queue), nil
}
+// IPCInfo reports global parameters for message queues. See msgctl(IPC_INFO).
+func (r *Registry) IPCInfo(ctx context.Context) *linux.MsgInfo {
+ return &linux.MsgInfo{
+ MsgPool: linux.MSGPOOL,
+ MsgMap: linux.MSGMAP,
+ MsgMax: linux.MSGMAX,
+ MsgMnb: linux.MSGMNB,
+ MsgMni: linux.MSGMNI,
+ MsgSsz: linux.MSGSSZ,
+ MsgTql: linux.MSGTQL,
+ MsgSeg: linux.MSGSEG,
+ }
+}
+
+// MsgInfo reports global parameters for message queues. See msgctl(MSG_INFO).
+func (r *Registry) MsgInfo(ctx context.Context) *linux.MsgInfo {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ var messages, bytes uint64
+ r.reg.ForAllObjects(
+ func(o ipc.Mechanism) {
+ q := o.(*Queue)
+ q.mu.Lock()
+ messages += q.messageCount
+ bytes += q.byteCount
+ q.mu.Unlock()
+ },
+ )
+
+ return &linux.MsgInfo{
+ MsgPool: int32(r.reg.ObjectCount()),
+ MsgMap: int32(messages),
+ MsgTql: int32(bytes),
+ MsgMax: linux.MSGMAX,
+ MsgMnb: linux.MSGMNB,
+ MsgMni: linux.MSGMNI,
+ MsgSsz: linux.MSGSSZ,
+ MsgSeg: linux.MSGSEG,
+ }
+}
+
// Send appends a message to the message queue, and returns an error if sending
// fails. See msgsnd(2).
func (q *Queue) Send(ctx context.Context, m Message, b Blocker, wait bool, pid int32) error {
@@ -465,6 +507,73 @@ func (q *Queue) msgAtIndex(mType int64) *Message {
return msg
}
+// Set modifies some values of the queue. See msgctl(IPC_SET).
+func (q *Queue) Set(ctx context.Context, ds *linux.MsqidDS) error {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ creds := auth.CredentialsFromContext(ctx)
+ if ds.MsgQbytes > maxQueueBytes && !creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, q.obj.UserNS) {
+ // "An attempt (IPC_SET) was made to increase msg_qbytes beyond the
+ // system parameter MSGMNB, but the caller is not privileged (Linux:
+ // does not have the CAP_SYS_RESOURCE capability)."
+ return linuxerr.EPERM
+ }
+
+ if err := q.obj.Set(ctx, &ds.MsgPerm); err != nil {
+ return err
+ }
+
+ q.maxBytes = ds.MsgQbytes
+ q.changeTime = ktime.NowFromContext(ctx)
+ return nil
+}
+
+// Stat returns a MsqidDS object filled with information about the queue. See
+// msgctl(IPC_STAT) and msgctl(MSG_STAT).
+func (q *Queue) Stat(ctx context.Context) (*linux.MsqidDS, error) {
+ return q.stat(ctx, fs.PermMask{Read: true})
+}
+
+// StatAny is similar to Queue.Stat, but doesn't require read permission. See
+// msgctl(MSG_STAT_ANY).
+func (q *Queue) StatAny(ctx context.Context) (*linux.MsqidDS, error) {
+ return q.stat(ctx, fs.PermMask{})
+}
+
+// stat returns a MsqidDS object filled with information about the queue. An
+// error is returned if the user doesn't have the specified permissions.
+func (q *Queue) stat(ctx context.Context, mask fs.PermMask) (*linux.MsqidDS, error) {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ creds := auth.CredentialsFromContext(ctx)
+ if !q.obj.CheckPermissions(creds, mask) {
+ // "The caller must have read permission on the message queue."
+ return nil, linuxerr.EACCES
+ }
+
+ return &linux.MsqidDS{
+ MsgPerm: linux.IPCPerm{
+ Key: uint32(q.obj.Key),
+ UID: uint32(creds.UserNamespace.MapFromKUID(q.obj.Owner.UID)),
+ GID: uint32(creds.UserNamespace.MapFromKGID(q.obj.Owner.GID)),
+ CUID: uint32(creds.UserNamespace.MapFromKUID(q.obj.Creator.UID)),
+ CGID: uint32(creds.UserNamespace.MapFromKGID(q.obj.Creator.GID)),
+ Mode: uint16(q.obj.Perms.LinuxMode()),
+ Seq: 0, // IPC sequences not supported.
+ },
+ MsgStime: q.sendTime.TimeT(),
+ MsgRtime: q.receiveTime.TimeT(),
+ MsgCtime: q.changeTime.TimeT(),
+ MsgCbytes: q.byteCount,
+ MsgQnum: q.messageCount,
+ MsgQbytes: q.maxBytes,
+ MsgLspid: q.sendPID,
+ MsgLrpid: q.receivePID,
+ }, nil
+}
+
// Lock implements ipc.Mechanism.Lock.
func (q *Queue) Lock() {
q.mu.Lock()
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 8a5c81a68..28e466948 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -336,19 +336,15 @@ func (s *Set) Size() int {
return len(s.sems)
}
-// Change changes some fields from the set atomically.
-func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error {
+// Set modifies attributes for a semaphore set. See semctl(IPC_SET).
+func (s *Set) Set(ctx context.Context, ds *linux.SemidDS) error {
s.mu.Lock()
defer s.mu.Unlock()
- // "The effective UID of the calling process must match the owner or creator
- // of the semaphore set, or the caller must be privileged."
- if !s.obj.CheckOwnership(creds) {
- return linuxerr.EACCES
+ if err := s.obj.Set(ctx, &ds.SemPerm); err != nil {
+ return err
}
- s.obj.Owner = owner
- s.obj.Perms = perms
s.changeTime = ktime.NowFromContext(ctx)
return nil
}
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index b8da0c76c..ab938fa3c 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -618,25 +618,10 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
s.mu.Lock()
defer s.mu.Unlock()
- creds := auth.CredentialsFromContext(ctx)
- if !s.obj.CheckOwnership(creds) {
- return linuxerr.EPERM
- }
-
- uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID))
- gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID))
- if !uid.Ok() || !gid.Ok() {
- return linuxerr.EINVAL
+ if err := s.obj.Set(ctx, &ds.ShmPerm); err != nil {
+ return err
}
- // User may only modify the lower 9 bits of the mode. All the other bits are
- // always 0 for the underlying inode.
- mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff)
- s.obj.Perms = fs.FilePermsFromMode(mode)
-
- s.obj.Owner.UID = uid
- s.obj.Owner.GID = gid
-
s.changeTime = ktime.NowFromContext(ctx)
return nil
}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 2eda15303..5814a4eca 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -489,11 +489,6 @@ func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID)
tg.signalHandlers.mu.Lock()
defer tg.signalHandlers.mu.Unlock()
- // TODO(gvisor.dev/issue/6148): "If tcsetpgrp() is called by a member of a
- // background process group in its session, and the calling process is not
- // blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all members of
- // this background process group."
-
// tty must be the controlling terminal.
if tg.tty != tty {
return -1, linuxerr.ENOTTY
@@ -516,6 +511,16 @@ func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID)
return -1, linuxerr.EPERM
}
+ signalAction := tg.signalHandlers.actions[linux.SIGTTOU]
+ // If the calling process is a member of a background group, a SIGTTOU
+ // signal is sent to all members of this background process group.
+ // We need also need to check whether it is ignoring or blocking SIGTTOU.
+ ignored := signalAction.Handler == linux.SIG_IGN
+ blocked := tg.leader.signalMask == linux.SignalSetOf(linux.SIGTTOU)
+ if tg.processGroup.id != tg.processGroup.session.foreground.id && !ignored && !blocked {
+ tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGTTOU), true)
+ }
+
tg.processGroup.session.foreground.id = pgid
return 0, nil
}
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index b7f765cd7..d71d64580 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -77,15 +77,6 @@ func (mm *MemoryManager) destroyAIOContextLocked(ctx context.Context, id uint64)
return nil
}
- // Only unmaps after it assured that the address is a valid aio context to
- // prevent random memory from been unmapped.
- //
- // Note: It's possible to unmap this address and map something else into
- // the same address. Then it would be unmapping memory that it doesn't own.
- // This is, however, the way Linux implements AIO. Keeps the same [weird]
- // semantics in case anyone relies on it.
- mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize)
-
delete(mm.aioManager.contexts, id)
aioCtx.destroy()
return aioCtx
@@ -411,6 +402,15 @@ func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOC
return nil
}
+ // Only unmaps after it assured that the address is a valid aio context to
+ // prevent random memory from been unmapped.
+ //
+ // Note: It's possible to unmap this address and map something else into
+ // the same address. Then it would be unmapping memory that it doesn't own.
+ // This is, however, the way Linux implements AIO. Keeps the same [weird]
+ // semantics in case anyone relies on it.
+ mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize)
+
mm.aioManager.mu.Lock()
defer mm.aioManager.mu.Unlock()
return mm.destroyAIOContextLocked(ctx, id)
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
index 8fd8287b3..7a3c97c5a 100644
--- a/pkg/sentry/platform/kvm/bluepill_fault.go
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -55,11 +55,7 @@ func calculateBluepillFault(physical uintptr, phyRegions []physicalRegion) (virt
}
// Adjust the block to match our size.
- physicalStart = alignedPhysical & faultBlockMask
- if physicalStart < pr.physical {
- // Bound the starting point to the start of the region.
- physicalStart = pr.physical
- }
+ physicalStart = pr.physical + (alignedPhysical-pr.physical)&faultBlockMask
virtualStart = pr.virtual + (physicalStart - pr.physical)
physicalEnd := physicalStart + faultBlockSize
if physicalEnd > end {
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 56f90d952..2046a48b9 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -123,7 +123,7 @@ var AMD64 = &kernel.SyscallTable{
68: syscalls.Supported("msgget", Msgget),
69: syscalls.Supported("msgsnd", Msgsnd),
70: syscalls.Supported("msgrcv", Msgrcv),
- 71: syscalls.PartiallySupported("msgctl", Msgctl, "Only supports IPC_RMID option.", []string{"gvisor.dev/issue/135"}),
+ 71: syscalls.Supported("msgctl", Msgctl),
72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil),
73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil),
74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil),
@@ -616,7 +616,7 @@ var ARM64 = &kernel.SyscallTable{
184: syscalls.ErrorWithEvent("mq_notify", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
185: syscalls.ErrorWithEvent("mq_getsetattr", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
186: syscalls.Supported("msgget", Msgget),
- 187: syscalls.PartiallySupported("msgctl", Msgctl, "Only supports IPC_RMID option.", []string{"gvisor.dev/issue/135"}),
+ 187: syscalls.Supported("msgctl", Msgctl),
188: syscalls.Supported("msgrcv", Msgrcv),
189: syscalls.Supported("msgsnd", Msgsnd),
190: syscalls.Supported("semget", Semget),
diff --git a/pkg/sentry/syscalls/linux/sys_msgqueue.go b/pkg/sentry/syscalls/linux/sys_msgqueue.go
index 5259ade90..60b989ee7 100644
--- a/pkg/sentry/syscalls/linux/sys_msgqueue.go
+++ b/pkg/sentry/syscalls/linux/sys_msgqueue.go
@@ -130,12 +130,63 @@ func receive(t *kernel.Task, id ipc.ID, mType int64, maxSize int64, msgCopy, wai
func Msgctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
id := ipc.ID(args[0].Int())
cmd := args[1].Int()
+ buf := args[2].Pointer()
creds := auth.CredentialsFromContext(t)
+ r := t.IPCNamespace().MsgqueueRegistry()
+
switch cmd {
+ case linux.IPC_INFO:
+ info := r.IPCInfo(t)
+ _, err := info.CopyOut(t, buf)
+ return 0, nil, err
+ case linux.MSG_INFO:
+ msgInfo := r.MsgInfo(t)
+ _, err := msgInfo.CopyOut(t, buf)
+ return 0, nil, err
case linux.IPC_RMID:
- return 0, nil, t.IPCNamespace().MsgqueueRegistry().Remove(id, creds)
+ return 0, nil, r.Remove(id, creds)
+ }
+
+ // Remaining commands use a queue.
+ queue, err := r.FindByID(id)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ switch cmd {
+ case linux.MSG_STAT:
+ // Technically, we should be treating id as "an index into the kernel's
+ // internal array that maintains information about all shared memory
+ // segments on the system". Since we don't track segments in an array,
+ // we'll just pretend the msqid is the index and do the same thing as
+ // IPC_STAT. Linux also uses the index as the msqid.
+ fallthrough
+ case linux.IPC_STAT:
+ stat, err := queue.Stat(t)
+ if err != nil {
+ return 0, nil, err
+ }
+ _, err = stat.CopyOut(t, buf)
+ return 0, nil, err
+
+ case linux.MSG_STAT_ANY:
+ stat, err := queue.StatAny(t)
+ if err != nil {
+ return 0, nil, err
+ }
+ _, err = stat.CopyOut(t, buf)
+ return 0, nil, err
+
+ case linux.IPC_SET:
+ var ds linux.MsqidDS
+ if _, err := ds.CopyIn(t, buf); err != nil {
+ return 0, nil, linuxerr.EINVAL
+ }
+ err := queue.Set(t, &ds)
+ return 0, nil, err
+
default:
return 0, nil, linuxerr.EINVAL
}
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index f61cc466c..5a119b21c 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
@@ -166,8 +165,7 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, err
}
- perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777))
- return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms)
+ return 0, nil, ipcSet(t, id, &s)
case linux.GETPID:
v, err := getPID(t, id, num)
@@ -243,24 +241,13 @@ func remove(t *kernel.Task, id ipc.ID) error {
return r.Remove(id, creds)
}
-func ipcSet(t *kernel.Task, id ipc.ID, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error {
+func ipcSet(t *kernel.Task, id ipc.ID, ds *linux.SemidDS) error {
r := t.IPCNamespace().SemaphoreRegistry()
set := r.FindByID(id)
if set == nil {
return linuxerr.EINVAL
}
-
- creds := auth.CredentialsFromContext(t)
- kuid := creds.UserNamespace.MapToKUID(uid)
- if !kuid.Ok() {
- return linuxerr.EINVAL
- }
- kgid := creds.UserNamespace.MapToKGID(gid)
- if !kgid.Ok() {
- return linuxerr.EINVAL
- }
- owner := fs.FileOwner{UID: kuid, GID: kgid}
- return set.Change(t, creds, owner, perms)
+ return set.Set(t, ds)
}
func ipcStat(t *kernel.Task, id ipc.ID) (*linux.SemidDS, error) {
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 5aad31b78..82ee2c521 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -1,9 +1,5 @@
# The gVisor Virtual Filesystem
-THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION
-USE. For the filesystem implementation currently used by gVisor, see the `fs`
-package.
-
## Implementation Notes
### Reference Counting