summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/fs/tmpfs/fs.go25
-rwxr-xr-xpkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go4
-rwxr-xr-xpkg/sentry/platform/ring0/defs_impl.go4
-rwxr-xr-xpkg/sentry/time/seqatomic_parameters.go4
-rw-r--r--runsc/boot/controller.go2
-rw-r--r--runsc/boot/fs.go281
-rw-r--r--runsc/boot/loader.go14
-rw-r--r--runsc/specutils/fs.go40
8 files changed, 339 insertions, 35 deletions
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index b7c29a4d1..83e1bf247 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -34,6 +34,16 @@ const (
// GID for the root directory.
rootGIDKey = "gid"
+ // cacheKey sets the caching policy for the mount.
+ cacheKey = "cache"
+
+ // cacheAll uses the virtual file system cache for everything (default).
+ cacheAll = "cache"
+
+ // cacheRevalidate allows dirents to be cached, but revalidates them on each
+ // lookup.
+ cacheRevalidate = "revalidate"
+
// TODO(edahlgren/mpratt): support a tmpfs size limit.
// size = "size"
@@ -122,15 +132,24 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
delete(options, rootGIDKey)
}
+ // Construct a mount which will follow the cache options provided.
+ var msrc *fs.MountSource
+ switch options[cacheKey] {
+ case "", cacheAll:
+ msrc = fs.NewCachingMountSource(f, flags)
+ case cacheRevalidate:
+ msrc = fs.NewRevalidatingMountSource(f, flags)
+ default:
+ return nil, fmt.Errorf("invalid cache policy option %q", options[cacheKey])
+ }
+ delete(options, cacheKey)
+
// Fail if the caller passed us more options than we can parse. They may be
// expecting us to set something we can't set.
if len(options) > 0 {
return nil, fmt.Errorf("unsupported mount options: %v", options)
}
- // Construct a mount which will cache dirents.
- msrc := fs.NewCachingMountSource(f, flags)
-
// Construct the tmpfs root.
return NewDir(ctx, nil, owner, perms, msrc), nil
}
diff --git a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go
index 580fd9d86..5b4ad3062 100755
--- a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go
+++ b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go
@@ -1,12 +1,12 @@
package kernel
import (
+ "fmt"
+ "reflect"
"strings"
"unsafe"
- "fmt"
"gvisor.googlesource.com/gvisor/third_party/gvsync"
- "reflect"
)
// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
diff --git a/pkg/sentry/platform/ring0/defs_impl.go b/pkg/sentry/platform/ring0/defs_impl.go
index 582553bc7..2fcc15f50 100755
--- a/pkg/sentry/platform/ring0/defs_impl.go
+++ b/pkg/sentry/platform/ring0/defs_impl.go
@@ -1,14 +1,14 @@
package ring0
import (
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+ "reflect"
"syscall"
"fmt"
- "gvisor.googlesource.com/gvisor/pkg/cpuid"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
"io"
- "reflect"
)
var (
diff --git a/pkg/sentry/time/seqatomic_parameters.go b/pkg/sentry/time/seqatomic_parameters.go
index 24d6db1c6..54152a39a 100755
--- a/pkg/sentry/time/seqatomic_parameters.go
+++ b/pkg/sentry/time/seqatomic_parameters.go
@@ -1,12 +1,12 @@
package time
import (
+ "fmt"
+ "reflect"
"strings"
"unsafe"
- "fmt"
"gvisor.googlesource.com/gvisor/third_party/gvsync"
- "reflect"
)
// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index a277145b1..416e5355d 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -340,7 +340,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
cm.l.k = k
// Set up the restore environment.
- mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k)
+ mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k, cm.l.mountHints)
renv, err := mntr.createRestoreEnvironment(cm.l.conf)
if err != nil {
return fmt.Errorf("creating RestoreEnvironment: %v", err)
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 939f2419c..2fa0725d1 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -18,6 +18,7 @@ import (
"fmt"
"path"
"path/filepath"
+ "sort"
"strconv"
"strings"
"syscall"
@@ -50,6 +51,9 @@ const (
// Device name for root mount.
rootDevice = "9pfs-/"
+ // MountPrefix is the annotation prefix for mount hints.
+ MountPrefix = "gvisor.dev/spec/mount"
+
// ChildContainersDir is the directory where child container root
// filesystems are mounted.
ChildContainersDir = "/__runsc_containers__"
@@ -292,6 +296,174 @@ func (f *fdDispenser) empty() bool {
return len(f.fds) == 0
}
+type shareType int
+
+const (
+ invalid shareType = iota
+
+ // container shareType indicates that the mount is used by a single container.
+ container
+
+ // pod shareType indicates that the mount is used by more than one container
+ // inside the pod.
+ pod
+
+ // shared shareType indicates that the mount can also be shared with a process
+ // outside the pod, e.g. NFS.
+ shared
+)
+
+func parseShare(val string) (shareType, error) {
+ switch val {
+ case "container":
+ return container, nil
+ case "pod":
+ return pod, nil
+ case "shared":
+ return shared, nil
+ default:
+ return 0, fmt.Errorf("invalid share value %q", val)
+ }
+}
+
+func (s shareType) String() string {
+ switch s {
+ case invalid:
+ return "invalid"
+ case container:
+ return "container"
+ case pod:
+ return "pod"
+ case shared:
+ return "shared"
+ default:
+ return fmt.Sprintf("invalid share value %d", s)
+ }
+}
+
+// mountHint represents extra information about mounts that are provided via
+// annotations. They can override mount type, and provide sharing information
+// so that mounts can be correctly shared inside the pod.
+type mountHint struct {
+ name string
+ share shareType
+ mount specs.Mount
+
+ // root is the inode where the volume is mounted. For mounts with 'pod' share
+ // the volume is mounted once and then bind mounted inside the containers.
+ root *fs.Inode
+}
+
+func (m *mountHint) setField(key, val string) error {
+ switch key {
+ case "source":
+ if len(val) == 0 {
+ return fmt.Errorf("source cannot be empty")
+ }
+ m.mount.Source = val
+ case "type":
+ return m.setType(val)
+ case "share":
+ share, err := parseShare(val)
+ if err != nil {
+ return err
+ }
+ m.share = share
+ case "options":
+ return m.setOptions(val)
+ default:
+ return fmt.Errorf("invalid mount annotation: %s=%s", key, val)
+ }
+ return nil
+}
+
+func (m *mountHint) setType(val string) error {
+ switch val {
+ case "tmpfs", "bind":
+ m.mount.Type = val
+ default:
+ return fmt.Errorf("invalid type %q", val)
+ }
+ return nil
+}
+
+func (m *mountHint) setOptions(val string) error {
+ opts := strings.Split(val, ",")
+ if err := specutils.ValidateMountOptions(opts); err != nil {
+ return err
+ }
+ // Sort options so it can be compared with container mount options later on.
+ sort.Strings(opts)
+ m.mount.Options = opts
+ return nil
+}
+
+func (m *mountHint) isSupported() bool {
+ return m.mount.Type == tmpfs && m.share == pod
+}
+
+// podMountHints contains a collection of mountHints for the pod.
+type podMountHints struct {
+ mounts map[string]*mountHint
+}
+
+func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
+ mnts := make(map[string]*mountHint)
+ for k, v := range spec.Annotations {
+ // Look for 'gvisor.dev/spec/mount' annotations and parse them.
+ if strings.HasPrefix(k, MountPrefix) {
+ parts := strings.Split(k, "/")
+ if len(parts) != 5 {
+ return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
+ }
+ name := parts[3]
+ if len(name) == 0 || path.Clean(name) != name {
+ return nil, fmt.Errorf("invalid mount name: %s", name)
+ }
+ mnt := mnts[name]
+ if mnt == nil {
+ mnt = &mountHint{name: name}
+ mnts[name] = mnt
+ }
+ if err := mnt.setField(parts[4], v); err != nil {
+ return nil, err
+ }
+ }
+ }
+
+ // Validate all hints after done parsing.
+ for name, m := range mnts {
+ log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share)
+ if m.share == invalid {
+ return nil, fmt.Errorf("share field for %q has not been set", m.name)
+ }
+ if len(m.mount.Source) == 0 {
+ return nil, fmt.Errorf("source field for %q has not been set", m.name)
+ }
+ if len(m.mount.Type) == 0 {
+ return nil, fmt.Errorf("type field for %q has not been set", m.name)
+ }
+
+ // Check for duplicate mount sources.
+ for name2, m2 := range mnts {
+ if name != name2 && m.mount.Source == m2.mount.Source {
+ return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source)
+ }
+ }
+ }
+
+ return &podMountHints{mounts: mnts}, nil
+}
+
+func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
+ for _, m := range p.mounts {
+ if m.mount.Source == mount.Source {
+ return m
+ }
+ }
+ return nil
+}
+
type containerMounter struct {
// cid is the container ID. May be set to empty for the root container.
cid string
@@ -306,15 +478,18 @@ type containerMounter struct {
fds fdDispenser
k *kernel.Kernel
+
+ hints *podMountHints
}
-func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel) *containerMounter {
+func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
return &containerMounter{
cid: cid,
root: spec.Root,
mounts: compileMounts(spec),
fds: fdDispenser{fds: goferFDs},
k: k,
+ hints: hints,
}
}
@@ -476,6 +651,15 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
// 'setMountNS' is called after namespace is created. It must set the mount NS
// to 'rootCtx'.
func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
+ for _, hint := range c.hints.mounts {
+ log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+ inode, err := c.mountSharedMaster(rootCtx, conf, hint)
+ if err != nil {
+ return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+ }
+ hint.root = inode
+ }
+
// Create a tmpfs mount where we create and mount a root filesystem for
// each child container.
c.mounts = append(c.mounts, specs.Mount{
@@ -498,21 +682,57 @@ func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx c
return c.mountSubmounts(rootCtx, conf, mns, root)
}
+// mountSharedMaster mounts the master of a volume that is shared among
+// containers in a pod. It returns the root mount's inode.
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
+ if err != nil {
+ return nil, err
+ }
+ if len(fsName) == 0 {
+ return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+ }
+
+ // Mount with revalidate because it's shared among containers.
+ opts = append(opts, "cache=revalidate")
+
+ // All filesystem names should have been mapped to something we know.
+ filesystem := mustFindFilesystem(fsName)
+
+ mf := mountFlags(hint.mount.Options)
+ if useOverlay {
+ // All writes go to upper, be paranoid and make lower readonly.
+ mf.ReadOnly = true
+ }
+
+ inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
+ }
+
+ if useOverlay {
+ log.Debugf("Adding overlay on top of shared mount %q", hint.name)
+ inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ return inode, nil
+}
+
// createRootMount creates the root filesystem.
func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
// First construct the filesystem from the spec.Root.
mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
- var (
- rootInode *fs.Inode
- err error
- )
-
fd := c.fds.remove()
log.Infof("Mounting root over 9P, ioFD: %d", fd)
p9FS := mustFindFilesystem("9p")
opts := p9MountOptions(fd, conf.FileAccess)
- rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+ rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
if err != nil {
return nil, fmt.Errorf("creating root mount point: %v", err)
}
@@ -579,8 +799,14 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
for _, m := range c.mounts {
- if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
- return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+ if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
+ if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
+ return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
+ }
+ } else {
+ if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+ return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+ }
}
}
@@ -653,6 +879,37 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
return nil
}
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
+ // For now enforce that all options are the same. Once bind mount is properly
+ // supported, then we should ensure the master is less restrictive than the
+ // container, e.g. master can be 'rw' while container mounts as 'ro'.
+ if len(mount.Options) != len(source.mount.Options) {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
+ }
+ sort.Strings(mount.Options)
+ for i, opt := range mount.Options {
+ if opt != source.mount.Options[i] {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
+ }
+ }
+
+ maxTraversals := uint(0)
+ target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
+ }
+ defer target.DecRef()
+
+ if err := mns.Mount(ctx, target, source.root); err != nil {
+ return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
+ }
+
+ log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+ return nil
+}
+
// addRestoreMount adds a mount to the MountSources map used for restoring a
// checkpointed container.
func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
@@ -678,8 +935,8 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron
return nil
}
-// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
-// to the environment.
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
+// the mounts to the environment.
func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
renv := &fs.RestoreEnvironment{
MountSources: make(map[string][]fs.MountArgs),
@@ -730,7 +987,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
// the host /tmp, but this is a nice optimization, and fixes some apps that call
// mknod in /tmp. It's unsafe to mount tmpfs if:
-// 1. /tmp is mounted explictly: we should not override user's wish
+// 1. /tmp is mounted explicitly: we should not override user's wish
// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
//
// Note that when there are submounts inside of '/tmp', directories for the
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 42bddb2e8..3e6095fdc 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -117,6 +117,10 @@ type Loader struct {
//
// processes is guardded by mu.
processes map[execID]*execProcess
+
+ // mountHints provides extra information about mounts for containers that
+ // apply to the entire pod.
+ mountHints *podMountHints
}
// execID uniquely identifies a sentry process that is executed in a container.
@@ -299,6 +303,11 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("initializing compat logs: %v", err)
}
+ mountHints, err := newPodMountHints(args.Spec)
+ if err != nil {
+ return nil, fmt.Errorf("creating pod mount hints: %v", err)
+ }
+
eid := execID{cid: args.ID}
l := &Loader{
k: k,
@@ -311,6 +320,7 @@ func New(args Args) (*Loader, error) {
rootProcArgs: procArgs,
sandboxID: args.ID,
processes: map[execID]*execProcess{eid: {}},
+ mountHints: mountHints,
}
// We don't care about child signals; some platforms can generate a
@@ -502,7 +512,7 @@ func (l *Loader) run() error {
// cid for root container can be empty. Only subcontainers need it to set
// the mount location.
- mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k)
+ mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints)
if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil {
return err
}
@@ -623,7 +633,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
goferFDs = append(goferFDs, fd)
}
- mntr := newContainerMounter(spec, cid, goferFDs, l.k)
+ mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints)
if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil {
return fmt.Errorf("configuring container FS: %v", err)
}
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index 1f3afb4e4..6e6902e9f 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -16,6 +16,7 @@ package specutils
import (
"fmt"
+ "math/bits"
"path"
"syscall"
@@ -105,22 +106,30 @@ func optionsToFlags(opts []string, source map[string]mapping) uint32 {
return rv
}
-// ValidateMount validates that spec mounts are correct.
+// validateMount validates that spec mounts are correct.
func validateMount(mnt *specs.Mount) error {
if !path.IsAbs(mnt.Destination) {
return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt)
}
-
if mnt.Type == "bind" {
- for _, o := range mnt.Options {
- if ContainsStr(invalidOptions, o) {
- return fmt.Errorf("mount option %q is not supported: %v", o, mnt)
- }
- _, ok1 := optionsMap[o]
- _, ok2 := propOptionsMap[o]
- if !ok1 && !ok2 {
- return fmt.Errorf("unknown mount option %q", o)
- }
+ return ValidateMountOptions(mnt.Options)
+ }
+ return nil
+}
+
+// ValidateMountOptions validates that mount options are correct.
+func ValidateMountOptions(opts []string) error {
+ for _, o := range opts {
+ if ContainsStr(invalidOptions, o) {
+ return fmt.Errorf("mount option %q is not supported", o)
+ }
+ _, ok1 := optionsMap[o]
+ _, ok2 := propOptionsMap[o]
+ if !ok1 && !ok2 {
+ return fmt.Errorf("unknown mount option %q", o)
+ }
+ if err := validatePropagation(o); err != nil {
+ return err
}
}
return nil
@@ -133,5 +142,14 @@ func validateRootfsPropagation(opt string) error {
if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 {
return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt)
}
+ return validatePropagation(opt)
+}
+
+func validatePropagation(opt string) error {
+ flags := PropOptionsToFlags([]string{opt})
+ exclusive := flags & (syscall.MS_SLAVE | syscall.MS_PRIVATE | syscall.MS_SHARED | syscall.MS_UNBINDABLE)
+ if bits.OnesCount32(exclusive) > 1 {
+ return fmt.Errorf("mount propagation options are mutually exclusive: %q", opt)
+ }
return nil
}