diff options
author | Fabricio Voznika <fvoznika@google.com> | 2019-03-18 12:29:43 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2019-03-18 12:30:43 -0700 |
commit | e420cc3e5d2066674d32d16ad885bee6b30da210 (patch) | |
tree | 991b119af7c8816a539318560338b3e5f065a2f8 /runsc | |
parent | eb69542807a87491fd4e6405bdab1c0f64db536d (diff) |
Add support for mount propagation
Properly handle propagation options for root and mounts. Now usage of
mount options shared, rshared, and noexec cause error to start. shared/
rshared breaks sandbox=>host isolation. slave however can be supported
because changes propagate from host to sandbox.
Root FS setup moved inside the gofer. Apart from simplifying the code,
it keeps all mounts inside the namespace. And they are torn down when
the namespace is destroyed (DestroyFS is no longer needed).
PiperOrigin-RevId: 239037661
Change-Id: I8b5ee4d50da33c042ea34fa68e56514ebe20e6e0
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/cmd/BUILD | 1 | ||||
-rw-r--r-- | runsc/cmd/boot.go | 16 | ||||
-rw-r--r-- | runsc/cmd/gofer.go | 279 | ||||
-rw-r--r-- | runsc/cmd/gofer_test.go (renamed from runsc/container/fs_test.go) | 16 | ||||
-rw-r--r-- | runsc/container/BUILD | 2 | ||||
-rw-r--r-- | runsc/container/container.go | 75 | ||||
-rw-r--r-- | runsc/container/container_test.go | 165 | ||||
-rw-r--r-- | runsc/container/fs.go | 287 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 23 | ||||
-rw-r--r-- | runsc/specutils/BUILD | 1 | ||||
-rw-r--r-- | runsc/specutils/fs.go | 139 | ||||
-rw-r--r-- | runsc/specutils/namespace.go | 16 | ||||
-rw-r--r-- | runsc/specutils/specutils.go | 52 | ||||
-rw-r--r-- | runsc/specutils/specutils_test.go | 31 |
14 files changed, 681 insertions, 422 deletions
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 9e2be0d37..dabf18c5f 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -60,6 +60,7 @@ go_test( "capability_test.go", "delete_test.go", "exec_test.go", + "gofer_test.go", ], data = [ "//runsc", diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 3039b389f..ff2fa2fb9 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -76,6 +76,11 @@ type Boot struct { // startSyncFD is the file descriptor to synchronize runsc and sandbox. startSyncFD int + // mountsFD is the file descriptor to read list of mounts after they have + // been resolved (direct paths, no symlinks). They are resolved outside the + // sandbox (e.g. gofer) and sent through this FD. + mountsFD int + // pidns is set if the sanadbox is in its own pid namespace. pidns bool } @@ -111,6 +116,7 @@ func (b *Boot) SetFlags(f *flag.FlagSet) { f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container") f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.") f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup") + f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).") } // Execute implements subcommands.Command.Execute. It starts a sandbox in a @@ -191,6 +197,16 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) panic("setCapsAndCallSelf must never return success") } + // Read resolved mount list and replace the original one from the spec. + mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file") + cleanMounts, err := specutils.ReadMounts(mountsFile) + if err != nil { + mountsFile.Close() + Fatalf("Error reading mounts file: %v", err) + } + mountsFile.Close() + spec.Mounts = cleanMounts + // Create the loader. bootArgs := boot.Args{ ID: f.Arg(0), diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 6f9711518..e712244ef 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -16,7 +16,11 @@ package cmd import ( "context" + "encoding/json" + "fmt" "os" + "path/filepath" + "strings" "sync" "syscall" @@ -59,6 +63,7 @@ type Gofer struct { panicOnWrite bool specFD int + mountsFD int } // Name implements subcommands.Command. @@ -84,6 +89,7 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected") f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") + f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).") } // Execute implements subcommands.Command. @@ -100,45 +106,13 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("reading spec: %v", err) } - // Find what path is going to be served by this gofer. - root := spec.Root.Path - conf := args[0].(*boot.Config) - if g.setUpRoot && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { - // Convert all shared mounts into slave to be sure that nothing will be - // propagated outside of our namespace. - if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { - Fatalf("error converting mounts: %v", err) - } - - // FIXME: runsc can't be re-executed without - // /proc, so we create a tmpfs mount, mount ./proc and ./root - // there, then move this mount to the root and after - // setCapsAndCallSelf, runsc will chroot into /root. - // - // We need a directory to construct a new root and we know that - // runsc can't start without /proc, so we can use it for this. - flags := uintptr(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC) - if err := syscall.Mount("runsc-root", "/proc", "tmpfs", flags, ""); err != nil { - Fatalf("error mounting tmpfs: %v", err) - } - os.Mkdir("/proc/proc", 0755) - os.Mkdir("/proc/root", 0755) - if err := syscall.Mount("runsc-proc", "/proc/proc", "proc", flags|syscall.MS_RDONLY, ""); err != nil { - Fatalf("error mounting proc: %v", err) - } - if err := syscall.Mount(root, "/proc/root", "", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { - Fatalf("error mounting root: %v", err) - } - if err := pivotRoot("/proc"); err != nil { - Fatalf("faild to change the root file system: %v", err) - } - if err := os.Chdir("/"); err != nil { - Fatalf("failed to change working directory") + if g.setUpRoot { + if err := setupRootFS(spec, conf); err != nil { + Fatalf("Error setting up root FS: %v", err) } } - if g.applyCaps { // Disable caps when calling myself again. // Note: minimal argument handling for the default case to keep it simple. @@ -150,15 +124,34 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) panic("unreachable") } + // Find what path is going to be served by this gofer. + root := spec.Root.Path + if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + root = "/root" + } + + // Resolve mount points paths, then replace mounts from our spec and send the + // mount list over to the sandbox, so they are both in sync. + // + // Note that all mount points have been mounted in the proper location in + // setupRootFS(). + cleanMounts, err := resolveMounts(spec.Mounts, root) + if err != nil { + Fatalf("Failure to resolve mounts: %v", err) + } + spec.Mounts = cleanMounts + go func() { + if err := g.writeMounts(cleanMounts); err != nil { + panic(fmt.Sprintf("Failed to write mounts: %v", err)) + } + }() + specutils.LogSpec(spec) // fsgofer should run with a umask of 0, because we want to preserve file // modes exactly as sent by the sandbox, which will have applied its own umask. syscall.Umask(0) - if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { - root = "/root" - } if err := syscall.Chroot(root); err != nil { Fatalf("failed to chroot to %q: %v", root, err) } @@ -232,6 +225,25 @@ func runServers(ats []p9.Attacher, ioFDs []int) { log.Infof("All 9P servers exited.") } +func (g *Gofer) writeMounts(mounts []specs.Mount) error { + bytes, err := json.Marshal(mounts) + if err != nil { + return err + } + + f := os.NewFile(uintptr(g.mountsFD), "mounts file") + defer f.Close() + + for written := 0; written < len(bytes); { + w, err := f.Write(bytes[written:]) + if err != nil { + return err + } + written += w + } + return nil +} + func isReadonlyMount(opts []string) bool { for _, o := range opts { if o == "ro" { @@ -240,3 +252,194 @@ func isReadonlyMount(opts []string) bool { } return false } + +func setupRootFS(spec *specs.Spec, conf *boot.Config) error { + // Convert all shared mounts into slaves to be sure that nothing will be + // propagated outside of our namespace. + if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + Fatalf("error converting mounts: %v", err) + } + + root := spec.Root.Path + if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + // FIXME: runsc can't be re-executed without + // /proc, so we create a tmpfs mount, mount ./proc and ./root + // there, then move this mount to the root and after + // setCapsAndCallSelf, runsc will chroot into /root. + // + // We need a directory to construct a new root and we know that + // runsc can't start without /proc, so we can use it for this. + flags := uintptr(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC) + if err := syscall.Mount("runsc-root", "/proc", "tmpfs", flags, ""); err != nil { + Fatalf("error mounting tmpfs: %v", err) + } + + // Prepare tree structure for pivot_root(2). + os.Mkdir("/proc/proc", 0755) + os.Mkdir("/proc/root", 0755) + if err := syscall.Mount("runsc-proc", "/proc/proc", "proc", flags|syscall.MS_RDONLY, ""); err != nil { + Fatalf("error mounting proc: %v", err) + } + root = "/proc/root" + } + + // Mount root path followed by submounts. + if err := syscall.Mount(spec.Root.Path, root, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting root on root (%q) err: %v", spec.Root.Path, err) + } + flags := uint32(syscall.MS_SLAVE | syscall.MS_REC) + if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { + flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation}) + } + if err := syscall.Mount("", spec.Root.Path, "", uintptr(flags), ""); err != nil { + return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", spec.Root.Path, flags, err) + } + + // Replace the current spec, with the clean spec with symlinks resolved. + if err := setupMounts(spec.Mounts, root); err != nil { + Fatalf("error setting up FS: %v", err) + } + + // Create working directory if needed. + if spec.Process.Cwd != "" { + dst, err := resolveSymlinks(root, spec.Process.Cwd) + if err != nil { + return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err) + } + if err := os.MkdirAll(dst, 0755); err != nil { + return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err) + } + } + + // Check if root needs to be remounted as readonly. + if spec.Root.Readonly { + // If root is a mount point but not read-only, we can change mount options + // to make it read-only for extra safety. + log.Infof("Remounting root as readonly: %q", spec.Root.Path) + flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC) + if err := syscall.Mount(spec.Root.Path, spec.Root.Path, "bind", flags, ""); err != nil { + return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err) + } + } + + if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + if err := pivotRoot("/proc"); err != nil { + Fatalf("faild to change the root file system: %v", err) + } + if err := os.Chdir("/"); err != nil { + Fatalf("failed to change working directory") + } + } + return nil +} + +// setupMounts binds mount all mounts specified in the spec in their correct +// location inside root. It will resolve relative paths and symlinks. It also +// creates directories as needed. +func setupMounts(mounts []specs.Mount, root string) error { + for _, m := range mounts { + if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { + continue + } + + dst, err := resolveSymlinks(root, m.Destination) + if err != nil { + return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) + } + + flags := specutils.OptionsToFlags(m.Options) | syscall.MS_BIND + log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags) + if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil { + return fmt.Errorf("mounting %v: %v", m, err) + } + + // Set propagation options that cannot be set together with other options. + flags = specutils.PropOptionsToFlags(m.Options) + if flags != 0 { + if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil { + return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err) + } + } + } + return nil +} + +// resolveMounts resolved relative paths and symlinks to mount points. +// +// Note: mount points must already be in place for resolution to work. +// Otherwise, it may follow symlinks to locations that would be overwritten +// with another mount point and return the wrong location. In short, make sure +// setupMounts() has been called before. +func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) { + cleanMounts := make([]specs.Mount, 0, len(mounts)) + for _, m := range mounts { + if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { + cleanMounts = append(cleanMounts, m) + continue + } + dst, err := resolveSymlinks(root, m.Destination) + if err != nil { + return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) + } + relDst, err := filepath.Rel(root, dst) + if err != nil { + panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err)) + } + cpy := m + cpy.Destination = filepath.Join("/", relDst) + cleanMounts = append(cleanMounts, cpy) + } + return cleanMounts, nil +} + +// ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are +// symlinks, they are evaluated relative to 'root' to ensure the end result is +// the same as if the process was running inside the container. +func resolveSymlinks(root, rel string) (string, error) { + return resolveSymlinksImpl(root, root, rel, 255) +} + +func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) { + if followCount == 0 { + return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel)) + } + + rel = filepath.Clean(rel) + for _, name := range strings.Split(rel, string(filepath.Separator)) { + if name == "" { + continue + } + // Note that Join() resolves things like ".." and returns a clean path. + path := filepath.Join(base, name) + if !strings.HasPrefix(path, root) { + // One cannot '..' their way out of root. + path = root + continue + } + fi, err := os.Lstat(path) + if err != nil { + if !os.IsNotExist(err) { + return "", err + } + // Not found means there is no symlink to check. Just keep walking dirs. + base = path + continue + } + if fi.Mode()&os.ModeSymlink != 0 { + link, err := os.Readlink(path) + if err != nil { + return "", err + } + if filepath.IsAbs(link) { + base = root + } + base, err = resolveSymlinksImpl(root, base, link, followCount-1) + if err != nil { + return "", err + } + continue + } + base = path + } + return base, nil +} diff --git a/runsc/container/fs_test.go b/runsc/cmd/gofer_test.go index 87cdb078e..8e692feb9 100644 --- a/runsc/container/fs_test.go +++ b/runsc/cmd/gofer_test.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package container +package cmd import ( "fmt" @@ -21,10 +21,16 @@ import ( "path" "path/filepath" "testing" - - "gvisor.googlesource.com/gvisor/runsc/test/testutil" ) +func tmpDir() string { + dir := os.Getenv("TEST_TMPDIR") + if dir == "" { + dir = "/tmp" + } + return dir +} + type dir struct { rel string link string @@ -50,7 +56,7 @@ func construct(root string, dirs []dir) error { } func TestResolveSymlinks(t *testing.T) { - root, err := ioutil.TempDir(testutil.TmpDir(), "root") + root, err := ioutil.TempDir(tmpDir(), "root") if err != nil { t.Fatal("ioutil.TempDir() failed:", err) } @@ -141,7 +147,7 @@ func TestResolveSymlinks(t *testing.T) { } func TestResolveSymlinksLoop(t *testing.T) { - root, err := ioutil.TempDir(testutil.TmpDir(), "root") + root, err := ioutil.TempDir(tmpDir(), "root") if err != nil { t.Fatal("ioutil.TempDir() failed:", err) } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 3b25ff79a..2936b7cdf 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -6,7 +6,6 @@ go_library( name = "container", srcs = [ "container.go", - "fs.go", "hook.go", "status.go", ], @@ -34,7 +33,6 @@ go_test( srcs = [ "console_test.go", "container_test.go", - "fs_test.go", "multi_container_test.go", "shared_volume_test.go", ], diff --git a/runsc/container/container.go b/runsc/container/container.go index 6f092a5ce..fdcf8d7b7 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -281,18 +281,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo if specutils.ShouldCreateSandbox(spec) { log.Debugf("Creating new sandbox for container %q", id) - // Setup rootfs and mounts. It returns a new mount list with destination - // paths resolved. Since the spec for the root container is read from disk, - // Write the new spec to a new file that will be used by the sandbox. - cleanMounts, err := setupFS(spec, conf, bundleDir) - if err != nil { - return nil, fmt.Errorf("setup mounts: %v", err) - } - spec.Mounts = cleanMounts - if err := specutils.WriteCleanSpec(bundleDir, spec); err != nil { - return nil, fmt.Errorf("writing clean spec: %v", err) - } - // Create and join cgroup before processes are created to ensure they are // part of the cgroup from the start (and all tneir children processes). cg, err := cgroup.New(spec) @@ -306,14 +294,14 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo } } if err := runInCgroup(cg, func() error { - ioFiles, err := c.createGoferProcess(spec, conf, bundleDir) + ioFiles, specFile, err := c.createGoferProcess(spec, conf, bundleDir) if err != nil { return err } // Start a new sandbox for this container. Any errors after this point // must destroy the container. - c.Sandbox, err = sandbox.New(id, spec, conf, bundleDir, consoleSocket, userLog, ioFiles, cg) + c.Sandbox, err = sandbox.New(id, spec, conf, bundleDir, consoleSocket, userLog, ioFiles, specFile, cg) return err }); err != nil { return nil, err @@ -387,26 +375,22 @@ func (c *Container) Start(conf *boot.Config) error { return err } } else { - // Setup rootfs and mounts. It returns a new mount list with destination - // paths resolved. Replace the original spec with new mount list and start - // container. - cleanMounts, err := setupFS(c.Spec, conf, c.BundleDir) - if err != nil { - return fmt.Errorf("setup mounts: %v", err) - } - c.Spec.Mounts = cleanMounts - if err := specutils.WriteCleanSpec(c.BundleDir, c.Spec); err != nil { - return fmt.Errorf("writing clean spec: %v", err) - } - // Join cgroup to strt gofer process to ensure it's part of the cgroup from // the start (and all tneir children processes). if err := runInCgroup(c.Sandbox.Cgroup, func() error { // Create the gofer process. - ioFiles, err := c.createGoferProcess(c.Spec, conf, c.BundleDir) + ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir) if err != nil { return err } + defer mountsFile.Close() + + cleanMounts, err := specutils.ReadMounts(mountsFile) + if err != nil { + return fmt.Errorf("reading mounts file: %v", err) + } + c.Spec.Mounts = cleanMounts + return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles) }); err != nil { return err @@ -665,12 +649,6 @@ func (c *Container) Destroy() error { errs = append(errs, err.Error()) } - if err := destroyFS(c.Spec); err != nil { - err = fmt.Errorf("destroying container fs: %v", err) - log.Warningf("%v", err) - errs = append(errs, err.Error()) - } - if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) { err = fmt.Errorf("deleting container root directory %q: %v", c.Root, err) log.Warningf("%v", err) @@ -787,7 +765,7 @@ func (c *Container) waitForStopped() error { return backoff.Retry(op, b) } -func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, error) { +func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) { // Start with the general config flags. args := conf.ToFlags() @@ -800,7 +778,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund if conf.LogFilename != "" { logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) if err != nil { - return nil, fmt.Errorf("opening log file %q: %v", conf.LogFilename, err) + return nil, nil, fmt.Errorf("opening log file %q: %v", conf.LogFilename, err) } defer logFile.Close() goferEnds = append(goferEnds, logFile) @@ -811,7 +789,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund if conf.DebugLog != "" { debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer") if err != nil { - return nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err) + return nil, nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err) } defer debugLogFile.Close() goferEnds = append(goferEnds, debugLogFile) @@ -825,30 +803,39 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund } // Open the spec file to donate to the sandbox. - specFile, err := specutils.OpenCleanSpec(bundleDir) + specFile, err := specutils.OpenSpec(bundleDir) if err != nil { - return nil, fmt.Errorf("opening spec file: %v", err) + return nil, nil, fmt.Errorf("opening spec file: %v", err) } defer specFile.Close() goferEnds = append(goferEnds, specFile) args = append(args, "--spec-fd="+strconv.Itoa(nextFD)) nextFD++ + // Create pipe that allows gofer to send mount list to sandbox after all paths + // have been resolved. + mountsSand, mountsGofer, err := os.Pipe() + if err != nil { + return nil, nil, err + } + defer mountsGofer.Close() + goferEnds = append(goferEnds, mountsGofer) + args = append(args, fmt.Sprintf("--mounts-fd=%d", nextFD)) + nextFD++ + // Add root mount and then add any other additional mounts. mountCount := 1 - - // Add additional mounts. for _, m := range spec.Mounts { if specutils.Is9PMount(m) { mountCount++ } } - sandEnds := make([]*os.File, 0, mountCount) + sandEnds := make([]*os.File, 0, mountCount) for i := 0; i < mountCount; i++ { fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) if err != nil { - return nil, err + return nil, nil, err } sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD")) @@ -884,12 +871,12 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund // Start the gofer in the given namespace. log.Debugf("Starting gofer: %s %v", binPath, args) if err := specutils.StartInNS(cmd, nss); err != nil { - return nil, err + return nil, nil, err } log.Infof("Gofer started, PID: %d", cmd.Process.Pid) c.GoferPid = cmd.Process.Pid c.goferIsChild = true - return sandEnds, nil + return sandEnds, mountsSand, nil } // changeStatus transitions from one status to another ensuring that the diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 06a25de6d..f17155175 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -1594,6 +1594,171 @@ func TestCreateWorkingDir(t *testing.T) { } } +// TestMountPropagation verifies that mount propagates to slave but not to +// private mounts. +func TestMountPropagation(t *testing.T) { + // Setup dir structure: + // - src: is mounted as shared and is used as source for both private and + // slave mounts + // - dir: will be bind mounted inside src and should propagate to slave + tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "mount") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + src := filepath.Join(tmpDir, "src") + srcMnt := filepath.Join(src, "mnt") + dir := filepath.Join(tmpDir, "dir") + for _, path := range []string{src, srcMnt, dir} { + if err := os.MkdirAll(path, 0777); err != nil { + t.Fatalf("MkdirAll(%q): %v", path, err) + } + } + dirFile := filepath.Join(dir, "file") + f, err := os.Create(dirFile) + if err != nil { + t.Fatalf("os.Create(%q): %v", dirFile, err) + } + f.Close() + + // Setup src as a shared mount. + if err := syscall.Mount(src, src, "bind", syscall.MS_BIND, ""); err != nil { + t.Fatalf("mount(%q, %q, MS_BIND): %v", dir, srcMnt, err) + } + if err := syscall.Mount("", src, "", syscall.MS_SHARED, ""); err != nil { + t.Fatalf("mount(%q, MS_SHARED): %v", srcMnt, err) + } + + spec := testutil.NewSpecWithArgs("sleep", "1000") + + priv := filepath.Join(tmpDir, "priv") + slave := filepath.Join(tmpDir, "slave") + spec.Mounts = []specs.Mount{ + { + Source: src, + Destination: priv, + Type: "bind", + Options: []string{"private"}, + }, + { + Source: src, + Destination: slave, + Type: "bind", + Options: []string{"slave"}, + }, + } + + conf := testutil.TestConfig() + rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "") + if err != nil { + t.Fatalf("creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + // After the container is started, mount dir inside source and check what + // happens to both destinations. + if err := syscall.Mount(dir, srcMnt, "bind", syscall.MS_BIND, ""); err != nil { + t.Fatalf("mount(%q, %q, MS_BIND): %v", dir, srcMnt, err) + } + + // Check that mount didn't propagate to private mount. + privFile := filepath.Join(priv, "mnt", "file") + args := &control.ExecArgs{ + Filename: "/usr/bin/test", + Argv: []string{"test", "!", "-f", privFile}, + } + if ws, err := cont.executeSync(args); err != nil || ws != 0 { + t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err) + } + + // Check that mount propagated to slave mount. + slaveFile := filepath.Join(slave, "mnt", "file") + args = &control.ExecArgs{ + Filename: "/usr/bin/test", + Argv: []string{"test", "-f", slaveFile}, + } + if ws, err := cont.executeSync(args); err != nil || ws != 0 { + t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err) + } +} + +func TestMountSymlink(t *testing.T) { + for _, conf := range configs(overlay) { + t.Logf("Running test with conf: %+v", conf) + + dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + + source := path.Join(dir, "source") + target := path.Join(dir, "target") + for _, path := range []string{source, target} { + if err := os.MkdirAll(path, 0777); err != nil { + t.Fatalf("os.MkdirAll(): %v", err) + } + } + f, err := os.Create(path.Join(source, "file")) + if err != nil { + t.Fatalf("os.Create(): %v", err) + } + f.Close() + + link := path.Join(dir, "link") + if err := os.Symlink(target, link); err != nil { + t.Fatalf("os.Symlink(%q, %q): %v", target, link, err) + } + + spec := testutil.NewSpecWithArgs("/bin/sleep", "1000") + + // Mount to a symlink to ensure the mount code will follow it and mount + // at the symlink target. + spec.Mounts = append(spec.Mounts, specs.Mount{ + Type: "bind", + Destination: link, + Source: source, + }) + + rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + cont, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "") + if err != nil { + t.Fatalf("creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + // Check that symlink was resolved and mount was created where the symlink + // is pointing to. + file := path.Join(target, "file") + args := &control.ExecArgs{ + Filename: "/usr/bin/test", + Argv: []string{"test", "-f", file}, + } + if ws, err := cont.executeSync(args); err != nil || ws != 0 { + t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err) + } + } +} + // executeSync synchronously executes a new process. func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { pid, err := cont.Execute(args) diff --git a/runsc/container/fs.go b/runsc/container/fs.go deleted file mode 100644 index 998160487..000000000 --- a/runsc/container/fs.go +++ /dev/null @@ -1,287 +0,0 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package container - -import ( - "bufio" - "fmt" - "os" - "path/filepath" - "strings" - "syscall" - - specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/runsc/boot" - "gvisor.googlesource.com/gvisor/runsc/specutils" -) - -type mapping struct { - set bool - val uint32 -} - -var optionsMap = map[string]mapping{ - "acl": {set: true, val: syscall.MS_POSIXACL}, - "async": {set: false, val: syscall.MS_SYNCHRONOUS}, - "atime": {set: false, val: syscall.MS_NOATIME}, - "bind": {set: true, val: syscall.MS_BIND}, - "defaults": {set: true, val: 0}, - "dev": {set: false, val: syscall.MS_NODEV}, - "diratime": {set: false, val: syscall.MS_NODIRATIME}, - "dirsync": {set: true, val: syscall.MS_DIRSYNC}, - "exec": {set: false, val: syscall.MS_NOEXEC}, - "iversion": {set: true, val: syscall.MS_I_VERSION}, - "loud": {set: false, val: syscall.MS_SILENT}, - "mand": {set: true, val: syscall.MS_MANDLOCK}, - "noacl": {set: false, val: syscall.MS_POSIXACL}, - "noatime": {set: true, val: syscall.MS_NOATIME}, - "nodev": {set: true, val: syscall.MS_NODEV}, - "nodiratime": {set: true, val: syscall.MS_NODIRATIME}, - "noexec": {set: true, val: syscall.MS_NOEXEC}, - "noiversion": {set: false, val: syscall.MS_I_VERSION}, - "nomand": {set: false, val: syscall.MS_MANDLOCK}, - "norelatime": {set: false, val: syscall.MS_RELATIME}, - "nostrictatime": {set: false, val: syscall.MS_STRICTATIME}, - "nosuid": {set: true, val: syscall.MS_NOSUID}, - "private": {set: true, val: syscall.MS_PRIVATE}, - "rbind": {set: true, val: syscall.MS_BIND | syscall.MS_REC}, - "relatime": {set: true, val: syscall.MS_RELATIME}, - "remount": {set: true, val: syscall.MS_REMOUNT}, - "ro": {set: true, val: syscall.MS_RDONLY}, - "rprivate": {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC}, - "rw": {set: false, val: syscall.MS_RDONLY}, - "silent": {set: true, val: syscall.MS_SILENT}, - "strictatime": {set: true, val: syscall.MS_STRICTATIME}, - "suid": {set: false, val: syscall.MS_NOSUID}, - "sync": {set: true, val: syscall.MS_SYNCHRONOUS}, -} - -// setupFS creates the container directory structure under 'spec.Root.Path'. -// This allows the gofer serving the containers to be chroot under this -// directory to create an extra layer to security in case the gofer gets -// compromised. -// Returns list of mounts equivalent to 'spec.Mounts' with all destination paths -// cleaned and with symlinks resolved. -func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]specs.Mount, error) { - rv := make([]specs.Mount, 0, len(spec.Mounts)) - for _, m := range spec.Mounts { - if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { - rv = append(rv, m) - continue - } - - // It's possible that 'm.Destination' follows symlinks inside the - // container. - dst, err := resolveSymlinks(spec.Root.Path, m.Destination) - if err != nil { - return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) - } - - flags := optionsToFlags(m.Options) - flags |= syscall.MS_BIND - log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags) - if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil { - return nil, fmt.Errorf("mounting %v: %v", m, err) - } - - // Make the mount a slave, so that for recursive bind mount, umount won't - // propagate to the source. - flags = syscall.MS_SLAVE | syscall.MS_REC - if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil { - return nil, fmt.Errorf("mount rslave dst: %q, flags: %#x, err: %v", dst, flags, err) - } - - cpy := m - relDst, err := filepath.Rel(spec.Root.Path, dst) - if err != nil { - panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, spec.Root.Path, err)) - } - cpy.Destination = filepath.Join("/", relDst) - rv = append(rv, cpy) - } - - if spec.Process.Cwd != "" { - dst, err := resolveSymlinks(spec.Root.Path, spec.Process.Cwd) - if err != nil { - return nil, fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err) - } - if err := os.MkdirAll(dst, 0755); err != nil { - return nil, err - } - } - - // If root is read only, check if it needs to be remounted as readonly. - if spec.Root.Readonly { - isMountPoint, readonly, err := mountInfo(spec.Root.Path) - if err != nil { - return nil, err - } - if readonly { - return rv, nil - } - if !isMountPoint { - // Readonly root is not a mount point nor read-only. Can't do much other - // than just logging a warning. The gofer will prevent files to be open - // in write mode. - log.Warningf("Mount where root is located is not read-only and cannot be changed: %q", spec.Root.Path) - return rv, nil - } - - // If root is a mount point but not read-only, we can change mount options - // to make it read-only for extra safety. - log.Infof("Remounting root as readonly: %q", spec.Root.Path) - flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC) - src := spec.Root.Path - if err := syscall.Mount(src, src, "bind", flags, ""); err != nil { - return nil, fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err) - } - } - return rv, nil -} - -// mountInfo returns whether the path is a mount point and whether the mount -// that path belongs to is read-only. -func mountInfo(path string) (bool, bool, error) { - // Mounts are listed by their real paths. - realPath, err := filepath.EvalSymlinks(path) - if err != nil { - return false, false, err - } - f, err := os.Open("/proc/mounts") - if err != nil { - return false, false, err - } - scanner := bufio.NewScanner(f) - - var mountPoint string - var readonly bool - for scanner.Scan() { - line := scanner.Text() - parts := strings.Split(line, " ") - if len(parts) < 4 { - return false, false, fmt.Errorf("invalid /proc/mounts line format %q", line) - } - mp := parts[1] - opts := strings.Split(parts[3], ",") - - // Find the closest submount to the path. - if strings.Contains(realPath, mp) && len(mp) > len(mountPoint) { - mountPoint = mp - readonly = specutils.ContainsStr(opts, "ro") - } - } - if err := scanner.Err(); err != nil { - return false, false, err - } - return mountPoint == realPath, readonly, nil -} - -// destroyFS unmounts mounts done by runsc under `spec.Root.Path`. This -// recovers the container rootfs into the original state. -func destroyFS(spec *specs.Spec) error { - for _, m := range spec.Mounts { - if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { - continue - } - - // It's possible that 'm.Destination' follows symlinks inside the - // container. - dst, err := resolveSymlinks(spec.Root.Path, m.Destination) - if err != nil { - return err - } - - flags := syscall.MNT_DETACH - log.Infof("Unmounting dst: %q, flags: %#x", dst, flags) - // Do not return error if dst is not a mountpoint. - // Based on http://man7.org/linux/man-pages/man2/umount.2.html - // For kernel version 2.6+ and MNT_DETACH flag, EINVAL means - // the dst is not a mount point. - if err := syscall.Unmount(dst, flags); err != nil && - !os.IsNotExist(err) && err != syscall.EINVAL { - return err - } - } - return nil -} - -// resolveSymlinks walks 'rel' having 'root' as the root directory. If there are -// symlinks, they are evaluated relative to 'root' to ensure the end result is -// the same as if the process was running inside the container. -func resolveSymlinks(root, rel string) (string, error) { - return resolveSymlinksImpl(root, root, rel, 255) -} - -func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) { - if followCount == 0 { - return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel)) - } - - rel = filepath.Clean(rel) - for _, name := range strings.Split(rel, string(filepath.Separator)) { - if name == "" { - continue - } - // Note that Join() resolves things like ".." and returns a clean path. - path := filepath.Join(base, name) - if !strings.HasPrefix(path, root) { - // One cannot '..' their way out of root. - path = root - continue - } - fi, err := os.Lstat(path) - if err != nil { - if !os.IsNotExist(err) { - return "", err - } - // Not found means there is no symlink to check. Just keep walking dirs. - base = path - continue - } - if fi.Mode()&os.ModeSymlink != 0 { - link, err := os.Readlink(path) - if err != nil { - return "", err - } - if filepath.IsAbs(link) { - base = root - } - base, err = resolveSymlinksImpl(root, base, link, followCount-1) - if err != nil { - return "", err - } - continue - } - base = path - } - return base, nil -} - -func optionsToFlags(opts []string) uint32 { - var rv uint32 - for _, opt := range opts { - if m, ok := optionsMap[opt]; ok { - if m.set { - rv |= m.val - } else { - rv ^= m.val - } - } else { - log.Warningf("Ignoring mount option %q", opt) - } - } - return rv -} diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 2698e3f86..ae6375e13 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -75,7 +75,7 @@ type Sandbox struct { // New creates the sandbox process. The caller must call Destroy() on the // sandbox. -func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, cg *cgroup.Cgroup) (*Sandbox, error) { +func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, specFile *os.File, cg *cgroup.Cgroup) (*Sandbox, error) { s := &Sandbox{ID: id, Cgroup: cg} // The Cleanup object cleans up partially created sandboxes when an error // occurs. Any errors occurring during cleanup itself are ignored. @@ -86,17 +86,14 @@ func New(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke defer c.Clean() // Create pipe to synchronize when sandbox process has been booted. - fds := make([]int, 2) - if err := syscall.Pipe(fds); err != nil { + clientSyncFile, sandboxSyncFile, err := os.Pipe() + if err != nil { return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err) } - clientSyncFile := os.NewFile(uintptr(fds[0]), "client sandbox sync") defer clientSyncFile.Close() - sandboxSyncFile := os.NewFile(uintptr(fds[1]), "sandbox sync") - // Create the sandbox process. - err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, userLog, ioFiles, sandboxSyncFile) + err = s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, userLog, ioFiles, specFile, sandboxSyncFile) // sandboxSyncFile has to be closed to be able to detect when the sandbox // process exits unexpectedly. sandboxSyncFile.Close() @@ -294,7 +291,7 @@ func (s *Sandbox) connError(err error) error { // createSandboxProcess starts the sandbox as a subprocess by running the "boot" // command, passing in the bundle dir. -func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, startSyncFile *os.File) error { +func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, userLog string, ioFiles []*os.File, mountsFile, startSyncFile *os.File) error { // nextFD is used to get unused FDs that we can pass to the sandbox. It // starts at 3 because 0, 1, and 2 are taken by stdin/out/err. nextFD := 3 @@ -345,10 +342,14 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund cmd.Args = append(cmd.Args, "--controller-fd="+strconv.Itoa(nextFD)) nextFD++ - // Open the spec file to donate to the sandbox. - specFile, err := specutils.OpenCleanSpec(bundleDir) + defer mountsFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, mountsFile) + cmd.Args = append(cmd.Args, "--mounts-fd="+strconv.Itoa(nextFD)) + nextFD++ + + specFile, err := specutils.OpenSpec(bundleDir) if err != nil { - return fmt.Errorf("opening spec file: %v", err) + return err } defer specFile.Close() cmd.ExtraFiles = append(cmd.ExtraFiles, specFile) diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index 372799850..15476de6f 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_library( name = "specutils", srcs = [ + "fs.go", "namespace.go", "specutils.go", ], diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go new file mode 100644 index 000000000..b812a5fbd --- /dev/null +++ b/runsc/specutils/fs.go @@ -0,0 +1,139 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package specutils + +import ( + "fmt" + "path" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/log" +) + +type mapping struct { + set bool + val uint32 +} + +// optionsMap maps mount propagation-related OCI filesystem options to mount(2) +// syscall flags. +var optionsMap = map[string]mapping{ + "acl": {set: true, val: syscall.MS_POSIXACL}, + "async": {set: false, val: syscall.MS_SYNCHRONOUS}, + "atime": {set: false, val: syscall.MS_NOATIME}, + "bind": {set: true, val: syscall.MS_BIND}, + "defaults": {set: true, val: 0}, + "dev": {set: false, val: syscall.MS_NODEV}, + "diratime": {set: false, val: syscall.MS_NODIRATIME}, + "dirsync": {set: true, val: syscall.MS_DIRSYNC}, + "exec": {set: false, val: syscall.MS_NOEXEC}, + "iversion": {set: true, val: syscall.MS_I_VERSION}, + "loud": {set: false, val: syscall.MS_SILENT}, + "mand": {set: true, val: syscall.MS_MANDLOCK}, + "noacl": {set: false, val: syscall.MS_POSIXACL}, + "noatime": {set: true, val: syscall.MS_NOATIME}, + "nodev": {set: true, val: syscall.MS_NODEV}, + "nodiratime": {set: true, val: syscall.MS_NODIRATIME}, + "noiversion": {set: false, val: syscall.MS_I_VERSION}, + "nomand": {set: false, val: syscall.MS_MANDLOCK}, + "norelatime": {set: false, val: syscall.MS_RELATIME}, + "nostrictatime": {set: false, val: syscall.MS_STRICTATIME}, + "nosuid": {set: true, val: syscall.MS_NOSUID}, + "rbind": {set: true, val: syscall.MS_BIND | syscall.MS_REC}, + "relatime": {set: true, val: syscall.MS_RELATIME}, + "remount": {set: true, val: syscall.MS_REMOUNT}, + "ro": {set: true, val: syscall.MS_RDONLY}, + "rw": {set: false, val: syscall.MS_RDONLY}, + "silent": {set: true, val: syscall.MS_SILENT}, + "strictatime": {set: true, val: syscall.MS_STRICTATIME}, + "suid": {set: false, val: syscall.MS_NOSUID}, + "sync": {set: true, val: syscall.MS_SYNCHRONOUS}, +} + +// propOptionsMap is similar to optionsMap, but it lists propagation options +// that cannot be used together with other flags. +var propOptionsMap = map[string]mapping{ + "private": {set: true, val: syscall.MS_PRIVATE}, + "rprivate": {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC}, + "slave": {set: true, val: syscall.MS_SLAVE}, + "rslave": {set: true, val: syscall.MS_SLAVE | syscall.MS_REC}, + "unbindable": {set: true, val: syscall.MS_UNBINDABLE}, + "runbindable": {set: true, val: syscall.MS_UNBINDABLE | syscall.MS_REC}, +} + +// invalidOptions list options not allowed. +// - shared: sandbox must be isolated from the host. Propagating mount changes +// from the sandbox to the host breaks the isolation. +// - noexec: not yet supported. Don't ignore it since it could break +// in-sandbox security. +var invalidOptions = []string{"shared", "rshared", "noexec"} + +// OptionsToFlags converts mount options to syscall flags. +func OptionsToFlags(opts []string) uint32 { + return optionsToFlags(opts, optionsMap) +} + +// PropOptionsToFlags converts propagation mount options to syscall flags. +// Propagation options cannot be set other with other options and must be +// handled separatedly. +func PropOptionsToFlags(opts []string) uint32 { + return optionsToFlags(opts, propOptionsMap) +} + +func optionsToFlags(opts []string, source map[string]mapping) uint32 { + var rv uint32 + for _, opt := range opts { + if m, ok := source[opt]; ok { + if m.set { + rv |= m.val + } else { + rv ^= m.val + } + } + } + return rv +} + +// ValidateMount validates that spec mounts are correct. +func validateMount(mnt *specs.Mount) error { + if !path.IsAbs(mnt.Destination) { + return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt) + } + + if mnt.Type == "bind" { + for _, o := range mnt.Options { + if ContainsStr(invalidOptions, o) { + return fmt.Errorf("mount option %q is not supported: %v", o, mnt) + } + _, ok1 := optionsMap[o] + _, ok2 := propOptionsMap[o] + if !ok1 && !ok2 { + log.Warningf("Ignoring unknown mount option %q", o) + } + } + } + return nil +} + +// ValidateRootfsPropagation validates that rootfs propagation options are +// correct. +func validateRootfsPropagation(opt string) error { + flags := PropOptionsToFlags([]string{opt}) + if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 { + return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt) + } + return nil +} diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go index 73fab13e1..35da789f4 100644 --- a/runsc/specutils/namespace.go +++ b/runsc/specutils/namespace.go @@ -105,9 +105,9 @@ func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNam return out } -// SetNS sets the namespace of the given type. It must be called with +// setNS sets the namespace of the given type. It must be called with // OSThreadLocked. -func SetNS(fd, nsType uintptr) error { +func setNS(fd, nsType uintptr) error { if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 { return err } @@ -119,30 +119,30 @@ func SetNS(fd, nsType uintptr) error { // // Preconditions: Must be called with os thread locked. func ApplyNS(ns specs.LinuxNamespace) (func(), error) { - log.Infof("applying namespace %v at path %q", ns.Type, ns.Path) + log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path) newNS, err := os.Open(ns.Path) if err != nil { return nil, fmt.Errorf("error opening %q: %v", ns.Path, err) } defer newNS.Close() - // Store current netns to restore back after child is started. + // Store current namespace to restore back. curPath := nsPath(ns.Type) oldNS, err := os.Open(curPath) if err != nil { return nil, fmt.Errorf("error opening %q: %v", curPath, err) } - // Set netns to the one requested and setup function to restore it back. + // Set namespace to the one requested and setup function to restore it back. flag := nsCloneFlag(ns.Type) - if err := SetNS(newNS.Fd(), flag); err != nil { + if err := setNS(newNS.Fd(), flag); err != nil { oldNS.Close() return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err) } return func() { - log.Infof("restoring namespace %v", ns.Type) + log.Infof("Restoring namespace %v", ns.Type) defer oldNS.Close() - if err := SetNS(oldNS.Fd(), flag); err != nil { + if err := setNS(oldNS.Fd(), flag); err != nil { panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err)) } }, nil diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 4e7893ab4..cbf099c64 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -92,9 +92,14 @@ func ValidateSpec(spec *specs.Spec) error { log.Warningf("Seccomp spec is being ignored") } - for i, m := range spec.Mounts { - if !path.IsAbs(m.Destination) { - return fmt.Errorf("Spec.Mounts[%d] Mount.Destination must be an absolute path: %v", i, m) + if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { + if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil { + return err + } + } + for _, m := range spec.Mounts { + if err := validateMount(&m); err != nil { + return err } } @@ -129,15 +134,19 @@ func absPath(base, rel string) string { return filepath.Join(base, rel) } +// OpenSpec opens an OCI runtime spec from the given bundle directory. +func OpenSpec(bundleDir string) (*os.File, error) { + // The spec file must be named "config.json" inside the bundle directory. + return os.Open(filepath.Join(bundleDir, "config.json")) +} + // ReadSpec reads an OCI runtime spec from the given bundle directory. // ReadSpec also normalizes all potential relative paths into absolute // path, e.g. spec.Root.Path, mount.Source. func ReadSpec(bundleDir string) (*specs.Spec, error) { - // The spec file must be in "config.json" inside the bundle directory. - specPath := filepath.Join(bundleDir, "config.json") - specFile, err := os.Open(specPath) + specFile, err := OpenSpec(bundleDir) if err != nil { - return nil, fmt.Errorf("error opening spec file %q: %v", specPath, err) + return nil, fmt.Errorf("error opening spec file %q: %v", specFile.Name(), err) } defer specFile.Close() return ReadSpecFromFile(bundleDir, specFile) @@ -171,27 +180,17 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) return &spec, nil } -// OpenCleanSpec opens spec file that has destination mount paths resolved to -// their absolute location. -func OpenCleanSpec(bundleDir string) (*os.File, error) { - f, err := os.Open(filepath.Join(bundleDir, "config.clean.json")) +// ReadMounts reads mount list from a file. +func ReadMounts(f *os.File) ([]specs.Mount, error) { + bytes, err := ioutil.ReadAll(f) if err != nil { - return nil, err + return nil, fmt.Errorf("error reading mounts: %v", err) } - if _, err := f.Seek(0, os.SEEK_SET); err != nil { - f.Close() - return nil, fmt.Errorf("error seeking to beginning of file %q: %v", f.Name(), err) - } - return f, nil -} - -// WriteCleanSpec writes a spec file that has destination mount paths resolved. -func WriteCleanSpec(bundleDir string, spec *specs.Spec) error { - bytes, err := json.Marshal(spec) - if err != nil { - return err + var mounts []specs.Mount + if err := json.Unmarshal(bytes, &mounts); err != nil { + return nil, fmt.Errorf("error unmarshaling mounts: %v\n %s", err, string(bytes)) } - return ioutil.WriteFile(filepath.Join(bundleDir, "config.clean.json"), bytes, 0755) + return mounts, nil } // Capabilities takes in spec and returns a TaskCapabilities corresponding to @@ -407,8 +406,7 @@ func Mount(src, dst, typ string, flags uint32) error { // source (file or directory). var isDir bool if typ == "proc" { - // Special case, as there is no source directory for proc - // mounts. + // Special case, as there is no source directory for proc mounts. isDir = true } else if fi, err := os.Stat(src); err != nil { return fmt.Errorf("Stat(%q) failed: %v", src, err) diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go index b61f1ca62..02af6e6ad 100644 --- a/runsc/specutils/specutils_test.go +++ b/runsc/specutils/specutils_test.go @@ -219,6 +219,37 @@ func TestSpecInvalid(t *testing.T) { }, error: "must be an absolute path", }, + { + name: "invalid mount option", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + Mounts: []specs.Mount{ + { + Source: "/src", + Destination: "/dst", + Type: "bind", + Options: []string{"shared"}, + }, + }, + }, + error: "is not supported", + }, + { + name: "invalid rootfs propagation", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + Linux: &specs.Linux{ + RootfsPropagation: "foo", + }, + }, + error: "root mount propagation option must specify private or slave", + }, } { err := ValidateSpec(&test.spec) if len(test.error) == 0 { |