6 files changed, 155 insertions, 105 deletions
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e2c5f5fb1..ddf288456 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -254,7 +254,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
 
 // addSubmountOverlay overlays the inode over a ramfs tree containing the given
 // paths.
-func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) {
 	// Construct a ramfs tree of mount points. The contents never
 	// change, so this can be fully caching. There's no real
 	// filesystem backing this tree, so we set the filesystem to
@@ -264,7 +264,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
 	if err != nil {
 		return nil, fmt.Errorf("creating mount tree: %v", err)
 	}
-	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding mount overlay: %v", err)
 	}
@@ -741,7 +741,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Con
 	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
 	// mounted even if they are not in the spec.
 	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
-	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf)
 	if err != nil {
 		return nil, fmt.Errorf("adding submount overlay: %v", err)
 	}
@@ -851,7 +851,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Confi
 	submounts := subtargets(m.Destination, c.mounts)
 	if len(submounts) > 0 {
 		log.Infof("Adding submount overlay over %q", m.Destination)
-		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		inode, err = addSubmountOverlay(ctx, inode, submounts, mf)
 		if err != nil {
 			return fmt.Errorf("adding submount overlay: %v", err)
 		}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index dc9861389..bf9ec5d38 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -491,9 +491,9 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 			}
 
 			ctx := l.k.SupervisorContext()
-			mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs)
+			mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs)
 			if err != nil {
-				t.Fatalf("failed to setupVFS2: %v", err)
+				t.Fatalf("mountAll: %v", err)
 			}
 
 			root := mns.Root()
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 66b6cf19b..7844ea28c 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -134,7 +134,7 @@ func registerFilesystems(k *kernel.Kernel) error {
 }
 
 func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
-	mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+	mns, err := mntr.mountAll(conf, procArgs)
 	if err != nil {
 		return fmt.Errorf("failed to setupFS: %w", err)
 	}
@@ -149,7 +149,7 @@ func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containe
 	return nil
 }
 
-func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
 	log.Infof("Configuring container's file system with VFS2")
 
 	// Create context with root credentials to mount the filesystem (the current
@@ -172,24 +172,44 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, p
 	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
 		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
 	}
+
+	if c.root.Readonly || conf.Overlay {
+		// Switch to ReadOnly after all submounts were setup.
+		root := mns.Root()
+		defer root.DecRef(rootCtx)
+		if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
+			return nil, fmt.Errorf(`failed to set mount at "/" readonly: %v`, err)
+		}
+	}
+
 	return mns, nil
 }
 
+// createMountNamespaceVFS2 creates the container's root mount and namespace.
+// The mount is created ReadWrite to allow mount point for submounts to be
+// created. ** The caller is responsible to switch to ReadOnly if needed **
 func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
 	fd := c.fds.remove()
-	opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+	data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
 
 	if conf.OverlayfsStaleRead {
 		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
 		// can only send mount options for specs.Mounts (specs.Root is missing
 		// Options field). So assume root is always on top of overlayfs.
-		opts = append(opts, "overlayfs_stale_read")
+		data = append(data, "overlayfs_stale_read")
 	}
 
 	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{
-		Data: strings.Join(opts, ","),
-	})
+	opts := &vfs.MountOptions{
+		// Always mount as ReadWrite to allow other mounts on top of it. It'll be
+		// made ReadOnly in the caller (if needed).
+		ReadOnly: false,
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			Data: strings.Join(data, ","),
+		},
+		InternalMount: true,
+	}
+	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, opts)
 	if err != nil {
 		return nil, fmt.Errorf("setting up mount namespace: %w", err)
 	}
@@ -227,6 +247,7 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.
 			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
 				return fmt.Errorf("failed to set mount at %q readwrite: %v", submount.Destination, err)
 			}
+			// Restore back to ReadOnly at the end.
 			defer func() {
 				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
 					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 6082068c7..33ada5bb9 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -41,6 +41,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
 	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
@@ -1490,6 +1491,8 @@ func TestMountNewDir(t *testing.T) {
 				Source:      srcDir,
 				Type:        "bind",
 			})
+			// Extra points for creating the mount with a readonly root.
+			spec.Root.Readonly = true
 
 			if err := run(spec, conf); err != nil {
 				t.Fatalf("error running sandbox: %v", err)
@@ -1499,17 +1502,17 @@ func TestMountNewDir(t *testing.T) {
 }
 
 func TestReadonlyRoot(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
-			spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+			spec := testutil.NewSpecWithArgs("sleep", "100")
 			spec.Root.Readonly = true
+
 			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
 			if err != nil {
 				t.Fatalf("error setting up container: %v", err)
 			}
 			defer cleanup()
 
-			// Create, start and wait for the container.
 			args := Args{
 				ID:        testutil.RandomContainerID(),
 				Spec:      spec,
@@ -1524,12 +1527,82 @@ func TestReadonlyRoot(t *testing.T) {
 				t.Fatalf("error starting container: %v", err)
 			}
 
-			ws, err := c.Wait()
+			// Read mounts to check that root is readonly.
+			out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", "mount | grep ' / '")
+			if err != nil || ws != 0 {
+				t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+			}
+			t.Logf("root mount: %q", out)
+			if !strings.Contains(string(out), "(ro)") {
+				t.Errorf("root not mounted readonly: %q", out)
+			}
+
+			// Check that file cannot be created.
+			ws, err = execute(c, "/bin/touch", "/foo")
 			if err != nil {
-				t.Fatalf("error waiting on container: %v", err)
+				t.Fatalf("touch file in ro mount: %v", err)
 			}
 			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-				t.Fatalf("container failed, waitStatus: %v", ws)
+				t.Fatalf("wrong waitStatus: %v", ws)
+			}
+		})
+	}
+}
+
+func TestReadonlyMount(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
+			if err != nil {
+				t.Fatalf("ioutil.TempDir() failed: %v", err)
+			}
+			spec := testutil.NewSpecWithArgs("sleep", "100")
+			spec.Mounts = append(spec.Mounts, specs.Mount{
+				Destination: dir,
+				Source:      dir,
+				Type:        "bind",
+				Options:     []string{"ro"},
+			})
+			spec.Root.Readonly = false
+
+			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			args := Args{
+				ID:        testutil.RandomContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Read mounts to check that volume is readonly.
+			cmd := fmt.Sprintf("mount | grep ' %s '", dir)
+			out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", cmd)
+			if err != nil || ws != 0 {
+				t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+			}
+			t.Logf("mount: %q", out)
+			if !strings.Contains(string(out), "(ro)") {
+				t.Errorf("volume not mounted readonly: %q", out)
+			}
+
+			// Check that file cannot be created.
+			ws, err = execute(c, "/bin/touch", path.Join(dir, "file"))
+			if err != nil {
+				t.Fatalf("touch file in ro mount: %v", err)
+			}
+			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+				t.Fatalf("wrong WaitStatus: %v", ws)
 			}
 		})
 	}
@@ -1616,54 +1689,6 @@ func TestUIDMap(t *testing.T) {
 	}
 }
 
-func TestReadonlyMount(t *testing.T) {
-	for name, conf := range configsWithVFS2(t, overlay) {
-		t.Run(name, func(t *testing.T) {
-			dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
-			spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
-			if err != nil {
-				t.Fatalf("ioutil.TempDir() failed: %v", err)
-			}
-			spec.Mounts = append(spec.Mounts, specs.Mount{
-				Destination: dir,
-				Source:      dir,
-				Type:        "bind",
-				Options:     []string{"ro"},
-			})
-			spec.Root.Readonly = false
-
-			_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
-			if err != nil {
-				t.Fatalf("error setting up container: %v", err)
-			}
-			defer cleanup()
-
-			// Create, start and wait for the container.
-			args := Args{
-				ID:        testutil.RandomContainerID(),
-				Spec:      spec,
-				BundleDir: bundleDir,
-			}
-			c, err := New(conf, args)
-			if err != nil {
-				t.Fatalf("error creating container: %v", err)
-			}
-			defer c.Destroy()
-			if err := c.Start(conf); err != nil {
-				t.Fatalf("error starting container: %v", err)
-			}
-
-			ws, err := c.Wait()
-			if err != nil {
-				t.Fatalf("error waiting on container: %v", err)
-			}
-			if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
-				t.Fatalf("container failed, waitStatus: %v", ws)
-			}
-		})
-	}
-}
-
 // TestAbbreviatedIDs checks that runsc supports using abbreviated container
 // IDs in place of full IDs.
 func TestAbbreviatedIDs(t *testing.T) {
@@ -2116,21 +2141,13 @@ func TestMountPropagation(t *testing.T) {
 
 	// Check that mount didn't propagate to private mount.
 	privFile := filepath.Join(priv, "mnt", "file")
-	execArgs := &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "!", "-f", privFile},
-	}
-	if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+	if ws, err := execute(cont, "/usr/bin/test", "!", "-f", privFile); err != nil || ws != 0 {
 		t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err)
 	}
 
 	// Check that mount propagated to slave mount.
 	slaveFile := filepath.Join(slave, "mnt", "file")
-	execArgs = &control.ExecArgs{
-		Filename: "/usr/bin/test",
-		Argv:     []string{"test", "-f", slaveFile},
-	}
-	if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+	if ws, err := execute(cont, "/usr/bin/test", "-f", slaveFile); err != nil || ws != 0 {
 		t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err)
 	}
 }
@@ -2196,11 +2213,7 @@ func TestMountSymlink(t *testing.T) {
 			// Check that symlink was resolved and mount was created where the symlink
 			// is pointing to.
 			file := path.Join(target, "file")
-			execArgs := &control.ExecArgs{
-				Filename: "/usr/bin/test",
-				Argv:     []string{"test", "-f", file},
-			}
-			if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+			if ws, err := execute(cont, "/usr/bin/test", "-f", file); err != nil || ws != 0 {
 				t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
 			}
 		})
@@ -2326,6 +2339,35 @@ func TestTTYField(t *testing.T) {
 	}
 }
 
+func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) {
+	args := &control.ExecArgs{
+		Filename: name,
+		Argv:     append([]string{name}, arg...),
+	}
+	return cont.executeSync(args)
+}
+
+func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, syscall.WaitStatus, error) {
+	r, w, err := os.Pipe()
+	if err != nil {
+		return nil, 0, err
+	}
+	defer r.Close()
+
+	args := &control.ExecArgs{
+		Filename:    name,
+		Argv:        append([]string{name}, arg...),
+		FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, w, w}},
+	}
+	ws, err := cont.executeSync(args)
+	w.Close()
+	if err != nil {
+		return nil, 0, err
+	}
+	out, err := ioutil.ReadAll(r)
+	return out, ws, err
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 5b790c6c8..952215ec1 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -1517,8 +1517,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	}
 
 	// Check that container isn't running anymore.
-	args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-	if _, err := c.executeSync(args); err == nil {
+	if _, err := execute(c, "/bin/true"); err == nil {
 		t.Fatalf("Container %q was not stopped after gofer death", c.ID)
 	}
 
@@ -1533,8 +1532,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 		if err := waitForProcessList(c, pl); err != nil {
 			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
 		}
-		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-		if _, err := c.executeSync(args); err != nil {
+		if _, err := execute(c, "/bin/true"); err != nil {
 			t.Fatalf("Container %q was affected by another container: %v", c.ID, err)
 		}
 	}
@@ -1556,8 +1554,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 
 	// Check that entire sandbox isn't running anymore.
 	for _, c := range containers {
-		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
-		if _, err := c.executeSync(args); err == nil {
+		if _, err := execute(c, "/bin/true"); err == nil {
 			t.Fatalf("Container %q was not stopped after gofer death", c.ID)
 		}
 	}
@@ -1719,12 +1716,11 @@ func TestMultiContainerHomeEnvDir(t *testing.T) {
 				homeDirs[name] = homeFile
 			}
 
-			// We will sleep in the root container in order to ensure that
-			// the root container doesn't terminate before sub containers can be
-			// created.
+			// We will sleep in the root container in order to ensure that the root
+			//container doesn't terminate before sub containers can be created.
 			rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())}
 			subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())}
-			execCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())}
+			execCmd := fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())
 
 			// Setup the containers, a root container and sub container.
 			specConfig, ids := createSpecs(rootCmd, subCmd)
@@ -1735,9 +1731,8 @@ func TestMultiContainerHomeEnvDir(t *testing.T) {
 			defer cleanup()
 
 			// Exec into the root container synchronously.
-			args := &control.ExecArgs{Argv: execCmd}
-			if _, err := containers[0].executeSync(args); err != nil {
-				t.Errorf("error executing %+v: %v", args, err)
+			if _, err := execute(containers[0], "/bin/sh", "-c", execCmd); err != nil {
+				t.Errorf("error executing %+v: %v", execCmd, err)
 			}
 
 			// Wait for the subcontainer to finish.
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index 4ea8fefee..cb5bffb89 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -168,11 +168,7 @@ func TestSharedVolume(t *testing.T) {
 
 func checkFile(c *Container, filename string, want []byte) error {
 	cpy := filename + ".copy"
-	argsCp := &control.ExecArgs{
-		Filename: "/bin/cp",
-		Argv:     []string{"cp", "-f", filename, cpy},
-	}
-	if _, err := c.executeSync(argsCp); err != nil {
+	if _, err := execute(c, "/bin/cp", "-f", filename, cpy); err != nil {
 		return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err)
 	}
 	got, err := ioutil.ReadFile(cpy)
@@ -235,11 +231,7 @@ func TestSharedVolumeFile(t *testing.T) {
 	}
 
 	// Append to file inside the container and check that content is not lost.
-	argsAppend := &control.ExecArgs{
-		Filename: "/bin/bash",
-		Argv:     []string{"bash", "-c", "echo -n sandbox- >> " + filename},
-	}
-	if _, err := c.executeSync(argsAppend); err != nil {
+	if _, err := execute(c, "/bin/bash", "-c", "echo -n sandbox- >> "+filename); err != nil {
 		t.Fatalf("unexpected error appending file %q: %v", filename, err)
 	}
 	want = []byte("host-sandbox-")