summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
Diffstat (limited to 'runsc')
-rw-r--r--runsc/boot/BUILD11
-rw-r--r--runsc/boot/config.go5
-rw-r--r--runsc/boot/fds.go33
-rw-r--r--runsc/boot/fs.go9
-rw-r--r--runsc/boot/loader.go31
-rw-r--r--runsc/boot/loader_amd64.go5
-rw-r--r--runsc/boot/loader_arm64.go5
-rw-r--r--runsc/boot/loader_test.go37
-rw-r--r--runsc/boot/user.go64
-rw-r--r--runsc/boot/vfs.go310
-rw-r--r--runsc/container/container_test.go14
-rw-r--r--runsc/main.go3
12 files changed, 506 insertions, 21 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 23f42382f..5451f1eba 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -21,6 +21,7 @@ go_library(
"network.go",
"strace.go",
"user.go",
+ "vfs.go",
],
visibility = [
"//runsc:__subpackages__",
@@ -33,6 +34,7 @@ go_library(
"//pkg/control/server",
"//pkg/cpuid",
"//pkg/eventchannel",
+ "//pkg/fspath",
"//pkg/log",
"//pkg/memutil",
"//pkg/rand",
@@ -40,6 +42,7 @@ go_library(
"//pkg/sentry/arch",
"//pkg/sentry/arch:registers_go_proto",
"//pkg/sentry/control",
+ "//pkg/sentry/devices/memdev",
"//pkg/sentry/fs",
"//pkg/sentry/fs/dev",
"//pkg/sentry/fs/gofer",
@@ -49,6 +52,12 @@ go_library(
"//pkg/sentry/fs/sys",
"//pkg/sentry/fs/tmpfs",
"//pkg/sentry/fs/tty",
+ "//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/fsimpl/gofer",
+ "//pkg/sentry/fsimpl/host",
+ "//pkg/sentry/fsimpl/proc",
+ "//pkg/sentry/fsimpl/sys",
+ "//pkg/sentry/fsimpl/tmpfs",
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel:uncaught_signal_go_proto",
@@ -71,6 +80,7 @@ go_library(
"//pkg/sentry/time",
"//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
"//pkg/sentry/usage",
+ "//pkg/sentry/vfs",
"//pkg/sentry/watchdog",
"//pkg/sync",
"//pkg/syserror",
@@ -114,6 +124,7 @@ go_test(
"//pkg/p9",
"//pkg/sentry/contexttest",
"//pkg/sentry/fs",
+ "//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sync",
"//pkg/unet",
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 7ea5bfade..715a19112 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -305,5 +305,10 @@ func (c *Config) ToFlags() []string {
if len(c.TestOnlyTestNameEnv) != 0 {
f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
}
+
+ if c.VFS2 {
+ f = append(f, "--vfs2=true")
+ }
+
return f
}
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 5314b0f2a..7e49f6f9f 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
+ vfshost "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
"gvisor.dev/gvisor/pkg/sentry/kernel"
)
@@ -31,6 +32,10 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
}
+ if kernel.VFS2Enabled {
+ return createFDTableVFS2(ctx, console, stdioFDs)
+ }
+
k := kernel.KernelFromContext(ctx)
fdTable := k.NewFDTable()
defer fdTable.DecRef()
@@ -78,3 +83,31 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
fdTable.IncRef()
return fdTable, nil
}
+
+func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) {
+ k := kernel.KernelFromContext(ctx)
+ fdTable := k.NewFDTable()
+ defer fdTable.DecRef()
+
+ hostMount, err := vfshost.NewMount(k.VFS())
+ if err != nil {
+ return nil, fmt.Errorf("creating host mount: %w", err)
+ }
+
+ for appFD, hostFD := range stdioFDs {
+ // TODO(gvisor.dev/issue/1482): Add TTY support.
+ appFile, err := vfshost.ImportFD(hostMount, hostFD, false)
+ if err != nil {
+ return nil, err
+ }
+
+ if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
+ appFile.DecRef()
+ return nil, err
+ }
+ appFile.DecRef()
+ }
+
+ fdTable.IncRef()
+ return fdTable, nil
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 82cc612d2..98cce60af 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -278,6 +278,9 @@ func subtargets(root string, mnts []specs.Mount) []string {
}
func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+ if conf.VFS2 {
+ return setupContainerVFS2(ctx, conf, mntr, procArgs)
+ }
mns, err := mntr.setupFS(conf, procArgs)
if err != nil {
return err
@@ -573,6 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
// should be mounted (e.g. a volume shared between containers). It must be
// called for the root container only.
func (c *containerMounter) processHints(conf *Config) error {
+ if conf.VFS2 {
+ return nil
+ }
ctx := c.k.SupervisorContext()
for _, hint := range c.hints.mounts {
// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
@@ -781,9 +787,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
default:
- // TODO(nlacasse): Support all the mount types and make this a fatal error.
- // Most applications will "just work" without them, so this is a warning
- // for now.
log.Warningf("ignoring unknown filesystem type %q", m.Type)
}
return fsName, opts, useOverlay, nil
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 654441f65..cf1f47bc7 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -26,7 +26,6 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
- "gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/log"
@@ -73,6 +72,8 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
)
+var syscallTable *kernel.SyscallTable
+
// Loader keeps state needed to start the kernel and run the container..
type Loader struct {
// k is the kernel.
@@ -195,13 +196,14 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("setting up memory usage: %v", err)
}
- if args.Conf.VFS2 {
- st, ok := kernel.LookupSyscallTable(abi.Linux, arch.Host)
- if ok {
- vfs2.Override(st.Table)
- }
+ // Patch the syscall table.
+ kernel.VFS2Enabled = args.Conf.VFS2
+ if kernel.VFS2Enabled {
+ vfs2.Override(syscallTable.Table)
}
+ kernel.RegisterSyscallTable(syscallTable)
+
// Create kernel and platform.
p, err := createPlatform(args.Conf, args.Device)
if err != nil {
@@ -392,11 +394,16 @@ func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.
return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
}
+ wd := spec.Process.Cwd
+ if wd == "" {
+ wd = "/"
+ }
+
// Create the process arguments.
procArgs := kernel.CreateProcessArgs{
Argv: spec.Process.Args,
Envv: spec.Process.Env,
- WorkingDirectory: spec.Process.Cwd, // Defaults to '/' if empty.
+ WorkingDirectory: wd,
Credentials: creds,
Umask: 0022,
Limits: ls,
@@ -541,7 +548,15 @@ func (l *Loader) run() error {
}
// Add the HOME enviroment variable if it is not already set.
- envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+ var envv []string
+ if kernel.VFS2Enabled {
+ envv, err = maybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
+ l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+
+ } else {
+ envv, err = maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
+ l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+ }
if err != nil {
return err
}
diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go
index b9669f2ac..78df86611 100644
--- a/runsc/boot/loader_amd64.go
+++ b/runsc/boot/loader_amd64.go
@@ -17,11 +17,10 @@
package boot
import (
- "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
)
func init() {
- // Register the global syscall table.
- kernel.RegisterSyscallTable(linux.AMD64)
+ // Set the global syscall table.
+ syscallTable = linux.AMD64
}
diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go
index cf64d28c8..250785010 100644
--- a/runsc/boot/loader_arm64.go
+++ b/runsc/boot/loader_arm64.go
@@ -17,11 +17,10 @@
package boot
import (
- "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
)
func init() {
- // Register the global syscall table.
- kernel.RegisterSyscallTable(linux.ARM64)
+ // Set the global syscall table.
+ syscallTable = linux.ARM64
}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index c9a75b76d..e7c71734f 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -30,6 +30,7 @@ import (
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
"gvisor.dev/gvisor/runsc/fsgofer"
@@ -66,6 +67,11 @@ func testSpec() *specs.Spec {
}
}
+func resetSyscallTable() {
+ kernel.VFS2Enabled = false
+ kernel.FlushSyscallTablesTestOnly()
+}
+
// startGofer starts a new gofer routine serving 'root' path. It returns the
// sandbox side of the connection, and a function that when called will stop the
// gofer.
@@ -101,7 +107,7 @@ func startGofer(root string) (int, func(), error) {
return sandboxEnd, cleanup, nil
}
-func createLoader() (*Loader, func(), error) {
+func createLoader(vfsEnabled bool) (*Loader, func(), error) {
fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10]))
if err != nil {
return nil, nil, err
@@ -109,6 +115,8 @@ func createLoader() (*Loader, func(), error) {
conf := testConfig()
spec := testSpec()
+ conf.VFS2 = vfsEnabled
+
sandEnd, cleanup, err := startGofer(spec.Root.Path)
if err != nil {
return nil, nil, err
@@ -142,10 +150,22 @@ func createLoader() (*Loader, func(), error) {
// TestRun runs a simple application in a sandbox and checks that it succeeds.
func TestRun(t *testing.T) {
- l, cleanup, err := createLoader()
+ defer resetSyscallTable()
+ doRun(t, false)
+}
+
+// TestRunVFS2 runs TestRun in VFSv2.
+func TestRunVFS2(t *testing.T) {
+ defer resetSyscallTable()
+ doRun(t, true)
+}
+
+func doRun(t *testing.T, vfsEnabled bool) {
+ l, cleanup, err := createLoader(vfsEnabled)
if err != nil {
t.Fatalf("error creating loader: %v", err)
}
+
defer l.Destroy()
defer cleanup()
@@ -179,7 +199,18 @@ func TestRun(t *testing.T) {
// TestStartSignal tests that the controller Start message will cause
// WaitForStartSignal to return.
func TestStartSignal(t *testing.T) {
- l, cleanup, err := createLoader()
+ defer resetSyscallTable()
+ doStartSignal(t, false)
+}
+
+// TestStartSignalVFS2 does TestStartSignal with VFS2.
+func TestStartSignalVFS2(t *testing.T) {
+ defer resetSyscallTable()
+ doStartSignal(t, true)
+}
+
+func doStartSignal(t *testing.T, vfsEnabled bool) {
+ l, cleanup, err := createLoader(vfsEnabled)
if err != nil {
t.Fatalf("error creating loader: %v", err)
}
diff --git a/runsc/boot/user.go b/runsc/boot/user.go
index f0aa52135..332e4fce5 100644
--- a/runsc/boot/user.go
+++ b/runsc/boot/user.go
@@ -23,8 +23,10 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -84,6 +86,48 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K
File: f,
}
+ return findHomeInPasswd(uint32(uid), r, defaultHome)
+}
+
+type fileReaderVFS2 struct {
+ ctx context.Context
+ fd *vfs.FileDescription
+}
+
+func (r *fileReaderVFS2) Read(buf []byte) (int, error) {
+ n, err := r.fd.Read(r.ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+ return int(n), err
+}
+
+func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth.KUID) (string, error) {
+ const defaultHome = "/"
+
+ root := mns.Root()
+ defer root.DecRef()
+
+ creds := auth.CredentialsFromContext(ctx)
+
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse("/etc/passwd"),
+ }
+
+ opts := &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ }
+
+ fd, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, target, opts)
+ if err != nil {
+ return defaultHome, nil
+ }
+ defer fd.DecRef()
+
+ r := &fileReaderVFS2{
+ ctx: ctx,
+ fd: fd,
+ }
+
homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome)
if err != nil {
return "", err
@@ -111,6 +155,26 @@ func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.
if err != nil {
return nil, fmt.Errorf("error reading exec user: %v", err)
}
+
+ return append(envv, "HOME="+homeDir), nil
+}
+
+func maybeAddExecUserHomeVFS2(ctx context.Context, vmns *vfs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
+ // Check if the envv already contains HOME.
+ for _, env := range envv {
+ if strings.HasPrefix(env, "HOME=") {
+ // We have it. Return the original slice unmodified.
+ return envv, nil
+ }
+ }
+
+ // Read /etc/passwd for the user's HOME directory and set the HOME
+ // environment variable as required by POSIX if it is not overridden by
+ // the user.
+ homeDir, err := getExecUserHomeVFS2(ctx, vmns, uid)
+ if err != nil {
+ return nil, fmt.Errorf("error reading exec user: %v", err)
+ }
return append(envv, "HOME="+homeDir), nil
}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
new file mode 100644
index 000000000..82083c57d
--- /dev/null
+++ b/runsc/boot/vfs.go
@@ -0,0 +1,310 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "path"
+ "strconv"
+ "strings"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/devices/memdev"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+ goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+ procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
+ sysimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+ tmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+
+ vfsObj.MustRegisterFilesystemType(rootFsName, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserList: true,
+ })
+
+ vfsObj.MustRegisterFilesystemType(bind, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserList: true,
+ })
+
+ vfsObj.MustRegisterFilesystemType(devpts, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
+
+ vfsObj.MustRegisterFilesystemType(devtmpfs, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
+ vfsObj.MustRegisterFilesystemType(proc, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
+ vfsObj.MustRegisterFilesystemType(sysfs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
+ vfsObj.MustRegisterFilesystemType(tmpfs, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
+ vfsObj.MustRegisterFilesystemType(nonefs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
+
+ // Setup files in devtmpfs.
+ if err := memdev.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering memdev: %w", err)
+ }
+ a, err := devtmpfsimpl.NewAccessor(ctx, vfsObj, creds, devtmpfsimpl.Name)
+ if err != nil {
+ return fmt.Errorf("creating devtmpfs accessor: %w", err)
+ }
+ defer a.Release()
+
+ if err := a.UserspaceInit(ctx); err != nil {
+ return fmt.Errorf("initializing userspace: %w", err)
+ }
+ if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
+ return fmt.Errorf("creating devtmpfs files: %w", err)
+ }
+ return nil
+}
+
+func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+ if err := mntr.k.VFS().Init(); err != nil {
+ return fmt.Errorf("failed to initialize VFS: %w", err)
+ }
+ mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+ if err != nil {
+ return fmt.Errorf("failed to setupFS: %w", err)
+ }
+ procArgs.MountNamespaceVFS2 = mns
+ return setExecutablePathVFS2(ctx, procArgs)
+}
+
+func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
+
+ exe := procArgs.Argv[0]
+
+ // Absolute paths can be used directly.
+ if path.IsAbs(exe) {
+ procArgs.Filename = exe
+ return nil
+ }
+
+ // Paths with '/' in them should be joined to the working directory, or
+ // to the root if working directory is not set.
+ if strings.IndexByte(exe, '/') > 0 {
+
+ if !path.IsAbs(procArgs.WorkingDirectory) {
+ return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory)
+ }
+
+ procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
+ return nil
+ }
+
+ // Paths with a '/' are relative to the CWD.
+ if strings.IndexByte(exe, '/') > 0 {
+ procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
+ return nil
+ }
+
+ // Otherwise, We must lookup the name in the paths, starting from the
+ // root directory.
+ root := procArgs.MountNamespaceVFS2.Root()
+ defer root.DecRef()
+
+ paths := fs.GetPath(procArgs.Envv)
+ creds := procArgs.Credentials
+
+ for _, p := range paths {
+
+ binPath := path.Join(p, exe)
+
+ pop := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(binPath),
+ FollowFinalSymlink: true,
+ }
+
+ opts := &vfs.OpenOptions{
+ FileExec: true,
+ Flags: linux.O_RDONLY,
+ }
+
+ dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
+ if err == syserror.ENOENT || err == syserror.EACCES {
+ // Didn't find it here.
+ continue
+ }
+ if err != nil {
+ return err
+ }
+ dentry.DecRef()
+
+ procArgs.Filename = binPath
+ return nil
+ }
+
+ return fmt.Errorf("executable %q not found in $PATH=%q", exe, strings.Join(paths, ":"))
+}
+
+func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+ log.Infof("Configuring container's file system with VFS2")
+
+ // Create context with root credentials to mount the filesystem (the current
+ // user may not be privileged enough).
+ rootProcArgs := *procArgs
+ rootProcArgs.WorkingDirectory = "/"
+ rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+ rootProcArgs.Umask = 0022
+ rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
+ rootCtx := procArgs.NewContext(c.k)
+
+ creds := procArgs.Credentials
+ if err := registerFilesystems(rootCtx, c.k.VFS(), creds); err != nil {
+ return nil, fmt.Errorf("register filesystems: %w", err)
+ }
+
+ fd := c.fds.remove()
+
+ opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",")
+
+ log.Infof("Mounting root over 9P, ioFD: %d", fd)
+ mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts})
+ if err != nil {
+ return nil, fmt.Errorf("setting up mountnamespace: %w", err)
+ }
+
+ rootProcArgs.MountNamespaceVFS2 = mns
+
+ // Mount submounts.
+ if err := c.mountSubmountsVFS2(rootCtx, conf, mns, creds); err != nil {
+ return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
+ }
+
+ return mns, nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+
+ for _, submount := range c.mounts {
+ log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
+ if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil {
+ return err
+ }
+ }
+
+ // TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go.
+
+ return c.checkDispenser()
+}
+
+// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version.
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error {
+ root := mns.Root()
+ defer root.DecRef()
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(submount.Destination),
+ }
+
+ _, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount)
+ if err != nil {
+ return fmt.Errorf("mountOptions failed: %w", err)
+ }
+
+ opts := &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ Data: strings.Join(options, ","),
+ },
+ InternalMount: true,
+ }
+
+ // All writes go to upper, be paranoid and make lower readonly.
+ opts.ReadOnly = useOverlay
+
+ if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil {
+ return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+ }
+ log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts)
+ return nil
+}
+
+// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m specs.Mount) (string, []string, bool, error) {
+ var (
+ fsName string
+ opts []string
+ useOverlay bool
+ )
+
+ switch m.Type {
+ case devpts, devtmpfs, proc, sysfs:
+ fsName = m.Type
+ case nonefs:
+ fsName = sysfs
+ case tmpfs:
+ fsName = m.Type
+
+ var err error
+ opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+ if err != nil {
+ return "", nil, false, err
+ }
+
+ case bind:
+ fd := c.fds.remove()
+ fsName = "9p"
+ opts = p9MountOptionsVFS2(fd, c.getMountAccessType(m))
+ // If configured, add overlay to all writable mounts.
+ useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+ default:
+ log.Warningf("ignoring unknown filesystem type %q", m.Type)
+ }
+ return fsName, opts, useOverlay, nil
+}
+
+// p9MountOptions creates a slice of options for a p9 mount.
+// TODO(gvisor.dev/issue/1200): Remove this version in favor of the one in
+// fs.go when privateunixsocket lands.
+func p9MountOptionsVFS2(fd int, fa FileAccessType) []string {
+ opts := []string{
+ "trans=fd",
+ "rfdno=" + strconv.Itoa(fd),
+ "wfdno=" + strconv.Itoa(fd),
+ }
+ if fa == FileAccessShared {
+ opts = append(opts, "cache=remote_revalidating")
+ }
+ return opts
+}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 442e80ac0..24f9ecc35 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -521,9 +521,21 @@ func TestExePath(t *testing.T) {
// Test the we can retrieve the application exit status from the container.
func TestAppExitStatus(t *testing.T) {
+ conf := testutil.TestConfig()
+ conf.VFS2 = false
+ doAppExitStatus(t, conf)
+}
+
+// This is TestAppExitStatus for VFSv2.
+func TestAppExitStatusVFS2(t *testing.T) {
+ conf := testutil.TestConfig()
+ conf.VFS2 = true
+ doAppExitStatus(t, conf)
+}
+
+func doAppExitStatus(t *testing.T, conf *boot.Config) {
// First container will succeed.
succSpec := testutil.NewSpecWithArgs("true")
- conf := testutil.TestConfig()
rootDir, bundleDir, err := testutil.SetupContainer(succSpec, conf)
if err != nil {
t.Fatalf("error setting up container: %v", err)
diff --git a/runsc/main.go b/runsc/main.go
index c1c78529c..9d52f3006 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -84,6 +84,7 @@ var (
rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
+ vfs2Enabled = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
// Test flags, not to be used outside tests, ever.
testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
@@ -230,6 +231,7 @@ func main() {
ReferenceLeakMode: refsLeakMode,
OverlayfsStaleRead: *overlayfsStaleRead,
CPUNumFromQuota: *cpuNumFromQuota,
+ VFS2: *vfs2Enabled,
TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
TestOnlyTestNameEnv: *testOnlyTestNameEnv,
@@ -313,6 +315,7 @@ func main() {
log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay)
log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets)
log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls)
+ log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
log.Infof("***************************")
if *testOnlyAllowRunAsCurrentUserWithoutChroot {