diff options
Diffstat (limited to 'runsc/boot')
-rw-r--r-- | runsc/boot/BUILD | 18 | ||||
-rw-r--r-- | runsc/boot/compat.go | 2 | ||||
-rw-r--r-- | runsc/boot/config.go | 5 | ||||
-rw-r--r-- | runsc/boot/fds.go | 33 | ||||
-rw-r--r-- | runsc/boot/filter/config.go | 11 | ||||
-rw-r--r-- | runsc/boot/fs.go | 24 | ||||
-rw-r--r-- | runsc/boot/loader.go | 92 | ||||
-rw-r--r-- | runsc/boot/loader_amd64.go | 5 | ||||
-rw-r--r-- | runsc/boot/loader_arm64.go | 5 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 49 | ||||
-rw-r--r-- | runsc/boot/user.go | 170 | ||||
-rw-r--r-- | runsc/boot/user_test.go | 254 | ||||
-rw-r--r-- | runsc/boot/vfs.go | 316 |
13 files changed, 523 insertions, 461 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 26f68fe3d..69dcc74f2 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -20,9 +20,10 @@ go_library( "loader_arm64.go", "network.go", "strace.go", - "user.go", + "vfs.go", ], visibility = [ + "//pkg/test:__subpackages__", "//runsc:__subpackages__", "//test:__subpackages__", ], @@ -33,6 +34,7 @@ go_library( "//pkg/control/server", "//pkg/cpuid", "//pkg/eventchannel", + "//pkg/fspath", "//pkg/log", "//pkg/memutil", "//pkg/rand", @@ -40,6 +42,7 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/control", + "//pkg/sentry/devices/memdev", "//pkg/sentry/fs", "//pkg/sentry/fs/dev", "//pkg/sentry/fs/gofer", @@ -49,6 +52,13 @@ go_library( "//pkg/sentry/fs/sys", "//pkg/sentry/fs/tmpfs", "//pkg/sentry/fs/tty", + "//pkg/sentry/fs/user", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/gofer", + "//pkg/sentry/fsimpl/host", + "//pkg/sentry/fsimpl/proc", + "//pkg/sentry/fsimpl/sys", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel:uncaught_signal_go_proto", @@ -71,6 +81,7 @@ go_library( "//pkg/sentry/time", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/sentry/watchdog", "//pkg/sync", "//pkg/syserror", @@ -87,7 +98,6 @@ go_library( "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", "//pkg/urpc", - "//pkg/usermem", "//runsc/boot/filter", "//runsc/boot/platforms", "//runsc/boot/pprof", @@ -105,7 +115,6 @@ go_test( "compat_test.go", "fs_test.go", "loader_test.go", - "user_test.go", ], library = ":boot", deps = [ @@ -114,10 +123,11 @@ go_test( "//pkg/p9", "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel", "//pkg/sync", "//pkg/unet", "//runsc/fsgofer", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 8995d678e..b7cfb35bf 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -65,7 +65,7 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) { if logFD > 0 { f := os.NewFile(uintptr(logFD), "user log file") - target := &log.MultiEmitter{c.sink, &log.K8sJSONEmitter{log.Writer{Next: f}}} + target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}} c.sink = &log.BasicLogger{Level: log.Info, Emitter: target} } return c, nil diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 7ea5bfade..715a19112 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -305,5 +305,10 @@ func (c *Config) ToFlags() []string { if len(c.TestOnlyTestNameEnv) != 0 { f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv) } + + if c.VFS2 { + f = append(f, "--vfs2=true") + } + return f } diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 417d2d5fb..7e7a31fbd 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" + vfshost "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" ) @@ -31,10 +32,13 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } + if kernel.VFS2Enabled { + return createFDTableVFS2(ctx, console, stdioFDs) + } + k := kernel.KernelFromContext(ctx) fdTable := k.NewFDTable() defer fdTable.DecRef() - mounter := fs.FileOwnerFromContext(ctx) var ttyFile *fs.File for appFD, hostFD := range stdioFDs { @@ -44,7 +48,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F // Import the file as a host TTY file. if ttyFile == nil { var err error - appFile, err = host.ImportFile(ctx, hostFD, mounter, true /* isTTY */) + appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */) if err != nil { return nil, err } @@ -63,7 +67,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F } else { // Import the file as a regular host file. var err error - appFile, err = host.ImportFile(ctx, hostFD, mounter, false /* isTTY */) + appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */) if err != nil { return nil, err } @@ -79,3 +83,26 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F fdTable.IncRef() return fdTable, nil } + +func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) { + k := kernel.KernelFromContext(ctx) + fdTable := k.NewFDTable() + defer fdTable.DecRef() + + for appFD, hostFD := range stdioFDs { + // TODO(gvisor.dev/issue/1482): Add TTY support. + appFile, err := vfshost.ImportFD(ctx, k.HostMount(), hostFD, false) + if err != nil { + return nil, err + } + + if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { + appFile.DecRef() + return nil, err + } + appFile.DecRef() + } + + fdTable.IncRef() + return fdTable, nil +} diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index a4627905e..1828d116a 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -44,7 +44,7 @@ var allowedSyscalls = seccomp.SyscallRules{ { seccomp.AllowAny{}, seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.AllowValue(syscall.O_CLOEXEC), }, }, syscall.SYS_EPOLL_CREATE1: {}, @@ -284,12 +284,21 @@ var allowedSyscalls = seccomp.SyscallRules{ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, + unix.SYS_STATX: {}, syscall.SYS_SYNC_FILE_RANGE: {}, syscall.SYS_TGKILL: []seccomp.Rule{ { seccomp.AllowValue(uint64(os.Getpid())), }, }, + syscall.SYS_UTIMENSAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* null pathname */ + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* flags */ + }, + }, syscall.SYS_WRITE: {}, // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 0f62842ea..98cce60af 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -278,6 +278,9 @@ func subtargets(root string, mnts []specs.Mount) []string { } func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + if conf.VFS2 { + return setupContainerVFS2(ctx, conf, mntr, procArgs) + } mns, err := mntr.setupFS(conf, procArgs) if err != nil { return err @@ -573,6 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. func (c *containerMounter) processHints(conf *Config) error { + if conf.VFS2 { + return nil + } ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a @@ -781,9 +787,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly default: - // TODO(nlacasse): Support all the mount types and make this a fatal error. - // Most applications will "just work" without them, so this is a warning - // for now. log.Warningf("ignoring unknown filesystem type %q", m.Type) } return fsName, opts, useOverlay, nil @@ -824,7 +827,20 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil) if err != nil { - return fmt.Errorf("creating mount with source %q: %v", m.Source, err) + err := fmt.Errorf("creating mount with source %q: %v", m.Source, err) + // Check to see if this is a common error due to a Linux bug. + // This error is generated here in order to cause it to be + // printed to the user using Docker via 'runsc create' etc. rather + // than simply printed to the logs for the 'runsc boot' command. + // + // We check the error message string rather than type because the + // actual error types (syscall.EIO, syscall.EPIPE) are lost by file system + // implementation (e.g. p9). + // TODO(gvisor.dev/issue/1765): Remove message when bug is resolved. + if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) { + return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug")) + } + return err } // If there are submounts, we need to overlay the mount on top of a ramfs diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index e7ca98134..3f41d8357 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -26,7 +26,6 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" @@ -36,6 +35,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/fs/user" + vfs2host "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -46,6 +47,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" @@ -73,6 +75,8 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) +var syscallTable *kernel.SyscallTable + // Loader keeps state needed to start the kernel and run the container.. type Loader struct { // k is the kernel. @@ -156,13 +160,17 @@ type Args struct { Spec *specs.Spec // Conf is the system configuration. Conf *Config - // ControllerFD is the FD to the URPC controller. + // ControllerFD is the FD to the URPC controller. The Loader takes ownership + // of this FD and may close it at any time. ControllerFD int - // Device is an optional argument that is passed to the platform. + // Device is an optional argument that is passed to the platform. The Loader + // takes ownership of this file and may close it at any time. Device *os.File - // GoferFDs is an array of FDs used to connect with the Gofer. + // GoferFDs is an array of FDs used to connect with the Gofer. The Loader + // takes ownership of these FDs and may close them at any time. GoferFDs []int - // StdioFDs is the stdio for the application. + // StdioFDs is the stdio for the application. The Loader takes ownership of + // these FDs and may close them at any time. StdioFDs []int // Console is set to true if using TTY. Console bool @@ -175,6 +183,9 @@ type Args struct { UserLogFD int } +// make sure stdioFDs are always the same on initial start and on restore +const startingStdioFD = 64 + // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. func New(args Args) (*Loader, error) { @@ -188,13 +199,14 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("setting up memory usage: %v", err) } - if args.Conf.VFS2 { - st, ok := kernel.LookupSyscallTable(abi.Linux, arch.Host) - if ok { - vfs2.Override(st.Table) - } + // Patch the syscall table. + kernel.VFS2Enabled = args.Conf.VFS2 + if kernel.VFS2Enabled { + vfs2.Override(syscallTable.Table) } + kernel.RegisterSyscallTable(syscallTable) + // Create kernel and platform. p, err := createPlatform(args.Conf, args.Device) if err != nil { @@ -319,6 +331,35 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("creating pod mount hints: %v", err) } + if kernel.VFS2Enabled { + // Set up host mount that will be used for imported fds. + hostFilesystem := vfs2host.NewFilesystem(k.VFS()) + defer hostFilesystem.DecRef() + hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create hostfs mount: %v", err) + } + k.SetHostMount(hostMount) + } + + // Make host FDs stable between invocations. Host FDs must map to the exact + // same number when the sandbox is restored. Otherwise the wrong FD will be + // used. + var stdioFDs []int + newfd := startingStdioFD + for _, fd := range args.StdioFDs { + err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) + } + stdioFDs = append(stdioFDs, newfd) + err = syscall.Close(fd) + if err != nil { + return nil, fmt.Errorf("close original stdioFDs failed: %v", err) + } + newfd++ + } + eid := execID{cid: args.ID} l := &Loader{ k: k, @@ -327,7 +368,7 @@ func New(args Args) (*Loader, error) { watchdog: dog, spec: args.Spec, goferFDs: args.GoferFDs, - stdioFDs: args.StdioFDs, + stdioFDs: stdioFDs, rootProcArgs: procArgs, sandboxID: args.ID, processes: map[execID]*execProcess{eid: {}}, @@ -367,11 +408,16 @@ func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel. return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err) } + wd := spec.Process.Cwd + if wd == "" { + wd = "/" + } + // Create the process arguments. procArgs := kernel.CreateProcessArgs{ Argv: spec.Process.Args, Envv: spec.Process.Env, - WorkingDirectory: spec.Process.Cwd, // Defaults to '/' if empty. + WorkingDirectory: wd, Credentials: creds, Umask: 0022, Limits: ls, @@ -516,7 +562,15 @@ func (l *Loader) run() error { } // Add the HOME enviroment variable if it is not already set. - envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + var envv []string + if kernel.VFS2Enabled { + envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2, + l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + + } else { + envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, + l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + } if err != nil { return err } @@ -569,6 +623,16 @@ func (l *Loader) run() error { } }) + // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again + // either in createFDTable() during initial start or in descriptor.initAfterLoad() + // during restore, we can release l.stdioFDs now. + for _, fd := range l.stdioFDs { + err := syscall.Close(fd) + if err != nil { + return fmt.Errorf("close dup()ed stdioFDs: %v", err) + } + } + log.Infof("Process should have started...") l.watchdog.Start() return l.k.Start() @@ -810,7 +874,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { root := args.MountNamespace.Root() defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) - envv, err := maybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) + envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) if err != nil { return 0, err } diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go index b9669f2ac..78df86611 100644 --- a/runsc/boot/loader_amd64.go +++ b/runsc/boot/loader_amd64.go @@ -17,11 +17,10 @@ package boot import ( - "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" ) func init() { - // Register the global syscall table. - kernel.RegisterSyscallTable(linux.AMD64) + // Set the global syscall table. + syscallTable = linux.AMD64 } diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go index cf64d28c8..250785010 100644 --- a/runsc/boot/loader_arm64.go +++ b/runsc/boot/loader_arm64.go @@ -17,11 +17,10 @@ package boot import ( - "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" ) func init() { - // Register the global syscall table. - kernel.RegisterSyscallTable(linux.ARM64) + // Set the global syscall table. + syscallTable = linux.ARM64 } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 44aa63196..e7c71734f 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -24,11 +24,13 @@ import ( "time" specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/control/server" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/runsc/fsgofer" @@ -65,6 +67,11 @@ func testSpec() *specs.Spec { } } +func resetSyscallTable() { + kernel.VFS2Enabled = false + kernel.FlushSyscallTablesTestOnly() +} + // startGofer starts a new gofer routine serving 'root' path. It returns the // sandbox side of the connection, and a function that when called will stop the // gofer. @@ -100,7 +107,7 @@ func startGofer(root string) (int, func(), error) { return sandboxEnd, cleanup, nil } -func createLoader() (*Loader, func(), error) { +func createLoader(vfsEnabled bool) (*Loader, func(), error) { fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10])) if err != nil { return nil, nil, err @@ -108,12 +115,23 @@ func createLoader() (*Loader, func(), error) { conf := testConfig() spec := testSpec() + conf.VFS2 = vfsEnabled + sandEnd, cleanup, err := startGofer(spec.Root.Path) if err != nil { return nil, nil, err } - stdio := []int{int(os.Stdin.Fd()), int(os.Stdout.Fd()), int(os.Stderr.Fd())} + // Loader takes ownership of stdio. + var stdio []int + for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} { + newFd, err := unix.Dup(int(f.Fd())) + if err != nil { + return nil, nil, err + } + stdio = append(stdio, newFd) + } + args := Args{ ID: "foo", Spec: spec, @@ -132,10 +150,22 @@ func createLoader() (*Loader, func(), error) { // TestRun runs a simple application in a sandbox and checks that it succeeds. func TestRun(t *testing.T) { - l, cleanup, err := createLoader() + defer resetSyscallTable() + doRun(t, false) +} + +// TestRunVFS2 runs TestRun in VFSv2. +func TestRunVFS2(t *testing.T) { + defer resetSyscallTable() + doRun(t, true) +} + +func doRun(t *testing.T, vfsEnabled bool) { + l, cleanup, err := createLoader(vfsEnabled) if err != nil { t.Fatalf("error creating loader: %v", err) } + defer l.Destroy() defer cleanup() @@ -169,7 +199,18 @@ func TestRun(t *testing.T) { // TestStartSignal tests that the controller Start message will cause // WaitForStartSignal to return. func TestStartSignal(t *testing.T) { - l, cleanup, err := createLoader() + defer resetSyscallTable() + doStartSignal(t, false) +} + +// TestStartSignalVFS2 does TestStartSignal with VFS2. +func TestStartSignalVFS2(t *testing.T) { + defer resetSyscallTable() + doStartSignal(t, true) +} + +func doStartSignal(t *testing.T, vfsEnabled bool) { + l, cleanup, err := createLoader(vfsEnabled) if err != nil { t.Fatalf("error creating loader: %v", err) } diff --git a/runsc/boot/user.go b/runsc/boot/user.go deleted file mode 100644 index f0aa52135..000000000 --- a/runsc/boot/user.go +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package boot - -import ( - "bufio" - "fmt" - "io" - "strconv" - "strings" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/usermem" -) - -type fileReader struct { - // Ctx is the context for the file reader. - Ctx context.Context - - // File is the file to read from. - File *fs.File -} - -// Read implements io.Reader.Read. -func (r *fileReader) Read(buf []byte) (int, error) { - n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf)) - return int(n), err -} - -// getExecUserHome returns the home directory of the executing user read from -// /etc/passwd as read from the container filesystem. -func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) { - // The default user home directory to return if no user matching the user - // if found in the /etc/passwd found in the image. - const defaultHome = "/" - - // Open the /etc/passwd file from the dirent via the root mount namespace. - mnsRoot := rootMns.Root() - maxTraversals := uint(linux.MaxSymlinkTraversals) - dirent, err := rootMns.FindInode(ctx, mnsRoot, nil, "/etc/passwd", &maxTraversals) - if err != nil { - // NOTE: Ignore errors opening the passwd file. If the passwd file - // doesn't exist we will return the default home directory. - return defaultHome, nil - } - defer dirent.DecRef() - - // Check read permissions on the file. - if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil { - // NOTE: Ignore permissions errors here and return default root dir. - return defaultHome, nil - } - - // Only open regular files. We don't open other files like named pipes as - // they may block and might present some attack surface to the container. - // Note that runc does not seem to do this kind of checking. - if !fs.IsRegular(dirent.Inode.StableAttr) { - return defaultHome, nil - } - - f, err := dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Directory: false}) - if err != nil { - return "", err - } - defer f.DecRef() - - r := &fileReader{ - Ctx: ctx, - File: f, - } - - homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome) - if err != nil { - return "", err - } - - return homeDir, nil -} - -// maybeAddExecUserHome returns a new slice with the HOME enviroment variable -// set if the slice does not already contain it, otherwise it returns the -// original slice unmodified. -func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) { - // Check if the envv already contains HOME. - for _, env := range envv { - if strings.HasPrefix(env, "HOME=") { - // We have it. Return the original slice unmodified. - return envv, nil - } - } - - // Read /etc/passwd for the user's HOME directory and set the HOME - // environment variable as required by POSIX if it is not overridden by - // the user. - homeDir, err := getExecUserHome(ctx, mns, uid) - if err != nil { - return nil, fmt.Errorf("error reading exec user: %v", err) - } - return append(envv, "HOME="+homeDir), nil -} - -// findHomeInPasswd parses a passwd file and returns the given user's home -// directory. This function does it's best to replicate the runc's behavior. -func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) { - s := bufio.NewScanner(passwd) - - for s.Scan() { - if err := s.Err(); err != nil { - return "", err - } - - line := strings.TrimSpace(s.Text()) - if line == "" { - continue - } - - // Pull out part of passwd entry. Loosely parse the passwd entry as some - // passwd files could be poorly written and for compatibility with runc. - // - // Per 'man 5 passwd' - // /etc/passwd contains one line for each user account, with seven - // fields delimited by colons (“:”). These fields are: - // - // - login name - // - optional encrypted password - // - numerical user ID - // - numerical group ID - // - user name or comment field - // - user home directory - // - optional user command interpreter - parts := strings.Split(line, ":") - - found := false - homeDir := "" - for i, p := range parts { - switch i { - case 2: - parsedUID, err := strconv.ParseUint(p, 10, 32) - if err == nil && parsedUID == uint64(uid) { - found = true - } - case 5: - homeDir = p - } - } - if found { - // NOTE: If the uid is present but the home directory is not - // present in the /etc/passwd entry we return an empty string. This - // is, for better or worse, what runc does. - return homeDir, nil - } - } - - return defaultHome, nil -} diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go deleted file mode 100644 index fb4e13dfb..000000000 --- a/runsc/boot/user_test.go +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package boot - -import ( - "io/ioutil" - "os" - "path/filepath" - "strings" - "syscall" - "testing" - - specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" -) - -func setupTempDir() (string, error) { - tmpDir, err := ioutil.TempDir(os.TempDir(), "exec-user-test") - if err != nil { - return "", err - } - return tmpDir, nil -} - -func setupPasswd(contents string, perms os.FileMode) func() (string, error) { - return func() (string, error) { - tmpDir, err := setupTempDir() - if err != nil { - return "", err - } - - if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil { - return "", err - } - - f, err := os.Create(filepath.Join(tmpDir, "etc", "passwd")) - if err != nil { - return "", err - } - defer f.Close() - - _, err = f.WriteString(contents) - if err != nil { - return "", err - } - - err = f.Chmod(perms) - if err != nil { - return "", err - } - return tmpDir, nil - } -} - -// TestGetExecUserHome tests the getExecUserHome function. -func TestGetExecUserHome(t *testing.T) { - tests := map[string]struct { - uid auth.KUID - createRoot func() (string, error) - expected string - }{ - "success": { - uid: 1000, - createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0666), - expected: "/home/adin", - }, - "no_passwd": { - uid: 1000, - createRoot: setupTempDir, - expected: "/", - }, - "no_perms": { - uid: 1000, - createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0000), - expected: "/", - }, - "directory": { - uid: 1000, - createRoot: func() (string, error) { - tmpDir, err := setupTempDir() - if err != nil { - return "", err - } - - if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil { - return "", err - } - - if err := syscall.Mkdir(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil { - return "", err - } - - return tmpDir, nil - }, - expected: "/", - }, - // Currently we don't allow named pipes. - "named_pipe": { - uid: 1000, - createRoot: func() (string, error) { - tmpDir, err := setupTempDir() - if err != nil { - return "", err - } - - if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil { - return "", err - } - - if err := syscall.Mkfifo(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil { - return "", err - } - - return tmpDir, nil - }, - expected: "/", - }, - } - - for name, tc := range tests { - t.Run(name, func(t *testing.T) { - tmpDir, err := tc.createRoot() - if err != nil { - t.Fatalf("failed to create root dir: %v", err) - } - - sandEnd, cleanup, err := startGofer(tmpDir) - if err != nil { - t.Fatalf("failed to create gofer: %v", err) - } - defer cleanup() - - ctx := contexttest.Context(t) - conf := &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - DisableSeccomp: true, - } - - spec := &specs.Spec{ - Root: &specs.Root{ - Path: tmpDir, - Readonly: true, - }, - // Add /proc mount as tmpfs to avoid needing a kernel. - Mounts: []specs.Mount{ - { - Destination: "/proc", - Type: "tmpfs", - }, - }, - } - - mntr := newContainerMounter(spec, []int{sandEnd}, nil, &podMountHints{}) - mns, err := mntr.createMountNamespace(ctx, conf) - if err != nil { - t.Fatalf("failed to create mount namespace: %v", err) - } - ctx = fs.WithRoot(ctx, mns.Root()) - if err := mntr.mountSubmounts(ctx, conf, mns); err != nil { - t.Fatalf("failed to create mount namespace: %v", err) - } - - got, err := getExecUserHome(ctx, mns, tc.uid) - if err != nil { - t.Fatalf("failed to get user home: %v", err) - } - - if got != tc.expected { - t.Fatalf("expected %v, got: %v", tc.expected, got) - } - }) - } -} - -// TestFindHomeInPasswd tests the findHomeInPasswd function's passwd file parsing. -func TestFindHomeInPasswd(t *testing.T) { - tests := map[string]struct { - uid uint32 - passwd string - expected string - def string - }{ - "empty": { - uid: 1000, - passwd: "", - expected: "/", - def: "/", - }, - "whitespace": { - uid: 1000, - passwd: " ", - expected: "/", - def: "/", - }, - "full": { - uid: 1000, - passwd: "adin::1000:1111::/home/adin:/bin/sh", - expected: "/home/adin", - def: "/", - }, - // For better or worse, this is how runc works. - "partial": { - uid: 1000, - passwd: "adin::1000:1111:", - expected: "", - def: "/", - }, - "multiple": { - uid: 1001, - passwd: "adin::1000:1111::/home/adin:/bin/sh\nian::1001:1111::/home/ian:/bin/sh", - expected: "/home/ian", - def: "/", - }, - "duplicate": { - uid: 1000, - passwd: "adin::1000:1111::/home/adin:/bin/sh\nian::1000:1111::/home/ian:/bin/sh", - expected: "/home/adin", - def: "/", - }, - "empty_lines": { - uid: 1001, - passwd: "adin::1000:1111::/home/adin:/bin/sh\n\n\nian::1001:1111::/home/ian:/bin/sh", - expected: "/home/ian", - def: "/", - }, - } - - for name, tc := range tests { - t.Run(name, func(t *testing.T) { - got, err := findHomeInPasswd(tc.uid, strings.NewReader(tc.passwd), tc.def) - if err != nil { - t.Fatalf("error parsing passwd: %v", err) - } - if tc.expected != got { - t.Fatalf("expected %v, got: %v", tc.expected, got) - } - }) - } -} diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go new file mode 100644 index 000000000..bce3a3593 --- /dev/null +++ b/runsc/boot/vfs.go @@ -0,0 +1,316 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "path" + "strconv" + "strings" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/devices/memdev" + "gvisor.dev/gvisor/pkg/sentry/fs" + devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" + sysimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" + tmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/syserror" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { + + vfsObj.MustRegisterFilesystemType(rootFsName, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(bind, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(devpts, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(devtmpfs, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(proc, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(sysfs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(tmpfs, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(nonefs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + // Setup files in devtmpfs. + if err := memdev.Register(vfsObj); err != nil { + return fmt.Errorf("registering memdev: %w", err) + } + a, err := devtmpfsimpl.NewAccessor(ctx, vfsObj, creds, devtmpfsimpl.Name) + if err != nil { + return fmt.Errorf("creating devtmpfs accessor: %w", err) + } + defer a.Release() + + if err := a.UserspaceInit(ctx); err != nil { + return fmt.Errorf("initializing userspace: %w", err) + } + if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating devtmpfs files: %w", err) + } + return nil +} + +func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + if err := mntr.k.VFS().Init(); err != nil { + return fmt.Errorf("failed to initialize VFS: %w", err) + } + mns, err := mntr.setupVFS2(ctx, conf, procArgs) + if err != nil { + return fmt.Errorf("failed to setupFS: %w", err) + } + procArgs.MountNamespaceVFS2 = mns + return setExecutablePathVFS2(ctx, procArgs) +} + +func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error { + + exe := procArgs.Argv[0] + + // Absolute paths can be used directly. + if path.IsAbs(exe) { + procArgs.Filename = exe + return nil + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(exe, '/') > 0 { + + if !path.IsAbs(procArgs.WorkingDirectory) { + return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory) + } + + procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe) + return nil + } + + // Paths with a '/' are relative to the CWD. + if strings.IndexByte(exe, '/') > 0 { + procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe) + return nil + } + + // Otherwise, We must lookup the name in the paths, starting from the + // root directory. + root := procArgs.MountNamespaceVFS2.Root() + defer root.DecRef() + + paths := fs.GetPath(procArgs.Envv) + creds := procArgs.Credentials + + for _, p := range paths { + + binPath := path.Join(p, exe) + + pop := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(binPath), + FollowFinalSymlink: true, + } + + opts := &vfs.OpenOptions{ + FileExec: true, + Flags: linux.O_RDONLY, + } + + dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts) + if err == syserror.ENOENT || err == syserror.EACCES { + // Didn't find it here. + continue + } + if err != nil { + return err + } + dentry.DecRef() + + procArgs.Filename = binPath + return nil + } + + return fmt.Errorf("executable %q not found in $PATH=%q", exe, strings.Join(paths, ":")) +} + +func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { + log.Infof("Configuring container's file system with VFS2") + + // Create context with root credentials to mount the filesystem (the current + // user may not be privileged enough). + rootProcArgs := *procArgs + rootProcArgs.WorkingDirectory = "/" + rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs.Umask = 0022 + rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals + rootCtx := procArgs.NewContext(c.k) + + creds := procArgs.Credentials + if err := registerFilesystems(rootCtx, c.k.VFS(), creds); err != nil { + return nil, fmt.Errorf("register filesystems: %w", err) + } + + fd := c.fds.remove() + + opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",") + + log.Infof("Mounting root over 9P, ioFD: %d", fd) + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts}) + if err != nil { + return nil, fmt.Errorf("setting up mountnamespace: %w", err) + } + + rootProcArgs.MountNamespaceVFS2 = mns + + // Mount submounts. + if err := c.mountSubmountsVFS2(rootCtx, conf, mns, creds); err != nil { + return nil, fmt.Errorf("mounting submounts vfs2: %w", err) + } + + return mns, nil +} + +func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { + + for _, submount := range c.mounts { + log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) + if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil { + return err + } + } + + // TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go. + + return c.checkDispenser() +} + +// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version. +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error { + root := mns.Root() + defer root.DecRef() + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(submount.Destination), + } + + _, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount) + if err != nil { + return fmt.Errorf("mountOptions failed: %w", err) + } + + opts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(options, ","), + }, + InternalMount: true, + } + + // All writes go to upper, be paranoid and make lower readonly. + opts.ReadOnly = useOverlay + + if err := c.k.VFS().MkdirAt(ctx, creds, target, &vfs.MkdirOptions{ + ForSyntheticMountpoint: true, + }); err != nil && err != syserror.EEXIST { + // Log a warning, but attempt the mount anyway. + log.Warningf("Failed to create mount point at %q: %v", submount.Destination, err) + } + if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil { + return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + } + log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts) + return nil +} + +// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m specs.Mount) (string, []string, bool, error) { + var ( + fsName string + opts []string + useOverlay bool + ) + + switch m.Type { + case devpts, devtmpfs, proc, sysfs: + fsName = m.Type + case nonefs: + fsName = sysfs + case tmpfs: + fsName = m.Type + + var err error + opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...) + if err != nil { + return "", nil, false, err + } + + case bind: + fd := c.fds.remove() + fsName = "9p" + opts = p9MountOptionsVFS2(fd, c.getMountAccessType(m)) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + + default: + log.Warningf("ignoring unknown filesystem type %q", m.Type) + } + return fsName, opts, useOverlay, nil +} + +// p9MountOptions creates a slice of options for a p9 mount. +// TODO(gvisor.dev/issue/1200): Remove this version in favor of the one in +// fs.go when privateunixsocket lands. +func p9MountOptionsVFS2(fd int, fa FileAccessType) []string { + opts := []string{ + "trans=fd", + "rfdno=" + strconv.Itoa(fd), + "wfdno=" + strconv.Itoa(fd), + } + if fa == FileAccessShared { + opts = append(opts, "cache=remote_revalidating") + } + return opts +} |