diff options
author | Kevin Krakauer <krakauer@google.com> | 2018-08-15 16:24:07 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-08-15 16:25:22 -0700 |
commit | 635b0c45933cd841298b0c21a513a9169e849594 (patch) | |
tree | 058bae2ead9f7f182baaf3491580b5a419cb6c94 | |
parent | 2033f61aae6ff1b3e613d7bb9e9da273791a5176 (diff) |
runsc fsgofer: Support dynamic serving of filesystems.
When multiple containers run inside a sentry, each container has its own root
filesystem and set of mounts. Containers are also added after sentry boot rather
than all configured and known at boot time.
The fsgofer needs to be able to serve the root filesystem of each container.
Thus, it must be possible to add filesystems after the fsgofer has already
started.
This change:
* Creates a URPC endpoint within the gofer process that listens for requests to
serve new content.
* Enables the sentry, when starting a new container, to add the new container's
filesystem.
* Mounts those new filesystems at separate roots within the sentry.
PiperOrigin-RevId: 208903248
Change-Id: Ifa91ec9c8caf5f2f0a9eead83c4a57090ce92068
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 14 | ||||
-rw-r--r-- | pkg/urpc/urpc.go | 10 | ||||
-rw-r--r-- | runsc/boot/controller.go | 19 | ||||
-rw-r--r-- | runsc/boot/fs.go | 159 | ||||
-rw-r--r-- | runsc/boot/loader.go | 36 | ||||
-rw-r--r-- | runsc/cmd/BUILD | 1 | ||||
-rw-r--r-- | runsc/cmd/gofer.go | 36 | ||||
-rw-r--r-- | runsc/container/container.go | 7 | ||||
-rw-r--r-- | runsc/fsgofer/BUILD | 4 | ||||
-rw-r--r-- | runsc/fsgofer/control.go | 203 | ||||
-rw-r--r-- | runsc/fsgofer/fsgofer.go | 5 | ||||
-rw-r--r-- | runsc/sandbox/BUILD | 1 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 139 |
13 files changed, 515 insertions, 119 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 419a1d473..cb43fdcdc 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -504,6 +504,14 @@ type CreateProcessArgs struct { // IPCNamespace is the initial IPC namespace. IPCNamespace *IPCNamespace + + // Root optionally contains the dirent that serves as the root for the + // process. If nil, the mount namespace's root is used as the process' + // root. + // + // Anyone setting Root must donate a reference (i.e. increment it) to + // keep it alive until it is decremented by CreateProcess. + Root *fs.Dirent } // NewContext returns a context.Context that represents the task that will be @@ -581,8 +589,12 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) { ctx := args.NewContext(k) // Grab the root directory. - root := fs.RootFromContext(ctx) + root := args.Root + if root == nil { + root = fs.RootFromContext(ctx) + } defer root.DecRef() + args.Root = nil // Grab the working directory. wd := root // Default. diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go index af620b704..1ec06dd4c 100644 --- a/pkg/urpc/urpc.go +++ b/pkg/urpc/urpc.go @@ -63,6 +63,10 @@ func (r RemoteError) Error() string { // file as a result of an RPC. These are not actually serialized, rather they // are sent via an accompanying SCM_RIGHTS message (plumbed through the unet // package). +// +// When embedding a FilePayload in an argument struct, the argument type _must_ +// be a pointer to the struct rather than the struct type itself. This is +// because the urpc package defines pointer methods on FilePayload. type FilePayload struct { Files []*os.File `json:"-"` } @@ -552,6 +556,12 @@ func (c *Client) Call(method string, arg interface{}, result interface{}) error c.mu.Lock() defer c.mu.Unlock() + // If arg is a FilePayload, not a *FilePayload, files won't actually be + // sent, so error out. + if _, ok := arg.(FilePayload); ok { + return fmt.Errorf("argument is a FilePayload, but should be a *FilePayload") + } + // Are there files to send? var fs []*os.File if fp, ok := arg.(filePayloader); ok { diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index fc6ea326a..69e88d8e0 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -17,6 +17,7 @@ package boot import ( "errors" "fmt" + "path" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/control/server" @@ -181,11 +182,15 @@ type StartArgs struct { // CID is the ID of the container to start. CID string + + // FilePayload contains the file descriptor over which the sandbox will + // request files from its root filesystem. + urpc.FilePayload } // Start runs a created container within a sandbox. func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { - log.Debugf("containerManager.Start") + log.Debugf("containerManager.Start: %+v", args) // Validate arguments. if args == nil { @@ -200,8 +205,18 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { if args.CID == "" { return errors.New("start argument missing container ID") } + // Prevent CIDs containing ".." from confusing the sentry when creating + // /containers/<cid> directory. + // TODO: Once we have multiple independant roots, this + // check won't be necessary. + if path.Clean(args.CID) != args.CID { + return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID) + } + if len(args.FilePayload.Files) != 1 { + return fmt.Errorf("start arguments must contain one file for the container root") + } - tgid, err := cm.l.startContainer(args, cm.l.k) + tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files[0]) if err != nil { return err } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index eea2ec1f5..8996b1398 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -48,6 +48,19 @@ const ( // Device name for root mount. rootDevice = "9pfs-/" + + // childContainersDir is the directory where child container root + // filesystems are mounted. + childContainersDir = "/__runsc_containers__" + + // Filesystems that runsc supports. + bind = "bind" + devpts = "devpts" + devtmpfs = "devtmpfs" + proc = "proc" + sysfs = "sysfs" + tmpfs = "tmpfs" + nonefs = "none" ) type fdDispenser struct { @@ -70,8 +83,15 @@ func (f *fdDispenser) empty() bool { // createMountNamespace creates a mount namespace containing the root filesystem // and all mounts. 'rootCtx' is used to walk directories to find mount points. func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) { + mounts := compileMounts(spec) + // Create a tmpfs mount where we create and mount a root filesystem for + // each child container. + mounts = append(mounts, specs.Mount{ + Type: tmpfs, + Destination: childContainersDir, + }) fds := &fdDispenser{fds: ioFDs} - rootInode, err := createRootMount(rootCtx, spec, conf, fds) + rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts) if err != nil { return nil, fmt.Errorf("failed to create root mount: %v", err) } @@ -79,7 +99,7 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec if err != nil { return nil, fmt.Errorf("failed to create root mount namespace: %v", err) } - mounts := compileMounts(spec) + if err := setMounts(rootCtx, conf, mns, fds, mounts); err != nil { return nil, fmt.Errorf("failed to configure mounts: %v", err) } @@ -98,12 +118,12 @@ func compileMounts(spec *specs.Spec) []specs.Mount { // Always mount /dev. mounts = append(mounts, specs.Mount{ - Type: "devtmpfs", + Type: devtmpfs, Destination: "/dev", }) mounts = append(mounts, specs.Mount{ - Type: "devpts", + Type: devpts, Destination: "/dev/pts", }) @@ -129,13 +149,13 @@ func compileMounts(spec *specs.Spec) []specs.Mount { var mandatoryMounts []specs.Mount if !procMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ - Type: "proc", + Type: proc, Destination: "/proc", }) } if !sysMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ - Type: "sysfs", + Type: sysfs, Destination: "/sys", }) } @@ -149,7 +169,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount { // that. Until then, the /tmp mount will always appear empty at // container creation. mandatoryMounts = append(mandatoryMounts, specs.Mount{ - Type: "tmpfs", + Type: tmpfs, Destination: "/tmp", }) } @@ -165,7 +185,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount { // mount namespace. func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error { for _, m := range mounts { - if err := mountSubmount(ctx, conf, mns, fds, m, mounts); err != nil { + if err := mountSubmount(ctx, conf, mns, fds, m, mounts, m.Destination); err != nil { return err } } @@ -173,7 +193,7 @@ func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *f } // createRootMount creates the root filesystem. -func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) { +func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) { // First construct the filesystem from the spec.Root. mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly} @@ -207,7 +227,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f // We need to overlay the root on top of a ramfs with stub directories // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always // mounted even if they are not in the spec. - submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp") + submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp") rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) if err != nil { return nil, fmt.Errorf("error adding submount overlay: %v", err) @@ -256,17 +276,17 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri ) switch m.Type { - case "devpts", "devtmpfs", "proc", "sysfs": + case devpts, devtmpfs, proc, sysfs: fsName = m.Type - case "none": - fsName = "sysfs" - case "tmpfs": + case nonefs: + fsName = sysfs + case tmpfs: fsName = m.Type // tmpfs has some extra supported options that we must pass through. opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") - case "bind": + case bind: switch conf.FileAccess { case FileAccessProxy, FileAccessProxyExclusive: fd := fds.remove() @@ -291,7 +311,7 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri return fsName, opts, useOverlay, err } -func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error { +func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount, dest string) error { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds) @@ -342,51 +362,52 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fd // in the right location, e.g. // mount: /var/run/secrets, may be created in '/run/secrets' if // '/var/run' => '/var'. - if err := mkdirAll(ctx, mns, m.Destination); err != nil { + if err := mkdirAll(ctx, mns, dest); err != nil { return err } root := mns.Root() defer root.DecRef() - dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals) + dirent, err := mns.FindInode(ctx, root, nil, dest, linux.MaxSymlinkTraversals) if err != nil { - return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err) + return fmt.Errorf("failed to find mount destination %q: %v", dest, err) } defer dirent.DecRef() if err := mns.Mount(ctx, dirent, inode); err != nil { - return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err) + return fmt.Errorf("failed to mount at destination %q: %v", dest, err) } - log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) + log.Infof("Mounted %q to %q type %s", m.Source, dest, m.Type) return nil } func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error { + log.Infof("mkdirAll called with path %s", path) root := mns.Root() defer root.DecRef() // Starting at the root, walk the path. parent := root ps := strings.Split(filepath.Clean(path), string(filepath.Separator)) - for i := 0; i < len(ps); i++ { - if ps[i] == "" { + for _, pathElem := range ps { + if pathElem == "" { // This will be case for the first and last element, if the path // begins or ends with '/'. Note that we always treat the path as // absolute, regardless of what the first character contains. continue } - d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit) + d, err := mns.FindInode(ctx, root, parent, pathElem, fs.DefaultTraversalLimit) if err == syserror.ENOENT { // If we encounter a path that does not exist, then // create it. - if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil { - return fmt.Errorf("failed to create directory %q: %v", ps[i], err) + if err := parent.CreateDirectory(ctx, root, pathElem, fs.FilePermsFromMode(0755)); err != nil { + return fmt.Errorf("failed to create directory %q: %v", pathElem, err) } - if d, err = parent.Walk(ctx, root, ps[i]); err != nil { - return fmt.Errorf("walk to %q failed: %v", ps[i], err) + if d, err = parent.Walk(ctx, root, pathElem); err != nil { + return fmt.Errorf("walk to %q failed: %v", pathElem, err) } } else if err != nil { - return fmt.Errorf("failed to find inode %q: %v", ps[i], err) + return fmt.Errorf("failed to find inode %q: %v", pathElem, err) } parent = d } @@ -444,7 +465,7 @@ func destinations(mounts []specs.Mount, extra ...string) []string { // mountDevice returns a device string based on the fs type and target // of the mount. func mountDevice(m specs.Mount) string { - if m.Type == "bind" { + if m.Type == bind { // Make a device string that includes the target, which is consistent across // S/R and uniquely identifies the connection. return "9pfs-" + m.Destination @@ -589,7 +610,7 @@ func subtargets(root string, mnts []specs.Mount) []string { // setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly. // procArgs are passed by reference and the FDMap field is modified. -func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel) error { +func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error { ctx := procArgs.NewContext(k) // Create the FD map, which will set stdin, stdout, and stderr. If @@ -604,27 +625,79 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe // won't need ours either way. procArgs.FDMap = fdm + // Use root user to configure mounts. The current user might not have + // permission to do so. + rootProcArgs := kernel.CreateProcessArgs{ + WorkingDirectory: "/", + Credentials: auth.NewRootCredentials(creds.UserNamespace), + Umask: 0022, + MaxSymlinkTraversals: linux.MaxSymlinkTraversals, + } + rootCtx := rootProcArgs.NewContext(k) + // If this is the root container, we also need to setup the root mount // namespace. - if k.RootMountNamespace() == nil { - // Use root user to configure mounts. The current user might not have - // permission to do so. - rootProcArgs := kernel.CreateProcessArgs{ - WorkingDirectory: "/", - Credentials: auth.NewRootCredentials(creds.UserNamespace), - Umask: 0022, - MaxSymlinkTraversals: linux.MaxSymlinkTraversals, - } - rootCtx := rootProcArgs.NewContext(k) - + mns := k.RootMountNamespace() + if mns == nil { // Create the virtual filesystem. mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs) if err != nil { return fmt.Errorf("error creating mounts: %v", err) } - k.SetRootMountNamespace(mns) + return nil + } + + // Setup a child container. + + // Create the container's root filesystem mount. + log.Infof("Creating new process in child container.") + fds := &fdDispenser{fds: append([]int{}, ioFDs...)} + rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil) + if err != nil { + return fmt.Errorf("error creating filesystem for container: %v", err) + } + + // Make directories for submounts within the container. + rootDir := mns.Root() + defer rootDir.DecRef() + containerRoot := filepath.Join(childContainersDir, cid) + mkdirAll(ctx, mns, containerRoot) + + // Mount the container's root filesystem to the newly created + // mount point. + containerRootDirent, err := mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals) + if err != nil { + return fmt.Errorf("failed to find mount destination: %q: %v", containerRoot, err) + } + if err := mns.Mount(ctx, containerRootDirent, rootInode); err != nil { + return fmt.Errorf("failed to mount at destination %q: %v", containerRoot, err) + } + containerRootDirent.DecRef() + + // We have to re-walk to the dirent to find the mounted + // directory. The old dirent is invalid at this point. + containerRootDirent, err = mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals) + if err != nil { + return fmt.Errorf("failed to find mount destination2: %q: %v", containerRoot, err) + } + log.Infof("Mounted child's root fs to %q", containerRoot) + + // Mount all submounts. + mounts := compileMounts(spec) + for _, m := range mounts { + // TODO: Enable bind mounts in child containers. + if m.Type == bind { + log.Infof("Bind mounts in child containers are not yet supported: %+v", m) + continue + } + dest := filepath.Join(containerRoot, m.Destination) + if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), fds, m, mounts, dest); err != nil { + return fmt.Errorf("error mounting filesystem for container: %v", err) + } } + // Set the procArgs root directory. + procArgs.Root = containerRootDirent return nil } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index f6c7bf223..7debf0ac2 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -19,6 +19,7 @@ import ( "errors" "fmt" "math/rand" + "os" "runtime" "sync" "sync/atomic" @@ -229,7 +230,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console // Ensure that signals received are forwarded to the emulated kernel. stopSignalForwarding := sighandling.PrepareForwarding(k, false)() - procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k) + procArgs, err := newProcess(spec, creds, utsns, ipcns, k) if err != nil { return nil, fmt.Errorf("failed to create root process: %v", err) } @@ -250,7 +251,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console } // newProcess creates a process that can be run with kernel.CreateProcess. -func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) { +func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) { // Create initial limits. ls, err := createLimitSet(spec) if err != nil { @@ -277,7 +278,6 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds UTSNamespace: utsns, IPCNamespace: ipcns, } - return procArgs, nil } @@ -356,7 +356,8 @@ func (l *Loader) run() error { l.console, l.rootProcArgs.Credentials, l.rootProcArgs.Limits, - l.k) + l.k, + "" /* CID, which isn't needed for the root container */) if err != nil { return err } @@ -376,8 +377,7 @@ func (l *Loader) run() error { // startContainer starts a child container. It returns the thread group ID of // the newly created process. -func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.ThreadID, error) { - spec := args.Spec +func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, file *os.File) (kernel.ThreadID, error) { // Create capabilities. caps, err := specutils.Capabilities(spec.Process.Capabilities) if err != nil { @@ -406,26 +406,24 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa // when indicated by the spec. procArgs, err := newProcess( - args.Spec, - args.Conf, - nil, // ioFDs - false, // console + spec, creds, - k.RootUTSNamespace(), - k.RootIPCNamespace(), - k) + l.k.RootUTSNamespace(), + l.k.RootIPCNamespace(), + l.k) if err != nil { return 0, fmt.Errorf("failed to create new process: %v", err) } err = setFileSystemForProcess( &procArgs, - args.Spec, - args.Conf, - nil, + spec, + conf, + []int{int(file.Fd())}, // ioFDs false, creds, procArgs.Limits, - k) + k, + cid) if err != nil { return 0, fmt.Errorf("failed to create new process: %v", err) } @@ -435,7 +433,7 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa return 0, fmt.Errorf("failed to create process in sentry: %v", err) } - ts := k.TaskSet() + ts := l.k.TaskSet() tgid := ts.Root.IDOfThreadGroup(tg) if tgid == 0 { return 0, errors.New("failed to get thread group ID of new process") @@ -446,7 +444,7 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa l.mu.Lock() defer l.mu.Unlock() - l.containerRootTGIDs[args.CID] = tgid + l.containerRootTGIDs[cid] = tgid return tgid, nil } diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 18e95284b..c45784749 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -36,7 +36,6 @@ go_library( "//pkg/p9", "//pkg/sentry/control", "//pkg/sentry/kernel/auth", - "//pkg/unet", "//pkg/urpc", "//runsc/boot", "//runsc/container", diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 55315c0e8..ed4b1d29c 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -16,7 +16,6 @@ package cmd import ( "os" - "sync" "syscall" "context" @@ -25,7 +24,6 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/p9" - "gvisor.googlesource.com/gvisor/pkg/unet" "gvisor.googlesource.com/gvisor/runsc/fsgofer" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -36,6 +34,10 @@ type Gofer struct { bundleDir string ioFDs intFlags applyCaps bool + + // controllerFD is the file descriptor of a stream socket for the + // control server that is donated to this process. + controllerFD int } // Name implements subcommands.Command. @@ -58,11 +60,12 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") + f.IntVar(&g.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") } // Execute implements subcommands.Command. func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - if g.bundleDir == "" || len(g.ioFDs) < 1 { + if g.bundleDir == "" || len(g.ioFDs) < 1 || g.controllerFD == -1 { f.Usage() return subcommands.ExitUsageError } @@ -134,29 +137,14 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) } - runServers(ats, g.ioFDs) - return subcommands.ExitSuccess -} + ctrl, err := fsgofer.NewController(g.controllerFD, g.bundleDir) -func runServers(ats []p9.Attacher, ioFDs []int) { - // Run the loops and wait for all to exit. - var wg sync.WaitGroup - for i, ioFD := range ioFDs { - wg.Add(1) - go func(ioFD int, at p9.Attacher) { - socket, err := unet.NewSocket(ioFD) - if err != nil { - Fatalf("err creating server on FD %d: %v", ioFD, err) - } - s := p9.NewServer(at) - if err := s.Handle(socket); err != nil { - Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err) - } - wg.Done() - }(ioFD, ats[i]) + if err := ctrl.Serve(ats, g.ioFDs); err != nil { + Fatalf("Failed to serve via P9: %v", err) } - wg.Wait() - log.Infof("All 9P servers exited.") + ctrl.Wait() + + return subcommands.ExitSuccess } func isReadonlyMount(opts []string) bool { diff --git a/runsc/container/container.go b/runsc/container/container.go index 574075b00..da2ce0d25 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -249,6 +249,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo return nil, err } c.Sandbox = sb.Sandbox + + // Prepare the gofer to serve the container's filesystem. + err = sb.Sandbox.CreateChild(c.ID, bundleDir) + if err != nil { + c.Destroy() + return nil, err + } } c.Status = Created diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 24e172f48..0bc682b5f 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "fsgofer", srcs = [ + "control.go", "fsgofer.go", "fsgofer_unsafe.go", ], @@ -14,9 +15,12 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/control/server", "//pkg/fd", "//pkg/log", "//pkg/p9", + "//pkg/unet", + "//pkg/urpc", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/fsgofer/control.go b/runsc/fsgofer/control.go new file mode 100644 index 000000000..8ce8ee8a0 --- /dev/null +++ b/runsc/fsgofer/control.go @@ -0,0 +1,203 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsgofer + +import ( + "fmt" + "path/filepath" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/control/server" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/unet" + "gvisor.googlesource.com/gvisor/pkg/urpc" +) + +// Controller manages the fsgofer's control server. +type Controller struct { + // api holds the control server's URPC endpoints. + api api + + // srv is the control server. + srv *server.Server +} + +// NewController creates a new Controller and starts it listenting +func NewController(fd int, rootBundleDir string) (*Controller, error) { + if !filepath.IsAbs(rootBundleDir) { + return nil, fmt.Errorf("NewController should receive an absolute bundle dir path, but got %q", rootBundleDir) + } + + srv, err := server.CreateFromFD(fd) + if err != nil { + return nil, err + } + + cr := &Controller{srv: srv} + cr.api.rootBundleDir = rootBundleDir + cr.api.bundleDirs = make(map[string]string) + srv.Register(&cr.api) + + if err := srv.StartServing(); err != nil { + return nil, err + } + + return cr, nil +} + +// Wait waits for all the p9 servers to finish, then shuts down the control +// server. +func (cr *Controller) Wait() { + cr.api.p9wg.Wait() + cr.srv.Stop() + log.Infof("All 9P servers exited.") +} + +// Serve starts serving each Attacher in ats via its corresponding file +// descriptor in ioFDs. +func (cr *Controller) Serve(ats []p9.Attacher, ioFDs []int) error { + if len(ats) != len(ioFDs) { + return fmt.Errorf("number of attach points does not match the number of IO FDs (%d and %d)", len(ats), len(ioFDs)) + } + for i, _ := range ats { + cr.api.serve(ats[i], ioFDs[i]) + } + return nil +} + +// api URPC methods. +const ( + // AddBundleDirs readies the gofer to serve from a new bundle + // directory. It should be called during runsc create. + AddBundleDirs = "api.AddBundleDirs" + + // ServeDirectory serves a new directory via the fsgofer. It should be + // called during runsc start. + ServeDirectory = "api.ServeDirectory" +) + +// API defines and implements the URPC endpoints for the gofer. +type api struct { + // p9wg waits for all the goroutines serving the sentry via p9. When its + // counter is 0, the gofer is out of work and exits. + p9wg sync.WaitGroup + + // bundleDirs maps from container ID to bundle directory for each + // container. + bundleDirs map[string]string + + // rootBundleDir is the bundle directory of the root container. + rootBundleDir string +} + +// AddBundleDirsRequest is the URPC argument to AddBundleDirs. +type AddBundleDirsRequest struct { + // BundleDirs is a map of container IDs to bundle directories to add to + // the gofer. + BundleDirs map[string]string +} + +// AddBundleDirsRequest adds bundle directories that for the gofer to serve. +func (api *api) AddBundleDirs(req *AddBundleDirsRequest, _ *struct{}) error { + log.Debugf("fsgofer.AddBundleDirs") + for cid, bd := range req.BundleDirs { + if _, ok := api.bundleDirs[cid]; ok { + return fmt.Errorf("fsgofer already has a bundleDir for container %q", cid) + } + api.bundleDirs[cid] = bd + } + return nil +} + +// ServeDirectoryRequest is the URPC argument to ServeDirectory. +type ServeDirectoryRequest struct { + // Dir is the absolute path to a directory to be served to the sentry. + Dir string + + // IsReadOnly specifies whether the directory should be served in + // read-only mode. + IsReadOnly bool + + // CID is the container ID of the container that needs to serve a + // directory. + CID string + + // FilePayload contains the socket over which the sentry will request + // files from Dir. + urpc.FilePayload +} + +// ServeDirectory begins serving a directory via a file descriptor for the +// sentry. Directories must be added via AddBundleDirsRequest before +// ServeDirectory is called. +func (api *api) ServeDirectory(req *ServeDirectoryRequest, _ *struct{}) error { + log.Debugf("fsgofer.ServeDirectory: %+v", req) + + if req.Dir == "" { + return fmt.Errorf("ServeDirectory should receive a directory argument, but was empty") + } + if req.CID == "" { + return fmt.Errorf("ServeDirectory should receive a CID argument, but was empty") + } + // Prevent CIDs containing ".." from confusing the sentry when creating + // /containers/<cid> directory. + // TODO: Once we have multiple independant roots, this + // check won't be necessary. + if filepath.Clean(req.CID) != req.CID { + return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", req.CID) + } + if nFiles := len(req.FilePayload.Files); nFiles != 1 { + return fmt.Errorf("ServeDirectory should receive 1 file descriptor, but got %d", nFiles) + } + + bd, ok := api.bundleDirs[req.CID] + if !ok { + // If there's no entry in bundleDirs for the container ID, this + // is the root container. + bd = api.rootBundleDir + } + + // Relative paths are served relative to the bundle directory. + absDir := req.Dir + if !filepath.IsAbs(absDir) { + absDir = filepath.Join(bd, req.Dir) + } + + // Create the attach point and start serving. + at := NewAttachPoint(absDir, Config{ + ROMount: req.IsReadOnly, + LazyOpenForWrite: true, + }) + api.serve(at, int(req.FilePayload.Files[0].Fd())) + + return nil +} + +// serve begins serving a directory via a file descriptor. +func (api *api) serve(at p9.Attacher, ioFD int) { + api.p9wg.Add(1) + go func(ioFD int, at p9.Attacher) { + socket, err := unet.NewSocket(ioFD) + if err != nil { + panic(fmt.Sprintf("err creating server on FD %d: %v", ioFD, err)) + } + s := p9.NewServer(at) + if err := s.Handle(socket); err != nil { + panic(fmt.Sprintf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)) + } + api.p9wg.Done() + }(ioFD, at) +} diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 52cdc91a2..38263896a 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -66,6 +66,11 @@ func (f fileType) String() string { return "unknown" } +// ControlSocketAddr generates an abstract unix socket name for the given id. +func ControlSocketAddr(id string) string { + return fmt.Sprintf("\x00runsc-gofer.%s", id) +} + // Config sets configuration options for each attach point. type Config struct { // ROMount is set to true if this is a readonly mount. diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index a961c3cc7..cdacc5e22 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/sentry/control", "//pkg/urpc", "//runsc/boot", + "//runsc/fsgofer", "//runsc/specutils", "@com_github_kr_pty//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 2b043d412..83cc94dc4 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -31,6 +31,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/control" "gvisor.googlesource.com/gvisor/pkg/urpc" "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/fsgofer" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -84,7 +85,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // StartRoot starts running the root container process inside the sandbox. func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error { log.Debugf("Start root sandbox %q, pid: %d", s.ID, s.Pid) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -104,21 +105,67 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error { return nil } +// CreateChild creates a non-root container inside the sandbox. +func (s *Sandbox) CreateChild(cid, bundleDir string) error { + log.Debugf("Create non-root container sandbox %q, pid: %d for container %q with bundle directory %q", s.ID, s.Pid, cid, bundleDir) + + // Connect to the gofer and prepare it to serve from bundleDir for this + // container. + goferConn, err := s.goferConnect() + if err != nil { + return fmt.Errorf("couldn't connect to gofer: %v", err) + } + defer goferConn.Close() + goferReq := fsgofer.AddBundleDirsRequest{BundleDirs: map[string]string{cid: bundleDir}} + if err := goferConn.Call(fsgofer.AddBundleDirs, &goferReq, nil); err != nil { + return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err) + } + + return nil +} + // Start starts running a non-root container inside the sandbox. func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string) error { log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid) - conn, err := s.connect() + + sandboxConn, err := s.sandboxConnect() + if err != nil { + return fmt.Errorf("couldn't connect to sandbox: %v", err) + } + defer sandboxConn.Close() + goferConn, err := s.goferConnect() + if err != nil { + return fmt.Errorf("couldn't connect to gofer: %v", err) + } + defer goferConn.Close() + + // Create socket that connects the sandbox and gofer. + sandEnd, goferEnd, err := createSocketPair() if err != nil { return err } - defer conn.Close() + defer sandEnd.Close() + defer goferEnd.Close() + + // Tell the Gofer about the new filesystem it needs to serve. + goferReq := fsgofer.ServeDirectoryRequest{ + Dir: spec.Root.Path, + IsReadOnly: spec.Root.Readonly, + CID: cid, + FilePayload: urpc.FilePayload{Files: []*os.File{goferEnd}}, + } + if err := goferConn.Call(fsgofer.ServeDirectory, &goferReq, nil); err != nil { + return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err) + } + // Start running the container. args := boot.StartArgs{ - Spec: spec, - Conf: conf, - CID: cid, + Spec: spec, + Conf: conf, + CID: cid, + FilePayload: urpc.FilePayload{Files: []*os.File{sandEnd}}, } - if err := conn.Call(boot.ContainerStart, args, nil); err != nil { + if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err) } @@ -142,7 +189,7 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str SandboxID: s.ID, } - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -165,7 +212,7 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str // given container in this sandbox. func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return nil, err } @@ -183,7 +230,7 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { // Execute runs the specified command in the container. func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus, error) { log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return 0, s.connError(err) } @@ -203,7 +250,7 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus, // Event retrieves stats about the sandbox such as memory and CPU utilization. func (s *Sandbox) Event(cid string) (*boot.Event, error) { log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return nil, err } @@ -219,7 +266,7 @@ func (s *Sandbox) Event(cid string) (*boot.Event, error) { return &e, nil } -func (s *Sandbox) connect() (*urpc.Client, error) { +func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { log.Debugf("Connecting to sandbox %q", s.ID) conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)) if err != nil { @@ -228,6 +275,15 @@ func (s *Sandbox) connect() (*urpc.Client, error) { return conn, nil } +func (s *Sandbox) goferConnect() (*urpc.Client, error) { + log.Debugf("Connecting to gofer for sandbox %q", s.ID) + conn, err := client.ConnectTo(fsgofer.ControlSocketAddr(s.ID)) + if err != nil { + return nil, s.connError(err) + } + return conn, nil +} + func (s *Sandbox) connError(err error) error { return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err) } @@ -244,31 +300,45 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle // Add root mount and then add any other additional mounts. mountCount := 1 + + // Add additional mounts. for _, m := range spec.Mounts { if specutils.Is9PMount(m) { mountCount++ } } - sandEnds := make([]*os.File, 0, mountCount) goferEnds := make([]*os.File, 0, mountCount) - for i := 0; i < mountCount; i++ { - // Create socket that connects the sandbox and gofer. - fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + // nextFD is the next available file descriptor for the gofer process. + // It starts at 3 because 0-2 are used by stdin/stdout/stderr. + var nextFD int + for nextFD = 3; nextFD-3 < mountCount; nextFD++ { + sandEnd, goferEnd, err := createSocketPair() if err != nil { return nil, err } - sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd")) - - goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd") defer goferEnd.Close() + sandEnds = append(sandEnds, sandEnd) goferEnds = append(goferEnds, goferEnd) + args = append(args, fmt.Sprintf("--io-fds=%d", nextFD)) + } - args = append(args, fmt.Sprintf("--io-fds=%d", 3+i)) + // Create and donate a file descriptor for the control server. + addr := fsgofer.ControlSocketAddr(s.ID) + serverFD, err := server.CreateSocket(addr) + if err != nil { + return nil, fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err) } + // Add the control server fd. + args = append(args, "--controller-fd="+strconv.Itoa(nextFD)) + nextFD++ + controllerFile := os.NewFile(uintptr(serverFD), "gofer_control_socket_server") + defer controllerFile.Close() + cmd := exec.Command(binPath, args...) cmd.ExtraFiles = goferEnds + cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile) // Setup any uid/gid mappings, and create or join the configured user // namespace so the gofer's view of the filesystem aligns with the @@ -286,6 +356,15 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle return sandEnds, nil } +// createSocketPair creates a pair of files wrapping a socket pair. +func createSocketPair() (*os.File, *os.File, error) { + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[0]), "sandbox io fd"), os.NewFile(uintptr(fds[1]), "gofer io fd"), nil +} + // createSandboxProcess starts the sandbox as a subprocess by running the "boot" // command, passing in the bundle dir. func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File) error { @@ -296,7 +375,9 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund // Create control server socket here and donate FD to child process because // it may be in a different network namespace and won't be reachable from // outside. - fd, err := server.CreateSocket(boot.ControlSocketAddr(s.ID)) + addr := boot.ControlSocketAddr(s.ID) + fd, err := server.CreateSocket(addr) + log.Infof("creating sandbox process with addr: %s", addr) if err != nil { return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err) } @@ -438,7 +519,7 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error { if err := specutils.WaitForReady(s.Pid, timeout, ready); err != nil { return fmt.Errorf("unexpected error waiting for sandbox %q, err: %v", s.ID, err) } - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -454,7 +535,7 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error { func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) var ws syscall.WaitStatus - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return ws, err } @@ -471,7 +552,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { func (s *Sandbox) WaitPID(pid int32, cid string) (syscall.WaitStatus, error) { log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) var ws syscall.WaitStatus - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return ws, err } @@ -536,7 +617,7 @@ func (s *Sandbox) Destroy() error { // Signal sends the signal to a container in the sandbox. func (s *Sandbox) Signal(cid string, sig syscall.Signal) error { log.Debugf("Signal sandbox %q", s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -556,7 +637,7 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal) error { // The statefile will be written to f. func (s *Sandbox) Checkpoint(cid string, f *os.File) error { log.Debugf("Checkpoint sandbox %q", s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -577,7 +658,7 @@ func (s *Sandbox) Checkpoint(cid string, f *os.File) error { // Pause sends the pause call for a container in the sandbox. func (s *Sandbox) Pause(cid string) error { log.Debugf("Pause sandbox %q", s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -592,7 +673,7 @@ func (s *Sandbox) Pause(cid string) error { // Resume sends the resume call for a container in the sandbox. func (s *Sandbox) Resume(cid string) error { log.Debugf("Resume sandbox %q", s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -630,7 +711,7 @@ func (s *Sandbox) IsRunning() bool { // Stacks collects and returns all stacks for the sandbox. func (s *Sandbox) Stacks() (string, error) { log.Debugf("Stacks sandbox %q", s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return "", err } |