diff options
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 14 | ||||
-rw-r--r-- | pkg/urpc/urpc.go | 10 | ||||
-rw-r--r-- | runsc/boot/controller.go | 19 | ||||
-rw-r--r-- | runsc/boot/fs.go | 159 | ||||
-rw-r--r-- | runsc/boot/loader.go | 36 | ||||
-rw-r--r-- | runsc/cmd/BUILD | 1 | ||||
-rw-r--r-- | runsc/cmd/gofer.go | 36 | ||||
-rw-r--r-- | runsc/container/container.go | 7 | ||||
-rw-r--r-- | runsc/fsgofer/BUILD | 4 | ||||
-rw-r--r-- | runsc/fsgofer/control.go | 203 | ||||
-rw-r--r-- | runsc/fsgofer/fsgofer.go | 5 | ||||
-rw-r--r-- | runsc/sandbox/BUILD | 1 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 139 |
13 files changed, 515 insertions, 119 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 419a1d473..cb43fdcdc 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -504,6 +504,14 @@ type CreateProcessArgs struct { // IPCNamespace is the initial IPC namespace. IPCNamespace *IPCNamespace + + // Root optionally contains the dirent that serves as the root for the + // process. If nil, the mount namespace's root is used as the process' + // root. + // + // Anyone setting Root must donate a reference (i.e. increment it) to + // keep it alive until it is decremented by CreateProcess. + Root *fs.Dirent } // NewContext returns a context.Context that represents the task that will be @@ -581,8 +589,12 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error) { ctx := args.NewContext(k) // Grab the root directory. - root := fs.RootFromContext(ctx) + root := args.Root + if root == nil { + root = fs.RootFromContext(ctx) + } defer root.DecRef() + args.Root = nil // Grab the working directory. wd := root // Default. diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go index af620b704..1ec06dd4c 100644 --- a/pkg/urpc/urpc.go +++ b/pkg/urpc/urpc.go @@ -63,6 +63,10 @@ func (r RemoteError) Error() string { // file as a result of an RPC. These are not actually serialized, rather they // are sent via an accompanying SCM_RIGHTS message (plumbed through the unet // package). +// +// When embedding a FilePayload in an argument struct, the argument type _must_ +// be a pointer to the struct rather than the struct type itself. This is +// because the urpc package defines pointer methods on FilePayload. type FilePayload struct { Files []*os.File `json:"-"` } @@ -552,6 +556,12 @@ func (c *Client) Call(method string, arg interface{}, result interface{}) error c.mu.Lock() defer c.mu.Unlock() + // If arg is a FilePayload, not a *FilePayload, files won't actually be + // sent, so error out. + if _, ok := arg.(FilePayload); ok { + return fmt.Errorf("argument is a FilePayload, but should be a *FilePayload") + } + // Are there files to send? var fs []*os.File if fp, ok := arg.(filePayloader); ok { diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index fc6ea326a..69e88d8e0 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -17,6 +17,7 @@ package boot import ( "errors" "fmt" + "path" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/control/server" @@ -181,11 +182,15 @@ type StartArgs struct { // CID is the ID of the container to start. CID string + + // FilePayload contains the file descriptor over which the sandbox will + // request files from its root filesystem. + urpc.FilePayload } // Start runs a created container within a sandbox. func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { - log.Debugf("containerManager.Start") + log.Debugf("containerManager.Start: %+v", args) // Validate arguments. if args == nil { @@ -200,8 +205,18 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { if args.CID == "" { return errors.New("start argument missing container ID") } + // Prevent CIDs containing ".." from confusing the sentry when creating + // /containers/<cid> directory. + // TODO: Once we have multiple independant roots, this + // check won't be necessary. + if path.Clean(args.CID) != args.CID { + return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID) + } + if len(args.FilePayload.Files) != 1 { + return fmt.Errorf("start arguments must contain one file for the container root") + } - tgid, err := cm.l.startContainer(args, cm.l.k) + tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files[0]) if err != nil { return err } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index eea2ec1f5..8996b1398 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -48,6 +48,19 @@ const ( // Device name for root mount. rootDevice = "9pfs-/" + + // childContainersDir is the directory where child container root + // filesystems are mounted. + childContainersDir = "/__runsc_containers__" + + // Filesystems that runsc supports. + bind = "bind" + devpts = "devpts" + devtmpfs = "devtmpfs" + proc = "proc" + sysfs = "sysfs" + tmpfs = "tmpfs" + nonefs = "none" ) type fdDispenser struct { @@ -70,8 +83,15 @@ func (f *fdDispenser) empty() bool { // createMountNamespace creates a mount namespace containing the root filesystem // and all mounts. 'rootCtx' is used to walk directories to find mount points. func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) { + mounts := compileMounts(spec) + // Create a tmpfs mount where we create and mount a root filesystem for + // each child container. + mounts = append(mounts, specs.Mount{ + Type: tmpfs, + Destination: childContainersDir, + }) fds := &fdDispenser{fds: ioFDs} - rootInode, err := createRootMount(rootCtx, spec, conf, fds) + rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts) if err != nil { return nil, fmt.Errorf("failed to create root mount: %v", err) } @@ -79,7 +99,7 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec if err != nil { return nil, fmt.Errorf("failed to create root mount namespace: %v", err) } - mounts := compileMounts(spec) + if err := setMounts(rootCtx, conf, mns, fds, mounts); err != nil { return nil, fmt.Errorf("failed to configure mounts: %v", err) } @@ -98,12 +118,12 @@ func compileMounts(spec *specs.Spec) []specs.Mount { // Always mount /dev. mounts = append(mounts, specs.Mount{ - Type: "devtmpfs", + Type: devtmpfs, Destination: "/dev", }) mounts = append(mounts, specs.Mount{ - Type: "devpts", + Type: devpts, Destination: "/dev/pts", }) @@ -129,13 +149,13 @@ func compileMounts(spec *specs.Spec) []specs.Mount { var mandatoryMounts []specs.Mount if !procMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ - Type: "proc", + Type: proc, Destination: "/proc", }) } if !sysMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ - Type: "sysfs", + Type: sysfs, Destination: "/sys", }) } @@ -149,7 +169,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount { // that. Until then, the /tmp mount will always appear empty at // container creation. mandatoryMounts = append(mandatoryMounts, specs.Mount{ - Type: "tmpfs", + Type: tmpfs, Destination: "/tmp", }) } @@ -165,7 +185,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount { // mount namespace. func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error { for _, m := range mounts { - if err := mountSubmount(ctx, conf, mns, fds, m, mounts); err != nil { + if err := mountSubmount(ctx, conf, mns, fds, m, mounts, m.Destination); err != nil { return err } } @@ -173,7 +193,7 @@ func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *f } // createRootMount creates the root filesystem. -func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) { +func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) { // First construct the filesystem from the spec.Root. mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly} @@ -207,7 +227,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f // We need to overlay the root on top of a ramfs with stub directories // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always // mounted even if they are not in the spec. - submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp") + submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp") rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) if err != nil { return nil, fmt.Errorf("error adding submount overlay: %v", err) @@ -256,17 +276,17 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri ) switch m.Type { - case "devpts", "devtmpfs", "proc", "sysfs": + case devpts, devtmpfs, proc, sysfs: fsName = m.Type - case "none": - fsName = "sysfs" - case "tmpfs": + case nonefs: + fsName = sysfs + case tmpfs: fsName = m.Type // tmpfs has some extra supported options that we must pass through. opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") - case "bind": + case bind: switch conf.FileAccess { case FileAccessProxy, FileAccessProxyExclusive: fd := fds.remove() @@ -291,7 +311,7 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri return fsName, opts, useOverlay, err } -func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error { +func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount, dest string) error { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds) @@ -342,51 +362,52 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fd // in the right location, e.g. // mount: /var/run/secrets, may be created in '/run/secrets' if // '/var/run' => '/var'. - if err := mkdirAll(ctx, mns, m.Destination); err != nil { + if err := mkdirAll(ctx, mns, dest); err != nil { return err } root := mns.Root() defer root.DecRef() - dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals) + dirent, err := mns.FindInode(ctx, root, nil, dest, linux.MaxSymlinkTraversals) if err != nil { - return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err) + return fmt.Errorf("failed to find mount destination %q: %v", dest, err) } defer dirent.DecRef() if err := mns.Mount(ctx, dirent, inode); err != nil { - return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err) + return fmt.Errorf("failed to mount at destination %q: %v", dest, err) } - log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) + log.Infof("Mounted %q to %q type %s", m.Source, dest, m.Type) return nil } func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error { + log.Infof("mkdirAll called with path %s", path) root := mns.Root() defer root.DecRef() // Starting at the root, walk the path. parent := root ps := strings.Split(filepath.Clean(path), string(filepath.Separator)) - for i := 0; i < len(ps); i++ { - if ps[i] == "" { + for _, pathElem := range ps { + if pathElem == "" { // This will be case for the first and last element, if the path // begins or ends with '/'. Note that we always treat the path as // absolute, regardless of what the first character contains. continue } - d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit) + d, err := mns.FindInode(ctx, root, parent, pathElem, fs.DefaultTraversalLimit) if err == syserror.ENOENT { // If we encounter a path that does not exist, then // create it. - if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil { - return fmt.Errorf("failed to create directory %q: %v", ps[i], err) + if err := parent.CreateDirectory(ctx, root, pathElem, fs.FilePermsFromMode(0755)); err != nil { + return fmt.Errorf("failed to create directory %q: %v", pathElem, err) } - if d, err = parent.Walk(ctx, root, ps[i]); err != nil { - return fmt.Errorf("walk to %q failed: %v", ps[i], err) + if d, err = parent.Walk(ctx, root, pathElem); err != nil { + return fmt.Errorf("walk to %q failed: %v", pathElem, err) } } else if err != nil { - return fmt.Errorf("failed to find inode %q: %v", ps[i], err) + return fmt.Errorf("failed to find inode %q: %v", pathElem, err) } parent = d } @@ -444,7 +465,7 @@ func destinations(mounts []specs.Mount, extra ...string) []string { // mountDevice returns a device string based on the fs type and target // of the mount. func mountDevice(m specs.Mount) string { - if m.Type == "bind" { + if m.Type == bind { // Make a device string that includes the target, which is consistent across // S/R and uniquely identifies the connection. return "9pfs-" + m.Destination @@ -589,7 +610,7 @@ func subtargets(root string, mnts []specs.Mount) []string { // setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly. // procArgs are passed by reference and the FDMap field is modified. -func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel) error { +func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error { ctx := procArgs.NewContext(k) // Create the FD map, which will set stdin, stdout, and stderr. If @@ -604,27 +625,79 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe // won't need ours either way. procArgs.FDMap = fdm + // Use root user to configure mounts. The current user might not have + // permission to do so. + rootProcArgs := kernel.CreateProcessArgs{ + WorkingDirectory: "/", + Credentials: auth.NewRootCredentials(creds.UserNamespace), + Umask: 0022, + MaxSymlinkTraversals: linux.MaxSymlinkTraversals, + } + rootCtx := rootProcArgs.NewContext(k) + // If this is the root container, we also need to setup the root mount // namespace. - if k.RootMountNamespace() == nil { - // Use root user to configure mounts. The current user might not have - // permission to do so. - rootProcArgs := kernel.CreateProcessArgs{ - WorkingDirectory: "/", - Credentials: auth.NewRootCredentials(creds.UserNamespace), - Umask: 0022, - MaxSymlinkTraversals: linux.MaxSymlinkTraversals, - } - rootCtx := rootProcArgs.NewContext(k) - + mns := k.RootMountNamespace() + if mns == nil { // Create the virtual filesystem. mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs) if err != nil { return fmt.Errorf("error creating mounts: %v", err) } - k.SetRootMountNamespace(mns) + return nil + } + + // Setup a child container. + + // Create the container's root filesystem mount. + log.Infof("Creating new process in child container.") + fds := &fdDispenser{fds: append([]int{}, ioFDs...)} + rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil) + if err != nil { + return fmt.Errorf("error creating filesystem for container: %v", err) + } + + // Make directories for submounts within the container. + rootDir := mns.Root() + defer rootDir.DecRef() + containerRoot := filepath.Join(childContainersDir, cid) + mkdirAll(ctx, mns, containerRoot) + + // Mount the container's root filesystem to the newly created + // mount point. + containerRootDirent, err := mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals) + if err != nil { + return fmt.Errorf("failed to find mount destination: %q: %v", containerRoot, err) + } + if err := mns.Mount(ctx, containerRootDirent, rootInode); err != nil { + return fmt.Errorf("failed to mount at destination %q: %v", containerRoot, err) + } + containerRootDirent.DecRef() + + // We have to re-walk to the dirent to find the mounted + // directory. The old dirent is invalid at this point. + containerRootDirent, err = mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals) + if err != nil { + return fmt.Errorf("failed to find mount destination2: %q: %v", containerRoot, err) + } + log.Infof("Mounted child's root fs to %q", containerRoot) + + // Mount all submounts. + mounts := compileMounts(spec) + for _, m := range mounts { + // TODO: Enable bind mounts in child containers. + if m.Type == bind { + log.Infof("Bind mounts in child containers are not yet supported: %+v", m) + continue + } + dest := filepath.Join(containerRoot, m.Destination) + if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), fds, m, mounts, dest); err != nil { + return fmt.Errorf("error mounting filesystem for container: %v", err) + } } + // Set the procArgs root directory. + procArgs.Root = containerRootDirent return nil } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index f6c7bf223..7debf0ac2 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -19,6 +19,7 @@ import ( "errors" "fmt" "math/rand" + "os" "runtime" "sync" "sync/atomic" @@ -229,7 +230,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console // Ensure that signals received are forwarded to the emulated kernel. stopSignalForwarding := sighandling.PrepareForwarding(k, false)() - procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k) + procArgs, err := newProcess(spec, creds, utsns, ipcns, k) if err != nil { return nil, fmt.Errorf("failed to create root process: %v", err) } @@ -250,7 +251,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console } // newProcess creates a process that can be run with kernel.CreateProcess. -func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) { +func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) { // Create initial limits. ls, err := createLimitSet(spec) if err != nil { @@ -277,7 +278,6 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds UTSNamespace: utsns, IPCNamespace: ipcns, } - return procArgs, nil } @@ -356,7 +356,8 @@ func (l *Loader) run() error { l.console, l.rootProcArgs.Credentials, l.rootProcArgs.Limits, - l.k) + l.k, + "" /* CID, which isn't needed for the root container */) if err != nil { return err } @@ -376,8 +377,7 @@ func (l *Loader) run() error { // startContainer starts a child container. It returns the thread group ID of // the newly created process. -func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.ThreadID, error) { - spec := args.Spec +func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, file *os.File) (kernel.ThreadID, error) { // Create capabilities. caps, err := specutils.Capabilities(spec.Process.Capabilities) if err != nil { @@ -406,26 +406,24 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa // when indicated by the spec. procArgs, err := newProcess( - args.Spec, - args.Conf, - nil, // ioFDs - false, // console + spec, creds, - k.RootUTSNamespace(), - k.RootIPCNamespace(), - k) + l.k.RootUTSNamespace(), + l.k.RootIPCNamespace(), + l.k) if err != nil { return 0, fmt.Errorf("failed to create new process: %v", err) } err = setFileSystemForProcess( &procArgs, - args.Spec, - args.Conf, - nil, + spec, + conf, + []int{int(file.Fd())}, // ioFDs false, creds, procArgs.Limits, - k) + k, + cid) if err != nil { return 0, fmt.Errorf("failed to create new process: %v", err) } @@ -435,7 +433,7 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa return 0, fmt.Errorf("failed to create process in sentry: %v", err) } - ts := k.TaskSet() + ts := l.k.TaskSet() tgid := ts.Root.IDOfThreadGroup(tg) if tgid == 0 { return 0, errors.New("failed to get thread group ID of new process") @@ -446,7 +444,7 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa l.mu.Lock() defer l.mu.Unlock() - l.containerRootTGIDs[args.CID] = tgid + l.containerRootTGIDs[cid] = tgid return tgid, nil } diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 18e95284b..c45784749 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -36,7 +36,6 @@ go_library( "//pkg/p9", "//pkg/sentry/control", "//pkg/sentry/kernel/auth", - "//pkg/unet", "//pkg/urpc", "//runsc/boot", "//runsc/container", diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 55315c0e8..ed4b1d29c 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -16,7 +16,6 @@ package cmd import ( "os" - "sync" "syscall" "context" @@ -25,7 +24,6 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/p9" - "gvisor.googlesource.com/gvisor/pkg/unet" "gvisor.googlesource.com/gvisor/runsc/fsgofer" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -36,6 +34,10 @@ type Gofer struct { bundleDir string ioFDs intFlags applyCaps bool + + // controllerFD is the file descriptor of a stream socket for the + // control server that is donated to this process. + controllerFD int } // Name implements subcommands.Command. @@ -58,11 +60,12 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") + f.IntVar(&g.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") } // Execute implements subcommands.Command. func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - if g.bundleDir == "" || len(g.ioFDs) < 1 { + if g.bundleDir == "" || len(g.ioFDs) < 1 || g.controllerFD == -1 { f.Usage() return subcommands.ExitUsageError } @@ -134,29 +137,14 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) } - runServers(ats, g.ioFDs) - return subcommands.ExitSuccess -} + ctrl, err := fsgofer.NewController(g.controllerFD, g.bundleDir) -func runServers(ats []p9.Attacher, ioFDs []int) { - // Run the loops and wait for all to exit. - var wg sync.WaitGroup - for i, ioFD := range ioFDs { - wg.Add(1) - go func(ioFD int, at p9.Attacher) { - socket, err := unet.NewSocket(ioFD) - if err != nil { - Fatalf("err creating server on FD %d: %v", ioFD, err) - } - s := p9.NewServer(at) - if err := s.Handle(socket); err != nil { - Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err) - } - wg.Done() - }(ioFD, ats[i]) + if err := ctrl.Serve(ats, g.ioFDs); err != nil { + Fatalf("Failed to serve via P9: %v", err) } - wg.Wait() - log.Infof("All 9P servers exited.") + ctrl.Wait() + + return subcommands.ExitSuccess } func isReadonlyMount(opts []string) bool { diff --git a/runsc/container/container.go b/runsc/container/container.go index 574075b00..da2ce0d25 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -249,6 +249,13 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo return nil, err } c.Sandbox = sb.Sandbox + + // Prepare the gofer to serve the container's filesystem. + err = sb.Sandbox.CreateChild(c.ID, bundleDir) + if err != nil { + c.Destroy() + return nil, err + } } c.Status = Created diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 24e172f48..0bc682b5f 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -5,6 +5,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "fsgofer", srcs = [ + "control.go", "fsgofer.go", "fsgofer_unsafe.go", ], @@ -14,9 +15,12 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/control/server", "//pkg/fd", "//pkg/log", "//pkg/p9", + "//pkg/unet", + "//pkg/urpc", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/fsgofer/control.go b/runsc/fsgofer/control.go new file mode 100644 index 000000000..8ce8ee8a0 --- /dev/null +++ b/runsc/fsgofer/control.go @@ -0,0 +1,203 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsgofer + +import ( + "fmt" + "path/filepath" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/control/server" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/unet" + "gvisor.googlesource.com/gvisor/pkg/urpc" +) + +// Controller manages the fsgofer's control server. +type Controller struct { + // api holds the control server's URPC endpoints. + api api + + // srv is the control server. + srv *server.Server +} + +// NewController creates a new Controller and starts it listenting +func NewController(fd int, rootBundleDir string) (*Controller, error) { + if !filepath.IsAbs(rootBundleDir) { + return nil, fmt.Errorf("NewController should receive an absolute bundle dir path, but got %q", rootBundleDir) + } + + srv, err := server.CreateFromFD(fd) + if err != nil { + return nil, err + } + + cr := &Controller{srv: srv} + cr.api.rootBundleDir = rootBundleDir + cr.api.bundleDirs = make(map[string]string) + srv.Register(&cr.api) + + if err := srv.StartServing(); err != nil { + return nil, err + } + + return cr, nil +} + +// Wait waits for all the p9 servers to finish, then shuts down the control +// server. +func (cr *Controller) Wait() { + cr.api.p9wg.Wait() + cr.srv.Stop() + log.Infof("All 9P servers exited.") +} + +// Serve starts serving each Attacher in ats via its corresponding file +// descriptor in ioFDs. +func (cr *Controller) Serve(ats []p9.Attacher, ioFDs []int) error { + if len(ats) != len(ioFDs) { + return fmt.Errorf("number of attach points does not match the number of IO FDs (%d and %d)", len(ats), len(ioFDs)) + } + for i, _ := range ats { + cr.api.serve(ats[i], ioFDs[i]) + } + return nil +} + +// api URPC methods. +const ( + // AddBundleDirs readies the gofer to serve from a new bundle + // directory. It should be called during runsc create. + AddBundleDirs = "api.AddBundleDirs" + + // ServeDirectory serves a new directory via the fsgofer. It should be + // called during runsc start. + ServeDirectory = "api.ServeDirectory" +) + +// API defines and implements the URPC endpoints for the gofer. +type api struct { + // p9wg waits for all the goroutines serving the sentry via p9. When its + // counter is 0, the gofer is out of work and exits. + p9wg sync.WaitGroup + + // bundleDirs maps from container ID to bundle directory for each + // container. + bundleDirs map[string]string + + // rootBundleDir is the bundle directory of the root container. + rootBundleDir string +} + +// AddBundleDirsRequest is the URPC argument to AddBundleDirs. +type AddBundleDirsRequest struct { + // BundleDirs is a map of container IDs to bundle directories to add to + // the gofer. + BundleDirs map[string]string +} + +// AddBundleDirsRequest adds bundle directories that for the gofer to serve. +func (api *api) AddBundleDirs(req *AddBundleDirsRequest, _ *struct{}) error { + log.Debugf("fsgofer.AddBundleDirs") + for cid, bd := range req.BundleDirs { + if _, ok := api.bundleDirs[cid]; ok { + return fmt.Errorf("fsgofer already has a bundleDir for container %q", cid) + } + api.bundleDirs[cid] = bd + } + return nil +} + +// ServeDirectoryRequest is the URPC argument to ServeDirectory. +type ServeDirectoryRequest struct { + // Dir is the absolute path to a directory to be served to the sentry. + Dir string + + // IsReadOnly specifies whether the directory should be served in + // read-only mode. + IsReadOnly bool + + // CID is the container ID of the container that needs to serve a + // directory. + CID string + + // FilePayload contains the socket over which the sentry will request + // files from Dir. + urpc.FilePayload +} + +// ServeDirectory begins serving a directory via a file descriptor for the +// sentry. Directories must be added via AddBundleDirsRequest before +// ServeDirectory is called. +func (api *api) ServeDirectory(req *ServeDirectoryRequest, _ *struct{}) error { + log.Debugf("fsgofer.ServeDirectory: %+v", req) + + if req.Dir == "" { + return fmt.Errorf("ServeDirectory should receive a directory argument, but was empty") + } + if req.CID == "" { + return fmt.Errorf("ServeDirectory should receive a CID argument, but was empty") + } + // Prevent CIDs containing ".." from confusing the sentry when creating + // /containers/<cid> directory. + // TODO: Once we have multiple independant roots, this + // check won't be necessary. + if filepath.Clean(req.CID) != req.CID { + return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", req.CID) + } + if nFiles := len(req.FilePayload.Files); nFiles != 1 { + return fmt.Errorf("ServeDirectory should receive 1 file descriptor, but got %d", nFiles) + } + + bd, ok := api.bundleDirs[req.CID] + if !ok { + // If there's no entry in bundleDirs for the container ID, this + // is the root container. + bd = api.rootBundleDir + } + + // Relative paths are served relative to the bundle directory. + absDir := req.Dir + if !filepath.IsAbs(absDir) { + absDir = filepath.Join(bd, req.Dir) + } + + // Create the attach point and start serving. + at := NewAttachPoint(absDir, Config{ + ROMount: req.IsReadOnly, + LazyOpenForWrite: true, + }) + api.serve(at, int(req.FilePayload.Files[0].Fd())) + + return nil +} + +// serve begins serving a directory via a file descriptor. +func (api *api) serve(at p9.Attacher, ioFD int) { + api.p9wg.Add(1) + go func(ioFD int, at p9.Attacher) { + socket, err := unet.NewSocket(ioFD) + if err != nil { + panic(fmt.Sprintf("err creating server on FD %d: %v", ioFD, err)) + } + s := p9.NewServer(at) + if err := s.Handle(socket); err != nil { + panic(fmt.Sprintf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err)) + } + api.p9wg.Done() + }(ioFD, at) +} diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 52cdc91a2..38263896a 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -66,6 +66,11 @@ func (f fileType) String() string { return "unknown" } +// ControlSocketAddr generates an abstract unix socket name for the given id. +func ControlSocketAddr(id string) string { + return fmt.Sprintf("\x00runsc-gofer.%s", id) +} + // Config sets configuration options for each attach point. type Config struct { // ROMount is set to true if this is a readonly mount. diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index a961c3cc7..cdacc5e22 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/sentry/control", "//pkg/urpc", "//runsc/boot", + "//runsc/fsgofer", "//runsc/specutils", "@com_github_kr_pty//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 2b043d412..83cc94dc4 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -31,6 +31,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/control" "gvisor.googlesource.com/gvisor/pkg/urpc" "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/fsgofer" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -84,7 +85,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // StartRoot starts running the root container process inside the sandbox. func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error { log.Debugf("Start root sandbox %q, pid: %d", s.ID, s.Pid) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -104,21 +105,67 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error { return nil } +// CreateChild creates a non-root container inside the sandbox. +func (s *Sandbox) CreateChild(cid, bundleDir string) error { + log.Debugf("Create non-root container sandbox %q, pid: %d for container %q with bundle directory %q", s.ID, s.Pid, cid, bundleDir) + + // Connect to the gofer and prepare it to serve from bundleDir for this + // container. + goferConn, err := s.goferConnect() + if err != nil { + return fmt.Errorf("couldn't connect to gofer: %v", err) + } + defer goferConn.Close() + goferReq := fsgofer.AddBundleDirsRequest{BundleDirs: map[string]string{cid: bundleDir}} + if err := goferConn.Call(fsgofer.AddBundleDirs, &goferReq, nil); err != nil { + return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err) + } + + return nil +} + // Start starts running a non-root container inside the sandbox. func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string) error { log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid) - conn, err := s.connect() + + sandboxConn, err := s.sandboxConnect() + if err != nil { + return fmt.Errorf("couldn't connect to sandbox: %v", err) + } + defer sandboxConn.Close() + goferConn, err := s.goferConnect() + if err != nil { + return fmt.Errorf("couldn't connect to gofer: %v", err) + } + defer goferConn.Close() + + // Create socket that connects the sandbox and gofer. + sandEnd, goferEnd, err := createSocketPair() if err != nil { return err } - defer conn.Close() + defer sandEnd.Close() + defer goferEnd.Close() + + // Tell the Gofer about the new filesystem it needs to serve. + goferReq := fsgofer.ServeDirectoryRequest{ + Dir: spec.Root.Path, + IsReadOnly: spec.Root.Readonly, + CID: cid, + FilePayload: urpc.FilePayload{Files: []*os.File{goferEnd}}, + } + if err := goferConn.Call(fsgofer.ServeDirectory, &goferReq, nil); err != nil { + return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err) + } + // Start running the container. args := boot.StartArgs{ - Spec: spec, - Conf: conf, - CID: cid, + Spec: spec, + Conf: conf, + CID: cid, + FilePayload: urpc.FilePayload{Files: []*os.File{sandEnd}}, } - if err := conn.Call(boot.ContainerStart, args, nil); err != nil { + if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err) } @@ -142,7 +189,7 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str SandboxID: s.ID, } - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -165,7 +212,7 @@ func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, f str // given container in this sandbox. func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return nil, err } @@ -183,7 +230,7 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { // Execute runs the specified command in the container. func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus, error) { log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return 0, s.connError(err) } @@ -203,7 +250,7 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus, // Event retrieves stats about the sandbox such as memory and CPU utilization. func (s *Sandbox) Event(cid string) (*boot.Event, error) { log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return nil, err } @@ -219,7 +266,7 @@ func (s *Sandbox) Event(cid string) (*boot.Event, error) { return &e, nil } -func (s *Sandbox) connect() (*urpc.Client, error) { +func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { log.Debugf("Connecting to sandbox %q", s.ID) conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)) if err != nil { @@ -228,6 +275,15 @@ func (s *Sandbox) connect() (*urpc.Client, error) { return conn, nil } +func (s *Sandbox) goferConnect() (*urpc.Client, error) { + log.Debugf("Connecting to gofer for sandbox %q", s.ID) + conn, err := client.ConnectTo(fsgofer.ControlSocketAddr(s.ID)) + if err != nil { + return nil, s.connError(err) + } + return conn, nil +} + func (s *Sandbox) connError(err error) error { return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err) } @@ -244,31 +300,45 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle // Add root mount and then add any other additional mounts. mountCount := 1 + + // Add additional mounts. for _, m := range spec.Mounts { if specutils.Is9PMount(m) { mountCount++ } } - sandEnds := make([]*os.File, 0, mountCount) goferEnds := make([]*os.File, 0, mountCount) - for i := 0; i < mountCount; i++ { - // Create socket that connects the sandbox and gofer. - fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + // nextFD is the next available file descriptor for the gofer process. + // It starts at 3 because 0-2 are used by stdin/stdout/stderr. + var nextFD int + for nextFD = 3; nextFD-3 < mountCount; nextFD++ { + sandEnd, goferEnd, err := createSocketPair() if err != nil { return nil, err } - sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd")) - - goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd") defer goferEnd.Close() + sandEnds = append(sandEnds, sandEnd) goferEnds = append(goferEnds, goferEnd) + args = append(args, fmt.Sprintf("--io-fds=%d", nextFD)) + } - args = append(args, fmt.Sprintf("--io-fds=%d", 3+i)) + // Create and donate a file descriptor for the control server. + addr := fsgofer.ControlSocketAddr(s.ID) + serverFD, err := server.CreateSocket(addr) + if err != nil { + return nil, fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err) } + // Add the control server fd. + args = append(args, "--controller-fd="+strconv.Itoa(nextFD)) + nextFD++ + controllerFile := os.NewFile(uintptr(serverFD), "gofer_control_socket_server") + defer controllerFile.Close() + cmd := exec.Command(binPath, args...) cmd.ExtraFiles = goferEnds + cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile) // Setup any uid/gid mappings, and create or join the configured user // namespace so the gofer's view of the filesystem aligns with the @@ -286,6 +356,15 @@ func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundle return sandEnds, nil } +// createSocketPair creates a pair of files wrapping a socket pair. +func createSocketPair() (*os.File, *os.File, error) { + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[0]), "sandbox io fd"), os.NewFile(uintptr(fds[1]), "gofer io fd"), nil +} + // createSandboxProcess starts the sandbox as a subprocess by running the "boot" // command, passing in the bundle dir. func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File) error { @@ -296,7 +375,9 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund // Create control server socket here and donate FD to child process because // it may be in a different network namespace and won't be reachable from // outside. - fd, err := server.CreateSocket(boot.ControlSocketAddr(s.ID)) + addr := boot.ControlSocketAddr(s.ID) + fd, err := server.CreateSocket(addr) + log.Infof("creating sandbox process with addr: %s", addr) if err != nil { return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err) } @@ -438,7 +519,7 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error { if err := specutils.WaitForReady(s.Pid, timeout, ready); err != nil { return fmt.Errorf("unexpected error waiting for sandbox %q, err: %v", s.ID, err) } - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -454,7 +535,7 @@ func (s *Sandbox) waitForCreated(timeout time.Duration) error { func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) var ws syscall.WaitStatus - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return ws, err } @@ -471,7 +552,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { func (s *Sandbox) WaitPID(pid int32, cid string) (syscall.WaitStatus, error) { log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) var ws syscall.WaitStatus - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return ws, err } @@ -536,7 +617,7 @@ func (s *Sandbox) Destroy() error { // Signal sends the signal to a container in the sandbox. func (s *Sandbox) Signal(cid string, sig syscall.Signal) error { log.Debugf("Signal sandbox %q", s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -556,7 +637,7 @@ func (s *Sandbox) Signal(cid string, sig syscall.Signal) error { // The statefile will be written to f. func (s *Sandbox) Checkpoint(cid string, f *os.File) error { log.Debugf("Checkpoint sandbox %q", s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -577,7 +658,7 @@ func (s *Sandbox) Checkpoint(cid string, f *os.File) error { // Pause sends the pause call for a container in the sandbox. func (s *Sandbox) Pause(cid string) error { log.Debugf("Pause sandbox %q", s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -592,7 +673,7 @@ func (s *Sandbox) Pause(cid string) error { // Resume sends the resume call for a container in the sandbox. func (s *Sandbox) Resume(cid string) error { log.Debugf("Resume sandbox %q", s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return err } @@ -630,7 +711,7 @@ func (s *Sandbox) IsRunning() bool { // Stacks collects and returns all stacks for the sandbox. func (s *Sandbox) Stacks() (string, error) { log.Debugf("Stacks sandbox %q", s.ID) - conn, err := s.connect() + conn, err := s.sandboxConnect() if err != nil { return "", err } |