diff options
-rw-r--r-- | runsc/boot/controller.go | 6 | ||||
-rw-r--r-- | runsc/boot/fs.go | 6 | ||||
-rw-r--r-- | runsc/boot/loader.go | 20 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 106 | ||||
-rw-r--r-- | runsc/cmd/BUILD | 1 | ||||
-rw-r--r-- | runsc/cmd/gofer.go | 84 | ||||
-rw-r--r-- | runsc/cmd/state.go | 5 | ||||
-rw-r--r-- | runsc/container/BUILD | 7 | ||||
-rw-r--r-- | runsc/container/container.go | 96 | ||||
-rw-r--r-- | runsc/container/container_test.go | 7 | ||||
-rw-r--r-- | runsc/container/fs.go | 198 | ||||
-rw-r--r-- | runsc/container/fs_test.go | 158 | ||||
-rw-r--r-- | runsc/fsgofer/BUILD | 4 | ||||
-rw-r--r-- | runsc/fsgofer/control.go | 204 | ||||
-rw-r--r-- | runsc/sandbox/BUILD | 2 | ||||
-rw-r--r-- | runsc/sandbox/network.go | 3 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 176 | ||||
-rw-r--r-- | runsc/specutils/BUILD | 6 | ||||
-rw-r--r-- | runsc/specutils/namespace.go (renamed from runsc/sandbox/namespace.go) | 34 | ||||
-rw-r--r-- | runsc/test/testutil/BUILD | 1 | ||||
-rw-r--r-- | runsc/test/testutil/docker.go | 6 | ||||
-rw-r--r-- | runsc/test/testutil/testutil.go | 57 |
22 files changed, 671 insertions, 516 deletions
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 2d6b507b3..fdb6be5b1 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -212,11 +212,11 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { if path.Clean(args.CID) != args.CID { return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID) } - if len(args.FilePayload.Files) != 1 { - return fmt.Errorf("start arguments must contain one file for the container root") + if len(args.FilePayload.Files) == 0 { + return fmt.Errorf("start arguments must contain at least one file for the container root") } - tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files[0]) + tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files) if err != nil { return err } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 6f5379a6d..20d0e42ef 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -510,8 +510,6 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) MountSources: make(map[string][]fs.MountArgs), } - mounts := compileMounts(spec) - // Add root mount. fd := fds.remove() opts := p9MountOptions(conf, fd) @@ -528,8 +526,8 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) } renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount) - // Add submounts - for _, m := range mounts { + // Add submounts. + for _, m := range compileMounts(spec) { if err := addRestoreMount(conf, renv, m, fds); err != nil { return nil, err } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 0e94cf215..3963ed55d 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -23,6 +23,7 @@ import ( "runtime" "sync" "sync/atomic" + "syscall" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" @@ -377,7 +378,7 @@ func (l *Loader) run() error { // startContainer starts a child container. It returns the thread group ID of // the newly created process. -func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, file *os.File) (kernel.ThreadID, error) { +func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) (kernel.ThreadID, error) { // Create capabilities. caps, err := specutils.Capabilities(spec.Process.Capabilities) if err != nil { @@ -414,11 +415,23 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config if err != nil { return 0, fmt.Errorf("failed to create new process: %v", err) } + + // Can't take ownership away from os.File. dup them to get a new FDs. + var ioFDs []int + for _, f := range files { + fd, err := syscall.Dup(int(f.Fd())) + if err != nil { + return 0, fmt.Errorf("failed to dup file: %v", err) + } + f.Close() + ioFDs = append(ioFDs, fd) + } + err = setFileSystemForProcess( &procArgs, spec, conf, - []int{int(file.Fd())}, // ioFDs + ioFDs, false, creds, procArgs.Limits, @@ -453,8 +466,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config return tgid, nil } -// TODO: Per-container namespaces must be supported -// for -pid. +// TODO: Per-container namespaces must be supported for -pid. // waitContainer waits for the root process of a container to exit. func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index f2f690b5d..2396d52c8 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -16,7 +16,6 @@ package boot import ( "fmt" - "io/ioutil" "math/rand" "os" "reflect" @@ -36,6 +35,15 @@ func init() { rand.Seed(time.Now().UnixNano()) } +func testConfig() *Config { + return &Config{ + RootDir: "unused_root_dir", + Network: NetworkNone, + FileAccess: FileAccessDirect, + DisableSeccomp: true, + } +} + // testSpec returns a simple spec that can be used in tests. func testSpec() *specs.Spec { return &specs.Spec{ @@ -55,12 +63,7 @@ func createLoader() (*Loader, error) { if err != nil { return nil, err } - conf := &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessDirect, - DisableSeccomp: true, - } + conf := testConfig() spec := testSpec() return New(spec, conf, fd, nil, false) } @@ -152,18 +155,6 @@ func TestStartSignal(t *testing.T) { // Test that MountNamespace can be created with various specs. func TestCreateMountNamespace(t *testing.T) { - conf := &Config{ - RootDir: "unused_root_dir", - FileAccess: FileAccessDirect, - DisableSeccomp: true, - } - - testFile, err := ioutil.TempFile(os.TempDir(), "create-mount-namespace-") - if err != nil { - t.Fatalf("ioutil.TempFile() failed, err: %v", err) - } - defer os.RemoveAll(testFile.Name()) - testCases := []struct { name string // Spec that will be used to create the mount manager. Note @@ -234,8 +225,7 @@ func TestCreateMountNamespace(t *testing.T) { }, { Destination: "/foo/qux", - Source: testFile.Name(), - Type: "bind", + Type: "tmpfs", }, { // File mounts with the same prefix. @@ -284,8 +274,7 @@ func TestCreateMountNamespace(t *testing.T) { { // Mount with the same prefix. Destination: "/dev/fd-foo", - Source: testFile.Name(), - Type: "bind", + Type: "tmpfs", }, { // Unsupported fs type. @@ -298,8 +287,7 @@ func TestCreateMountNamespace(t *testing.T) { }, { Destination: "/dev/bar", - Source: testFile.Name(), - Type: "bind", + Type: "tmpfs", }, }, }, @@ -339,19 +327,22 @@ func TestCreateMountNamespace(t *testing.T) { } for _, tc := range testCases { - ctx := contexttest.Context(t) - mm, err := createMountNamespace(ctx, ctx, &tc.spec, conf, nil) - if err != nil { - t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err) - } - defer mm.DecRef() - root := mm.Root() - defer root.DecRef() - for _, p := range tc.expectedPaths { - if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil { - t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + t.Run(tc.name, func(t *testing.T) { + conf := testConfig() + ctx := contexttest.Context(t) + mm, err := createMountNamespace(ctx, ctx, &tc.spec, conf, nil) + if err != nil { + t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err) } - } + defer mm.DecRef() + root := mm.Root() + defer root.DecRef() + for _, p := range tc.expectedPaths { + if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil { + t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + } + } + }) } } @@ -361,7 +352,7 @@ func TestRestoreEnvironment(t *testing.T) { testCases := []struct { name string spec *specs.Spec - conf *Config + fileAccess FileAccessType ioFDs []int errorExpected bool expectedRenv fs.RestoreEnvironment @@ -384,12 +375,7 @@ func TestRestoreEnvironment(t *testing.T) { }, }, }, - conf: &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessProxy, - DisableSeccomp: true, - }, + fileAccess: FileAccessProxy, ioFDs: []int{0}, errorExpected: false, expectedRenv: fs.RestoreEnvironment{ @@ -444,12 +430,7 @@ func TestRestoreEnvironment(t *testing.T) { }, }, }, - conf: &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessProxy, - DisableSeccomp: true, - }, + fileAccess: FileAccessProxy, ioFDs: []int{0, 1}, errorExpected: false, expectedRenv: fs.RestoreEnvironment{ @@ -508,12 +489,7 @@ func TestRestoreEnvironment(t *testing.T) { }, }, }, - conf: &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessProxy, - DisableSeccomp: true, - }, + fileAccess: FileAccessProxy, ioFDs: []int{0}, errorExpected: false, expectedRenv: fs.RestoreEnvironment{ @@ -572,12 +548,7 @@ func TestRestoreEnvironment(t *testing.T) { }, }, }, - conf: &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessDirect, - DisableSeccomp: true, - }, + fileAccess: FileAccessDirect, ioFDs: []int{0, 1}, errorExpected: true, }, @@ -596,20 +567,17 @@ func TestRestoreEnvironment(t *testing.T) { }, }, }, - conf: &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessDirect, - DisableSeccomp: true, - }, + fileAccess: FileAccessDirect, ioFDs: []int{0}, errorExpected: true, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { + conf := testConfig() + conf.FileAccess = tc.fileAccess fds := &fdDispenser{fds: tc.ioFDs} - actualRenv, err := createRestoreEnvironment(tc.spec, tc.conf, fds) + actualRenv, err := createRestoreEnvironment(tc.spec, conf, fds) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) } else if tc.errorExpected { diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index b9ef4022f..5dee26a5c 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/p9", "//pkg/sentry/control", "//pkg/sentry/kernel/auth", + "//pkg/unet", "//pkg/urpc", "//runsc/boot", "//runsc/console", diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index e23f64d12..ab76734fc 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -16,6 +16,8 @@ package cmd import ( "os" + "path" + "sync" "syscall" "context" @@ -24,6 +26,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/unet" "gvisor.googlesource.com/gvisor/runsc/fsgofer" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -35,10 +38,6 @@ type Gofer struct { ioFDs intFlags applyCaps bool - // controllerFD is the file descriptor of a stream socket for the - // control server that is donated to this process. - controllerFD int - panicOnWrite bool } @@ -62,26 +61,16 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") - f.IntVar(&g.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected") } // Execute implements subcommands.Command. func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - if g.bundleDir == "" || len(g.ioFDs) < 1 || g.controllerFD == -1 { + if g.bundleDir == "" || len(g.ioFDs) < 1 { f.Usage() return subcommands.ExitUsageError } - // fsgofer should run with a umask of 0, because we want to preserve file - // modes exactly as sent by the sandbox, which will have applied its own umask. - syscall.Umask(0) - - spec, err := specutils.ReadSpec(g.bundleDir) - if err != nil { - Fatalf("error reading spec: %v", err) - } - if g.applyCaps { // Minimal set of capabilities needed by the Gofer to operate on files. caps := []string{ @@ -107,49 +96,84 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) panic("unreachable") } + spec, err := specutils.ReadSpec(g.bundleDir) + if err != nil { + Fatalf("error reading spec: %v", err) + } specutils.LogSpec(spec) - // Start with root mount, then add any other addition mount as needed. + // fsgofer should run with a umask of 0, because we want to preserve file + // modes exactly as sent by the sandbox, which will have applied its own umask. + syscall.Umask(0) + + // Find what path is going to be served by this gofer. + root := absPath(g.bundleDir, spec.Root.Path) + if err := syscall.Chroot(root); err != nil { + Fatalf("failed to chroot to %q: %v", root, err) + } + if err := syscall.Chdir("/"); err != nil { + Fatalf("failed to change working dir: %v", err) + } + log.Infof("Process chroot'd to %q", root) + + // Start with root mount, then add any other additional mount as needed. ats := make([]p9.Attacher, 0, len(spec.Mounts)+1) - p := absPath(g.bundleDir, spec.Root.Path) - ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{ + ats = append(ats, fsgofer.NewAttachPoint("/", fsgofer.Config{ ROMount: spec.Root.Readonly, PanicOnWrite: g.panicOnWrite, // Docker uses overlay2 by default for the root mount, and overlay2 does a copy-up when // each file is opened as writable. Thus, we open files lazily to avoid copy-up. LazyOpenForWrite: true, })) - log.Infof("Serving %q mapped to %q on FD %d", "/", p, g.ioFDs[0]) + log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], spec.Root.Readonly) mountIdx := 1 // first one is the root for _, m := range spec.Mounts { if specutils.Is9PMount(m) { - p = absPath(g.bundleDir, m.Source) - ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{ + if !path.IsAbs(m.Destination) { + Fatalf("destination must be absolute path: %v", m.Destination) + } + cfg := fsgofer.Config{ ROMount: isReadonlyMount(m.Options), PanicOnWrite: g.panicOnWrite, LazyOpenForWrite: false, - })) + } + ats = append(ats, fsgofer.NewAttachPoint(m.Destination, cfg)) if mountIdx >= len(g.ioFDs) { Fatalf("No FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m) } - log.Infof("Serving %q mapped to %q on FD %d", m.Destination, p, g.ioFDs[mountIdx]) + log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfg.ROMount) mountIdx++ } } if mountIdx != len(g.ioFDs) { - Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) + Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) } - ctrl, err := fsgofer.NewController(g.controllerFD, g.bundleDir) + runServers(ats, g.ioFDs) + return subcommands.ExitSuccess +} - if err := ctrl.Serve(ats, g.ioFDs); err != nil { - Fatalf("Failed to serve via P9: %v", err) +func runServers(ats []p9.Attacher, ioFDs []int) { + // Run the loops and wait for all to exit. + var wg sync.WaitGroup + for i, ioFD := range ioFDs { + wg.Add(1) + go func(ioFD int, at p9.Attacher) { + socket, err := unet.NewSocket(ioFD) + if err != nil { + Fatalf("err creating server on FD %d: %v", ioFD, err) + } + s := p9.NewServer(at) + if err := s.Handle(socket); err != nil { + Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err) + } + wg.Done() + }(ioFD, ats[i]) } - ctrl.Wait() - - return subcommands.ExitSuccess + wg.Wait() + log.Infof("All 9P servers exited.") } func isReadonlyMount(opts []string) bool { diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go index 28752d95e..265014e1b 100644 --- a/runsc/cmd/state.go +++ b/runsc/cmd/state.go @@ -63,8 +63,11 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s } log.Debugf("Returning state for container %+v", c) + state := c.State() + log.Debugf("State: %+v", state) + // Write json-encoded state directly to stdout. - b, err := json.MarshalIndent(c.State(), "", " ") + b, err := json.MarshalIndent(state, "", " ") if err != nil { Fatalf("error marshaling container state: %v", err) } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index e40ca4709..cba418d0c 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -13,6 +13,7 @@ go_library( name = "container", srcs = [ "container.go", + "fs.go", "hook.go", "status.go", ], @@ -28,13 +29,17 @@ go_library( "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", ], ) go_test( name = "container_test", size = "medium", - srcs = ["container_test.go"], + srcs = [ + "container_test.go", + "fs_test.go", + ], data = [ ":uds_test_app", "//runsc", diff --git a/runsc/container/container.go b/runsc/container/container.go index 8bd47aac1..16af66d3e 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -21,6 +21,7 @@ import ( "fmt" "io/ioutil" "os" + "os/exec" "path/filepath" "regexp" "strconv" @@ -223,15 +224,19 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // init container in the sandbox. if specutils.ShouldCreateSandbox(spec) || !conf.MultiContainer { log.Debugf("Creating new sandbox for container %q", id) + ioFiles, err := c.createGoferProcess(spec, conf, bundleDir) + if err != nil { + return nil, err + } + // Start a new sandbox for this container. Any errors after this point // must destroy the container. - s, goferPid, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket) + s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket, ioFiles) if err != nil { c.Destroy() return nil, err } c.Sandbox = s - c.GoferPid = goferPid } else { // This is sort of confusing. For a sandbox with a root // container and a child container in it, runsc sees: @@ -254,13 +259,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo return nil, err } c.Sandbox = sb.Sandbox - - // Prepare the gofer to serve the container's filesystem. - err = sb.Sandbox.CreateChild(c.ID, bundleDir) - if err != nil { - c.Destroy() - return nil, err - } } c.Status = Created @@ -304,7 +302,12 @@ func (c *Container) Start(conf *boot.Config) error { return err } } else { - if err := c.Sandbox.Start(c.Spec, conf, c.ID); err != nil { + // Create the gofer process. + ioFiles, err := c.createGoferProcess(c.Spec, conf, c.BundleDir) + if err != nil { + return err + } + if err := c.Sandbox.Start(c.Spec, conf, c.ID, ioFiles); err != nil { c.Destroy() return err } @@ -518,6 +521,8 @@ func (c *Container) Destroy() error { log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err) } } + c.Sandbox = nil + if c.GoferPid != 0 { log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid) if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { @@ -527,9 +532,7 @@ func (c *Container) Destroy() error { } } - c.Sandbox = nil c.Status = Stopped - return nil } @@ -596,3 +599,72 @@ func (c *Container) waitForStopped() error { } return backoff.Retry(op, b) } + +func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, error) { + if conf.FileAccess == boot.FileAccessDirect { + // Don't start a gofer. The sandbox will access host FS directly. + return nil, nil + } + + if err := setupFS(spec, conf, bundleDir); err != nil { + return nil, fmt.Errorf("failed to setup mounts: %v", err) + } + + // Start with the general config flags. + args := conf.ToFlags() + args = append(args, "gofer", "--bundle", bundleDir) + if conf.Overlay { + args = append(args, "--panic-on-write=true") + } + + // Add root mount and then add any other additional mounts. + mountCount := 1 + + // Add additional mounts. + for _, m := range spec.Mounts { + if specutils.Is9PMount(m) { + mountCount++ + } + } + sandEnds := make([]*os.File, 0, mountCount) + goferEnds := make([]*os.File, 0, mountCount) + + // nextFD is the next available file descriptor for the gofer process. + // It starts at 3 because 0-2 are used by stdin/stdout/stderr. + nextFD := 3 + for ; nextFD-3 < mountCount; nextFD++ { + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, err + } + sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd")) + + goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd") + defer goferEnd.Close() + goferEnds = append(goferEnds, goferEnd) + + args = append(args, fmt.Sprintf("--io-fds=%d", nextFD)) + } + + binPath, err := specutils.BinPath() + if err != nil { + return nil, err + } + cmd := exec.Command(binPath, args...) + cmd.ExtraFiles = goferEnds + + // Setup any uid/gid mappings, and create or join the configured user + // namespace so the gofer's view of the filesystem aligns with the + // users in the sandbox. + specutils.SetUIDGIDMappings(cmd, spec) + nss := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec) + + // Start the gofer in the given namespace. + log.Debugf("Starting gofer: %s %v", binPath, args) + if err := specutils.StartInNS(cmd, nss); err != nil { + return nil, err + } + log.Infof("Gofer started, pid: %d", cmd.Process.Pid) + c.GoferPid = cmd.Process.Pid + return sandEnds, nil +} diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 6d84700ce..25aaf3f86 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -1211,9 +1211,6 @@ func TestMountNewDir(t *testing.T) { if err != nil { t.Fatal("ioutil.TempDir() failed:", err) } - if err := os.Chmod(root, 0755); err != nil { - t.Fatalf("os.Chmod(%q) failed: %v", root, err) - } srcDir := path.Join(root, "src", "dir", "anotherdir") if err := os.MkdirAll(srcDir, 0755); err != nil { @@ -1747,3 +1744,7 @@ func TestGoferExits(t *testing.T) { t.Errorf("container shouldn't be running, container: %+v", c) } } + +func TestMain(m *testing.M) { + testutil.RunAsRoot(m) +} diff --git a/runsc/container/fs.go b/runsc/container/fs.go new file mode 100644 index 000000000..652f81bbf --- /dev/null +++ b/runsc/container/fs.go @@ -0,0 +1,198 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +type mapping struct { + set bool + val uint32 +} + +var optionsMap = map[string]mapping{ + "acl": {set: true, val: syscall.MS_POSIXACL}, + "async": {set: false, val: syscall.MS_SYNCHRONOUS}, + "atime": {set: false, val: syscall.MS_NOATIME}, + "bind": {set: true, val: syscall.MS_BIND}, + "defaults": {set: true, val: 0}, + "dev": {set: false, val: syscall.MS_NODEV}, + "diratime": {set: false, val: syscall.MS_NODIRATIME}, + "dirsync": {set: true, val: syscall.MS_DIRSYNC}, + "exec": {set: false, val: syscall.MS_NOEXEC}, + "iversion": {set: true, val: syscall.MS_I_VERSION}, + "loud": {set: false, val: syscall.MS_SILENT}, + "mand": {set: true, val: syscall.MS_MANDLOCK}, + "noacl": {set: false, val: syscall.MS_POSIXACL}, + "noatime": {set: true, val: syscall.MS_NOATIME}, + "nodev": {set: true, val: syscall.MS_NODEV}, + "nodiratime": {set: true, val: syscall.MS_NODIRATIME}, + "noexec": {set: true, val: syscall.MS_NOEXEC}, + "noiversion": {set: false, val: syscall.MS_I_VERSION}, + "nomand": {set: false, val: syscall.MS_MANDLOCK}, + "norelatime": {set: false, val: syscall.MS_RELATIME}, + "nostrictatime": {set: false, val: syscall.MS_STRICTATIME}, + "nosuid": {set: true, val: syscall.MS_NOSUID}, + "private": {set: true, val: syscall.MS_PRIVATE}, + "rbind": {set: true, val: syscall.MS_BIND | syscall.MS_REC}, + "relatime": {set: true, val: syscall.MS_RELATIME}, + "remount": {set: true, val: syscall.MS_REMOUNT}, + "ro": {set: true, val: syscall.MS_RDONLY}, + "rprivate": {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC}, + "rw": {set: false, val: syscall.MS_RDONLY}, + "silent": {set: true, val: syscall.MS_SILENT}, + "strictatime": {set: true, val: syscall.MS_STRICTATIME}, + "suid": {set: false, val: syscall.MS_NOSUID}, + "sync": {set: true, val: syscall.MS_SYNCHRONOUS}, +} + +// setupFS creates the container directory structure under 'spec.Root.Path'. +// This allows the gofer serving the containers to be chroot under this +// directory to create an extra layer to security in case the gofer gets +// compromised. +func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error { + for _, m := range spec.Mounts { + if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { + continue + } + src := m.Source + if !filepath.IsAbs(src) { + src = filepath.Join(bundleDir, src) + } + srcfi, err := os.Stat(src) + if err != nil { + return err + } + + // It's possible that 'm.Destination' follows symlinks inside the + // container. + dst, err := resolveSymlinks(spec.Root.Path, m.Destination) + if err != nil { + return err + } + + // Create mount point if it doesn't exits + if _, err := os.Stat(dst); os.IsNotExist(err) { + if srcfi.IsDir() { + if err := os.MkdirAll(dst, 0755); err != nil { + return err + } + } else { + if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil { + return err + } + f, err := os.OpenFile(dst, os.O_CREATE, 0755) + if err != nil { + return err + } + f.Close() + } + } + + flags := optionsToFlags(m.Options) + flags |= syscall.MS_BIND + log.Infof("Mounting src: %q, dst: %q, flags: %#x", src, dst, flags) + if err := syscall.Mount(src, dst, m.Type, uintptr(flags), ""); err != nil { + return err + } + } + + // Remount root as readonly after setup is done, if requested. + if spec.Root.Readonly { + log.Infof("Remounting root as readonly: %q", spec.Root.Path) + flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC) + return unix.Mount(spec.Root.Path, spec.Root.Path, "bind", flags, "") + } + return nil +} + +// resolveSymlinks walks 'rel' having 'root' as the root directory. If there are +// symlinks, they are evaluated relative to 'root' to ensure the end result is +// the same as if the process was running inside the container. +func resolveSymlinks(root, rel string) (string, error) { + return resolveSymlinksImpl(root, root, rel, 255) +} + +func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) { + if followCount == 0 { + return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel)) + } + + rel = filepath.Clean(rel) + for _, name := range strings.Split(rel, string(filepath.Separator)) { + if name == "" { + continue + } + // Note that Join() resolves things like ".." and returns a clean path. + path := filepath.Join(base, name) + if !strings.HasPrefix(path, root) { + // One cannot '..' their way out of root. + path = root + continue + } + fi, err := os.Lstat(path) + if err != nil { + if !os.IsNotExist(err) { + return "", err + } + // Not found means there is no symlink to check. Just keep walking dirs. + base = path + continue + } + if fi.Mode()&os.ModeSymlink != 0 { + link, err := os.Readlink(path) + if err != nil { + return "", err + } + if filepath.IsAbs(link) { + base = root + } + base, err = resolveSymlinksImpl(root, base, link, followCount-1) + if err != nil { + return "", err + } + continue + } + base = path + } + return base, nil +} + +func optionsToFlags(opts []string) uint32 { + var rv uint32 + for _, opt := range opts { + if m, ok := optionsMap[opt]; ok { + if m.set { + rv |= m.val + } else { + rv ^= m.val + } + } else { + log.Warningf("Ignoring mount option %q", opt) + } + } + return rv +} diff --git a/runsc/container/fs_test.go b/runsc/container/fs_test.go new file mode 100644 index 000000000..84bde18fb --- /dev/null +++ b/runsc/container/fs_test.go @@ -0,0 +1,158 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "fmt" + "io/ioutil" + "os" + "path" + "path/filepath" + "testing" + + "gvisor.googlesource.com/gvisor/runsc/test/testutil" +) + +type dir struct { + rel string + link string +} + +func construct(root string, dirs []dir) error { + for _, d := range dirs { + p := path.Join(root, d.rel) + if d.link == "" { + if err := os.MkdirAll(p, 0755); err != nil { + return fmt.Errorf("error creating dir: %v", err) + } + } else { + if err := os.MkdirAll(path.Dir(p), 0755); err != nil { + return fmt.Errorf("error creating dir: %v", err) + } + if err := os.Symlink(d.link, p); err != nil { + return fmt.Errorf("error creating symlink: %v", err) + } + } + } + return nil +} + +func TestResolveSymlinks(t *testing.T) { + root, err := ioutil.TempDir(testutil.TmpDir(), "root") + if err != nil { + t.Fatal("ioutil.TempDir() failed:", err) + } + dirs := []dir{ + {"dir1/dir11/dir111/dir1111", ""}, // Just a boring dir + {"dir1/lnk12", "dir11"}, // Link to sibling + {"dir1/lnk13", "./dir11"}, // Link to sibling through self + {"dir1/lnk14", "../dir1/dir11"}, // Link to sibling through parent + {"dir1/dir15/lnk151", ".."}, // Link to parent + {"dir1/lnk16", "dir11/dir111"}, // Link to child + {"dir1/lnk17", "."}, // Link to self + {"dir1/lnk18", "lnk13"}, // Link to link + {"lnk2", "dir1/lnk13"}, // Link to link to link + {"dir3/dir21/lnk211", "../.."}, // Link to root relative + {"dir3/lnk22", "/"}, // Link to root absolute + {"dir3/lnk23", "/dir1"}, // Link to dir absolute + {"dir3/lnk24", "/dir1/lnk12"}, // Link to link absolute + {"lnk5", "../../.."}, // Link outside root + } + if err := construct(root, dirs); err != nil { + t.Fatal("construct failed:", err) + } + + tests := []struct { + name string + rel string + want string + compareHost bool + }{ + {name: "root", rel: "/", want: "/", compareHost: true}, + {name: "basic dir", rel: "/dir1/dir11/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "dot 1", rel: "/dir1/dir11/./dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "dot 2", rel: "/dir1/././dir11/./././././dir111/.", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "dotdot 1", rel: "/dir1/dir11/../dir15", want: "/dir1/dir15", compareHost: true}, + {name: "dotdot 2", rel: "/dir1/dir11/dir1111/../..", want: "/dir1", compareHost: true}, + + {name: "link sibling", rel: "/dir1/lnk12", want: "/dir1/dir11", compareHost: true}, + {name: "link sibling + dir", rel: "/dir1/lnk12/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "link sibling through self", rel: "/dir1/lnk13", want: "/dir1/dir11", compareHost: true}, + {name: "link sibling through parent", rel: "/dir1/lnk14", want: "/dir1/dir11", compareHost: true}, + + {name: "link parent", rel: "/dir1/dir15/lnk151", want: "/dir1", compareHost: true}, + {name: "link parent + dir", rel: "/dir1/dir15/lnk151/dir11", want: "/dir1/dir11", compareHost: true}, + {name: "link child", rel: "/dir1/lnk16", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "link child + dir", rel: "/dir1/lnk16/dir1111", want: "/dir1/dir11/dir111/dir1111", compareHost: true}, + {name: "link self", rel: "/dir1/lnk17", want: "/dir1", compareHost: true}, + {name: "link self + dir", rel: "/dir1/lnk17/dir11", want: "/dir1/dir11", compareHost: true}, + + {name: "link^2", rel: "/dir1/lnk18", want: "/dir1/dir11", compareHost: true}, + {name: "link^2 + dir", rel: "/dir1/lnk18/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "link^3", rel: "/lnk2", want: "/dir1/dir11", compareHost: true}, + {name: "link^3 + dir", rel: "/lnk2/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + + {name: "link abs", rel: "/dir3/lnk23", want: "/dir1"}, + {name: "link abs + dir", rel: "/dir3/lnk23/dir11", want: "/dir1/dir11"}, + {name: "link^2 abs", rel: "/dir3/lnk24", want: "/dir1/dir11"}, + {name: "link^2 abs + dir", rel: "/dir3/lnk24/dir111", want: "/dir1/dir11/dir111"}, + + {name: "root link rel", rel: "/dir3/dir21/lnk211", want: "/", compareHost: true}, + {name: "root link abs", rel: "/dir3/lnk22", want: "/"}, + {name: "root contain link", rel: "/lnk5/dir1", want: "/dir1"}, + {name: "root contain dotdot", rel: "/dir1/dir11/../../../../../../../..", want: "/"}, + + {name: "crazy", rel: "/dir3/dir21/lnk211/dir3/lnk22/dir1/dir11/../../lnk5/dir3/../dir3/lnk24/dir111/dir1111/..", want: "/dir1/dir11/dir111"}, + } + for _, tst := range tests { + t.Run(tst.name, func(t *testing.T) { + got, err := resolveSymlinks(root, tst.rel) + if err != nil { + t.Errorf("resolveSymlinks(root, %q) failed: %v", tst.rel, err) + } + want := path.Join(root, tst.want) + if got != want { + t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, got, want) + } + if tst.compareHost { + // Check that host got to the same end result. + host, err := filepath.EvalSymlinks(path.Join(root, tst.rel)) + if err != nil { + t.Errorf("path.EvalSymlinks(root, %q) failed: %v", tst.rel, err) + } + if host != got { + t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, host, got) + } + } + }) + } +} + +func TestResolveSymlinksLoop(t *testing.T) { + root, err := ioutil.TempDir(testutil.TmpDir(), "root") + if err != nil { + t.Fatal("ioutil.TempDir() failed:", err) + } + dirs := []dir{ + {"loop1", "loop2"}, + {"loop2", "loop1"}, + } + if err := construct(root, dirs); err != nil { + t.Fatal("construct failed:", err) + } + if _, err := resolveSymlinks(root, "loop1"); err == nil { + t.Errorf("resolveSymlinks() should have failed") + } +} diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 0bc682b5f..24e172f48 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "fsgofer", srcs = [ - "control.go", "fsgofer.go", "fsgofer_unsafe.go", ], @@ -15,12 +14,9 @@ go_library( ], deps = [ "//pkg/abi/linux", - "//pkg/control/server", "//pkg/fd", "//pkg/log", "//pkg/p9", - "//pkg/unet", - "//pkg/urpc", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/fsgofer/control.go b/runsc/fsgofer/control.go deleted file mode 100644 index 8cb2f67ac..000000000 --- a/runsc/fsgofer/control.go +++ /dev/null @@ -1,204 +0,0 @@ -// Copyright 2018 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fsgofer - -import ( - "fmt" - "os" - "path/filepath" - "sync" - - "gvisor.googlesource.com/gvisor/pkg/control/server" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/p9" - "gvisor.googlesource.com/gvisor/pkg/unet" - "gvisor.googlesource.com/gvisor/pkg/urpc" -) - -// Controller manages the fsgofer's control server. -type Controller struct { - // api holds the control server's URPC endpoints. - api api - - // srv is the control server. - srv *server.Server -} - -// NewController creates a new Controller and starts it listenting -func NewController(fd int, rootBundleDir string) (*Controller, error) { - if !filepath.IsAbs(rootBundleDir) { - return nil, fmt.Errorf("NewController should receive an absolute bundle dir path, but got %q", rootBundleDir) - } - - srv, err := server.CreateFromFD(fd) - if err != nil { - return nil, err - } - - cr := &Controller{srv: srv} - cr.api.rootBundleDir = rootBundleDir - cr.api.bundleDirs = make(map[string]string) - srv.Register(&cr.api) - - if err := srv.StartServing(); err != nil { - return nil, err - } - - return cr, nil -} - -// Wait waits for all the p9 servers to finish, then shuts down the control -// server. -func (cr *Controller) Wait() { - cr.api.p9wg.Wait() - cr.srv.Stop() - log.Infof("All 9P servers exited.") -} - -// Serve starts serving each Attacher in ats via its corresponding file -// descriptor in ioFDs. This takes ownership of the FDs in ioFDs. -func (cr *Controller) Serve(ats []p9.Attacher, ioFDs []int) error { - if len(ats) != len(ioFDs) { - return fmt.Errorf("number of attach points does not match the number of IO FDs (%d and %d)", len(ats), len(ioFDs)) - } - for i, _ := range ats { - cr.api.serve(ats[i], os.NewFile(uintptr(ioFDs[i]), "io fd")) - } - return nil -} - -// api URPC methods. -const ( - // AddBundleDirs readies the gofer to serve from a new bundle - // directory. It should be called during runsc create. - AddBundleDirs = "api.AddBundleDirs" - - // ServeDirectory serves a new directory via the fsgofer. It should be - // called during runsc start. - ServeDirectory = "api.ServeDirectory" -) - -// API defines and implements the URPC endpoints for the gofer. -type api struct { - // p9wg waits for all the goroutines serving the sentry via p9. When its - // counter is 0, the gofer is out of work and exits. - p9wg sync.WaitGroup - - // bundleDirs maps from container ID to bundle directory for each - // container. - bundleDirs map[string]string - - // rootBundleDir is the bundle directory of the root container. - rootBundleDir string -} - -// AddBundleDirsRequest is the URPC argument to AddBundleDirs. -type AddBundleDirsRequest struct { - // BundleDirs is a map of container IDs to bundle directories to add to - // the gofer. - BundleDirs map[string]string -} - -// AddBundleDirsRequest adds bundle directories that for the gofer to serve. -func (api *api) AddBundleDirs(req *AddBundleDirsRequest, _ *struct{}) error { - log.Debugf("fsgofer.AddBundleDirs") - for cid, bd := range req.BundleDirs { - if _, ok := api.bundleDirs[cid]; ok { - return fmt.Errorf("fsgofer already has a bundleDir for container %q", cid) - } - api.bundleDirs[cid] = bd - } - return nil -} - -// ServeDirectoryRequest is the URPC argument to ServeDirectory. -type ServeDirectoryRequest struct { - // Dir is the absolute path to a directory to be served to the sentry. - Dir string - - // IsReadOnly specifies whether the directory should be served in - // read-only mode. - IsReadOnly bool - - // CID is the container ID of the container that needs to serve a - // directory. - CID string - - // FilePayload contains the socket over which the sentry will request - // files from Dir. - urpc.FilePayload -} - -// ServeDirectory begins serving a directory via a file descriptor for the -// sentry. Directories must be added via AddBundleDirsRequest before -// ServeDirectory is called. -func (api *api) ServeDirectory(req *ServeDirectoryRequest, _ *struct{}) error { - log.Debugf("fsgofer.ServeDirectory: %+v", req) - - if req.Dir == "" { - return fmt.Errorf("ServeDirectory should receive a directory argument, but was empty") - } - if req.CID == "" { - return fmt.Errorf("ServeDirectory should receive a CID argument, but was empty") - } - // Prevent CIDs containing ".." from confusing the sentry when creating - // /containers/<cid> directory. - // TODO: Once we have multiple independant roots, this - // check won't be necessary. - if filepath.Clean(req.CID) != req.CID { - return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", req.CID) - } - if nFiles := len(req.FilePayload.Files); nFiles != 1 { - return fmt.Errorf("ServeDirectory should receive 1 file descriptor, but got %d", nFiles) - } - - bd, ok := api.bundleDirs[req.CID] - if !ok { - // If there's no entry in bundleDirs for the container ID, this - // is the root container. - bd = api.rootBundleDir - } - - // Relative paths are served relative to the bundle directory. - absDir := req.Dir - if !filepath.IsAbs(absDir) { - absDir = filepath.Join(bd, req.Dir) - } - - // Create the attach point and start serving. - at := NewAttachPoint(absDir, Config{ - ROMount: req.IsReadOnly, - LazyOpenForWrite: true, - }) - api.serve(at, req.FilePayload.Files[0]) - - return nil -} - -// serve begins serving a directory via a file descriptor. -func (api *api) serve(at p9.Attacher, ioFile *os.File) { - api.p9wg.Add(1) - go func() { - socket, err := unet.NewSocket(int(ioFile.Fd())) - if err != nil { - panic(fmt.Sprintf("err creating server on FD %d: %v", ioFile.Fd(), err)) - } - s := p9.NewServer(at) - if err := s.Handle(socket); err != nil { - panic(fmt.Sprintf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFile.Fd(), err)) - } - api.p9wg.Done() - }() -} diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index e9a39f797..9317b1c14 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library") go_library( name = "sandbox", srcs = [ - "namespace.go", "network.go", "sandbox.go", ], @@ -21,7 +20,6 @@ go_library( "//pkg/urpc", "//runsc/boot", "//runsc/console", - "//runsc/fsgofer", "//runsc/specutils", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", "@com_github_vishvananda_netlink//:go_default_library", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index d0ce6228b..8694ba755 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -29,6 +29,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/urpc" "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/specutils" ) const ( @@ -132,7 +133,7 @@ func createDefaultLoopbackInterface(conn *urpc.Client) error { func joinNetNS(nsPath string) (func(), error) { runtime.LockOSThread() - restoreNS, err := applyNS(specs.LinuxNamespace{ + restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{ Type: specs.NetworkNamespace, Path: nsPath, }) diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index e54ba4ba3..f14a2f8c9 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -32,7 +32,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/urpc" "gvisor.googlesource.com/gvisor/runsc/boot" "gvisor.googlesource.com/gvisor/runsc/console" - "gvisor.googlesource.com/gvisor/runsc/fsgofer" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -55,31 +54,20 @@ type Sandbox struct { } // Create creates the sandbox process. -func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string) (*Sandbox, int, error) { +func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) (*Sandbox, error) { s := &Sandbox{ID: id} - binPath, err := specutils.BinPath() - if err != nil { - return nil, 0, err - } - - // Create the gofer process. - goferPid, ioFiles, err := s.createGoferProcess(spec, conf, bundleDir, binPath) - if err != nil { - return nil, 0, err - } - // Create the sandbox process. - if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, binPath, ioFiles); err != nil { - return nil, 0, err + if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, ioFiles); err != nil { + return nil, err } // Wait for the control server to come up (or timeout). if err := s.waitForCreated(10 * time.Second); err != nil { - return nil, 0, err + return nil, err } - return s, goferPid, nil + return s, nil } // StartRoot starts running the root container process inside the sandbox. @@ -105,70 +93,29 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error { return nil } -// CreateChild creates a non-root container inside the sandbox. -func (s *Sandbox) CreateChild(cid, bundleDir string) error { - log.Debugf("Create non-root container sandbox %q, pid: %d for container %q with bundle directory %q", s.ID, s.Pid, cid, bundleDir) - - // Connect to the gofer and prepare it to serve from bundleDir for this - // container. - goferConn, err := s.goferConnect() - if err != nil { - return fmt.Errorf("couldn't connect to gofer: %v", err) - } - defer goferConn.Close() - goferReq := fsgofer.AddBundleDirsRequest{BundleDirs: map[string]string{cid: bundleDir}} - if err := goferConn.Call(fsgofer.AddBundleDirs, &goferReq, nil); err != nil { - return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err) +// Start starts running a non-root container inside the sandbox. +func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, ioFiles []*os.File) error { + for _, f := range ioFiles { + defer f.Close() } - return nil -} - -// Start starts running a non-root container inside the sandbox. -func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string) error { log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid) - sandboxConn, err := s.sandboxConnect() if err != nil { return fmt.Errorf("couldn't connect to sandbox: %v", err) } defer sandboxConn.Close() - goferConn, err := s.goferConnect() - if err != nil { - return fmt.Errorf("couldn't connect to gofer: %v", err) - } - defer goferConn.Close() - - // Create socket that connects the sandbox and gofer. - sandEnd, goferEnd, err := createSocketPair() - if err != nil { - return err - } - defer sandEnd.Close() - defer goferEnd.Close() - - // Tell the Gofer about the new filesystem it needs to serve. - goferReq := fsgofer.ServeDirectoryRequest{ - Dir: spec.Root.Path, - IsReadOnly: spec.Root.Readonly, - CID: cid, - FilePayload: urpc.FilePayload{Files: []*os.File{goferEnd}}, - } - if err := goferConn.Call(fsgofer.ServeDirectory, &goferReq, nil); err != nil { - return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err) - } // Start running the container. args := boot.StartArgs{ Spec: spec, Conf: conf, CID: cid, - FilePayload: urpc.FilePayload{Files: []*os.File{sandEnd}}, + FilePayload: urpc.FilePayload{Files: ioFiles}, } if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err) } - return nil } @@ -275,102 +222,13 @@ func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { return conn, nil } -func (s *Sandbox) goferConnect() (*urpc.Client, error) { - log.Debugf("Connecting to gofer for sandbox %q", s.ID) - conn, err := client.ConnectTo(fsgofer.ControlSocketAddr(s.ID)) - if err != nil { - return nil, s.connError(err) - } - return conn, nil -} - func (s *Sandbox) connError(err error) error { return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err) } -func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir, binPath string) (int, []*os.File, error) { - if conf.FileAccess == boot.FileAccessDirect { - // Don't start a gofer. The sandbox will access host FS directly. - return 0, nil, nil - } - - // Start with the general config flags. - args := conf.ToFlags() - args = append(args, "gofer", "--bundle", bundleDir) - - // Add root mount and then add any other additional mounts. - mountCount := 1 - - // Add additional mounts. - for _, m := range spec.Mounts { - if specutils.Is9PMount(m) { - mountCount++ - } - } - sandEnds := make([]*os.File, 0, mountCount) - goferEnds := make([]*os.File, 0, mountCount) - // nextFD is the next available file descriptor for the gofer process. - // It starts at 3 because 0-2 are used by stdin/stdout/stderr. - var nextFD int - for nextFD = 3; nextFD-3 < mountCount; nextFD++ { - sandEnd, goferEnd, err := createSocketPair() - if err != nil { - return 0, nil, err - } - defer goferEnd.Close() - sandEnds = append(sandEnds, sandEnd) - goferEnds = append(goferEnds, goferEnd) - args = append(args, fmt.Sprintf("--io-fds=%d", nextFD)) - } - - // Create and donate a file descriptor for the control server. - addr := fsgofer.ControlSocketAddr(s.ID) - serverFD, err := server.CreateSocket(addr) - if err != nil { - return 0, nil, fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err) - } - - // Add the control server fd. - args = append(args, "--controller-fd="+strconv.Itoa(nextFD)) - nextFD++ - controllerFile := os.NewFile(uintptr(serverFD), "gofer_control_socket_server") - defer controllerFile.Close() - - cmd := exec.Command(binPath, args...) - cmd.ExtraFiles = goferEnds - cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile) - - // Setup any uid/gid mappings, and create or join the configured user - // namespace so the gofer's view of the filesystem aligns with the - // users in the sandbox. - setUIDGIDMappings(cmd, spec) - nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec) - - if conf.Overlay { - args = append(args, "--panic-on-write=true") - } - - // Start the gofer in the given namespace. - log.Debugf("Starting gofer: %s %v", binPath, args) - if err := startInNS(cmd, nss); err != nil { - return 0, nil, err - } - log.Infof("Gofer started, pid: %d", cmd.Process.Pid) - return cmd.Process.Pid, sandEnds, nil -} - -// createSocketPair creates a pair of files wrapping a socket pair. -func createSocketPair() (*os.File, *os.File, error) { - fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) - if err != nil { - return nil, nil, err - } - return os.NewFile(uintptr(fds[0]), "sandbox io fd"), os.NewFile(uintptr(fds[1]), "gofer io fd"), nil -} - // createSandboxProcess starts the sandbox as a subprocess by running the "boot" // command, passing in the bundle dir. -func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File) error { +func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) error { // nextFD is used to get unused FDs that we can pass to the sandbox. It // starts at 3 because 0, 1, and 2 are taken by stdin/out/err. nextFD := 3 @@ -387,6 +245,10 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund consoleEnabled := consoleSocket != "" + binPath, err := specutils.BinPath() + if err != nil { + return err + } cmd := exec.Command(binPath, conf.ToFlags()...) cmd.SysProcAttr = &syscall.SysProcAttr{} cmd.Args = append(cmd.Args, @@ -464,7 +326,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund // Joins the network namespace if network is enabled. the sandbox talks // directly to the host network, which may have been configured in the // namespace. - if ns, ok := getNS(specs.NetworkNamespace, spec); ok && conf.Network != boot.NetworkNone { + if ns, ok := specutils.GetNS(specs.NetworkNamespace, spec); ok && conf.Network != boot.NetworkNone { log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) nss = append(nss, ns) } else { @@ -478,10 +340,10 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund // - Gofer: when using a Gofer, the sandbox process can run isolated in an // empty namespace. if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect { - if userns, ok := getNS(specs.UserNamespace, spec); ok { + if userns, ok := specutils.GetNS(specs.UserNamespace, spec); ok { log.Infof("Sandbox will be started in container's user namespace: %+v", userns) nss = append(nss, userns) - setUIDGIDMappings(cmd, spec) + specutils.SetUIDGIDMappings(cmd, spec) } else { log.Infof("Sandbox will be started in the current user namespace") } @@ -496,7 +358,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund } log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args) - if err := startInNS(cmd, nss); err != nil { + if err := specutils.StartInNS(cmd, nss); err != nil { return err } s.Pid = cmd.Process.Pid diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index a22ab789a..97a504b20 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -4,7 +4,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "specutils", - srcs = ["specutils.go"], + srcs = [ + "namespace.go", + "specutils.go", + ], importpath = "gvisor.googlesource.com/gvisor/runsc/specutils", visibility = [ "//runsc:__subpackages__", @@ -15,6 +18,7 @@ go_library( "//pkg/sentry/kernel/auth", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/sandbox/namespace.go b/runsc/specutils/namespace.go index 1d3bcfbb5..80eaad965 100644 --- a/runsc/sandbox/namespace.go +++ b/runsc/specutils/namespace.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package sandbox +package specutils import ( "fmt" @@ -74,10 +74,10 @@ func nsPath(nst specs.LinuxNamespaceType) string { } } -// getNS returns true and the namespace with the given type from the slice of +// GetNS returns true and the namespace with the given type from the slice of // namespaces in the spec. It returns false if the slice does not contain a // namespace with the type. -func getNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) { +func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) { if s.Linux == nil { return specs.LinuxNamespace{}, false } @@ -89,35 +89,35 @@ func getNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, b return specs.LinuxNamespace{}, false } -// filterNS returns a slice of namespaces from the spec with types that match +// FilterNS returns a slice of namespaces from the spec with types that match // those in the `filter` slice. -func filterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace { +func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace { if s.Linux == nil { return nil } var out []specs.LinuxNamespace for _, nst := range filter { - if ns, ok := getNS(nst, s); ok { + if ns, ok := GetNS(nst, s); ok { out = append(out, ns) } } return out } -// setNS sets the namespace of the given type. It must be called with +// SetNS sets the namespace of the given type. It must be called with // OSThreadLocked. -func setNS(fd, nsType uintptr) error { +func SetNS(fd, nsType uintptr) error { if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 { return err } return nil } -// applyNS applies the namespace on the current thread and returns a function +// ApplyNS applies the namespace on the current thread and returns a function // that will restore the namespace to the original value. // // Preconditions: Must be called with os thread locked. -func applyNS(ns specs.LinuxNamespace) (func(), error) { +func ApplyNS(ns specs.LinuxNamespace) (func(), error) { log.Infof("applying namespace %v at path %q", ns.Type, ns.Path) newNS, err := os.Open(ns.Path) if err != nil { @@ -134,22 +134,22 @@ func applyNS(ns specs.LinuxNamespace) (func(), error) { // Set netns to the one requested and setup function to restore it back. flag := nsCloneFlag(ns.Type) - if err := setNS(newNS.Fd(), flag); err != nil { + if err := SetNS(newNS.Fd(), flag); err != nil { oldNS.Close() return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err) } return func() { log.Infof("restoring namespace %v", ns.Type) defer oldNS.Close() - if err := setNS(oldNS.Fd(), flag); err != nil { + if err := SetNS(oldNS.Fd(), flag); err != nil { panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err)) } }, nil } -// startInNS joins or creates the given namespaces and calls cmd.Start before +// StartInNS joins or creates the given namespaces and calls cmd.Start before // restoring the namespaces to the original values. -func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { +func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { // We are about to setup namespaces, which requires the os thread being // locked so that Go doesn't change the thread out from under us. runtime.LockOSThread() @@ -167,7 +167,7 @@ func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { } // Join the given namespace, and restore the current namespace // before exiting. - restoreNS, err := applyNS(ns) + restoreNS, err := ApplyNS(ns) if err != nil { return err } @@ -177,8 +177,8 @@ func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { return cmd.Start() } -// setUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. -func setUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { +// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. +func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { if s.Linux == nil { return } diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD index 03ab3c4ac..ca91e07ff 100644 --- a/runsc/test/testutil/BUILD +++ b/runsc/test/testutil/BUILD @@ -18,5 +18,6 @@ go_library( "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_syndtr_gocapability//capability:go_default_library", ], ) diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go index b7d60e712..fc67c174a 100644 --- a/runsc/test/testutil/docker.go +++ b/runsc/test/testutil/docker.go @@ -32,7 +32,7 @@ func init() { rand.Seed(time.Now().UnixNano()) } -func runtime() string { +func getRuntime() string { r := os.Getenv("RUNSC_RUNTIME") if r == "" { return "runsc-test" @@ -43,7 +43,7 @@ func runtime() string { // IsPauseResumeSupported returns true if Pause/Resume is supported by runtime. func IsPauseResumeSupported() bool { // Native host network stack can't be saved. - return !strings.Contains(runtime(), "hostnet") + return !strings.Contains(getRuntime(), "hostnet") } // EnsureSupportedDockerVersion checks if correct docker is installed. @@ -128,7 +128,7 @@ type Docker struct { // Names of containers will be unique. func MakeDocker(namePrefix string) Docker { suffix := fmt.Sprintf("-%06d", rand.Int())[:7] - return Docker{Name: namePrefix + suffix, Runtime: runtime()} + return Docker{Name: namePrefix + suffix, Runtime: getRuntime()} } // Create calls 'docker create' with the arguments provided. diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index fc3d61e52..e90ab5ad5 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -23,11 +23,16 @@ import ( "io/ioutil" "net/http" "os" + "os/exec" "path/filepath" + "runtime" + "syscall" + "testing" "time" "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/syndtr/gocapability/capability" "gvisor.googlesource.com/gvisor/runsc/boot" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -227,3 +232,55 @@ func WaitForHTTP(port int, timeout time.Duration) error { } return Poll(cb, timeout) } + +// RunAsRoot ensures the test runs with CAP_SYS_ADMIN. If need it will create +// a new user namespace and reexecute the test as root inside of the namespace. +func RunAsRoot(m *testing.M) { + caps, err := capability.NewPid2(os.Getpid()) + if err != nil { + panic(err.Error()) + } + if err := caps.Load(); err != nil { + panic(err.Error()) + } + if caps.Get(capability.EFFECTIVE, capability.CAP_SYS_ADMIN) { + // Capability: check! Good to run. + os.Exit(m.Run()) + } + + // Current process doesn't have CAP_SYS_ADMIN, create user namespace and run + // as root inside that namespace to get it. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + cmd := exec.Command("/proc/self/exe", os.Args...) + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS, + // Set current user/group as root inside the namespace. + UidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getuid(), Size: 1}, + }, + GidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getgid(), Size: 1}, + }, + GidMappingsEnableSetgroups: false, + Credential: &syscall.Credential{ + Uid: 0, + Gid: 0, + }, + } + cmd.Env = os.Environ() + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + if exit, ok := err.(*exec.ExitError); ok { + if ws, ok := exit.Sys().(syscall.WaitStatus); ok { + os.Exit(ws.ExitStatus()) + } + os.Exit(-1) + } + panic(fmt.Sprint("error running child process:", err.Error())) + } + os.Exit(0) +} |