diff options
author | Fabricio Voznika <fvoznika@google.com> | 2018-08-27 11:09:06 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-08-27 11:10:14 -0700 |
commit | db81c0b02f2f947ae837a3e16471a148a66436eb (patch) | |
tree | d91ef12da80b0a76ef1c69db290665e31cc59860 | |
parent | 2524111fc63343fd7372f5ea0266130adea778a5 (diff) |
Put fsgofer inside chroot
Now each container gets its own dedicated gofer that is chroot'd to the
rootfs path. This is done to add an extra layer of security in case the
gofer gets compromised.
PiperOrigin-RevId: 210396476
Change-Id: Iba21360a59dfe90875d61000db103f8609157ca0
-rw-r--r-- | runsc/boot/controller.go | 6 | ||||
-rw-r--r-- | runsc/boot/fs.go | 6 | ||||
-rw-r--r-- | runsc/boot/loader.go | 20 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 106 | ||||
-rw-r--r-- | runsc/cmd/BUILD | 1 | ||||
-rw-r--r-- | runsc/cmd/gofer.go | 84 | ||||
-rw-r--r-- | runsc/cmd/state.go | 5 | ||||
-rw-r--r-- | runsc/container/BUILD | 7 | ||||
-rw-r--r-- | runsc/container/container.go | 96 | ||||
-rw-r--r-- | runsc/container/container_test.go | 7 | ||||
-rw-r--r-- | runsc/container/fs.go | 198 | ||||
-rw-r--r-- | runsc/container/fs_test.go | 158 | ||||
-rw-r--r-- | runsc/fsgofer/BUILD | 4 | ||||
-rw-r--r-- | runsc/fsgofer/control.go | 204 | ||||
-rw-r--r-- | runsc/sandbox/BUILD | 2 | ||||
-rw-r--r-- | runsc/sandbox/network.go | 3 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 176 | ||||
-rw-r--r-- | runsc/specutils/BUILD | 6 | ||||
-rw-r--r-- | runsc/specutils/namespace.go (renamed from runsc/sandbox/namespace.go) | 34 | ||||
-rw-r--r-- | runsc/test/testutil/BUILD | 1 | ||||
-rw-r--r-- | runsc/test/testutil/docker.go | 6 | ||||
-rw-r--r-- | runsc/test/testutil/testutil.go | 57 |
22 files changed, 671 insertions, 516 deletions
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 2d6b507b3..fdb6be5b1 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -212,11 +212,11 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { if path.Clean(args.CID) != args.CID { return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID) } - if len(args.FilePayload.Files) != 1 { - return fmt.Errorf("start arguments must contain one file for the container root") + if len(args.FilePayload.Files) == 0 { + return fmt.Errorf("start arguments must contain at least one file for the container root") } - tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files[0]) + tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files) if err != nil { return err } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 6f5379a6d..20d0e42ef 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -510,8 +510,6 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) MountSources: make(map[string][]fs.MountArgs), } - mounts := compileMounts(spec) - // Add root mount. fd := fds.remove() opts := p9MountOptions(conf, fd) @@ -528,8 +526,8 @@ func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) } renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount) - // Add submounts - for _, m := range mounts { + // Add submounts. + for _, m := range compileMounts(spec) { if err := addRestoreMount(conf, renv, m, fds); err != nil { return nil, err } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 0e94cf215..3963ed55d 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -23,6 +23,7 @@ import ( "runtime" "sync" "sync/atomic" + "syscall" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" @@ -377,7 +378,7 @@ func (l *Loader) run() error { // startContainer starts a child container. It returns the thread group ID of // the newly created process. -func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, file *os.File) (kernel.ThreadID, error) { +func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) (kernel.ThreadID, error) { // Create capabilities. caps, err := specutils.Capabilities(spec.Process.Capabilities) if err != nil { @@ -414,11 +415,23 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config if err != nil { return 0, fmt.Errorf("failed to create new process: %v", err) } + + // Can't take ownership away from os.File. dup them to get a new FDs. + var ioFDs []int + for _, f := range files { + fd, err := syscall.Dup(int(f.Fd())) + if err != nil { + return 0, fmt.Errorf("failed to dup file: %v", err) + } + f.Close() + ioFDs = append(ioFDs, fd) + } + err = setFileSystemForProcess( &procArgs, spec, conf, - []int{int(file.Fd())}, // ioFDs + ioFDs, false, creds, procArgs.Limits, @@ -453,8 +466,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config return tgid, nil } -// TODO: Per-container namespaces must be supported -// for -pid. +// TODO: Per-container namespaces must be supported for -pid. // waitContainer waits for the root process of a container to exit. func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index f2f690b5d..2396d52c8 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -16,7 +16,6 @@ package boot import ( "fmt" - "io/ioutil" "math/rand" "os" "reflect" @@ -36,6 +35,15 @@ func init() { rand.Seed(time.Now().UnixNano()) } +func testConfig() *Config { + return &Config{ + RootDir: "unused_root_dir", + Network: NetworkNone, + FileAccess: FileAccessDirect, + DisableSeccomp: true, + } +} + // testSpec returns a simple spec that can be used in tests. func testSpec() *specs.Spec { return &specs.Spec{ @@ -55,12 +63,7 @@ func createLoader() (*Loader, error) { if err != nil { return nil, err } - conf := &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessDirect, - DisableSeccomp: true, - } + conf := testConfig() spec := testSpec() return New(spec, conf, fd, nil, false) } @@ -152,18 +155,6 @@ func TestStartSignal(t *testing.T) { // Test that MountNamespace can be created with various specs. func TestCreateMountNamespace(t *testing.T) { - conf := &Config{ - RootDir: "unused_root_dir", - FileAccess: FileAccessDirect, - DisableSeccomp: true, - } - - testFile, err := ioutil.TempFile(os.TempDir(), "create-mount-namespace-") - if err != nil { - t.Fatalf("ioutil.TempFile() failed, err: %v", err) - } - defer os.RemoveAll(testFile.Name()) - testCases := []struct { name string // Spec that will be used to create the mount manager. Note @@ -234,8 +225,7 @@ func TestCreateMountNamespace(t *testing.T) { }, { Destination: "/foo/qux", - Source: testFile.Name(), - Type: "bind", + Type: "tmpfs", }, { // File mounts with the same prefix. @@ -284,8 +274,7 @@ func TestCreateMountNamespace(t *testing.T) { { // Mount with the same prefix. Destination: "/dev/fd-foo", - Source: testFile.Name(), - Type: "bind", + Type: "tmpfs", }, { // Unsupported fs type. @@ -298,8 +287,7 @@ func TestCreateMountNamespace(t *testing.T) { }, { Destination: "/dev/bar", - Source: testFile.Name(), - Type: "bind", + Type: "tmpfs", }, }, }, @@ -339,19 +327,22 @@ func TestCreateMountNamespace(t *testing.T) { } for _, tc := range testCases { - ctx := contexttest.Context(t) - mm, err := createMountNamespace(ctx, ctx, &tc.spec, conf, nil) - if err != nil { - t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err) - } - defer mm.DecRef() - root := mm.Root() - defer root.DecRef() - for _, p := range tc.expectedPaths { - if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil { - t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + t.Run(tc.name, func(t *testing.T) { + conf := testConfig() + ctx := contexttest.Context(t) + mm, err := createMountNamespace(ctx, ctx, &tc.spec, conf, nil) + if err != nil { + t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err) } - } + defer mm.DecRef() + root := mm.Root() + defer root.DecRef() + for _, p := range tc.expectedPaths { + if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil { + t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + } + } + }) } } @@ -361,7 +352,7 @@ func TestRestoreEnvironment(t *testing.T) { testCases := []struct { name string spec *specs.Spec - conf *Config + fileAccess FileAccessType ioFDs []int errorExpected bool expectedRenv fs.RestoreEnvironment @@ -384,12 +375,7 @@ func TestRestoreEnvironment(t *testing.T) { }, }, }, - conf: &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessProxy, - DisableSeccomp: true, - }, + fileAccess: FileAccessProxy, ioFDs: []int{0}, errorExpected: false, expectedRenv: fs.RestoreEnvironment{ @@ -444,12 +430,7 @@ func TestRestoreEnvironment(t *testing.T) { }, }, }, - conf: &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessProxy, - DisableSeccomp: true, - }, + fileAccess: FileAccessProxy, ioFDs: []int{0, 1}, errorExpected: false, expectedRenv: fs.RestoreEnvironment{ @@ -508,12 +489,7 @@ func TestRestoreEnvironment(t *testing.T) { }, }, }, - conf: &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessProxy, - DisableSeccomp: true, - }, + fileAccess: FileAccessProxy, ioFDs: []int{0}, errorExpected: false, expectedRenv: fs.RestoreEnvironment{ @@ -572,12 +548,7 @@ func TestRestoreEnvironment(t *testing.T) { }, }, }, - conf: &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessDirect, - DisableSeccomp: true, - }, + fileAccess: FileAccessDirect, ioFDs: []int{0, 1}, errorExpected: true, }, @@ -596,20 +567,17 @@ func TestRestoreEnvironment(t *testing.T) { }, }, }, - conf: &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - FileAccess: FileAccessDirect, - DisableSeccomp: true, - }, + fileAccess: FileAccessDirect, ioFDs: []int{0}, errorExpected: true, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { + conf := testConfig() + conf.FileAccess = tc.fileAccess fds := &fdDispenser{fds: tc.ioFDs} - actualRenv, err := createRestoreEnvironment(tc.spec, tc.conf, fds) + actualRenv, err := createRestoreEnvironment(tc.spec, conf, fds) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) } else if tc.errorExpected { diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index b9ef4022f..5dee26a5c 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/p9", "//pkg/sentry/control", "//pkg/sentry/kernel/auth", + "//pkg/unet", "//pkg/urpc", "//runsc/boot", "//runsc/console", diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index e23f64d12..ab76734fc 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -16,6 +16,8 @@ package cmd import ( "os" + "path" + "sync" "syscall" "context" @@ -24,6 +26,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/unet" "gvisor.googlesource.com/gvisor/runsc/fsgofer" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -35,10 +38,6 @@ type Gofer struct { ioFDs intFlags applyCaps bool - // controllerFD is the file descriptor of a stream socket for the - // control server that is donated to this process. - controllerFD int - panicOnWrite bool } @@ -62,26 +61,16 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") - f.IntVar(&g.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected") } // Execute implements subcommands.Command. func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - if g.bundleDir == "" || len(g.ioFDs) < 1 || g.controllerFD == -1 { + if g.bundleDir == "" || len(g.ioFDs) < 1 { f.Usage() return subcommands.ExitUsageError } - // fsgofer should run with a umask of 0, because we want to preserve file - // modes exactly as sent by the sandbox, which will have applied its own umask. - syscall.Umask(0) - - spec, err := specutils.ReadSpec(g.bundleDir) - if err != nil { - Fatalf("error reading spec: %v", err) - } - if g.applyCaps { // Minimal set of capabilities needed by the Gofer to operate on files. caps := []string{ @@ -107,49 +96,84 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) panic("unreachable") } + spec, err := specutils.ReadSpec(g.bundleDir) + if err != nil { + Fatalf("error reading spec: %v", err) + } specutils.LogSpec(spec) - // Start with root mount, then add any other addition mount as needed. + // fsgofer should run with a umask of 0, because we want to preserve file + // modes exactly as sent by the sandbox, which will have applied its own umask. + syscall.Umask(0) + + // Find what path is going to be served by this gofer. + root := absPath(g.bundleDir, spec.Root.Path) + if err := syscall.Chroot(root); err != nil { + Fatalf("failed to chroot to %q: %v", root, err) + } + if err := syscall.Chdir("/"); err != nil { + Fatalf("failed to change working dir: %v", err) + } + log.Infof("Process chroot'd to %q", root) + + // Start with root mount, then add any other additional mount as needed. ats := make([]p9.Attacher, 0, len(spec.Mounts)+1) - p := absPath(g.bundleDir, spec.Root.Path) - ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{ + ats = append(ats, fsgofer.NewAttachPoint("/", fsgofer.Config{ ROMount: spec.Root.Readonly, PanicOnWrite: g.panicOnWrite, // Docker uses overlay2 by default for the root mount, and overlay2 does a copy-up when // each file is opened as writable. Thus, we open files lazily to avoid copy-up. LazyOpenForWrite: true, })) - log.Infof("Serving %q mapped to %q on FD %d", "/", p, g.ioFDs[0]) + log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], spec.Root.Readonly) mountIdx := 1 // first one is the root for _, m := range spec.Mounts { if specutils.Is9PMount(m) { - p = absPath(g.bundleDir, m.Source) - ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{ + if !path.IsAbs(m.Destination) { + Fatalf("destination must be absolute path: %v", m.Destination) + } + cfg := fsgofer.Config{ ROMount: isReadonlyMount(m.Options), PanicOnWrite: g.panicOnWrite, LazyOpenForWrite: false, - })) + } + ats = append(ats, fsgofer.NewAttachPoint(m.Destination, cfg)) if mountIdx >= len(g.ioFDs) { Fatalf("No FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m) } - log.Infof("Serving %q mapped to %q on FD %d", m.Destination, p, g.ioFDs[mountIdx]) + log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfg.ROMount) mountIdx++ } } if mountIdx != len(g.ioFDs) { - Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) + Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) } - ctrl, err := fsgofer.NewController(g.controllerFD, g.bundleDir) + runServers(ats, g.ioFDs) + return subcommands.ExitSuccess +} - if err := ctrl.Serve(ats, g.ioFDs); err != nil { - Fatalf("Failed to serve via P9: %v", err) +func runServers(ats []p9.Attacher, ioFDs []int) { + // Run the loops and wait for all to exit. + var wg sync.WaitGroup + for i, ioFD := range ioFDs { + wg.Add(1) + go func(ioFD int, at p9.Attacher) { + socket, err := unet.NewSocket(ioFD) + if err != nil { + Fatalf("err creating server on FD %d: %v", ioFD, err) + } + s := p9.NewServer(at) + if err := s.Handle(socket); err != nil { + Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err) + } + wg.Done() + }(ioFD, ats[i]) } - ctrl.Wait() - - return subcommands.ExitSuccess + wg.Wait() + log.Infof("All 9P servers exited.") } func isReadonlyMount(opts []string) bool { diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go index 28752d95e..265014e1b 100644 --- a/runsc/cmd/state.go +++ b/runsc/cmd/state.go @@ -63,8 +63,11 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s } log.Debugf("Returning state for container %+v", c) + state := c.State() + log.Debugf("State: %+v", state) + // Write json-encoded state directly to stdout. - b, err := json.MarshalIndent(c.State(), "", " ") + b, err := json.MarshalIndent(state, "", " ") if err != nil { Fatalf("error marshaling container state: %v", err) } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index e40ca4709..cba418d0c 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -13,6 +13,7 @@ go_library( name = "container", srcs = [ "container.go", + "fs.go", "hook.go", "status.go", ], @@ -28,13 +29,17 @@ go_library( "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", ], ) go_test( name = "container_test", size = "medium", - srcs = ["container_test.go"], + srcs = [ + "container_test.go", + "fs_test.go", + ], data = [ ":uds_test_app", "//runsc", diff --git a/runsc/container/container.go b/runsc/container/container.go index 8bd47aac1..16af66d3e 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -21,6 +21,7 @@ import ( "fmt" "io/ioutil" "os" + "os/exec" "path/filepath" "regexp" "strconv" @@ -223,15 +224,19 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo // init container in the sandbox. if specutils.ShouldCreateSandbox(spec) || !conf.MultiContainer { log.Debugf("Creating new sandbox for container %q", id) + ioFiles, err := c.createGoferProcess(spec, conf, bundleDir) + if err != nil { + return nil, err + } + // Start a new sandbox for this container. Any errors after this point // must destroy the container. - s, goferPid, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket) + s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket, ioFiles) if err != nil { c.Destroy() return nil, err } c.Sandbox = s - c.GoferPid = goferPid } else { // This is sort of confusing. For a sandbox with a root // container and a child container in it, runsc sees: @@ -254,13 +259,6 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo return nil, err } c.Sandbox = sb.Sandbox - - // Prepare the gofer to serve the container's filesystem. - err = sb.Sandbox.CreateChild(c.ID, bundleDir) - if err != nil { - c.Destroy() - return nil, err - } } c.Status = Created @@ -304,7 +302,12 @@ func (c *Container) Start(conf *boot.Config) error { return err } } else { - if err := c.Sandbox.Start(c.Spec, conf, c.ID); err != nil { + // Create the gofer process. + ioFiles, err := c.createGoferProcess(c.Spec, conf, c.BundleDir) + if err != nil { + return err + } + if err := c.Sandbox.Start(c.Spec, conf, c.ID, ioFiles); err != nil { c.Destroy() return err } @@ -518,6 +521,8 @@ func (c *Container) Destroy() error { log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err) } } + c.Sandbox = nil + if c.GoferPid != 0 { log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid) if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { @@ -527,9 +532,7 @@ func (c *Container) Destroy() error { } } - c.Sandbox = nil c.Status = Stopped - return nil } @@ -596,3 +599,72 @@ func (c *Container) waitForStopped() error { } return backoff.Retry(op, b) } + +func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, error) { + if conf.FileAccess == boot.FileAccessDirect { + // Don't start a gofer. The sandbox will access host FS directly. + return nil, nil + } + + if err := setupFS(spec, conf, bundleDir); err != nil { + return nil, fmt.Errorf("failed to setup mounts: %v", err) + } + + // Start with the general config flags. + args := conf.ToFlags() + args = append(args, "gofer", "--bundle", bundleDir) + if conf.Overlay { + args = append(args, "--panic-on-write=true") + } + + // Add root mount and then add any other additional mounts. + mountCount := 1 + + // Add additional mounts. + for _, m := range spec.Mounts { + if specutils.Is9PMount(m) { + mountCount++ + } + } + sandEnds := make([]*os.File, 0, mountCount) + goferEnds := make([]*os.File, 0, mountCount) + + // nextFD is the next available file descriptor for the gofer process. + // It starts at 3 because 0-2 are used by stdin/stdout/stderr. + nextFD := 3 + for ; nextFD-3 < mountCount; nextFD++ { + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, err + } + sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd")) + + goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd") + defer goferEnd.Close() + goferEnds = append(goferEnds, goferEnd) + + args = append(args, fmt.Sprintf("--io-fds=%d", nextFD)) + } + + binPath, err := specutils.BinPath() + if err != nil { + return nil, err + } + cmd := exec.Command(binPath, args...) + cmd.ExtraFiles = goferEnds + + // Setup any uid/gid mappings, and create or join the configured user + // namespace so the gofer's view of the filesystem aligns with the + // users in the sandbox. + specutils.SetUIDGIDMappings(cmd, spec) + nss := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec) + + // Start the gofer in the given namespace. + log.Debugf("Starting gofer: %s %v", binPath, args) + if err := specutils.StartInNS(cmd, nss); err != nil { + return nil, err + } + log.Infof("Gofer started, pid: %d", cmd.Process.Pid) + c.GoferPid = cmd.Process.Pid + return sandEnds, nil +} diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 6d84700ce..25aaf3f86 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -1211,9 +1211,6 @@ func TestMountNewDir(t *testing.T) { if err != nil { t.Fatal("ioutil.TempDir() failed:", err) } - if err := os.Chmod(root, 0755); err != nil { - t.Fatalf("os.Chmod(%q) failed: %v", root, err) - } srcDir := path.Join(root, "src", "dir", "anotherdir") if err := os.MkdirAll(srcDir, 0755); err != nil { @@ -1747,3 +1744,7 @@ func TestGoferExits(t *testing.T) { t.Errorf("container shouldn't be running, container: %+v", c) } } + +func TestMain(m *testing.M) { + testutil.RunAsRoot(m) +} diff --git a/runsc/container/fs.go b/runsc/container/fs.go new file mode 100644 index 000000000..652f81bbf --- /dev/null +++ b/runsc/container/fs.go @@ -0,0 +1,198 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +type mapping struct { + set bool + val uint32 +} + +var optionsMap = map[string]mapping{ + "acl": {set: true, val: syscall.MS_POSIXACL}, + "async": {set: false, val: syscall.MS_SYNCHRONOUS}, + "atime": {set: false, val: syscall.MS_NOATIME}, + "bind": {set: true, val: syscall.MS_BIND}, + "defaults": {set: true, val: 0}, + "dev": {set: false, val: syscall.MS_NODEV}, + "diratime": {set: false, val: syscall.MS_NODIRATIME}, + "dirsync": {set: true, val: syscall.MS_DIRSYNC}, + "exec": {set: false, val: syscall.MS_NOEXEC}, + "iversion": {set: true, val: syscall.MS_I_VERSION}, + "loud": {set: false, val: syscall.MS_SILENT}, + "mand": {set: true, val: syscall.MS_MANDLOCK}, + "noacl": {set: false, val: syscall.MS_POSIXACL}, + "noatime": {set: true, val: syscall.MS_NOATIME}, + "nodev": {set: true, val: syscall.MS_NODEV}, + "nodiratime": {set: true, val: syscall.MS_NODIRATIME}, + "noexec": {set: true, val: syscall.MS_NOEXEC}, + "noiversion": {set: false, val: syscall.MS_I_VERSION}, + "nomand": {set: false, val: syscall.MS_MANDLOCK}, + "norelatime": {set: false, val: syscall.MS_RELATIME}, + "nostrictatime": {set: false, val: syscall.MS_STRICTATIME}, + "nosuid": {set: true, val: syscall.MS_NOSUID}, + "private": {set: true, val: syscall.MS_PRIVATE}, + "rbind": {set: true, val: syscall.MS_BIND | syscall.MS_REC}, + "relatime": {set: true, val: syscall.MS_RELATIME}, + "remount": {set: true, val: syscall.MS_REMOUNT}, + "ro": {set: true, val: syscall.MS_RDONLY}, + "rprivate": {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC}, + "rw": {set: false, val: syscall.MS_RDONLY}, + "silent": {set: true, val: syscall.MS_SILENT}, + "strictatime": {set: true, val: syscall.MS_STRICTATIME}, + "suid": {set: false, val: syscall.MS_NOSUID}, + "sync": {set: true, val: syscall.MS_SYNCHRONOUS}, +} + +// setupFS creates the container directory structure under 'spec.Root.Path'. +// This allows the gofer serving the containers to be chroot under this +// directory to create an extra layer to security in case the gofer gets +// compromised. +func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) error { + for _, m := range spec.Mounts { + if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { + continue + } + src := m.Source + if !filepath.IsAbs(src) { + src = filepath.Join(bundleDir, src) + } + srcfi, err := os.Stat(src) + if err != nil { + return err + } + + // It's possible that 'm.Destination' follows symlinks inside the + // container. + dst, err := resolveSymlinks(spec.Root.Path, m.Destination) + if err != nil { + return err + } + + // Create mount point if it doesn't exits + if _, err := os.Stat(dst); os.IsNotExist(err) { + if srcfi.IsDir() { + if err := os.MkdirAll(dst, 0755); err != nil { + return err + } + } else { + if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil { + return err + } + f, err := os.OpenFile(dst, os.O_CREATE, 0755) + if err != nil { + return err + } + f.Close() + } + } + + flags := optionsToFlags(m.Options) + flags |= syscall.MS_BIND + log.Infof("Mounting src: %q, dst: %q, flags: %#x", src, dst, flags) + if err := syscall.Mount(src, dst, m.Type, uintptr(flags), ""); err != nil { + return err + } + } + + // Remount root as readonly after setup is done, if requested. + if spec.Root.Readonly { + log.Infof("Remounting root as readonly: %q", spec.Root.Path) + flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC) + return unix.Mount(spec.Root.Path, spec.Root.Path, "bind", flags, "") + } + return nil +} + +// resolveSymlinks walks 'rel' having 'root' as the root directory. If there are +// symlinks, they are evaluated relative to 'root' to ensure the end result is +// the same as if the process was running inside the container. +func resolveSymlinks(root, rel string) (string, error) { + return resolveSymlinksImpl(root, root, rel, 255) +} + +func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) { + if followCount == 0 { + return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel)) + } + + rel = filepath.Clean(rel) + for _, name := range strings.Split(rel, string(filepath.Separator)) { + if name == "" { + continue + } + // Note that Join() resolves things like ".." and returns a clean path. + path := filepath.Join(base, name) + if !strings.HasPrefix(path, root) { + // One cannot '..' their way out of root. + path = root + continue + } + fi, err := os.Lstat(path) + if err != nil { + if !os.IsNotExist(err) { + return "", err + } + // Not found means there is no symlink to check. Just keep walking dirs. + base = path + continue + } + if fi.Mode()&os.ModeSymlink != 0 { + link, err := os.Readlink(path) + if err != nil { + return "", err + } + if filepath.IsAbs(link) { + base = root + } + base, err = resolveSymlinksImpl(root, base, link, followCount-1) + if err != nil { + return "", err + } + continue + } + base = path + } + return base, nil +} + +func optionsToFlags(opts []string) uint32 { + var rv uint32 + for _, opt := range opts { + if m, ok := optionsMap[opt]; ok { + if m.set { + rv |= m.val + } else { + rv ^= m.val + } + } else { + log.Warningf("Ignoring mount option %q", opt) + } + } + return rv +} diff --git a/runsc/container/fs_test.go b/runsc/container/fs_test.go new file mode 100644 index 000000000..84bde18fb --- /dev/null +++ b/runsc/container/fs_test.go @@ -0,0 +1,158 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "fmt" + "io/ioutil" + "os" + "path" + "path/filepath" + "testing" + + "gvisor.googlesource.com/gvisor/runsc/test/testutil" +) + +type dir struct { + rel string + link string +} + +func construct(root string, dirs []dir) error { + for _, d := range dirs { + p := path.Join(root, d.rel) + if d.link == "" { + if err := os.MkdirAll(p, 0755); err != nil { + return fmt.Errorf("error creating dir: %v", err) + } + } else { + if err := os.MkdirAll(path.Dir(p), 0755); err != nil { + return fmt.Errorf("error creating dir: %v", err) + } + if err := os.Symlink(d.link, p); err != nil { + return fmt.Errorf("error creating symlink: %v", err) + } + } + } + return nil +} + +func TestResolveSymlinks(t *testing.T) { + root, err := ioutil.TempDir(testutil.TmpDir(), "root") + if err != nil { + t.Fatal("ioutil.TempDir() failed:", err) + } + dirs := []dir{ + {"dir1/dir11/dir111/dir1111", ""}, // Just a boring dir + {"dir1/lnk12", "dir11"}, // Link to sibling + {"dir1/lnk13", "./dir11"}, // Link to sibling through self + {"dir1/lnk14", "../dir1/dir11"}, // Link to sibling through parent + {"dir1/dir15/lnk151", ".."}, // Link to parent + {"dir1/lnk16", "dir11/dir111"}, // Link to child + {"dir1/lnk17", "."}, // Link to self + {"dir1/lnk18", "lnk13"}, // Link to link + {"lnk2", "dir1/lnk13"}, // Link to link to link + {"dir3/dir21/lnk211", "../.."}, // Link to root relative + {"dir3/lnk22", "/"}, // Link to root absolute + {"dir3/lnk23", "/dir1"}, // Link to dir absolute + {"dir3/lnk24", "/dir1/lnk12"}, // Link to link absolute + {"lnk5", "../../.."}, // Link outside root + } + if err := construct(root, dirs); err != nil { + t.Fatal("construct failed:", err) + } + + tests := []struct { + name string + rel string + want string + compareHost bool + }{ + {name: "root", rel: "/", want: "/", compareHost: true}, + {name: "basic dir", rel: "/dir1/dir11/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "dot 1", rel: "/dir1/dir11/./dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "dot 2", rel: "/dir1/././dir11/./././././dir111/.", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "dotdot 1", rel: "/dir1/dir11/../dir15", want: "/dir1/dir15", compareHost: true}, + {name: "dotdot 2", rel: "/dir1/dir11/dir1111/../..", want: "/dir1", compareHost: true}, + + {name: "link sibling", rel: "/dir1/lnk12", want: "/dir1/dir11", compareHost: true}, + {name: "link sibling + dir", rel: "/dir1/lnk12/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "link sibling through self", rel: "/dir1/lnk13", want: "/dir1/dir11", compareHost: true}, + {name: "link sibling through parent", rel: "/dir1/lnk14", want: "/dir1/dir11", compareHost: true}, + + {name: "link parent", rel: "/dir1/dir15/lnk151", want: "/dir1", compareHost: true}, + {name: "link parent + dir", rel: "/dir1/dir15/lnk151/dir11", want: "/dir1/dir11", compareHost: true}, + {name: "link child", rel: "/dir1/lnk16", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "link child + dir", rel: "/dir1/lnk16/dir1111", want: "/dir1/dir11/dir111/dir1111", compareHost: true}, + {name: "link self", rel: "/dir1/lnk17", want: "/dir1", compareHost: true}, + {name: "link self + dir", rel: "/dir1/lnk17/dir11", want: "/dir1/dir11", compareHost: true}, + + {name: "link^2", rel: "/dir1/lnk18", want: "/dir1/dir11", compareHost: true}, + {name: "link^2 + dir", rel: "/dir1/lnk18/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "link^3", rel: "/lnk2", want: "/dir1/dir11", compareHost: true}, + {name: "link^3 + dir", rel: "/lnk2/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + + {name: "link abs", rel: "/dir3/lnk23", want: "/dir1"}, + {name: "link abs + dir", rel: "/dir3/lnk23/dir11", want: "/dir1/dir11"}, + {name: "link^2 abs", rel: "/dir3/lnk24", want: "/dir1/dir11"}, + {name: "link^2 abs + dir", rel: "/dir3/lnk24/dir111", want: "/dir1/dir11/dir111"}, + + {name: "root link rel", rel: "/dir3/dir21/lnk211", want: "/", compareHost: true}, + {name: "root link abs", rel: "/dir3/lnk22", want: "/"}, + {name: "root contain link", rel: "/lnk5/dir1", want: "/dir1"}, + {name: "root contain dotdot", rel: "/dir1/dir11/../../../../../../../..", want: "/"}, + + {name: "crazy", rel: "/dir3/dir21/lnk211/dir3/lnk22/dir1/dir11/../../lnk5/dir3/../dir3/lnk24/dir111/dir1111/..", want: "/dir1/dir11/dir111"}, + } + for _, tst := range tests { + t.Run(tst.name, func(t *testing.T) { + got, err := resolveSymlinks(root, tst.rel) + if err != nil { + t.Errorf("resolveSymlinks(root, %q) failed: %v", tst.rel, err) + } + want := path.Join(root, tst.want) + if got != want { + t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, got, want) + } + if tst.compareHost { + // Check that host got to the same end result. + host, err := filepath.EvalSymlinks(path.Join(root, tst.rel)) + if err != nil { + t.Errorf("path.EvalSymlinks(root, %q) failed: %v", tst.rel, err) + } + if host != got { + t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, host, got) + } + } + }) + } +} + +func TestResolveSymlinksLoop(t *testing.T) { + root, err := ioutil.TempDir(testutil.TmpDir(), "root") + if err != nil { + t.Fatal("ioutil.TempDir() failed:", err) + } + dirs := []dir{ + {"loop1", "loop2"}, + {"loop2", "loop1"}, + } + if err := construct(root, dirs); err != nil { + t.Fatal("construct failed:", err) + } + if _, err := resolveSymlinks(root, "loop1"); err == nil { + t.Errorf("resolveSymlinks() should have failed") + } +} diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 0bc682b5f..24e172f48 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "fsgofer", srcs = [ - "control.go", "fsgofer.go", "fsgofer_unsafe.go", ], @@ -15,12 +14,9 @@ go_library( ], deps = [ "//pkg/abi/linux", - "//pkg/control/server", "//pkg/fd", "//pkg/log", "//pkg/p9", - "//pkg/unet", - "//pkg/urpc", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/fsgofer/control.go b/runsc/fsgofer/control.go deleted file mode 100644 index 8cb2f67ac..000000000 --- a/runsc/fsgofer/control.go +++ /dev/null @@ -1,204 +0,0 @@ -// Copyright 2018 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fsgofer - -import ( - "fmt" - "os" - "path/filepath" - "sync" - - "gvisor.googlesource.com/gvisor/pkg/control/server" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/p9" - "gvisor.googlesource.com/gvisor/pkg/unet" - "gvisor.googlesource.com/gvisor/pkg/urpc" -) - -// Controller manages the fsgofer's control server. -type Controller struct { - // api holds the control server's URPC endpoints. - api api - - // srv is the control server. - srv *server.Server -} - -// NewController creates a new Controller and starts it listenting -func NewController(fd int, rootBundleDir string) (*Controller, error) { - if !filepath.IsAbs(rootBundleDir) { - return nil, fmt.Errorf("NewController should receive an absolute bundle dir path, but got %q", rootBundleDir) - } - - srv, err := server.CreateFromFD(fd) - if err != nil { - return nil, err - } - - cr := &Controller{srv: srv} - cr.api.rootBundleDir = rootBundleDir - cr.api.bundleDirs = make(map[string]string) - srv.Register(&cr.api) - - if err := srv.StartServing(); err != nil { - return nil, err - } - - return cr, nil -} - -// Wait waits for all the p9 servers to finish, then shuts down the control -// server. -func (cr *Controller) Wait() { - cr.api.p9wg.Wait() - cr.srv.Stop() - log.Infof("All 9P servers exited.") -} - -// Serve starts serving each Attacher in ats via its corresponding file -// descriptor in ioFDs. This takes ownership of the FDs in ioFDs. -func (cr *Controller) Serve(ats []p9.Attacher, ioFDs []int) error { - if len(ats) != len(ioFDs) { - return fmt.Errorf("number of attach points does not match the number of IO FDs (%d and %d)", len(ats), len(ioFDs)) - } - for i, _ := range ats { - cr.api.serve(ats[i], os.NewFile(uintptr(ioFDs[i]), "io fd")) - } - return nil -} - -// api URPC methods. -const ( - // AddBundleDirs readies the gofer to serve from a new bundle - // directory. It should be called during runsc create. - AddBundleDirs = "api.AddBundleDirs" - - // ServeDirectory serves a new directory via the fsgofer. It should be - // called during runsc start. - ServeDirectory = "api.ServeDirectory" -) - -// API defines and implements the URPC endpoints for the gofer. -type api struct { - // p9wg waits for all the goroutines serving the sentry via p9. When its - // counter is 0, the gofer is out of work and exits. - p9wg sync.WaitGroup - - // bundleDirs maps from container ID to bundle directory for each - // container. - bundleDirs map[string]string - - // rootBundleDir is the bundle directory of the root container. - rootBundleDir string -} - -// AddBundleDirsRequest is the URPC argument to AddBundleDirs. -type AddBundleDirsRequest struct { - // BundleDirs is a map of container IDs to bundle directories to add to - // the gofer. - BundleDirs map[string]string -} - -// AddBundleDirsRequest adds bundle directories that for the gofer to serve. -func (api *api) AddBundleDirs(req *AddBundleDirsRequest, _ *struct{}) error { - log.Debugf("fsgofer.AddBundleDirs") - for cid, bd := range req.BundleDirs { - if _, ok := api.bundleDirs[cid]; ok { - return fmt.Errorf("fsgofer already has a bundleDir for container %q", cid) - } - api.bundleDirs[cid] = bd - } - return nil -} - -// ServeDirectoryRequest is the URPC argument to ServeDirectory. -type ServeDirectoryRequest struct { - // Dir is the absolute path to a directory to be served to the sentry. - Dir string - - // IsReadOnly specifies whether the directory should be served in - // read-only mode. - IsReadOnly bool - - // CID is the container ID of the container that needs to serve a - // directory. - CID string - - // FilePayload contains the socket over which the sentry will request - // files from Dir. - urpc.FilePayload -} - -// ServeDirectory begins serving a directory via a file descriptor for the -// sentry. Directories must be added via AddBundleDirsRequest before -// ServeDirectory is called. -func (api *api) ServeDirectory(req *ServeDirectoryRequest, _ *struct{}) error { - log.Debugf("fsgofer.ServeDirectory: %+v", req) - - if req.Dir == "" { - return fmt.Errorf("ServeDirectory should receive a directory argument, but was empty") - } - if req.CID == "" { - return fmt.Errorf("ServeDirectory should receive a CID argument, but was empty") - } - // Prevent CIDs containing ".." from confusing the sentry when creating - // /containers/<cid> directory. - // TODO: Once we have multiple independant roots, this - // check won't be necessary. - if filepath.Clean(req.CID) != req.CID { - return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", req.CID) - } - if nFiles := len(req.FilePayload.Files); nFiles != 1 { - return fmt.Errorf("ServeDirectory should receive 1 file descriptor, but got %d", nFiles) - } - - bd, ok := api.bundleDirs[req.CID] - if !ok { - // If there's no entry in bundleDirs for the container ID, this - // is the root container. - bd = api.rootBundleDir - } - - // Relative paths are served relative to the bundle directory. - absDir := req.Dir - if !filepath.IsAbs(absDir) { - absDir = filepath.Join(bd, req.Dir) - } - - // Create the attach point and start serving. - at := NewAttachPoint(absDir, Config{ - ROMount: req.IsReadOnly, - LazyOpenForWrite: true, - }) - api.serve(at, req.FilePayload.Files[0]) - - return nil -} - -// serve begins serving a directory via a file descriptor. -func (api *api) serve(at p9.Attacher, ioFile *os.File) { - api.p9wg.Add(1) - go func() { - socket, err := unet.NewSocket(int(ioFile.Fd())) - if err != nil { - panic(fmt.Sprintf("err creating server on FD %d: %v", ioFile.Fd(), err)) - } - s := p9.NewServer(at) - if err := s.Handle(socket); err != nil { - panic(fmt.Sprintf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFile.Fd(), err)) - } - api.p9wg.Done() - }() -} diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index e9a39f797..9317b1c14 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -5,7 +5,6 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library") go_library( name = "sandbox", srcs = [ - "namespace.go", "network.go", "sandbox.go", ], @@ -21,7 +20,6 @@ go_library( "//pkg/urpc", "//runsc/boot", "//runsc/console", - "//runsc/fsgofer", "//runsc/specutils", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", "@com_github_vishvananda_netlink//:go_default_library", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index d0ce6228b..8694ba755 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -29,6 +29,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/urpc" "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/specutils" ) const ( @@ -132,7 +133,7 @@ func createDefaultLoopbackInterface(conn *urpc.Client) error { func joinNetNS(nsPath string) (func(), error) { runtime.LockOSThread() - restoreNS, err := applyNS(specs.LinuxNamespace{ + restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{ Type: specs.NetworkNamespace, Path: nsPath, }) diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index e54ba4ba3..f14a2f8c9 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -32,7 +32,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/urpc" "gvisor.googlesource.com/gvisor/runsc/boot" "gvisor.googlesource.com/gvisor/runsc/console" - "gvisor.googlesource.com/gvisor/runsc/fsgofer" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -55,31 +54,20 @@ type Sandbox struct { } // Create creates the sandbox process. -func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string) (*Sandbox, int, error) { +func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) (*Sandbox, error) { s := &Sandbox{ID: id} - binPath, err := specutils.BinPath() - if err != nil { - return nil, 0, err - } - - // Create the gofer process. - goferPid, ioFiles, err := s.createGoferProcess(spec, conf, bundleDir, binPath) - if err != nil { - return nil, 0, err - } - // Create the sandbox process. - if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, binPath, ioFiles); err != nil { - return nil, 0, err + if err := s.createSandboxProcess(spec, conf, bundleDir, consoleSocket, ioFiles); err != nil { + return nil, err } // Wait for the control server to come up (or timeout). if err := s.waitForCreated(10 * time.Second); err != nil { - return nil, 0, err + return nil, err } - return s, goferPid, nil + return s, nil } // StartRoot starts running the root container process inside the sandbox. @@ -105,70 +93,29 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error { return nil } -// CreateChild creates a non-root container inside the sandbox. -func (s *Sandbox) CreateChild(cid, bundleDir string) error { - log.Debugf("Create non-root container sandbox %q, pid: %d for container %q with bundle directory %q", s.ID, s.Pid, cid, bundleDir) - - // Connect to the gofer and prepare it to serve from bundleDir for this - // container. - goferConn, err := s.goferConnect() - if err != nil { - return fmt.Errorf("couldn't connect to gofer: %v", err) - } - defer goferConn.Close() - goferReq := fsgofer.AddBundleDirsRequest{BundleDirs: map[string]string{cid: bundleDir}} - if err := goferConn.Call(fsgofer.AddBundleDirs, &goferReq, nil); err != nil { - return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err) +// Start starts running a non-root container inside the sandbox. +func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string, ioFiles []*os.File) error { + for _, f := range ioFiles { + defer f.Close() } - return nil -} - -// Start starts running a non-root container inside the sandbox. -func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config, cid string) error { log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid) - sandboxConn, err := s.sandboxConnect() if err != nil { return fmt.Errorf("couldn't connect to sandbox: %v", err) } defer sandboxConn.Close() - goferConn, err := s.goferConnect() - if err != nil { - return fmt.Errorf("couldn't connect to gofer: %v", err) - } - defer goferConn.Close() - - // Create socket that connects the sandbox and gofer. - sandEnd, goferEnd, err := createSocketPair() - if err != nil { - return err - } - defer sandEnd.Close() - defer goferEnd.Close() - - // Tell the Gofer about the new filesystem it needs to serve. - goferReq := fsgofer.ServeDirectoryRequest{ - Dir: spec.Root.Path, - IsReadOnly: spec.Root.Readonly, - CID: cid, - FilePayload: urpc.FilePayload{Files: []*os.File{goferEnd}}, - } - if err := goferConn.Call(fsgofer.ServeDirectory, &goferReq, nil); err != nil { - return fmt.Errorf("error serving new filesystem for non-root container %v: %v", goferReq, err) - } // Start running the container. args := boot.StartArgs{ Spec: spec, Conf: conf, CID: cid, - FilePayload: urpc.FilePayload{Files: []*os.File{sandEnd}}, + FilePayload: urpc.FilePayload{Files: ioFiles}, } if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err) } - return nil } @@ -275,102 +222,13 @@ func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { return conn, nil } -func (s *Sandbox) goferConnect() (*urpc.Client, error) { - log.Debugf("Connecting to gofer for sandbox %q", s.ID) - conn, err := client.ConnectTo(fsgofer.ControlSocketAddr(s.ID)) - if err != nil { - return nil, s.connError(err) - } - return conn, nil -} - func (s *Sandbox) connError(err error) error { return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err) } -func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir, binPath string) (int, []*os.File, error) { - if conf.FileAccess == boot.FileAccessDirect { - // Don't start a gofer. The sandbox will access host FS directly. - return 0, nil, nil - } - - // Start with the general config flags. - args := conf.ToFlags() - args = append(args, "gofer", "--bundle", bundleDir) - - // Add root mount and then add any other additional mounts. - mountCount := 1 - - // Add additional mounts. - for _, m := range spec.Mounts { - if specutils.Is9PMount(m) { - mountCount++ - } - } - sandEnds := make([]*os.File, 0, mountCount) - goferEnds := make([]*os.File, 0, mountCount) - // nextFD is the next available file descriptor for the gofer process. - // It starts at 3 because 0-2 are used by stdin/stdout/stderr. - var nextFD int - for nextFD = 3; nextFD-3 < mountCount; nextFD++ { - sandEnd, goferEnd, err := createSocketPair() - if err != nil { - return 0, nil, err - } - defer goferEnd.Close() - sandEnds = append(sandEnds, sandEnd) - goferEnds = append(goferEnds, goferEnd) - args = append(args, fmt.Sprintf("--io-fds=%d", nextFD)) - } - - // Create and donate a file descriptor for the control server. - addr := fsgofer.ControlSocketAddr(s.ID) - serverFD, err := server.CreateSocket(addr) - if err != nil { - return 0, nil, fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err) - } - - // Add the control server fd. - args = append(args, "--controller-fd="+strconv.Itoa(nextFD)) - nextFD++ - controllerFile := os.NewFile(uintptr(serverFD), "gofer_control_socket_server") - defer controllerFile.Close() - - cmd := exec.Command(binPath, args...) - cmd.ExtraFiles = goferEnds - cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile) - - // Setup any uid/gid mappings, and create or join the configured user - // namespace so the gofer's view of the filesystem aligns with the - // users in the sandbox. - setUIDGIDMappings(cmd, spec) - nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec) - - if conf.Overlay { - args = append(args, "--panic-on-write=true") - } - - // Start the gofer in the given namespace. - log.Debugf("Starting gofer: %s %v", binPath, args) - if err := startInNS(cmd, nss); err != nil { - return 0, nil, err - } - log.Infof("Gofer started, pid: %d", cmd.Process.Pid) - return cmd.Process.Pid, sandEnds, nil -} - -// createSocketPair creates a pair of files wrapping a socket pair. -func createSocketPair() (*os.File, *os.File, error) { - fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) - if err != nil { - return nil, nil, err - } - return os.NewFile(uintptr(fds[0]), "sandbox io fd"), os.NewFile(uintptr(fds[1]), "gofer io fd"), nil -} - // createSandboxProcess starts the sandbox as a subprocess by running the "boot" // command, passing in the bundle dir. -func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, binPath string, ioFiles []*os.File) error { +func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) error { // nextFD is used to get unused FDs that we can pass to the sandbox. It // starts at 3 because 0, 1, and 2 are taken by stdin/out/err. nextFD := 3 @@ -387,6 +245,10 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund consoleEnabled := consoleSocket != "" + binPath, err := specutils.BinPath() + if err != nil { + return err + } cmd := exec.Command(binPath, conf.ToFlags()...) cmd.SysProcAttr = &syscall.SysProcAttr{} cmd.Args = append(cmd.Args, @@ -464,7 +326,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund // Joins the network namespace if network is enabled. the sandbox talks // directly to the host network, which may have been configured in the // namespace. - if ns, ok := getNS(specs.NetworkNamespace, spec); ok && conf.Network != boot.NetworkNone { + if ns, ok := specutils.GetNS(specs.NetworkNamespace, spec); ok && conf.Network != boot.NetworkNone { log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) nss = append(nss, ns) } else { @@ -478,10 +340,10 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund // - Gofer: when using a Gofer, the sandbox process can run isolated in an // empty namespace. if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect { - if userns, ok := getNS(specs.UserNamespace, spec); ok { + if userns, ok := specutils.GetNS(specs.UserNamespace, spec); ok { log.Infof("Sandbox will be started in container's user namespace: %+v", userns) nss = append(nss, userns) - setUIDGIDMappings(cmd, spec) + specutils.SetUIDGIDMappings(cmd, spec) } else { log.Infof("Sandbox will be started in the current user namespace") } @@ -496,7 +358,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund } log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args) - if err := startInNS(cmd, nss); err != nil { + if err := specutils.StartInNS(cmd, nss); err != nil { return err } s.Pid = cmd.Process.Pid diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index a22ab789a..97a504b20 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -4,7 +4,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "specutils", - srcs = ["specutils.go"], + srcs = [ + "namespace.go", + "specutils.go", + ], importpath = "gvisor.googlesource.com/gvisor/runsc/specutils", visibility = [ "//runsc:__subpackages__", @@ -15,6 +18,7 @@ go_library( "//pkg/sentry/kernel/auth", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/sandbox/namespace.go b/runsc/specutils/namespace.go index 1d3bcfbb5..80eaad965 100644 --- a/runsc/sandbox/namespace.go +++ b/runsc/specutils/namespace.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package sandbox +package specutils import ( "fmt" @@ -74,10 +74,10 @@ func nsPath(nst specs.LinuxNamespaceType) string { } } -// getNS returns true and the namespace with the given type from the slice of +// GetNS returns true and the namespace with the given type from the slice of // namespaces in the spec. It returns false if the slice does not contain a // namespace with the type. -func getNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) { +func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) { if s.Linux == nil { return specs.LinuxNamespace{}, false } @@ -89,35 +89,35 @@ func getNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, b return specs.LinuxNamespace{}, false } -// filterNS returns a slice of namespaces from the spec with types that match +// FilterNS returns a slice of namespaces from the spec with types that match // those in the `filter` slice. -func filterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace { +func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace { if s.Linux == nil { return nil } var out []specs.LinuxNamespace for _, nst := range filter { - if ns, ok := getNS(nst, s); ok { + if ns, ok := GetNS(nst, s); ok { out = append(out, ns) } } return out } -// setNS sets the namespace of the given type. It must be called with +// SetNS sets the namespace of the given type. It must be called with // OSThreadLocked. -func setNS(fd, nsType uintptr) error { +func SetNS(fd, nsType uintptr) error { if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 { return err } return nil } -// applyNS applies the namespace on the current thread and returns a function +// ApplyNS applies the namespace on the current thread and returns a function // that will restore the namespace to the original value. // // Preconditions: Must be called with os thread locked. -func applyNS(ns specs.LinuxNamespace) (func(), error) { +func ApplyNS(ns specs.LinuxNamespace) (func(), error) { log.Infof("applying namespace %v at path %q", ns.Type, ns.Path) newNS, err := os.Open(ns.Path) if err != nil { @@ -134,22 +134,22 @@ func applyNS(ns specs.LinuxNamespace) (func(), error) { // Set netns to the one requested and setup function to restore it back. flag := nsCloneFlag(ns.Type) - if err := setNS(newNS.Fd(), flag); err != nil { + if err := SetNS(newNS.Fd(), flag); err != nil { oldNS.Close() return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err) } return func() { log.Infof("restoring namespace %v", ns.Type) defer oldNS.Close() - if err := setNS(oldNS.Fd(), flag); err != nil { + if err := SetNS(oldNS.Fd(), flag); err != nil { panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err)) } }, nil } -// startInNS joins or creates the given namespaces and calls cmd.Start before +// StartInNS joins or creates the given namespaces and calls cmd.Start before // restoring the namespaces to the original values. -func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { +func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { // We are about to setup namespaces, which requires the os thread being // locked so that Go doesn't change the thread out from under us. runtime.LockOSThread() @@ -167,7 +167,7 @@ func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { } // Join the given namespace, and restore the current namespace // before exiting. - restoreNS, err := applyNS(ns) + restoreNS, err := ApplyNS(ns) if err != nil { return err } @@ -177,8 +177,8 @@ func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { return cmd.Start() } -// setUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. -func setUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { +// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. +func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { if s.Linux == nil { return } diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD index 03ab3c4ac..ca91e07ff 100644 --- a/runsc/test/testutil/BUILD +++ b/runsc/test/testutil/BUILD @@ -18,5 +18,6 @@ go_library( "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_syndtr_gocapability//capability:go_default_library", ], ) diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go index b7d60e712..fc67c174a 100644 --- a/runsc/test/testutil/docker.go +++ b/runsc/test/testutil/docker.go @@ -32,7 +32,7 @@ func init() { rand.Seed(time.Now().UnixNano()) } -func runtime() string { +func getRuntime() string { r := os.Getenv("RUNSC_RUNTIME") if r == "" { return "runsc-test" @@ -43,7 +43,7 @@ func runtime() string { // IsPauseResumeSupported returns true if Pause/Resume is supported by runtime. func IsPauseResumeSupported() bool { // Native host network stack can't be saved. - return !strings.Contains(runtime(), "hostnet") + return !strings.Contains(getRuntime(), "hostnet") } // EnsureSupportedDockerVersion checks if correct docker is installed. @@ -128,7 +128,7 @@ type Docker struct { // Names of containers will be unique. func MakeDocker(namePrefix string) Docker { suffix := fmt.Sprintf("-%06d", rand.Int())[:7] - return Docker{Name: namePrefix + suffix, Runtime: runtime()} + return Docker{Name: namePrefix + suffix, Runtime: getRuntime()} } // Create calls 'docker create' with the arguments provided. diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index fc3d61e52..e90ab5ad5 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -23,11 +23,16 @@ import ( "io/ioutil" "net/http" "os" + "os/exec" "path/filepath" + "runtime" + "syscall" + "testing" "time" "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/syndtr/gocapability/capability" "gvisor.googlesource.com/gvisor/runsc/boot" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -227,3 +232,55 @@ func WaitForHTTP(port int, timeout time.Duration) error { } return Poll(cb, timeout) } + +// RunAsRoot ensures the test runs with CAP_SYS_ADMIN. If need it will create +// a new user namespace and reexecute the test as root inside of the namespace. +func RunAsRoot(m *testing.M) { + caps, err := capability.NewPid2(os.Getpid()) + if err != nil { + panic(err.Error()) + } + if err := caps.Load(); err != nil { + panic(err.Error()) + } + if caps.Get(capability.EFFECTIVE, capability.CAP_SYS_ADMIN) { + // Capability: check! Good to run. + os.Exit(m.Run()) + } + + // Current process doesn't have CAP_SYS_ADMIN, create user namespace and run + // as root inside that namespace to get it. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + cmd := exec.Command("/proc/self/exe", os.Args...) + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS, + // Set current user/group as root inside the namespace. + UidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getuid(), Size: 1}, + }, + GidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getgid(), Size: 1}, + }, + GidMappingsEnableSetgroups: false, + Credential: &syscall.Credential{ + Uid: 0, + Gid: 0, + }, + } + cmd.Env = os.Environ() + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + if exit, ok := err.(*exec.ExitError); ok { + if ws, ok := exit.Sys().(syscall.WaitStatus); ok { + os.Exit(ws.ExitStatus()) + } + os.Exit(-1) + } + panic(fmt.Sprint("error running child process:", err.Error())) + } + os.Exit(0) +} |