diff options
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/BUILD | 7 | ||||
-rw-r--r-- | runsc/boot/BUILD | 2 | ||||
-rw-r--r-- | runsc/boot/config.go | 4 | ||||
-rw-r--r-- | runsc/boot/loader.go | 49 | ||||
-rw-r--r-- | runsc/boot/user.go | 28 | ||||
-rw-r--r-- | runsc/boot/user_test.go | 3 | ||||
-rw-r--r-- | runsc/cmd/exec.go | 53 | ||||
-rw-r--r-- | runsc/cmd/exec_test.go | 4 | ||||
-rw-r--r-- | runsc/cmd/gofer.go | 5 | ||||
-rw-r--r-- | runsc/container/BUILD | 1 | ||||
-rw-r--r-- | runsc/container/container_test.go | 25 | ||||
-rw-r--r-- | runsc/container/test_app/test_app.go | 65 | ||||
-rw-r--r-- | runsc/dockerutil/dockerutil.go | 17 | ||||
-rw-r--r-- | runsc/fsgofer/filter/config.go | 13 | ||||
-rw-r--r-- | runsc/fsgofer/filter/filter.go | 13 | ||||
-rw-r--r-- | runsc/fsgofer/fsgofer.go | 70 | ||||
-rw-r--r-- | runsc/fsgofer/fsgofer_test.go | 2 | ||||
-rw-r--r-- | runsc/main.go | 4 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 2 | ||||
-rw-r--r-- | runsc/specutils/BUILD | 1 | ||||
-rw-r--r-- | runsc/specutils/namespace.go | 14 | ||||
-rw-r--r-- | runsc/specutils/specutils.go | 10 |
22 files changed, 302 insertions, 90 deletions
diff --git a/runsc/BUILD b/runsc/BUILD index 5e7dacb87..e4e8e64a3 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -1,7 +1,7 @@ package(licenses = ["notice"]) # Apache 2.0 load("@io_bazel_rules_go//go:def.bzl", "go_binary") -load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar") +load("@rules_pkg//:pkg.bzl", "pkg_deb", "pkg_tar") go_binary( name = "runsc", @@ -91,11 +91,6 @@ pkg_deb( maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>", package = "runsc", postinst = "debian/postinst.sh", - tags = [ - # TODO(b/135475885): pkg_deb requires python2: - # https://github.com/bazelbuild/bazel/issues/8443 - "manual", - ], version_file = ":version.txt", visibility = [ "//visibility:public", diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 588bb8851..d90381c0f 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -80,6 +80,7 @@ go_library( "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", "//pkg/tcpip/transport/icmp", + "//pkg/tcpip/transport/raw", "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", "//pkg/urpc", @@ -109,6 +110,7 @@ go_test( "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/context/contexttest", "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", "//pkg/unet", "//runsc/fsgofer", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 31103367d..38278d0a2 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -167,6 +167,9 @@ type Config struct { // Overlay is whether to wrap the root filesystem in an overlay. Overlay bool + // FSGoferHostUDS enables the gofer to mount a host UDS. + FSGoferHostUDS bool + // Network indicates what type of network to use. Network NetworkType @@ -253,6 +256,7 @@ func (c *Config) ToFlags() []string { "--debug-log-format=" + c.DebugLogFormat, "--file-access=" + c.FileAccess.String(), "--overlay=" + strconv.FormatBool(c.Overlay), + "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS), "--network=" + c.Network.String(), "--log-packets=" + strconv.FormatBool(c.LogPackets), "--platform=" + c.Platform, diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 823a34619..adf345490 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -20,7 +20,6 @@ import ( mrand "math/rand" "os" "runtime" - "strings" "sync" "sync/atomic" "syscall" @@ -55,6 +54,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" + "gvisor.dev/gvisor/pkg/tcpip/transport/raw" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/runsc/boot/filter" @@ -535,23 +535,12 @@ func (l *Loader) run() error { return err } - // Read /etc/passwd for the user's HOME directory and set the HOME - // environment variable as required by POSIX if it is not overridden by - // the user. - hasHomeEnvv := false - for _, envv := range l.rootProcArgs.Envv { - if strings.HasPrefix(envv, "HOME=") { - hasHomeEnvv = true - } - } - if !hasHomeEnvv { - homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID)) - if err != nil { - return fmt.Errorf("error reading exec user: %v", err) - } - - l.rootProcArgs.Envv = append(l.rootProcArgs.Envv, "HOME="+homeDir) + // Add the HOME enviroment variable if it is not already set. + envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + if err != nil { + return err } + l.rootProcArgs.Envv = envv // Create the root container init task. It will begin running // when the kernel is started. @@ -815,6 +804,16 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { }) defer args.MountNamespace.DecRef() + // Add the HOME enviroment varible if it is not already set. + root := args.MountNamespace.Root() + defer root.DecRef() + ctx := fs.WithRoot(l.k.SupervisorContext(), root) + envv, err := maybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) + if err != nil { + return 0, err + } + args.Envv = envv + // Start the process. proc := control.Proc{Kernel: l.k} args.PIDNamespace = tg.PIDNamespace() @@ -913,15 +912,17 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { case NetworkNone, NetworkSandbox: // NetworkNone sets up loopback using netstack. - netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName} - protoNames := []string{tcp.ProtocolName, udp.ProtocolName, icmp.ProtocolName4} - s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{ - Clock: clock, - Stats: epsocket.Metrics, - HandleLocal: true, + netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} + transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + s := epsocket.Stack{stack.New(stack.Options{ + NetworkProtocols: netProtos, + TransportProtocols: transProtos, + Clock: clock, + Stats: epsocket.Metrics, + HandleLocal: true, // Enable raw sockets for users with sufficient // privileges. - Raw: true, + UnassociatedFactory: raw.EndpointFactory{}, })} // Enable SACK Recovery. diff --git a/runsc/boot/user.go b/runsc/boot/user.go index d1d423a5c..56cc12ee0 100644 --- a/runsc/boot/user.go +++ b/runsc/boot/user.go @@ -16,6 +16,7 @@ package boot import ( "bufio" + "fmt" "io" "strconv" "strings" @@ -23,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/usermem" ) @@ -42,7 +44,7 @@ func (r *fileReader) Read(buf []byte) (int, error) { // getExecUserHome returns the home directory of the executing user read from // /etc/passwd as read from the container filesystem. -func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32) (string, error) { +func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) { // The default user home directory to return if no user matching the user // if found in the /etc/passwd found in the image. const defaultHome = "/" @@ -82,7 +84,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32 File: f, } - homeDir, err := findHomeInPasswd(uid, r, defaultHome) + homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome) if err != nil { return "", err } @@ -90,6 +92,28 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32 return homeDir, nil } +// maybeAddExecUserHome returns a new slice with the HOME enviroment variable +// set if the slice does not already contain it, otherwise it returns the +// original slice unmodified. +func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) { + // Check if the envv already contains HOME. + for _, env := range envv { + if strings.HasPrefix(env, "HOME=") { + // We have it. Return the original slice unmodified. + return envv, nil + } + } + + // Read /etc/passwd for the user's HOME directory and set the HOME + // environment variable as required by POSIX if it is not overridden by + // the user. + homeDir, err := getExecUserHome(ctx, mns, uid) + if err != nil { + return nil, fmt.Errorf("error reading exec user: %v", err) + } + return append(envv, "HOME="+homeDir), nil +} + // findHomeInPasswd parses a passwd file and returns the given user's home // directory. This function does it's best to replicate the runc's behavior. func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) { diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go index 906baf3e5..9aee2ad07 100644 --- a/runsc/boot/user_test.go +++ b/runsc/boot/user_test.go @@ -25,6 +25,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) func setupTempDir() (string, error) { @@ -68,7 +69,7 @@ func setupPasswd(contents string, perms os.FileMode) func() (string, error) { // TestGetExecUserHome tests the getExecUserHome function. func TestGetExecUserHome(t *testing.T) { tests := map[string]struct { - uid uint32 + uid auth.KUID createRoot func() (string, error) expected string }{ diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index e817eff77..d1e99243b 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -105,11 +105,11 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) { // Execute implements subcommands.Command.Execute. It starts a process in an // already created container. func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - e, id, err := ex.parseArgs(f) + conf := args[0].(*boot.Config) + e, id, err := ex.parseArgs(f, conf.EnableRaw) if err != nil { Fatalf("parsing process spec: %v", err) } - conf := args[0].(*boot.Config) waitStatus := args[1].(*syscall.WaitStatus) c, err := container.Load(conf.RootDir, id) @@ -117,6 +117,9 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("loading sandbox: %v", err) } + log.Debugf("Exec arguments: %+v", e) + log.Debugf("Exec capablities: %+v", e.Capabilities) + // Replace empty settings with defaults from container. if e.WorkingDirectory == "" { e.WorkingDirectory = c.Spec.Process.Cwd @@ -127,15 +130,13 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("getting environment variables: %v", err) } } + if e.Capabilities == nil { - // enableRaw is set to true to prevent the filtering out of - // CAP_NET_RAW. This is the opposite of Create() because exec - // requires the capability to be set explicitly, while 'docker - // run' sets it by default. - e.Capabilities, err = specutils.Capabilities(true /* enableRaw */, c.Spec.Process.Capabilities) + e.Capabilities, err = specutils.Capabilities(conf.EnableRaw, c.Spec.Process.Capabilities) if err != nil { Fatalf("creating capabilities: %v", err) } + log.Infof("Using exec capabilities from container: %+v", e.Capabilities) } // containerd expects an actual process to represent the container being @@ -282,14 +283,14 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi // parseArgs parses exec information from the command line or a JSON file // depending on whether the --process flag was used. Returns an ExecArgs and // the ID of the container to be used. -func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) { +func (ex *Exec) parseArgs(f *flag.FlagSet, enableRaw bool) (*control.ExecArgs, string, error) { if ex.processPath == "" { // Requires at least a container ID and command. if f.NArg() < 2 { f.Usage() return nil, "", fmt.Errorf("both a container-id and command are required") } - e, err := ex.argsFromCLI(f.Args()[1:]) + e, err := ex.argsFromCLI(f.Args()[1:], enableRaw) return e, f.Arg(0), err } // Requires only the container ID. @@ -297,11 +298,11 @@ func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) { f.Usage() return nil, "", fmt.Errorf("a container-id is required") } - e, err := ex.argsFromProcessFile() + e, err := ex.argsFromProcessFile(enableRaw) return e, f.Arg(0), err } -func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) { +func (ex *Exec) argsFromCLI(argv []string, enableRaw bool) (*control.ExecArgs, error) { extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs)) for _, s := range ex.extraKGIDs { kgid, err := strconv.Atoi(s) @@ -314,7 +315,7 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) { var caps *auth.TaskCapabilities if len(ex.caps) > 0 { var err error - caps, err = capabilities(ex.caps) + caps, err = capabilities(ex.caps, enableRaw) if err != nil { return nil, fmt.Errorf("capabilities error: %v", err) } @@ -332,7 +333,7 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) { }, nil } -func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) { +func (ex *Exec) argsFromProcessFile(enableRaw bool) (*control.ExecArgs, error) { f, err := os.Open(ex.processPath) if err != nil { return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err) @@ -342,21 +343,21 @@ func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) { if err := json.NewDecoder(f).Decode(&p); err != nil { return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err) } - return argsFromProcess(&p) + return argsFromProcess(&p, enableRaw) } // argsFromProcess performs all the non-IO conversion from the Process struct // to ExecArgs. -func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) { +func argsFromProcess(p *specs.Process, enableRaw bool) (*control.ExecArgs, error) { // Create capabilities. var caps *auth.TaskCapabilities if p.Capabilities != nil { var err error - // enableRaw is set to true to prevent the filtering out of - // CAP_NET_RAW. This is the opposite of Create() because exec - // requires the capability to be set explicitly, while 'docker - // run' sets it by default. - caps, err = specutils.Capabilities(true /* enableRaw */, p.Capabilities) + // Starting from Docker 19, capabilities are explicitly set for exec (instead + // of nil like before). So we can't distinguish 'exec' from + // 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter + // CAP_NET_RAW in the same way as container start. + caps, err = specutils.Capabilities(enableRaw, p.Capabilities) if err != nil { return nil, fmt.Errorf("error creating capabilities: %v", err) } @@ -409,7 +410,7 @@ func resolveEnvs(envs ...[]string) ([]string, error) { // capabilities takes a list of capabilities as strings and returns an // auth.TaskCapabilities struct with those capabilities in every capability set. // This mimics runc's behavior. -func capabilities(cs []string) (*auth.TaskCapabilities, error) { +func capabilities(cs []string, enableRaw bool) (*auth.TaskCapabilities, error) { var specCaps specs.LinuxCapabilities for _, cap := range cs { specCaps.Ambient = append(specCaps.Ambient, cap) @@ -418,11 +419,11 @@ func capabilities(cs []string) (*auth.TaskCapabilities, error) { specCaps.Inheritable = append(specCaps.Inheritable, cap) specCaps.Permitted = append(specCaps.Permitted, cap) } - // enableRaw is set to true to prevent the filtering out of - // CAP_NET_RAW. This is the opposite of Create() because exec requires - // the capability to be set explicitly, while 'docker run' sets it by - // default. - return specutils.Capabilities(true /* enableRaw */, &specCaps) + // Starting from Docker 19, capabilities are explicitly set for exec (instead + // of nil like before). So we can't distinguish 'exec' from + // 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter + // CAP_NET_RAW in the same way as container start. + return specutils.Capabilities(enableRaw, &specCaps) } // stringSlice allows a flag to be used multiple times, where each occurrence diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go index eb38a431f..a1e980d08 100644 --- a/runsc/cmd/exec_test.go +++ b/runsc/cmd/exec_test.go @@ -91,7 +91,7 @@ func TestCLIArgs(t *testing.T) { } for _, tc := range testCases { - e, err := tc.ex.argsFromCLI(tc.argv) + e, err := tc.ex.argsFromCLI(tc.argv, true) if err != nil { t.Errorf("argsFromCLI(%+v): got error: %+v", tc.ex, err) } else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) { @@ -144,7 +144,7 @@ func TestJSONArgs(t *testing.T) { } for _, tc := range testCases { - e, err := argsFromProcess(&tc.p) + e, err := argsFromProcess(&tc.p, true) if err != nil { t.Errorf("argsFromProcess(%+v): got error: %+v", tc.p, err) } else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) { diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 9faabf494..fbd579fb8 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -182,6 +182,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) cfg := fsgofer.Config{ ROMount: isReadonlyMount(m.Options), PanicOnWrite: g.panicOnWrite, + HostUDS: conf.FSGoferHostUDS, } ap, err := fsgofer.NewAttachPoint(m.Destination, cfg) if err != nil { @@ -200,6 +201,10 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) } + if conf.FSGoferHostUDS { + filter.InstallUDSFilters() + } + if err := filter.Install(); err != nil { Fatalf("installing seccomp filters: %v", err) } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index bc1fa25e3..26d1cd5ab 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -47,6 +47,7 @@ go_test( ], deps = [ "//pkg/abi/linux", + "//pkg/bits", "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/kernel", diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 2ac12e5b6..519f5ed9b 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -34,6 +34,7 @@ import ( "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -2049,6 +2050,30 @@ func TestMountSymlink(t *testing.T) { } } +// Check that --net-raw disables the CAP_NET_RAW capability. +func TestNetRaw(t *testing.T) { + capNetRaw := strconv.FormatUint(bits.MaskOf64(int(linux.CAP_NET_RAW)), 10) + app, err := testutil.FindFile("runsc/container/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + for _, enableRaw := range []bool{true, false} { + conf := testutil.TestConfig() + conf.EnableRaw = enableRaw + + test := "--enabled" + if !enableRaw { + test = "--disabled" + } + + spec := testutil.NewSpecWithArgs(app, "capability", test, capNetRaw) + if err := run(spec, conf); err != nil { + t.Fatalf("Error running container: %v", err) + } + } +} + // executeSync synchronously executes a new process. func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { pid, err := cont.Execute(args) diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go index 7f735c254..913d781c6 100644 --- a/runsc/container/test_app/test_app.go +++ b/runsc/container/test_app/test_app.go @@ -19,10 +19,12 @@ package main import ( "context" "fmt" + "io/ioutil" "log" "net" "os" "os/exec" + "regexp" "strconv" sys "syscall" "time" @@ -35,6 +37,7 @@ import ( func main() { subcommands.Register(subcommands.HelpCommand(), "") subcommands.Register(subcommands.FlagsCommand(), "") + subcommands.Register(new(capability), "") subcommands.Register(new(fdReceiver), "") subcommands.Register(new(fdSender), "") subcommands.Register(new(forkBomb), "") @@ -287,3 +290,65 @@ func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interfac } return subcommands.ExitSuccess } + +type capability struct { + enabled uint64 + disabled uint64 +} + +// Name implements subcommands.Command. +func (*capability) Name() string { + return "capability" +} + +// Synopsis implements subcommands.Command. +func (*capability) Synopsis() string { + return "checks if effective capabilities are set/unset" +} + +// Usage implements subcommands.Command. +func (*capability) Usage() string { + return "capability [--enabled=number] [--disabled=number]" +} + +// SetFlags implements subcommands.Command. +func (c *capability) SetFlags(f *flag.FlagSet) { + f.Uint64Var(&c.enabled, "enabled", 0, "") + f.Uint64Var(&c.disabled, "disabled", 0, "") +} + +// Execute implements subcommands.Command. +func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if c.enabled == 0 && c.disabled == 0 { + fmt.Println("One of the flags must be set") + return subcommands.ExitUsageError + } + + status, err := ioutil.ReadFile("/proc/self/status") + if err != nil { + fmt.Printf("Error reading %q: %v\n", "proc/self/status", err) + return subcommands.ExitFailure + } + re := regexp.MustCompile("CapEff:\t([0-9a-f]+)\n") + matches := re.FindStringSubmatch(string(status)) + if matches == nil || len(matches) != 2 { + fmt.Printf("Effective capabilities not found in\n%s\n", status) + return subcommands.ExitFailure + } + caps, err := strconv.ParseUint(matches[1], 16, 64) + if err != nil { + fmt.Printf("failed to convert capabilities %q: %v\n", matches[1], err) + return subcommands.ExitFailure + } + + if c.enabled != 0 && (caps&c.enabled) != c.enabled { + fmt.Printf("Missing capabilities, want: %#x: got: %#x\n", c.enabled, caps) + return subcommands.ExitFailure + } + if c.disabled != 0 && (caps&c.disabled) != 0 { + fmt.Printf("Extra capabilities found, dont_want: %#x: got: %#x\n", c.disabled, caps) + return subcommands.ExitFailure + } + + return subcommands.ExitSuccess +} diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go index c073d8f75..57f6ae8de 100644 --- a/runsc/dockerutil/dockerutil.go +++ b/runsc/dockerutil/dockerutil.go @@ -282,7 +282,22 @@ func (d *Docker) Logs() (string, error) { // Exec calls 'docker exec' with the arguments provided. func (d *Docker) Exec(args ...string) (string, error) { - a := []string{"exec", d.Name} + return d.ExecWithFlags(nil, args...) +} + +// ExecWithFlags calls 'docker exec <flags> name <args>'. +func (d *Docker) ExecWithFlags(flags []string, args ...string) (string, error) { + a := []string{"exec"} + a = append(a, flags...) + a = append(a, d.Name) + a = append(a, args...) + return do(a...) +} + +// ExecAsUser calls 'docker exec' as the given user with the arguments +// provided. +func (d *Docker) ExecAsUser(user string, args ...string) (string, error) { + a := []string{"exec", "--user", user, d.Name} a = append(a, args...) return do(a...) } diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 2f3f2039a..c7922b54f 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -214,3 +214,16 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_UTIMENSAT: {}, syscall.SYS_WRITE: {}, } + +var udsSyscalls = seccomp.SyscallRules{ + syscall.SYS_SOCKET: []seccomp.Rule{ + { + seccomp.AllowValue(syscall.AF_UNIX), + }, + }, + syscall.SYS_CONNECT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + }, + }, +} diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go index 65053415f..289886720 100644 --- a/runsc/fsgofer/filter/filter.go +++ b/runsc/fsgofer/filter/filter.go @@ -23,11 +23,16 @@ import ( // Install installs seccomp filters. func Install() error { - s := allowedSyscalls - // Set of additional filters used by -race and -msan. Returns empty // when not enabled. - s.Merge(instrumentationFilters()) + allowedSyscalls.Merge(instrumentationFilters()) + + return seccomp.Install(allowedSyscalls) +} - return seccomp.Install(s) +// InstallUDSFilters extends the allowed syscalls to include those necessary for +// connecting to a host UDS. +func InstallUDSFilters() { + // Add additional filters required for connecting to the host's sockets. + allowedSyscalls.Merge(udsSyscalls) } diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 7c4d2b94e..29a82138e 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -21,6 +21,7 @@ package fsgofer import ( + "errors" "fmt" "io" "math" @@ -54,6 +55,7 @@ const ( regular fileType = iota directory symlink + socket unknown ) @@ -66,6 +68,8 @@ func (f fileType) String() string { return "directory" case symlink: return "symlink" + case socket: + return "socket" } return "unknown" } @@ -82,6 +86,9 @@ type Config struct { // PanicOnWrite panics on attempts to write to RO mounts. PanicOnWrite bool + + // HostUDS signals whether the gofer can mount a host's UDS. + HostUDS bool } type attachPoint struct { @@ -124,24 +131,50 @@ func (a *attachPoint) Attach() (p9.File, error) { if err != nil { return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err) } - mode := syscall.O_RDWR - if a.conf.ROMount || (stat.Mode&syscall.S_IFMT) == syscall.S_IFDIR { - mode = syscall.O_RDONLY - } - - // Open the root directory. - f, err := fd.Open(a.prefix, openFlags|mode, 0) - if err != nil { - return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err) - } + // Acquire the attach point lock. a.attachedMu.Lock() defer a.attachedMu.Unlock() + if a.attached { - f.Close() return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) } + // Hold the file descriptor we are converting into a p9.File. + var f *fd.FD + + // Apply the S_IFMT bitmask so we can detect file type appropriately. + switch fmtStat := stat.Mode & syscall.S_IFMT; fmtStat { + case syscall.S_IFSOCK: + // Check to see if the CLI option has been set to allow the UDS mount. + if !a.conf.HostUDS { + return nil, errors.New("host UDS support is disabled") + } + + // Attempt to open a connection. Bubble up the failures. + f, err = fd.DialUnix(a.prefix) + if err != nil { + return nil, err + } + + default: + // Default to Read/Write permissions. + mode := syscall.O_RDWR + + // If the configuration is Read Only or the mount point is a directory, + // set the mode to Read Only. + if a.conf.ROMount || fmtStat == syscall.S_IFDIR { + mode = syscall.O_RDONLY + } + + // Open the mount point & capture the FD. + f, err = fd.Open(a.prefix, openFlags|mode, 0) + if err != nil { + return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err) + } + } + + // Return a localFile object to the caller with the UDS FD included. rv, err := newLocalFile(a, f, a.prefix, stat) if err != nil { return nil, err @@ -295,7 +328,7 @@ func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) return file, nil } -func getSupportedFileType(stat syscall.Stat_t) (fileType, error) { +func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) { var ft fileType switch stat.Mode & syscall.S_IFMT { case syscall.S_IFREG: @@ -304,6 +337,11 @@ func getSupportedFileType(stat syscall.Stat_t) (fileType, error) { ft = directory case syscall.S_IFLNK: ft = symlink + case syscall.S_IFSOCK: + if !permitSocket { + return unknown, syscall.EPERM + } + ft = socket default: return unknown, syscall.EPERM } @@ -311,7 +349,7 @@ func getSupportedFileType(stat syscall.Stat_t) (fileType, error) { } func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) { - ft, err := getSupportedFileType(stat) + ft, err := getSupportedFileType(stat, a.conf.HostUDS) if err != nil { return nil, err } @@ -1026,7 +1064,11 @@ func (l *localFile) Flush() error { // Connect implements p9.File. func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) { - return nil, syscall.ECONNREFUSED + // Check to see if the CLI option has been set to allow the UDS mount. + if !l.attachPoint.conf.HostUDS { + return nil, syscall.ECONNREFUSED + } + return fd.DialUnix(l.hostPath) } // Close implements p9.File. diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index cbbe71019..05af7e397 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -665,7 +665,7 @@ func TestAttachInvalidType(t *testing.T) { } f, err := a.Attach() if f != nil || err == nil { - t.Fatalf("Attach should have failed, got (%v, nil)", f) + t.Fatalf("Attach should have failed, got (%v, %v)", f, err) } }) } diff --git a/runsc/main.go b/runsc/main.go index ff74c0a3d..7dce9dc00 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -68,6 +68,7 @@ var ( network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") gso = flag.Bool("gso", true, "enable generic segmenation offload") fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") + fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "Allow the gofer to mount Unix Domain Sockets.") overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") @@ -195,6 +196,7 @@ func main() { DebugLog: *debugLog, DebugLogFormat: *debugLogFormat, FileAccess: fsAccess, + FSGoferHostUDS: *fsGoferHostUDS, Overlay: *overlay, Network: netType, GSO: *gso, @@ -239,7 +241,7 @@ func main() { // want with them. Since Docker and Containerd both eat boot's stderr, we // dup our stderr to the provided log FD so that panics will appear in the // logs, rather than just disappear. - if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil { + if err := syscall.Dup3(int(f.Fd()), int(os.Stderr.Fd()), 0); err != nil { cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err) } diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 4c6c83fbd..ee9327fc8 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -352,7 +352,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF } if conf.DebugLog != "" { test := "" - if len(conf.TestOnlyTestNameEnv) == 0 { + if len(conf.TestOnlyTestNameEnv) != 0 { // Fetch test name if one is provided and the test only flag was set. if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { test = t diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index fbfb8e2f8..fa58313a0 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -13,6 +13,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/bits", "//pkg/log", "//pkg/sentry/kernel/auth", "@com_github_cenkalti_backoff//:go_default_library", diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go index d441419cb..c7dd3051c 100644 --- a/runsc/specutils/namespace.go +++ b/runsc/specutils/namespace.go @@ -33,19 +33,19 @@ import ( func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr { switch nst { case specs.IPCNamespace: - return syscall.CLONE_NEWIPC + return unix.CLONE_NEWIPC case specs.MountNamespace: - return syscall.CLONE_NEWNS + return unix.CLONE_NEWNS case specs.NetworkNamespace: - return syscall.CLONE_NEWNET + return unix.CLONE_NEWNET case specs.PIDNamespace: - return syscall.CLONE_NEWPID + return unix.CLONE_NEWPID case specs.UTSNamespace: - return syscall.CLONE_NEWUTS + return unix.CLONE_NEWUTS case specs.UserNamespace: - return syscall.CLONE_NEWUSER + return unix.CLONE_NEWUSER case specs.CgroupNamespace: - panic("cgroup namespace has no associated clone flag") + return unix.CLONE_NEWCGROUP default: panic(fmt.Sprintf("unknown namespace %v", nst)) } diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index cb9e58dfb..591abe458 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -31,6 +31,7 @@ import ( "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) @@ -241,6 +242,15 @@ func AllCapabilities() *specs.LinuxCapabilities { } } +// AllCapabilitiesUint64 returns a bitmask containing all capabilities set. +func AllCapabilitiesUint64() uint64 { + var rv uint64 + for _, cap := range capFromName { + rv |= bits.MaskOf64(int(cap)) + } + return rv +} + var capFromName = map[string]linux.Capability{ "CAP_CHOWN": linux.CAP_CHOWN, "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE, |