diff options
-rw-r--r-- | pkg/syserr/netstack.go | 2 | ||||
-rw-r--r-- | pkg/tcpip/stack/stack.go | 12 | ||||
-rw-r--r-- | pkg/tcpip/tcpip.go | 1 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/endpoint_state.go | 1 | ||||
-rw-r--r-- | runsc/boot/config.go | 6 | ||||
-rw-r--r-- | runsc/boot/loader.go | 7 | ||||
-rw-r--r-- | runsc/cmd/exec.go | 18 | ||||
-rw-r--r-- | runsc/main.go | 2 | ||||
-rw-r--r-- | runsc/specutils/specutils.go | 22 | ||||
-rw-r--r-- | runsc/test/integration/BUILD | 5 | ||||
-rw-r--r-- | runsc/test/integration/exec_test.go | 26 | ||||
-rw-r--r-- | runsc/test/testutil/docker.go | 21 |
12 files changed, 104 insertions, 19 deletions
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go index c5a628c7d..1a23919ef 100644 --- a/pkg/syserr/netstack.go +++ b/pkg/syserr/netstack.go @@ -45,6 +45,7 @@ var ( ErrNoSuchFile = New(tcpip.ErrNoSuchFile.String(), linux.ENOENT) ErrInvalidOptionValue = New(tcpip.ErrInvalidOptionValue.String(), linux.EINVAL) ErrBroadcastDisabled = New(tcpip.ErrBroadcastDisabled.String(), linux.EACCES) + ErrNotPermittedNet = New(tcpip.ErrNotPermitted.String(), linux.EPERM) ) var netstackErrorTranslations = map[*tcpip.Error]*Error{ @@ -84,6 +85,7 @@ var netstackErrorTranslations = map[*tcpip.Error]*Error{ tcpip.ErrMessageTooLong: ErrMessageTooLong, tcpip.ErrNoBufferSpace: ErrNoBufferSpace, tcpip.ErrBroadcastDisabled: ErrBroadcastDisabled, + tcpip.ErrNotPermitted: ErrNotPermittedNet, } // TranslateNetstackError converts an error from the tcpip package to a sentry diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index a74c0a7a0..8f7b6f781 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -291,6 +291,10 @@ type Stack struct { linkAddrCache *linkAddrCache + // raw indicates whether raw sockets may be created. It is set during + // Stack creation and is immutable. + raw bool + mu sync.RWMutex nics map[tcpip.NICID]*NIC forwarding bool @@ -327,6 +331,9 @@ type Options struct { // should be handled by the stack internally (true) or outside the // stack (false). HandleLocal bool + + // Raw indicates whether raw sockets may be created. + Raw bool } // New allocates a new networking stack with only the requested networking and @@ -352,6 +359,7 @@ func New(network []string, transport []string, opts Options) *Stack { clock: clock, stats: opts.Stats.FillIn(), handleLocal: opts.HandleLocal, + raw: opts.Raw, } // Add specified network protocols. @@ -512,6 +520,10 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp // protocol. Raw endpoints receive all traffic for a given protocol regardless // of address. func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + if !s.raw { + return nil, tcpip.ErrNotPermitted + } + t, ok := s.transportProtocols[transport] if !ok { return nil, tcpip.ErrUnknownProtocol diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index e898dcbca..80cd6b4e5 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -102,6 +102,7 @@ var ( ErrMessageTooLong = &Error{msg: "message too long"} ErrNoBufferSpace = &Error{msg: "no buffer space available"} ErrBroadcastDisabled = &Error{msg: "broadcast socket option disabled"} + ErrNotPermitted = &Error{msg: "operation not permitted"} ) // Errors related to Subnet diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index a42e09b8c..7f9dabb4d 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -341,6 +341,7 @@ func loadError(s string) *tcpip.Error { tcpip.ErrMessageTooLong, tcpip.ErrNoBufferSpace, tcpip.ErrBroadcastDisabled, + tcpip.ErrNotPermitted, } messageToError = make(map[string]*tcpip.Error) diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 2523077fd..ba47effc1 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -175,6 +175,11 @@ type Config struct { // Network indicates what type of network to use. Network NetworkType + // EnableRaw indicates whether raw sockets should be enabled. Raw + // sockets are disabled by stripping CAP_NET_RAW from the list of + // capabilities. + EnableRaw bool + // GSO indicates that generic segmentation offload is enabled. GSO bool @@ -235,6 +240,7 @@ func (c *Config) ToFlags() []string { "--watchdog-action=" + c.WatchdogAction.String(), "--panic-signal=" + strconv.Itoa(c.PanicSignal), "--profile=" + strconv.FormatBool(c.ProfileEnable), + "--net-raw=" + strconv.FormatBool(c.EnableRaw), } if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { // Only include if set since it is never to be used by users. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 88a834aa5..48ecb2626 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -227,7 +227,7 @@ func New(args Args) (*Loader, error) { } // Create capabilities. - caps, err := specutils.Capabilities(args.Spec.Process.Capabilities) + caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities) if err != nil { return nil, fmt.Errorf("converting capabilities: %v", err) } @@ -554,7 +554,7 @@ func (l *Loader) createContainer(cid string) error { // this method returns. func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error { // Create capabilities. - caps, err := specutils.Capabilities(spec.Process.Capabilities) + caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { return fmt.Errorf("creating capabilities: %v", err) } @@ -800,6 +800,9 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { Clock: clock, Stats: epsocket.Metrics, HandleLocal: true, + // Enable raw sockets for users with sufficient + // privileges. + Raw: true, })} if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { return nil, fmt.Errorf("failed to enable SACK: %v", err) diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 9e058ad97..718d01067 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -132,7 +132,11 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } } if e.Capabilities == nil { - e.Capabilities, err = specutils.Capabilities(c.Spec.Process.Capabilities) + // enableRaw is set to true to prevent the filtering out of + // CAP_NET_RAW. This is the opposite of Create() because exec + // requires the capability to be set explicitly, while 'docker + // run' sets it by default. + e.Capabilities, err = specutils.Capabilities(true /* enableRaw */, c.Spec.Process.Capabilities) if err != nil { Fatalf("creating capabilities: %v", err) } @@ -351,7 +355,11 @@ func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) { var caps *auth.TaskCapabilities if p.Capabilities != nil { var err error - caps, err = specutils.Capabilities(p.Capabilities) + // enableRaw is set to true to prevent the filtering out of + // CAP_NET_RAW. This is the opposite of Create() because exec + // requires the capability to be set explicitly, while 'docker + // run' sets it by default. + caps, err = specutils.Capabilities(true /* enableRaw */, p.Capabilities) if err != nil { return nil, fmt.Errorf("error creating capabilities: %v", err) } @@ -413,7 +421,11 @@ func capabilities(cs []string) (*auth.TaskCapabilities, error) { specCaps.Inheritable = append(specCaps.Inheritable, cap) specCaps.Permitted = append(specCaps.Permitted, cap) } - return specutils.Capabilities(&specCaps) + // enableRaw is set to true to prevent the filtering out of + // CAP_NET_RAW. This is the opposite of Create() because exec requires + // the capability to be set explicitly, while 'docker run' sets it by + // default. + return specutils.Capabilities(true /* enableRaw */, &specCaps) } // stringSlice allows a flag to be used multiple times, where each occurrence diff --git a/runsc/main.go b/runsc/main.go index 74253a844..b35726a74 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -68,6 +68,7 @@ var ( watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") ) @@ -159,6 +160,7 @@ func main() { WatchdogAction: wa, PanicSignal: *panicSignal, ProfileEnable: *profile, + EnableRaw: *netRaw, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, } if len(*straceSyscalls) != 0 { diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index af8d34535..32f81b8d4 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -198,20 +198,26 @@ func ReadMounts(f *os.File) ([]specs.Mount, error) { // Capabilities takes in spec and returns a TaskCapabilities corresponding to // the spec. -func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) { +func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) { + // Strip CAP_NET_RAW from all capability sets if necessary. + skipSet := map[linux.Capability]struct{}{} + if !enableRaw { + skipSet[linux.CAP_NET_RAW] = struct{}{} + } + var caps auth.TaskCapabilities if specCaps != nil { var err error - if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding); err != nil { + if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil { return nil, err } - if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective); err != nil { + if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil { return nil, err } - if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable); err != nil { + if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil { return nil, err } - if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted); err != nil { + if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil { return nil, err } // TODO: Support ambient capabilities. @@ -275,13 +281,17 @@ var capFromName = map[string]linux.Capability{ "CAP_AUDIT_READ": linux.CAP_AUDIT_READ, } -func capsFromNames(names []string) (auth.CapabilitySet, error) { +func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) { var caps []linux.Capability for _, n := range names { c, ok := capFromName[n] if !ok { return 0, fmt.Errorf("unknown capability %q", n) } + // Should we skip this capabilty? + if _, ok := skipSet[c]; ok { + continue + } caps = append(caps, c) } return auth.CapabilitySetOfMany(caps), nil diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD index 779d30ec9..0c4e4fa80 100644 --- a/runsc/test/integration/BUILD +++ b/runsc/test/integration/BUILD @@ -15,7 +15,10 @@ go_test( "manual", "local", ], - deps = ["//runsc/test/testutil"], + deps = [ + "//pkg/abi/linux", + "//runsc/test/testutil", + ], ) go_library( diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go index fac8337f4..d87957e2d 100644 --- a/runsc/test/integration/exec_test.go +++ b/runsc/test/integration/exec_test.go @@ -27,10 +27,13 @@ package integration import ( + "fmt" + "strconv" "syscall" "testing" "time" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/runsc/test/testutil" ) @@ -46,11 +49,28 @@ func TestExecCapabilities(t *testing.T) { } defer d.CleanUp() - want, err := d.WaitForOutput("CapEff:\t[0-9a-f]+\n", 5*time.Second) + matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second) if err != nil { - t.Fatalf("WaitForOutput() timeout: %v", err) + t.Fatalf("WaitForOutputSubmatch() timeout: %v", err) } - t.Log("Root capabilities:", want) + if len(matches) != 2 { + t.Fatalf("There should be a match for the whole line and the capability bitmask") + } + capString := matches[1] + t.Log("Root capabilities:", capString) + + // CAP_NET_RAW was in the capability set for the container, but was + // removed. However, `exec` does not remove it. Verify that it's not + // set in the container, then re-add it for comparison. + caps, err := strconv.ParseUint(capString, 16, 64) + if err != nil { + t.Fatalf("failed to convert capabilities %q: %v", capString, err) + } + if caps&(1<<uint64(linux.CAP_NET_RAW)) != 0 { + t.Fatalf("CAP_NET_RAW should be filtered, but is set in the container: %x", caps) + } + caps |= 1 << uint64(linux.CAP_NET_RAW) + want := fmt.Sprintf("CapEff:\t%016x\n", caps) // Now check that exec'd process capabilities match the root. got, err := d.Exec("grep", "CapEff:", "/proc/self/status") diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go index bce609061..b651319ed 100644 --- a/runsc/test/testutil/docker.go +++ b/runsc/test/testutil/docker.go @@ -334,19 +334,32 @@ func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) { // WaitForOutput calls 'docker logs' to retrieve containers output and searches // for the given pattern. func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) { + matches, err := d.WaitForOutputSubmatch(pattern, timeout) + if err != nil { + return "", err + } + if len(matches) == 0 { + return "", nil + } + return matches[0], nil +} + +// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and +// searches for the given pattern. It returns any regexp submatches as well. +func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) { re := regexp.MustCompile(pattern) var out string for exp := time.Now().Add(timeout); time.Now().Before(exp); { var err error out, err = d.Logs() if err != nil { - return "", err + return nil, err } - if match := re.FindString(out); match != "" { + if matches := re.FindStringSubmatch(out); matches != nil { // Success! - return match, nil + return matches, nil } time.Sleep(100 * time.Millisecond) } - return "", fmt.Errorf("timeout waiting for output %q: %s", re.String(), out) + return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), out) } |