summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/syserr/netstack.go2
-rw-r--r--pkg/tcpip/stack/stack.go12
-rw-r--r--pkg/tcpip/tcpip.go1
-rw-r--r--pkg/tcpip/transport/tcp/endpoint_state.go1
-rw-r--r--runsc/boot/config.go6
-rw-r--r--runsc/boot/loader.go7
-rw-r--r--runsc/cmd/exec.go18
-rw-r--r--runsc/main.go2
-rw-r--r--runsc/specutils/specutils.go22
-rw-r--r--runsc/test/integration/BUILD5
-rw-r--r--runsc/test/integration/exec_test.go26
-rw-r--r--runsc/test/testutil/docker.go21
12 files changed, 104 insertions, 19 deletions
diff --git a/pkg/syserr/netstack.go b/pkg/syserr/netstack.go
index c5a628c7d..1a23919ef 100644
--- a/pkg/syserr/netstack.go
+++ b/pkg/syserr/netstack.go
@@ -45,6 +45,7 @@ var (
ErrNoSuchFile = New(tcpip.ErrNoSuchFile.String(), linux.ENOENT)
ErrInvalidOptionValue = New(tcpip.ErrInvalidOptionValue.String(), linux.EINVAL)
ErrBroadcastDisabled = New(tcpip.ErrBroadcastDisabled.String(), linux.EACCES)
+ ErrNotPermittedNet = New(tcpip.ErrNotPermitted.String(), linux.EPERM)
)
var netstackErrorTranslations = map[*tcpip.Error]*Error{
@@ -84,6 +85,7 @@ var netstackErrorTranslations = map[*tcpip.Error]*Error{
tcpip.ErrMessageTooLong: ErrMessageTooLong,
tcpip.ErrNoBufferSpace: ErrNoBufferSpace,
tcpip.ErrBroadcastDisabled: ErrBroadcastDisabled,
+ tcpip.ErrNotPermitted: ErrNotPermittedNet,
}
// TranslateNetstackError converts an error from the tcpip package to a sentry
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index a74c0a7a0..8f7b6f781 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -291,6 +291,10 @@ type Stack struct {
linkAddrCache *linkAddrCache
+ // raw indicates whether raw sockets may be created. It is set during
+ // Stack creation and is immutable.
+ raw bool
+
mu sync.RWMutex
nics map[tcpip.NICID]*NIC
forwarding bool
@@ -327,6 +331,9 @@ type Options struct {
// should be handled by the stack internally (true) or outside the
// stack (false).
HandleLocal bool
+
+ // Raw indicates whether raw sockets may be created.
+ Raw bool
}
// New allocates a new networking stack with only the requested networking and
@@ -352,6 +359,7 @@ func New(network []string, transport []string, opts Options) *Stack {
clock: clock,
stats: opts.Stats.FillIn(),
handleLocal: opts.HandleLocal,
+ raw: opts.Raw,
}
// Add specified network protocols.
@@ -512,6 +520,10 @@ func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcp
// protocol. Raw endpoints receive all traffic for a given protocol regardless
// of address.
func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+ if !s.raw {
+ return nil, tcpip.ErrNotPermitted
+ }
+
t, ok := s.transportProtocols[transport]
if !ok {
return nil, tcpip.ErrUnknownProtocol
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index e898dcbca..80cd6b4e5 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -102,6 +102,7 @@ var (
ErrMessageTooLong = &Error{msg: "message too long"}
ErrNoBufferSpace = &Error{msg: "no buffer space available"}
ErrBroadcastDisabled = &Error{msg: "broadcast socket option disabled"}
+ ErrNotPermitted = &Error{msg: "operation not permitted"}
)
// Errors related to Subnet
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index a42e09b8c..7f9dabb4d 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -341,6 +341,7 @@ func loadError(s string) *tcpip.Error {
tcpip.ErrMessageTooLong,
tcpip.ErrNoBufferSpace,
tcpip.ErrBroadcastDisabled,
+ tcpip.ErrNotPermitted,
}
messageToError = make(map[string]*tcpip.Error)
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 2523077fd..ba47effc1 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -175,6 +175,11 @@ type Config struct {
// Network indicates what type of network to use.
Network NetworkType
+ // EnableRaw indicates whether raw sockets should be enabled. Raw
+ // sockets are disabled by stripping CAP_NET_RAW from the list of
+ // capabilities.
+ EnableRaw bool
+
// GSO indicates that generic segmentation offload is enabled.
GSO bool
@@ -235,6 +240,7 @@ func (c *Config) ToFlags() []string {
"--watchdog-action=" + c.WatchdogAction.String(),
"--panic-signal=" + strconv.Itoa(c.PanicSignal),
"--profile=" + strconv.FormatBool(c.ProfileEnable),
+ "--net-raw=" + strconv.FormatBool(c.EnableRaw),
}
if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 88a834aa5..48ecb2626 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -227,7 +227,7 @@ func New(args Args) (*Loader, error) {
}
// Create capabilities.
- caps, err := specutils.Capabilities(args.Spec.Process.Capabilities)
+ caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
if err != nil {
return nil, fmt.Errorf("converting capabilities: %v", err)
}
@@ -554,7 +554,7 @@ func (l *Loader) createContainer(cid string) error {
// this method returns.
func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
// Create capabilities.
- caps, err := specutils.Capabilities(spec.Process.Capabilities)
+ caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
if err != nil {
return fmt.Errorf("creating capabilities: %v", err)
}
@@ -800,6 +800,9 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
Clock: clock,
Stats: epsocket.Metrics,
HandleLocal: true,
+ // Enable raw sockets for users with sufficient
+ // privileges.
+ Raw: true,
})}
if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
return nil, fmt.Errorf("failed to enable SACK: %v", err)
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 9e058ad97..718d01067 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -132,7 +132,11 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
}
if e.Capabilities == nil {
- e.Capabilities, err = specutils.Capabilities(c.Spec.Process.Capabilities)
+ // enableRaw is set to true to prevent the filtering out of
+ // CAP_NET_RAW. This is the opposite of Create() because exec
+ // requires the capability to be set explicitly, while 'docker
+ // run' sets it by default.
+ e.Capabilities, err = specutils.Capabilities(true /* enableRaw */, c.Spec.Process.Capabilities)
if err != nil {
Fatalf("creating capabilities: %v", err)
}
@@ -351,7 +355,11 @@ func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
var caps *auth.TaskCapabilities
if p.Capabilities != nil {
var err error
- caps, err = specutils.Capabilities(p.Capabilities)
+ // enableRaw is set to true to prevent the filtering out of
+ // CAP_NET_RAW. This is the opposite of Create() because exec
+ // requires the capability to be set explicitly, while 'docker
+ // run' sets it by default.
+ caps, err = specutils.Capabilities(true /* enableRaw */, p.Capabilities)
if err != nil {
return nil, fmt.Errorf("error creating capabilities: %v", err)
}
@@ -413,7 +421,11 @@ func capabilities(cs []string) (*auth.TaskCapabilities, error) {
specCaps.Inheritable = append(specCaps.Inheritable, cap)
specCaps.Permitted = append(specCaps.Permitted, cap)
}
- return specutils.Capabilities(&specCaps)
+ // enableRaw is set to true to prevent the filtering out of
+ // CAP_NET_RAW. This is the opposite of Create() because exec requires
+ // the capability to be set explicitly, while 'docker run' sets it by
+ // default.
+ return specutils.Capabilities(true /* enableRaw */, &specCaps)
}
// stringSlice allows a flag to be used multiple times, where each occurrence
diff --git a/runsc/main.go b/runsc/main.go
index 74253a844..b35726a74 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -68,6 +68,7 @@ var (
watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+ netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
)
@@ -159,6 +160,7 @@ func main() {
WatchdogAction: wa,
PanicSignal: *panicSignal,
ProfileEnable: *profile,
+ EnableRaw: *netRaw,
TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
}
if len(*straceSyscalls) != 0 {
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index af8d34535..32f81b8d4 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -198,20 +198,26 @@ func ReadMounts(f *os.File) ([]specs.Mount, error) {
// Capabilities takes in spec and returns a TaskCapabilities corresponding to
// the spec.
-func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
+func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
+ // Strip CAP_NET_RAW from all capability sets if necessary.
+ skipSet := map[linux.Capability]struct{}{}
+ if !enableRaw {
+ skipSet[linux.CAP_NET_RAW] = struct{}{}
+ }
+
var caps auth.TaskCapabilities
if specCaps != nil {
var err error
- if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding); err != nil {
+ if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil {
return nil, err
}
- if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective); err != nil {
+ if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil {
return nil, err
}
- if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable); err != nil {
+ if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil {
return nil, err
}
- if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted); err != nil {
+ if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil {
return nil, err
}
// TODO: Support ambient capabilities.
@@ -275,13 +281,17 @@ var capFromName = map[string]linux.Capability{
"CAP_AUDIT_READ": linux.CAP_AUDIT_READ,
}
-func capsFromNames(names []string) (auth.CapabilitySet, error) {
+func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) {
var caps []linux.Capability
for _, n := range names {
c, ok := capFromName[n]
if !ok {
return 0, fmt.Errorf("unknown capability %q", n)
}
+ // Should we skip this capabilty?
+ if _, ok := skipSet[c]; ok {
+ continue
+ }
caps = append(caps, c)
}
return auth.CapabilitySetOfMany(caps), nil
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index 779d30ec9..0c4e4fa80 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -15,7 +15,10 @@ go_test(
"manual",
"local",
],
- deps = ["//runsc/test/testutil"],
+ deps = [
+ "//pkg/abi/linux",
+ "//runsc/test/testutil",
+ ],
)
go_library(
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index fac8337f4..d87957e2d 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -27,10 +27,13 @@
package integration
import (
+ "fmt"
+ "strconv"
"syscall"
"testing"
"time"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/runsc/test/testutil"
)
@@ -46,11 +49,28 @@ func TestExecCapabilities(t *testing.T) {
}
defer d.CleanUp()
- want, err := d.WaitForOutput("CapEff:\t[0-9a-f]+\n", 5*time.Second)
+ matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second)
if err != nil {
- t.Fatalf("WaitForOutput() timeout: %v", err)
+ t.Fatalf("WaitForOutputSubmatch() timeout: %v", err)
}
- t.Log("Root capabilities:", want)
+ if len(matches) != 2 {
+ t.Fatalf("There should be a match for the whole line and the capability bitmask")
+ }
+ capString := matches[1]
+ t.Log("Root capabilities:", capString)
+
+ // CAP_NET_RAW was in the capability set for the container, but was
+ // removed. However, `exec` does not remove it. Verify that it's not
+ // set in the container, then re-add it for comparison.
+ caps, err := strconv.ParseUint(capString, 16, 64)
+ if err != nil {
+ t.Fatalf("failed to convert capabilities %q: %v", capString, err)
+ }
+ if caps&(1<<uint64(linux.CAP_NET_RAW)) != 0 {
+ t.Fatalf("CAP_NET_RAW should be filtered, but is set in the container: %x", caps)
+ }
+ caps |= 1 << uint64(linux.CAP_NET_RAW)
+ want := fmt.Sprintf("CapEff:\t%016x\n", caps)
// Now check that exec'd process capabilities match the root.
got, err := d.Exec("grep", "CapEff:", "/proc/self/status")
diff --git a/runsc/test/testutil/docker.go b/runsc/test/testutil/docker.go
index bce609061..b651319ed 100644
--- a/runsc/test/testutil/docker.go
+++ b/runsc/test/testutil/docker.go
@@ -334,19 +334,32 @@ func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) {
// WaitForOutput calls 'docker logs' to retrieve containers output and searches
// for the given pattern.
func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) {
+ matches, err := d.WaitForOutputSubmatch(pattern, timeout)
+ if err != nil {
+ return "", err
+ }
+ if len(matches) == 0 {
+ return "", nil
+ }
+ return matches[0], nil
+}
+
+// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and
+// searches for the given pattern. It returns any regexp submatches as well.
+func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) {
re := regexp.MustCompile(pattern)
var out string
for exp := time.Now().Add(timeout); time.Now().Before(exp); {
var err error
out, err = d.Logs()
if err != nil {
- return "", err
+ return nil, err
}
- if match := re.FindString(out); match != "" {
+ if matches := re.FindStringSubmatch(out); matches != nil {
// Success!
- return match, nil
+ return matches, nil
}
time.Sleep(100 * time.Millisecond)
}
- return "", fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
+ return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), out)
}