2 files changed, 36 insertions, 26 deletions
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index bf1225e1c..d1e99243b 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -105,11 +105,11 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute. It starts a process in an
 // already created container.
 func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
-	e, id, err := ex.parseArgs(f)
+	conf := args[0].(*boot.Config)
+	e, id, err := ex.parseArgs(f, conf.EnableRaw)
 	if err != nil {
 		Fatalf("parsing process spec: %v", err)
 	}
-	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
 	c, err := container.Load(conf.RootDir, id)
@@ -117,6 +117,9 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("loading sandbox: %v", err)
 	}
 
+	log.Debugf("Exec arguments: %+v", e)
+	log.Debugf("Exec capablities: %+v", e.Capabilities)
+
 	// Replace empty settings with defaults from container.
 	if e.WorkingDirectory == "" {
 		e.WorkingDirectory = c.Spec.Process.Cwd
@@ -129,14 +132,11 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 
 	if e.Capabilities == nil {
-		// enableRaw is set to true to prevent the filtering out of
-		// CAP_NET_RAW. This is the opposite of Create() because exec
-		// requires the capability to be set explicitly, while 'docker
-		// run' sets it by default.
-		e.Capabilities, err = specutils.Capabilities(true /* enableRaw */, c.Spec.Process.Capabilities)
+		e.Capabilities, err = specutils.Capabilities(conf.EnableRaw, c.Spec.Process.Capabilities)
 		if err != nil {
 			Fatalf("creating capabilities: %v", err)
 		}
+		log.Infof("Using exec capabilities from container: %+v", e.Capabilities)
 	}
 
 	// containerd expects an actual process to represent the container being
@@ -283,14 +283,14 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
 // parseArgs parses exec information from the command line or a JSON file
 // depending on whether the --process flag was used. Returns an ExecArgs and
 // the ID of the container to be used.
-func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
+func (ex *Exec) parseArgs(f *flag.FlagSet, enableRaw bool) (*control.ExecArgs, string, error) {
 	if ex.processPath == "" {
 		// Requires at least a container ID and command.
 		if f.NArg() < 2 {
 			f.Usage()
 			return nil, "", fmt.Errorf("both a container-id and command are required")
 		}
-		e, err := ex.argsFromCLI(f.Args()[1:])
+		e, err := ex.argsFromCLI(f.Args()[1:], enableRaw)
 		return e, f.Arg(0), err
 	}
 	// Requires only the container ID.
@@ -298,11 +298,11 @@ func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
 		f.Usage()
 		return nil, "", fmt.Errorf("a container-id is required")
 	}
-	e, err := ex.argsFromProcessFile()
+	e, err := ex.argsFromProcessFile(enableRaw)
 	return e, f.Arg(0), err
 }
 
-func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
+func (ex *Exec) argsFromCLI(argv []string, enableRaw bool) (*control.ExecArgs, error) {
 	extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs))
 	for _, s := range ex.extraKGIDs {
 		kgid, err := strconv.Atoi(s)
@@ -315,7 +315,7 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
 	var caps *auth.TaskCapabilities
 	if len(ex.caps) > 0 {
 		var err error
-		caps, err = capabilities(ex.caps)
+		caps, err = capabilities(ex.caps, enableRaw)
 		if err != nil {
 			return nil, fmt.Errorf("capabilities error: %v", err)
 		}
@@ -333,7 +333,7 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
 	}, nil
 }
 
-func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
+func (ex *Exec) argsFromProcessFile(enableRaw bool) (*control.ExecArgs, error) {
 	f, err := os.Open(ex.processPath)
 	if err != nil {
 		return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err)
@@ -343,21 +343,21 @@ func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
 	if err := json.NewDecoder(f).Decode(&p); err != nil {
 		return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err)
 	}
-	return argsFromProcess(&p)
+	return argsFromProcess(&p, enableRaw)
 }
 
 // argsFromProcess performs all the non-IO conversion from the Process struct
 // to ExecArgs.
-func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
+func argsFromProcess(p *specs.Process, enableRaw bool) (*control.ExecArgs, error) {
 	// Create capabilities.
 	var caps *auth.TaskCapabilities
 	if p.Capabilities != nil {
 		var err error
-		// enableRaw is set to true to prevent the filtering out of
-		// CAP_NET_RAW. This is the opposite of Create() because exec
-		// requires the capability to be set explicitly, while 'docker
-		// run' sets it by default.
-		caps, err = specutils.Capabilities(true /* enableRaw */, p.Capabilities)
+		// Starting from Docker 19, capabilities are explicitly set for exec (instead
+		// of nil like before). So we can't distinguish 'exec' from
+		// 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter
+		// CAP_NET_RAW in the same way as container start.
+		caps, err = specutils.Capabilities(enableRaw, p.Capabilities)
 		if err != nil {
 			return nil, fmt.Errorf("error creating capabilities: %v", err)
 		}
@@ -410,7 +410,7 @@ func resolveEnvs(envs ...[]string) ([]string, error) {
 // capabilities takes a list of capabilities as strings and returns an
 // auth.TaskCapabilities struct with those capabilities in every capability set.
 // This mimics runc's behavior.
-func capabilities(cs []string) (*auth.TaskCapabilities, error) {
+func capabilities(cs []string, enableRaw bool) (*auth.TaskCapabilities, error) {
 	var specCaps specs.LinuxCapabilities
 	for _, cap := range cs {
 		specCaps.Ambient = append(specCaps.Ambient, cap)
@@ -419,11 +419,11 @@ func capabilities(cs []string) (*auth.TaskCapabilities, error) {
 		specCaps.Inheritable = append(specCaps.Inheritable, cap)
 		specCaps.Permitted = append(specCaps.Permitted, cap)
 	}
-	// enableRaw is set to true to prevent the filtering out of
-	// CAP_NET_RAW. This is the opposite of Create() because exec requires
-	// the capability to be set explicitly, while 'docker run' sets it by
-	// default.
-	return specutils.Capabilities(true /* enableRaw */, &specCaps)
+	// Starting from Docker 19, capabilities are explicitly set for exec (instead
+	// of nil like before). So we can't distinguish 'exec' from
+	// 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter
+	// CAP_NET_RAW in the same way as container start.
+	return specutils.Capabilities(enableRaw, &specCaps)
 }
 
 // stringSlice allows a flag to be used multiple times, where each occurrence
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index cb9e58dfb..591abe458 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -31,6 +31,7 @@ import (
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
@@ -241,6 +242,15 @@ func AllCapabilities() *specs.LinuxCapabilities {
 	}
 }
 
+// AllCapabilitiesUint64 returns a bitmask containing all capabilities set.
+func AllCapabilitiesUint64() uint64 {
+	var rv uint64
+	for _, cap := range capFromName {
+		rv |= bits.MaskOf64(int(cap))
+	}
+	return rv
+}
+
 var capFromName = map[string]linux.Capability{
 	"CAP_CHOWN":            linux.CAP_CHOWN,
 	"CAP_DAC_OVERRIDE":     linux.CAP_DAC_OVERRIDE,