From eaac94d91c28b745c51c33dd352ed9bfdd671b8c Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Tue, 20 Nov 2018 22:55:41 -0800
Subject: Use RET_KILL_PROCESS if available in kernel

RET_KILL_THREAD doesn't work well for Go because it will
kill only the offending thread and leave the process hanging.
RET_TRAP can be masked out and it's not guaranteed to kill
the process. RET_KILL_PROCESS is available since 4.14.

For older kernel, continue to use RET_TRAP as this is the
best option (likely to kill process, easy to debug).

PiperOrigin-RevId: 222357867
Change-Id: Icc1d7d731274b16c2125b7a1ba4f7883fbdb2cbd
---
 pkg/abi/linux/seccomp.go                       | 12 +++---
 pkg/seccomp/seccomp.go                         | 52 ++++++++++++++++++++++----
 pkg/seccomp/seccomp_test.go                    | 20 +++++++---
 pkg/seccomp/seccomp_test_victim.go             |  2 +-
 pkg/seccomp/seccomp_unsafe.go                  | 30 ++++++++++++---
 pkg/sentry/kernel/seccomp.go                   |  4 +-
 pkg/sentry/platform/ptrace/subprocess_linux.go |  2 +-
 runsc/boot/filter/filter.go                    |  3 +-
 runsc/fsgofer/filter/filter.go                 |  3 +-
 9 files changed, 95 insertions(+), 33 deletions(-)

diff --git a/pkg/abi/linux/seccomp.go b/pkg/abi/linux/seccomp.go
index 9963ceeba..5ec01cc4a 100644
--- a/pkg/abi/linux/seccomp.go
+++ b/pkg/abi/linux/seccomp.go
@@ -19,17 +19,19 @@ const (
 	SECCOMP_MODE_NONE   = 0
 	SECCOMP_MODE_FILTER = 2
 
-	SECCOMP_RET_KILL  = 0x00000000
-	SECCOMP_RET_TRAP  = 0x00030000
-	SECCOMP_RET_ERRNO = 0x00050000
-	SECCOMP_RET_TRACE = 0x7ff00000
-	SECCOMP_RET_ALLOW = 0x7fff0000
+	SECCOMP_RET_KILL_PROCESS = 0x80000000
+	SECCOMP_RET_KILL_THREAD  = 0x00000000
+	SECCOMP_RET_TRAP         = 0x00030000
+	SECCOMP_RET_ERRNO        = 0x00050000
+	SECCOMP_RET_TRACE        = 0x7ff00000
+	SECCOMP_RET_ALLOW        = 0x7fff0000
 
 	SECCOMP_RET_ACTION = 0x7fff0000
 	SECCOMP_RET_DATA   = 0x0000ffff
 
 	SECCOMP_SET_MODE_FILTER   = 1
 	SECCOMP_FILTER_FLAG_TSYNC = 1
+	SECCOMP_GET_ACTION_AVAIL  = 2
 )
 
 const (
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
index 1dfbf749e..9d714d02d 100644
--- a/pkg/seccomp/seccomp.go
+++ b/pkg/seccomp/seccomp.go
@@ -33,17 +33,42 @@ const (
 	defaultLabel = "default_action"
 )
 
+func actionName(a uint32) string {
+	switch a {
+	case linux.SECCOMP_RET_KILL_PROCESS:
+		return "kill process"
+	case linux.SECCOMP_RET_TRAP:
+		return "trap"
+	}
+	panic(fmt.Sprintf("invalid action: %d", a))
+}
+
 // Install generates BPF code based on the set of syscalls provided. It only
-// allows syscalls that conform to the specification and generates SIGSYS
-// trap unless kill is set.
+// allows syscalls that conform to the specification. Syscalls that violate the
+// specification will trigger RET_KILL_PROCESS, except for the cases below.
+//
+// RET_TRAP is used in violations, instead of RET_KILL_PROCESS, in the
+// following cases:
+//	 1. Kernel doesn't support RET_KILL_PROCESS: RET_KILL_THREAD only kills the
+//      offending thread and often keeps the sentry hanging.
+//   2. Debug: RET_TRAP generates a panic followed by a stack trace which is
+//      much easier to debug then RET_KILL_PROCESS which can't be caught.
 //
-// This is a convenience wrapper around BuildProgram and SetFilter.
-func Install(rules SyscallRules, kill bool) error {
-	log.Infof("Installing seccomp filters for %d syscalls (kill=%t)", len(rules), kill)
-	defaultAction := uint32(linux.SECCOMP_RET_TRAP)
-	if kill {
-		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+// Be aware that RET_TRAP sends SIGSYS to the process and it may be ignored,
+// making it possible for the process to continue running after a violation.
+// However, it will leave a SECCOMP audit event trail behind. In any case, the
+// syscall is still blocked from executing.
+func Install(rules SyscallRules) error {
+	defaultAction, err := defaultAction()
+	if err != nil {
+		return err
 	}
+
+	// Uncomment to get stack trace when there is a violation.
+	// defaultAction = uint32(linux.SECCOMP_RET_TRAP)
+
+	log.Infof("Installing seccomp filters for %d syscalls (action=%s)", len(rules), actionName(defaultAction))
+
 	instrs, err := BuildProgram([]RuleSet{
 		RuleSet{
 			Rules:  rules,
@@ -70,6 +95,17 @@ func Install(rules SyscallRules, kill bool) error {
 	return nil
 }
 
+func defaultAction() (uint32, error) {
+	available, err := isKillProcessAvailable()
+	if err != nil {
+		return 0, err
+	}
+	if available {
+		return uint32(linux.SECCOMP_RET_KILL_PROCESS), nil
+	}
+	return uint32(linux.SECCOMP_RET_TRAP), nil
+}
+
 // RuleSet is a set of rules and associated action.
 type RuleSet struct {
 	Rules  SyscallRules
diff --git a/pkg/seccomp/seccomp_test.go b/pkg/seccomp/seccomp_test.go
index 226f30b7b..f2b903e42 100644
--- a/pkg/seccomp/seccomp_test.go
+++ b/pkg/seccomp/seccomp_test.go
@@ -121,7 +121,7 @@ func TestBasic(t *testing.T) {
 					Action: linux.SECCOMP_RET_TRAP,
 				},
 			},
-			defaultAction: linux.SECCOMP_RET_KILL,
+			defaultAction: linux.SECCOMP_RET_KILL_THREAD,
 			specs: []spec{
 				{
 					desc: "Multiple rulesets allowed (1a)",
@@ -141,7 +141,7 @@ func TestBasic(t *testing.T) {
 				{
 					desc: "Multiple rulesets allowed (2)",
 					data: seccompData{nr: 0, arch: linux.AUDIT_ARCH_X86_64},
-					want: linux.SECCOMP_RET_KILL,
+					want: linux.SECCOMP_RET_KILL_THREAD,
 				},
 			},
 		},
@@ -431,15 +431,23 @@ func TestRealDeal(t *testing.T) {
 				t.Errorf("victim was not killed as expected, output: %s", out)
 				continue
 			}
+			// Depending on kernel version, either RET_TRAP or RET_KILL_PROCESS is
+			// used. RET_TRAP dumps reason for exit in output, while RET_KILL_PROCESS
+			// returns SIGSYS as exit status.
+			if !strings.Contains(string(out), test.want) &&
+				!strings.Contains(err.Error(), test.want) {
+				t.Errorf("Victim error is wrong, got: %v, err: %v, want: %v", string(out), err, test.want)
+				continue
+			}
 		} else {
 			if err != nil {
 				t.Errorf("victim failed to execute, err: %v", err)
 				continue
 			}
-		}
-		if !strings.Contains(string(out), test.want) {
-			t.Errorf("Victim output is wrong, got: %v, want: %v", err, test.want)
-			continue
+			if !strings.Contains(string(out), test.want) {
+				t.Errorf("Victim output is wrong, got: %v, want: %v", string(out), test.want)
+				continue
+			}
 		}
 	}
 }
diff --git a/pkg/seccomp/seccomp_test_victim.go b/pkg/seccomp/seccomp_test_victim.go
index 007038273..dd5ed0041 100644
--- a/pkg/seccomp/seccomp_test_victim.go
+++ b/pkg/seccomp/seccomp_test_victim.go
@@ -106,7 +106,7 @@ func main() {
 		}
 	}
 
-	if err := seccomp.Install(syscalls, false); err != nil {
+	if err := seccomp.Install(syscalls); err != nil {
 		fmt.Printf("Failed to install seccomp: %v", err)
 		os.Exit(1)
 	}
diff --git a/pkg/seccomp/seccomp_unsafe.go b/pkg/seccomp/seccomp_unsafe.go
index dd009221a..a31c6471d 100644
--- a/pkg/seccomp/seccomp_unsafe.go
+++ b/pkg/seccomp/seccomp_unsafe.go
@@ -36,22 +36,40 @@ type sockFprog struct {
 //
 //go:nosplit
 func SetFilter(instrs []linux.BPFInstruction) syscall.Errno {
-	// SYS_SECCOMP is not available in syscall package.
-	const SYS_SECCOMP = 317
-
 	// PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See seccomp(2) for details.
 	if _, _, errno := syscall.RawSyscall(syscall.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0); errno != 0 {
 		return errno
 	}
 
-	// TODO: Use SECCOMP_FILTER_FLAG_KILL_PROCESS when available.
 	sockProg := sockFprog{
 		Len:    uint16(len(instrs)),
 		Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])),
 	}
-	if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(&sockProg))); errno != 0 {
-		return errno
+	return seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg))
+}
+
+func isKillProcessAvailable() (bool, error) {
+	action := uint32(linux.SECCOMP_RET_KILL_PROCESS)
+	if errno := seccomp(linux.SECCOMP_GET_ACTION_AVAIL, 0, unsafe.Pointer(&action)); errno != 0 {
+		// EINVAL: SECCOMP_GET_ACTION_AVAIL not in this kernel yet.
+		// EOPNOTSUPP: SECCOMP_RET_KILL_PROCESS not supported.
+		if errno == syscall.EINVAL || errno == syscall.EOPNOTSUPP {
+			return false, nil
+		}
+		return false, errno
 	}
+	return true, nil
+}
 
+// seccomp calls seccomp(2). This is safe to call from an afterFork context.
+//
+//go:nosplit
+func seccomp(op, flags uint32, ptr unsafe.Pointer) syscall.Errno {
+	// SYS_SECCOMP is not available in syscall package.
+	const SYS_SECCOMP = 317
+
+	if _, _, errno := syscall.RawSyscall(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr)); errno != 0 {
+		return errno
+	}
 	return 0
 }
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 433b900c7..d6dc45bbd 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -117,7 +117,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
 		// "Results in the system call being executed."
 		return seccompResultAllow
 
-	case linux.SECCOMP_RET_KILL:
+	case linux.SECCOMP_RET_KILL_THREAD:
 		// "Results in the task exiting immediately without executing the
 		// system call. The exit status of the task will be SIGSYS, not
 		// SIGKILL."
@@ -155,7 +155,7 @@ func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, i
 		thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
 		if err != nil {
 			t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
-			thisRet = linux.SECCOMP_RET_KILL
+			thisRet = linux.SECCOMP_RET_KILL_THREAD
 		}
 		// "If multiple filters exist, the return value for the evaluation of a
 		// given system call will always use the highest precedent value." -
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 885ba4b2e..25b8e8cb7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -115,7 +115,7 @@ func createStub() (*thread, error) {
 	var defaultAction uint32
 	if probeSeccomp() {
 		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
-		defaultAction = uint32(linux.SECCOMP_RET_KILL)
+		defaultAction = uint32(linux.SECCOMP_RET_KILL_THREAD)
 	} else {
 		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
 		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index dc7294b1d..d69a6a2cc 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -57,8 +57,7 @@ func Install(opt Options) error {
 		return fmt.Errorf("unknown platform type %T", p)
 	}
 
-	// TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported.
-	return seccomp.Install(s, false)
+	return seccomp.Install(s)
 }
 
 // Report writes a warning message to the log.
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
index f50b6bc87..c120d57a6 100644
--- a/runsc/fsgofer/filter/filter.go
+++ b/runsc/fsgofer/filter/filter.go
@@ -29,6 +29,5 @@ func Install() error {
 	// when not enabled.
 	s.Merge(instrumentationFilters())
 
-	// TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported.
-	return seccomp.Install(s, false)
+	return seccomp.Install(s)
 }
-- 
cgit v1.2.3