summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
Diffstat (limited to 'runsc')
-rw-r--r--runsc/BUILD28
-rw-r--r--runsc/boot/BUILD9
-rw-r--r--runsc/boot/compat.go61
-rw-r--r--runsc/boot/compat_amd64.go85
-rw-r--r--runsc/boot/compat_arm64.go91
-rw-r--r--runsc/boot/compat_test.go45
-rw-r--r--runsc/boot/config.go53
-rw-r--r--runsc/boot/controller.go30
-rw-r--r--runsc/boot/filter/BUILD2
-rw-r--r--runsc/boot/filter/config.go91
-rw-r--r--runsc/boot/filter/config_amd64.go31
-rw-r--r--runsc/boot/filter/config_arm64.go21
-rw-r--r--runsc/boot/fs.go152
-rw-r--r--runsc/boot/fs_test.go131
-rw-r--r--runsc/boot/loader.go119
-rw-r--r--runsc/boot/loader_amd64.go27
-rw-r--r--runsc/boot/loader_arm64.go27
-rw-r--r--runsc/boot/network.go47
-rw-r--r--runsc/boot/user.go28
-rw-r--r--runsc/boot/user_test.go3
-rw-r--r--runsc/cgroup/cgroup.go24
-rw-r--r--runsc/cmd/debug.go42
-rw-r--r--runsc/cmd/exec.go53
-rw-r--r--runsc/cmd/exec_test.go4
-rw-r--r--runsc/cmd/gofer.go36
-rw-r--r--runsc/container/BUILD2
-rw-r--r--runsc/container/console_test.go11
-rw-r--r--runsc/container/container.go431
-rw-r--r--runsc/container/container_test.go278
-rw-r--r--runsc/container/multi_container_test.go296
-rw-r--r--runsc/container/state_file.go185
-rw-r--r--runsc/container/test_app/BUILD1
-rw-r--r--runsc/container/test_app/test_app.go105
-rw-r--r--runsc/criutil/criutil.go67
-rw-r--r--runsc/debian/description6
-rw-r--r--runsc/dockerutil/dockerutil.go42
-rw-r--r--runsc/fsgofer/BUILD2
-rw-r--r--runsc/fsgofer/filter/BUILD3
-rw-r--r--runsc/fsgofer/filter/config.go69
-rw-r--r--runsc/fsgofer/filter/config_amd64.go33
-rw-r--r--runsc/fsgofer/filter/config_arm64.go27
-rw-r--r--runsc/fsgofer/filter/filter.go13
-rw-r--r--runsc/fsgofer/fsgofer.go121
-rw-r--r--runsc/fsgofer/fsgofer_amd64_unsafe.go49
-rw-r--r--runsc/fsgofer/fsgofer_arm64_unsafe.go49
-rw-r--r--runsc/fsgofer/fsgofer_test.go2
-rw-r--r--runsc/fsgofer/fsgofer_unsafe.go25
-rw-r--r--runsc/main.go61
-rw-r--r--runsc/sandbox/BUILD2
-rw-r--r--runsc/sandbox/network.go107
-rw-r--r--runsc/sandbox/sandbox.go47
-rw-r--r--runsc/specutils/BUILD2
-rw-r--r--runsc/specutils/cri.go110
-rw-r--r--runsc/specutils/namespace.go14
-rw-r--r--runsc/specutils/specutils.go125
-rw-r--r--runsc/testutil/BUILD1
-rw-r--r--runsc/testutil/testutil.go54
-rw-r--r--runsc/version.go2
-rwxr-xr-xrunsc/version_test.sh36
59 files changed, 2717 insertions, 901 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
index a2a465e1e..e5587421d 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,7 +1,7 @@
package(licenses = ["notice"]) # Apache 2.0
load("@io_bazel_rules_go//go:def.bzl", "go_binary")
-load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar")
+load("@rules_pkg//:pkg.bzl", "pkg_deb", "pkg_tar")
go_binary(
name = "runsc",
@@ -13,7 +13,7 @@ go_binary(
visibility = [
"//visibility:public",
],
- x_defs = {"main.version": "{VERSION}"},
+ x_defs = {"main.version": "{STABLE_VERSION}"},
deps = [
"//pkg/log",
"//pkg/refs",
@@ -46,7 +46,7 @@ go_binary(
visibility = [
"//visibility:public",
],
- x_defs = {"main.version": "{VERSION}"},
+ x_defs = {"main.version": "{STABLE_VERSION}"},
deps = [
"//pkg/log",
"//pkg/refs",
@@ -76,28 +76,38 @@ pkg_tar(
genrule(
name = "deb-version",
+ # Note that runsc must appear in the srcs parameter and not the tools
+ # parameter, otherwise it will not be stamped. This is reasonable, as tools
+ # may be encoded differently in the build graph (cached more aggressively
+ # because they are assumes to be hermetic).
+ srcs = [":runsc"],
outs = ["version.txt"],
cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@",
stamp = 1,
- tools = [":runsc"],
)
pkg_deb(
name = "runsc-debian",
architecture = "amd64",
data = ":debian-data",
+ # Note that the description_file will be flatten (all newlines removed),
+ # and therefore it is kept to a simple one-line description. The expected
+ # format for debian packages is "short summary\nLonger explanation of
+ # tool." and this is impossible with the flattening.
description_file = "debian/description",
homepage = "https://gvisor.dev/",
maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>",
package = "runsc",
postinst = "debian/postinst.sh",
- tags = [
- # TODO(b/135475885): pkg_deb requires python2:
- # https://github.com/bazelbuild/bazel/issues/8443
- "manual",
- ],
version_file = ":version.txt",
visibility = [
"//visibility:public",
],
)
+
+sh_test(
+ name = "version_test",
+ size = "small",
+ srcs = ["version_test.sh"],
+ data = [":runsc"],
+)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 588bb8851..6226b63f8 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -7,6 +7,7 @@ go_library(
srcs = [
"compat.go",
"compat_amd64.go",
+ "compat_arm64.go",
"config.go",
"controller.go",
"debug.go",
@@ -15,6 +16,8 @@ go_library(
"fs.go",
"limits.go",
"loader.go",
+ "loader_amd64.go",
+ "loader_arm64.go",
"network.go",
"pprof.go",
"strace.go",
@@ -57,10 +60,11 @@ go_library(
"//pkg/sentry/pgalloc",
"//pkg/sentry/platform",
"//pkg/sentry/sighandling",
- "//pkg/sentry/socket/epsocket",
"//pkg/sentry/socket/hostinet",
"//pkg/sentry/socket/netlink",
"//pkg/sentry/socket/netlink/route",
+ "//pkg/sentry/socket/netlink/uevent",
+ "//pkg/sentry/socket/netstack",
"//pkg/sentry/socket/unix",
"//pkg/sentry/state",
"//pkg/sentry/strace",
@@ -80,6 +84,7 @@ go_library(
"//pkg/tcpip/network/ipv6",
"//pkg/tcpip/stack",
"//pkg/tcpip/transport/icmp",
+ "//pkg/tcpip/transport/raw",
"//pkg/tcpip/transport/tcp",
"//pkg/tcpip/transport/udp",
"//pkg/urpc",
@@ -106,9 +111,9 @@ go_test(
"//pkg/control/server",
"//pkg/log",
"//pkg/p9",
- "//pkg/sentry/arch:registers_go_proto",
"//pkg/sentry/context/contexttest",
"//pkg/sentry/fs",
+ "//pkg/sentry/kernel/auth",
"//pkg/unet",
"//runsc/fsgofer",
"@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 07e35ab10..352e710d2 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -21,10 +21,8 @@ import (
"syscall"
"github.com/golang/protobuf/proto"
- "gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/eventchannel"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/sentry/arch"
rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
"gvisor.dev/gvisor/pkg/sentry/strace"
@@ -53,9 +51,9 @@ type compatEmitter struct {
}
func newCompatEmitter(logFD int) (*compatEmitter, error) {
- nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
+ nameMap, ok := getSyscallNameMap()
if !ok {
- return nil, fmt.Errorf("amd64 Linux syscall table not found")
+ return nil, fmt.Errorf("Linux syscall table not found")
}
c := &compatEmitter{
@@ -86,16 +84,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
}
func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
- regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+ regs := us.Registers
c.mu.Lock()
defer c.mu.Unlock()
- sysnr := regs.OrigRax
+ sysnr := syscallNum(regs)
tr := c.trackers[sysnr]
if tr == nil {
switch sysnr {
- case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
+ case syscall.SYS_PRCTL:
// args: cmd, ...
tr = newArgsTracker(0)
@@ -112,10 +110,14 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
tr = newArgsTracker(2)
default:
- tr = &onceTracker{}
+ tr = newArchArgsTracker(sysnr)
+ if tr == nil {
+ tr = &onceTracker{}
+ }
}
c.trackers[sysnr] = tr
}
+
if tr.shouldReport(regs) {
c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
tr.onReported(regs)
@@ -139,10 +141,10 @@ func (c *compatEmitter) Close() error {
// the syscall and arguments.
type syscallTracker interface {
// shouldReport returns true is the syscall should be reported.
- shouldReport(regs *rpb.AMD64Registers) bool
+ shouldReport(regs *rpb.Registers) bool
// onReported marks the syscall as reported.
- onReported(regs *rpb.AMD64Registers)
+ onReported(regs *rpb.Registers)
}
// onceTracker reports only a single time, used for most syscalls.
@@ -150,10 +152,45 @@ type onceTracker struct {
reported bool
}
-func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool {
+func (o *onceTracker) shouldReport(_ *rpb.Registers) bool {
return !o.reported
}
-func (o *onceTracker) onReported(_ *rpb.AMD64Registers) {
+func (o *onceTracker) onReported(_ *rpb.Registers) {
o.reported = true
}
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+ // argsIdx is the syscall arguments to use as unique ID.
+ argsIdx []int
+ reported map[string]struct{}
+ count int
+}
+
+func newArgsTracker(argIdx ...int) *argsTracker {
+ return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
+}
+
+// key returns the command based on the syscall argument index.
+func (a *argsTracker) key(regs *rpb.Registers) string {
+ var rv string
+ for _, idx := range a.argsIdx {
+ rv += fmt.Sprintf("%d|", argVal(idx, regs))
+ }
+ return rv
+}
+
+func (a *argsTracker) shouldReport(regs *rpb.Registers) bool {
+ if a.count >= reportLimit {
+ return false
+ }
+ _, ok := a.reported[a.key(regs)]
+ return !ok
+}
+
+func (a *argsTracker) onReported(regs *rpb.Registers) {
+ a.count++
+ a.reported[a.key(regs)] = struct{}{}
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 43cd0db94..42b0ca8b0 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -16,62 +16,81 @@ package boot
import (
"fmt"
+ "syscall"
+ "gvisor.dev/gvisor/pkg/abi"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+ "gvisor.dev/gvisor/pkg/sentry/strace"
)
// reportLimit is the max number of events that should be reported per tracker.
const reportLimit = 100
-// argsTracker reports only once for each different combination of arguments.
-// It's used for generic syscalls like ioctl to report once per 'cmd'.
-type argsTracker struct {
- // argsIdx is the syscall arguments to use as unique ID.
- argsIdx []int
- reported map[string]struct{}
- count int
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+ return &rpb.Registers{
+ Arch: &rpb.Registers_Amd64{
+ Amd64: &rpb.AMD64Registers{},
+ },
+ }
}
-func newArgsTracker(argIdx ...int) *argsTracker {
- return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
-}
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+ amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
-// cmd returns the command based on the syscall argument index.
-func (a *argsTracker) key(regs *rpb.AMD64Registers) string {
- var rv string
- for _, idx := range a.argsIdx {
- rv += fmt.Sprintf("%d|", argVal(idx, regs))
+ switch argIdx {
+ case 0:
+ return uint32(amd64Regs.Rdi)
+ case 1:
+ return uint32(amd64Regs.Rsi)
+ case 2:
+ return uint32(amd64Regs.Rdx)
+ case 3:
+ return uint32(amd64Regs.R10)
+ case 4:
+ return uint32(amd64Regs.R8)
+ case 5:
+ return uint32(amd64Regs.R9)
}
- return rv
+ panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
}
-func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 {
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+ amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+
switch argIdx {
case 0:
- return uint32(regs.Rdi)
+ amd64Regs.Rdi = argVal
case 1:
- return uint32(regs.Rsi)
+ amd64Regs.Rsi = argVal
case 2:
- return uint32(regs.Rdx)
+ amd64Regs.Rdx = argVal
case 3:
- return uint32(regs.R10)
+ amd64Regs.R10 = argVal
case 4:
- return uint32(regs.R8)
+ amd64Regs.R8 = argVal
case 5:
- return uint32(regs.R9)
+ amd64Regs.R9 = argVal
+ default:
+ panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
}
- panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
}
-func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool {
- if a.count >= reportLimit {
- return false
- }
- _, ok := a.reported[a.key(regs)]
- return !ok
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+ return strace.Lookup(abi.Linux, arch.AMD64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+ amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+ return amd64Regs.OrigRax
}
-func (a *argsTracker) onReported(regs *rpb.AMD64Registers) {
- a.count++
- a.reported[a.key(regs)] = struct{}{}
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+ switch sysnr {
+ case syscall.SYS_ARCH_PRCTL:
+ // args: cmd, ...
+ return newArgsTracker(0)
+ }
+ return nil
}
diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go
new file mode 100644
index 000000000..f784cd237
--- /dev/null
+++ b/runsc/boot/compat_arm64.go
@@ -0,0 +1,91 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+ "gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+ return &rpb.Registers{
+ Arch: &rpb.Registers_Arm64{
+ Arm64: &rpb.ARM64Registers{},
+ },
+ }
+}
+
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+ arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+ switch argIdx {
+ case 0:
+ return uint32(arm64Regs.R0)
+ case 1:
+ return uint32(arm64Regs.R1)
+ case 2:
+ return uint32(arm64Regs.R2)
+ case 3:
+ return uint32(arm64Regs.R3)
+ case 4:
+ return uint32(arm64Regs.R4)
+ case 5:
+ return uint32(arm64Regs.R5)
+ }
+ panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+ arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+ switch argIdx {
+ case 0:
+ arm64Regs.R0 = argVal
+ case 1:
+ arm64Regs.R1 = argVal
+ case 2:
+ arm64Regs.R2 = argVal
+ case 3:
+ arm64Regs.R3 = argVal
+ case 4:
+ arm64Regs.R4 = argVal
+ case 5:
+ arm64Regs.R5 = argVal
+ default:
+ panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+ }
+}
+
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+ return strace.Lookup(abi.Linux, arch.ARM64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+ arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+ return arm64Regs.R8
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+ // currently, no arch specific syscalls need to be handled here.
+ return nil
+}
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index 388298d8d..839c5303b 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -16,8 +16,6 @@ package boot
import (
"testing"
-
- rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
)
func TestOnceTracker(t *testing.T) {
@@ -35,31 +33,34 @@ func TestOnceTracker(t *testing.T) {
func TestArgsTracker(t *testing.T) {
for _, tc := range []struct {
- name string
- idx []int
- rdi1 uint64
- rdi2 uint64
- rsi1 uint64
- rsi2 uint64
- want bool
+ name string
+ idx []int
+ arg1_1 uint64
+ arg1_2 uint64
+ arg2_1 uint64
+ arg2_2 uint64
+ want bool
}{
- {name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false},
- {name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false},
- {name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true},
- {name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true},
- {name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
- {name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false},
- {name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true},
+ {name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false},
+ {name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false},
+ {name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true},
+ {name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true},
+ {name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false},
+ {name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false},
+ {name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true},
} {
t.Run(tc.name, func(t *testing.T) {
c := newArgsTracker(tc.idx...)
- regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1}
+ regs := newRegs()
+ setArgVal(0, tc.arg1_1, regs)
+ setArgVal(1, tc.arg2_1, regs)
if !c.shouldReport(regs) {
t.Error("first call to shouldReport, got: false, want: true")
}
c.onReported(regs)
- regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2
+ setArgVal(0, tc.arg1_2, regs)
+ setArgVal(1, tc.arg2_2, regs)
if got := c.shouldReport(regs); tc.want != got {
t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want)
}
@@ -70,7 +71,9 @@ func TestArgsTracker(t *testing.T) {
func TestArgsTrackerLimit(t *testing.T) {
c := newArgsTracker(0, 1)
for i := 0; i < reportLimit; i++ {
- regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)}
+ regs := newRegs()
+ setArgVal(0, 123, regs)
+ setArgVal(1, uint64(i), regs)
if !c.shouldReport(regs) {
t.Error("shouldReport before limit was reached, got: false, want: true")
}
@@ -78,7 +81,9 @@ func TestArgsTrackerLimit(t *testing.T) {
}
// Should hit the count limit now.
- regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456}
+ regs := newRegs()
+ setArgVal(0, 123, regs)
+ setArgVal(1, 123456, regs)
if c.shouldReport(regs) {
t.Error("shouldReport after limit was reached, got: true, want: false")
}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 05b8f8761..a878bc2ce 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -167,6 +167,9 @@ type Config struct {
// Overlay is whether to wrap the root filesystem in an overlay.
Overlay bool
+ // FSGoferHostUDS enables the gofer to mount a host UDS.
+ FSGoferHostUDS bool
+
// Network indicates what type of network to use.
Network NetworkType
@@ -175,8 +178,11 @@ type Config struct {
// capabilities.
EnableRaw bool
- // GSO indicates that generic segmentation offload is enabled.
- GSO bool
+ // HardwareGSO indicates that hardware segmentation offload is enabled.
+ HardwareGSO bool
+
+ // SoftwareGSO indicates that software segmentation offload is enabled.
+ SoftwareGSO bool
// LogPackets indicates that all network packets should be logged.
LogPackets bool
@@ -211,12 +217,6 @@ type Config struct {
// RestoreFile is the path to the saved container image
RestoreFile string
- // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
- // tests. It allows runsc to start the sandbox process as the current
- // user, and without chrooting the sandbox process. This can be
- // necessary in test environments that have limited capabilities.
- TestOnlyAllowRunAsCurrentUserWithoutChroot bool
-
// NumNetworkChannels controls the number of AF_PACKET sockets that map
// to the same underlying network device. This allows netstack to better
// scale for high throughput use cases.
@@ -233,6 +233,29 @@ type Config struct {
// ReferenceLeakMode sets reference leak check mode
ReferenceLeakMode refs.LeakMode
+
+ // OverlayfsStaleRead causes cached FDs to reopen after a file is opened for
+ // write to workaround overlayfs limitation on kernels before 4.19.
+ OverlayfsStaleRead bool
+
+ // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
+ // tests. It allows runsc to start the sandbox process as the current
+ // user, and without chrooting the sandbox process. This can be
+ // necessary in test environments that have limited capabilities.
+ TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+ // TestOnlyTestNameEnv should only be used in tests. It looks up for the
+ // test name in the container environment variables and adds it to the debug
+ // log file name. This is done to help identify the log with the test when
+ // multiple tests are run in parallel, since there is no way to pass
+ // parameters to the runtime from docker.
+ TestOnlyTestNameEnv string
+
+ // CPUNumFromQuota sets CPU number count to available CPU quota, using
+ // least integer value greater than or equal to quota.
+ //
+ // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
+ CPUNumFromQuota bool
}
// ToFlags returns a slice of flags that correspond to the given Config.
@@ -246,6 +269,7 @@ func (c *Config) ToFlags() []string {
"--debug-log-format=" + c.DebugLogFormat,
"--file-access=" + c.FileAccess.String(),
"--overlay=" + strconv.FormatBool(c.Overlay),
+ "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
"--network=" + c.Network.String(),
"--log-packets=" + strconv.FormatBool(c.LogPackets),
"--platform=" + c.Platform,
@@ -260,10 +284,19 @@ func (c *Config) ToFlags() []string {
"--rootless=" + strconv.FormatBool(c.Rootless),
"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
+ "--gso=" + strconv.FormatBool(c.HardwareGSO),
+ "--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
+ "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
}
+ if c.CPUNumFromQuota {
+ f = append(f, "--cpu-num-from-quota")
+ }
+ // Only include these if set since it is never to be used by users.
if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
- // Only include if set since it is never to be used by users.
- f = append(f, "-TESTONLY-unsafe-nonroot=true")
+ f = append(f, "--TESTONLY-unsafe-nonroot=true")
+ }
+ if len(c.TestOnlyTestNameEnv) != 0 {
+ f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
}
return f
}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 72cbabd16..9c9e94864 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -18,7 +18,6 @@ import (
"errors"
"fmt"
"os"
- "path"
"syscall"
specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -27,12 +26,13 @@ import (
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/socket/epsocket"
+ "gvisor.dev/gvisor/pkg/sentry/socket/netstack"
"gvisor.dev/gvisor/pkg/sentry/state"
"gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/urpc"
+ "gvisor.dev/gvisor/runsc/specutils"
)
const (
@@ -51,7 +51,7 @@ const (
ContainerEvent = "containerManager.Event"
// ContainerExecuteAsync is the URPC endpoint for executing a command in a
- // container..
+ // container.
ContainerExecuteAsync = "containerManager.ExecuteAsync"
// ContainerPause pauses the container.
@@ -142,7 +142,7 @@ func newController(fd int, l *Loader) (*controller, error) {
}
srv.Register(manager)
- if eps, ok := l.k.NetworkStack().(*epsocket.Stack); ok {
+ if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok {
net := &Network{
Stack: eps.Stack,
}
@@ -152,7 +152,9 @@ func newController(fd int, l *Loader) (*controller, error) {
srv.Register(&debug{})
srv.Register(&control.Logging{})
if l.conf.ProfileEnable {
- srv.Register(&control.Profile{})
+ srv.Register(&control.Profile{
+ Kernel: l.k,
+ })
}
return &controller{
@@ -161,7 +163,7 @@ func newController(fd int, l *Loader) (*controller, error) {
}, nil
}
-// containerManager manages sandboes containers.
+// containerManager manages sandbox containers.
type containerManager struct {
// startChan is used to signal when the root container process should
// be started.
@@ -234,17 +236,13 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
if args.CID == "" {
return errors.New("start argument missing container ID")
}
- // Prevent CIDs containing ".." from confusing the sentry when creating
- // /containers/<cid> directory.
- // TODO(b/129293409): Once we have multiple independent roots, this
- // check won't be necessary.
- if path.Clean(args.CID) != args.CID {
- return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
- }
if len(args.FilePayload.Files) < 4 {
return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
}
+ // All validation passed, logs the spec for debugging.
+ specutils.LogSpec(args.Spec)
+
err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
if err != nil {
log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
@@ -355,7 +353,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
fs.SetRestoreEnvironment(*renv)
// Prepare to load from the state file.
- if eps, ok := networkStack.(*epsocket.Stack); ok {
+ if eps, ok := networkStack.(*netstack.Stack); ok {
stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
}
info, err := specFile.Stat()
@@ -384,7 +382,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
}
// Since we have a new kernel we also must make a new watchdog.
- dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+ dogOpts := watchdog.DefaultOpts
+ dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+ dog := watchdog.New(k, dogOpts)
// Change the loader fields to reflect the changes made when restoring.
cm.l.k = k
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index f5509b6b7..3a9dcfc04 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -6,6 +6,8 @@ go_library(
name = "filter",
srcs = [
"config.go",
+ "config_amd64.go",
+ "config_arm64.go",
"extra_filters.go",
"extra_filters_msan.go",
"extra_filters_race.go",
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 7ca776b3a..4fb9adca6 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -26,10 +26,6 @@ import (
// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
var allowedSyscalls = seccomp.SyscallRules{
- syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
- {seccomp.AllowValue(linux.ARCH_GET_FS)},
- {seccomp.AllowValue(linux.ARCH_SET_FS)},
- },
syscall.SYS_CLOCK_GETTIME: {},
syscall.SYS_CLONE: []seccomp.Rule{
{
@@ -42,8 +38,15 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.CLONE_THREAD),
},
},
- syscall.SYS_CLOSE: {},
- syscall.SYS_DUP: {},
+ syscall.SYS_CLOSE: {},
+ syscall.SYS_DUP: {},
+ syscall.SYS_DUP3: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ },
syscall.SYS_EPOLL_CREATE1: {},
syscall.SYS_EPOLL_CTL: {},
syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
@@ -88,14 +91,24 @@ var allowedSyscalls = seccomp.SyscallRules{
seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
seccomp.AllowAny{},
seccomp.AllowAny{},
- seccomp.AllowValue(0),
},
{
seccomp.AllowAny{},
seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
seccomp.AllowAny{},
+ },
+ // Non-private variants are included for flipcall support. They are otherwise
+ // unncessary, as the sentry will use only private futexes internally.
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAIT),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAKE),
seccomp.AllowAny{},
- seccomp.AllowValue(0),
},
},
syscall.SYS_GETPID: {},
@@ -121,11 +134,6 @@ var allowedSyscalls = seccomp.SyscallRules{
seccomp.AllowValue(syscall.SOL_SOCKET),
seccomp.AllowValue(syscall.SO_SNDBUF),
},
- {
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_REUSEADDR),
- },
},
syscall.SYS_GETTID: {},
syscall.SYS_GETTIMEOFDAY: {},
@@ -232,6 +240,15 @@ var allowedSyscalls = seccomp.SyscallRules{
seccomp.AllowValue(0),
},
},
+ unix.SYS_SENDMMSG: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.MSG_DONTWAIT),
+ seccomp.AllowValue(0),
+ },
+ },
syscall.SYS_RESTART_SYSCALL: {},
syscall.SYS_RT_SIGACTION: {},
syscall.SYS_RT_SIGPROCMASK: {},
@@ -295,6 +312,26 @@ func hostInetFilters() seccomp.SyscallRules {
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IP),
+ seccomp.AllowValue(syscall.IP_TOS),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IP),
+ seccomp.AllowValue(syscall.IP_RECVTOS),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_TCLASS),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+ },
+ {
+ seccomp.AllowAny{},
seccomp.AllowValue(syscall.SOL_IPV6),
seccomp.AllowValue(syscall.IPV6_V6ONLY),
},
@@ -396,6 +433,34 @@ func hostInetFilters() seccomp.SyscallRules {
seccomp.AllowAny{},
seccomp.AllowValue(4),
},
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IP),
+ seccomp.AllowValue(syscall.IP_TOS),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IP),
+ seccomp.AllowValue(syscall.IP_RECVTOS),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_TCLASS),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
{
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
new file mode 100644
index 000000000..5335ff82c
--- /dev/null
+++ b/runsc/boot/filter/config_amd64.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+ allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
+ seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
+ seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
+ )
+}
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
new file mode 100644
index 000000000..7fa9bbda3
--- /dev/null
+++ b/runsc/boot/filter/config_arm64.go
@@ -0,0 +1,21 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package filter
+
+// Reserve for future customization.
+func init() {
+}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 34c674840..421ccd255 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,7 +16,6 @@ package boot
import (
"fmt"
- "path"
"path/filepath"
"sort"
"strconv"
@@ -52,7 +51,7 @@ const (
rootDevice = "9pfs-/"
// MountPrefix is the annotation prefix for mount hints.
- MountPrefix = "gvisor.dev/spec/mount"
+ MountPrefix = "dev.gvisor.spec.mount."
// Filesystems that runsc supports.
bind = "bind"
@@ -64,6 +63,9 @@ const (
nonefs = "none"
)
+// tmpfs has some extra supported options that we must pass through.
+var tmpfsAllowedOptions = []string{"mode", "uid", "gid"}
+
func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
// Upper layer uses the same flags as lower, but it must be read-write.
upperFlags := lowerFlags
@@ -172,27 +174,25 @@ func p9MountOptions(fd int, fa FileAccessType) []string {
func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
var out []string
for _, o := range opts {
- kv := strings.Split(o, "=")
- switch len(kv) {
- case 1:
- if specutils.ContainsStr(allowedKeys, o) {
- out = append(out, o)
- continue
- }
- log.Warningf("ignoring unsupported key %q", kv)
- case 2:
- if specutils.ContainsStr(allowedKeys, kv[0]) {
- out = append(out, o)
- continue
- }
- log.Warningf("ignoring unsupported key %q", kv[0])
- default:
- return nil, fmt.Errorf("invalid option %q", o)
+ ok, err := parseMountOption(o, allowedKeys...)
+ if err != nil {
+ return nil, err
+ }
+ if ok {
+ out = append(out, o)
}
}
return out, nil
}
+func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
+ kv := strings.SplitN(opt, "=", 3)
+ if len(kv) > 2 {
+ return false, fmt.Errorf("invalid option %q", opt)
+ }
+ return specutils.ContainsStr(allowedKeys, kv[0]), nil
+}
+
// mountDevice returns a device string based on the fs type and target
// of the mount.
func mountDevice(m specs.Mount) string {
@@ -207,6 +207,8 @@ func mountDevice(m specs.Mount) string {
func mountFlags(opts []string) fs.MountSourceFlags {
mf := fs.MountSourceFlags{}
+ // Note: changes to supported options must be reflected in
+ // isSupportedMountFlag() as well.
for _, o := range opts {
switch o {
case "rw":
@@ -224,6 +226,18 @@ func mountFlags(opts []string) fs.MountSourceFlags {
return mf
}
+func isSupportedMountFlag(fstype, opt string) bool {
+ switch opt {
+ case "rw", "ro", "noatime", "noexec":
+ return true
+ }
+ if fstype == tmpfs {
+ ok, err := parseMountOption(opt, tmpfsAllowedOptions...)
+ return ok && err == nil
+ }
+ return false
+}
+
func mustFindFilesystem(name string) fs.Filesystem {
fs, ok := fs.FindFilesystem(name)
if !ok {
@@ -427,6 +441,46 @@ func (m *mountHint) isSupported() bool {
return m.mount.Type == tmpfs && m.share == pod
}
+// checkCompatible verifies that shared mount is compatible with master.
+// For now enforce that all options are the same. Once bind mount is properly
+// supported, then we should ensure the master is less restrictive than the
+// container, e.g. master can be 'rw' while container mounts as 'ro'.
+func (m *mountHint) checkCompatible(mount specs.Mount) error {
+ // Remove options that don't affect to mount's behavior.
+ masterOpts := filterUnsupportedOptions(m.mount)
+ slaveOpts := filterUnsupportedOptions(mount)
+
+ if len(masterOpts) != len(slaveOpts) {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+ }
+
+ sort.Strings(masterOpts)
+ sort.Strings(slaveOpts)
+ for i, opt := range masterOpts {
+ if opt != slaveOpts[i] {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+ }
+ }
+ return nil
+}
+
+func (m *mountHint) fileAccessType() FileAccessType {
+ if m.share == container {
+ return FileAccessExclusive
+ }
+ return FileAccessShared
+}
+
+func filterUnsupportedOptions(mount specs.Mount) []string {
+ rv := make([]string, 0, len(mount.Options))
+ for _, o := range mount.Options {
+ if isSupportedMountFlag(mount.Type, o) {
+ rv = append(rv, o)
+ }
+ }
+ return rv
+}
+
// podMountHints contains a collection of mountHints for the pod.
type podMountHints struct {
mounts map[string]*mountHint
@@ -435,14 +489,15 @@ type podMountHints struct {
func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
mnts := make(map[string]*mountHint)
for k, v := range spec.Annotations {
- // Look for 'gvisor.dev/spec/mount' annotations and parse them.
+ // Look for 'dev.gvisor.spec.mount' annotations and parse them.
if strings.HasPrefix(k, MountPrefix) {
- parts := strings.Split(k, "/")
- if len(parts) != 5 {
+ // Remove the prefix and split the rest.
+ parts := strings.Split(k[len(MountPrefix):], ".")
+ if len(parts) != 2 {
return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
}
- name := parts[3]
- if len(name) == 0 || path.Clean(name) != name {
+ name := parts[0]
+ if len(name) == 0 {
return nil, fmt.Errorf("invalid mount name: %s", name)
}
mnt := mnts[name]
@@ -450,7 +505,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
mnt = &mountHint{name: name}
mnts[name] = mnt
}
- if err := mnt.setField(parts[4], v); err != nil {
+ if err := mnt.setField(parts[1], v); err != nil {
return nil, err
}
}
@@ -520,6 +575,11 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
func (c *containerMounter) processHints(conf *Config) error {
ctx := c.k.SupervisorContext()
for _, hint := range c.hints.mounts {
+ // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+ // common gofer to mount all shared volumes.
+ if hint.mount.Type != tmpfs {
+ continue
+ }
log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
inode, err := c.mountSharedMaster(ctx, conf, hint)
if err != nil {
@@ -655,6 +715,14 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
log.Infof("Mounting root over 9P, ioFD: %d", fd)
p9FS := mustFindFilesystem("9p")
opts := p9MountOptions(fd, conf.FileAccess)
+
+ if conf.OverlayfsStaleRead {
+ // We can't check for overlayfs here because sandbox is chroot'ed and gofer
+ // can only send mount options for specs.Mounts (specs.Root is missing
+ // Options field). So assume root is always on top of overlayfs.
+ opts = append(opts, "overlayfs_stale_read")
+ }
+
rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
if err != nil {
return nil, fmt.Errorf("creating root mount point: %v", err)
@@ -689,7 +757,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
fsName string
opts []string
useOverlay bool
- err error
)
switch m.Type {
@@ -700,14 +767,16 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
case tmpfs:
fsName = m.Type
- // tmpfs has some extra supported options that we must pass through.
- opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+ var err error
+ opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+ if err != nil {
+ return "", nil, false, err
+ }
case bind:
fd := c.fds.remove()
fsName = "9p"
- // Non-root bind mounts are always shared.
- opts = p9MountOptions(fd, FileAccessShared)
+ opts = p9MountOptions(fd, c.getMountAccessType(m))
// If configured, add overlay to all writable mounts.
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
@@ -717,7 +786,15 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
// for now.
log.Warningf("ignoring unknown filesystem type %q", m.Type)
}
- return fsName, opts, useOverlay, err
+ return fsName, opts, useOverlay, nil
+}
+
+func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+ if hint := c.hints.findMount(mount); hint != nil {
+ return hint.fileAccessType()
+ }
+ // Non-root bind mounts are always shared if no hints were provided.
+ return FileAccessShared
}
// mountSubmount mounts volumes inside the container's root. Because mounts may
@@ -779,24 +856,15 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
return fmt.Errorf("mount %q error: %v", m.Destination, err)
}
- log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+ log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts)
return nil
}
// mountSharedSubmount binds mount to a previously mounted volume that is shared
// among containers in the same pod.
func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
- // For now enforce that all options are the same. Once bind mount is properly
- // supported, then we should ensure the master is less restrictive than the
- // container, e.g. master can be 'rw' while container mounts as 'ro'.
- if len(mount.Options) != len(source.mount.Options) {
- return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
- }
- sort.Strings(mount.Options)
- for i, opt := range mount.Options {
- if opt != source.mount.Options[i] {
- return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
- }
+ if err := source.checkCompatible(mount); err != nil {
+ return err
}
maxTraversals := uint(0)
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 49ab34b33..912037075 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -15,7 +15,6 @@
package boot
import (
- "path"
"reflect"
"strings"
"testing"
@@ -26,19 +25,19 @@ import (
func TestPodMountHintsHappy(t *testing.T) {
spec := &specs.Spec{
Annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "pod",
- path.Join(MountPrefix, "mount2", "source"): "bar",
- path.Join(MountPrefix, "mount2", "type"): "bind",
- path.Join(MountPrefix, "mount2", "share"): "container",
- path.Join(MountPrefix, "mount2", "options"): "rw,private",
+ MountPrefix + "mount2.source": "bar",
+ MountPrefix + "mount2.type": "bind",
+ MountPrefix + "mount2.share": "container",
+ MountPrefix + "mount2.options": "rw,private",
},
}
podHints, err := newPodMountHints(spec)
if err != nil {
- t.Errorf("newPodMountHints failed: %v", err)
+ t.Fatalf("newPodMountHints failed: %v", err)
}
// Check that fields were set correctly.
@@ -86,95 +85,95 @@ func TestPodMountHintsErrors(t *testing.T) {
{
name: "too short",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1"): "foo",
+ MountPrefix + "mount1": "foo",
},
error: "invalid mount annotation",
},
{
name: "no name",
annotations: map[string]string{
- MountPrefix + "//source": "foo",
+ MountPrefix + ".source": "foo",
},
error: "invalid mount name",
},
{
name: "missing source",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "pod",
},
error: "source field",
},
{
name: "missing type",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.share": "pod",
},
error: "type field",
},
{
name: "missing share",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "tmpfs",
},
error: "share field",
},
{
name: "invalid field name",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "invalid"): "foo",
+ MountPrefix + "mount1.invalid": "foo",
},
error: "invalid mount annotation",
},
{
name: "invalid source",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": "",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "pod",
},
error: "source cannot be empty",
},
{
name: "invalid type",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "invalid-type",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "invalid-type",
+ MountPrefix + "mount1.share": "pod",
},
error: "invalid type",
},
{
name: "invalid share",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "invalid-share",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "invalid-share",
},
error: "invalid share",
},
{
name: "invalid options",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "pod",
- path.Join(MountPrefix, "mount1", "options"): "invalid-option",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "pod",
+ MountPrefix + "mount1.options": "invalid-option",
},
error: "unknown mount option",
},
{
name: "duplicate source",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "pod",
- path.Join(MountPrefix, "mount2", "source"): "foo",
- path.Join(MountPrefix, "mount2", "type"): "bind",
- path.Join(MountPrefix, "mount2", "share"): "container",
+ MountPrefix + "mount2.source": "foo",
+ MountPrefix + "mount2.type": "bind",
+ MountPrefix + "mount2.share": "container",
},
error: "have the same mount source",
},
@@ -191,3 +190,61 @@ func TestPodMountHintsErrors(t *testing.T) {
})
}
}
+
+func TestGetMountAccessType(t *testing.T) {
+ const source = "foo"
+ for _, tst := range []struct {
+ name string
+ annotations map[string]string
+ want FileAccessType
+ }{
+ {
+ name: "container=exclusive",
+ annotations: map[string]string{
+ MountPrefix + "mount1.source": source,
+ MountPrefix + "mount1.type": "bind",
+ MountPrefix + "mount1.share": "container",
+ },
+ want: FileAccessExclusive,
+ },
+ {
+ name: "pod=shared",
+ annotations: map[string]string{
+ MountPrefix + "mount1.source": source,
+ MountPrefix + "mount1.type": "bind",
+ MountPrefix + "mount1.share": "pod",
+ },
+ want: FileAccessShared,
+ },
+ {
+ name: "shared=shared",
+ annotations: map[string]string{
+ MountPrefix + "mount1.source": source,
+ MountPrefix + "mount1.type": "bind",
+ MountPrefix + "mount1.share": "shared",
+ },
+ want: FileAccessShared,
+ },
+ {
+ name: "default=shared",
+ annotations: map[string]string{
+ MountPrefix + "mount1.source": source + "mismatch",
+ MountPrefix + "mount1.type": "bind",
+ MountPrefix + "mount1.share": "container",
+ },
+ want: FileAccessShared,
+ },
+ } {
+ t.Run(tst.name, func(t *testing.T) {
+ spec := &specs.Spec{Annotations: tst.annotations}
+ podHints, err := newPodMountHints(spec)
+ if err != nil {
+ t.Fatalf("newPodMountHints failed: %v", err)
+ }
+ mounter := containerMounter{hints: podHints}
+ if got := mounter.getMountAccessType(specs.Mount{Source: source}); got != tst.want {
+ t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got)
+ }
+ })
+ }
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 823a34619..bc1d0c1bb 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -20,7 +20,6 @@ import (
mrand "math/rand"
"os"
"runtime"
- "strings"
"sync"
"sync/atomic"
"syscall"
@@ -44,7 +43,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/sighandling"
- slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
@@ -55,6 +53,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/raw"
"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
"gvisor.dev/gvisor/runsc/boot/filter"
@@ -62,10 +61,11 @@ import (
"gvisor.dev/gvisor/runsc/specutils"
// Include supported socket providers.
- "gvisor.dev/gvisor/pkg/sentry/socket/epsocket"
"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
+ _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
+ "gvisor.dev/gvisor/pkg/sentry/socket/netstack"
_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
)
@@ -93,10 +93,6 @@ type Loader struct {
// spec is the base configuration for the root container.
spec *specs.Spec
- // startSignalForwarding enables forwarding of signals to the sandboxed
- // container. It should be called after the init process is loaded.
- startSignalForwarding func() func()
-
// stopSignalForwarding disables forwarding of signals to the sandboxed
// container. It should be called when a sandbox is destroyed.
stopSignalForwarding func()
@@ -146,9 +142,6 @@ type execProcess struct {
func init() {
// Initialize the random number generator.
mrand.Seed(gtime.Now().UnixNano())
-
- // Register the global syscall table.
- kernel.RegisterSyscallTable(slinux.AMD64)
}
// Args are the arguments for New().
@@ -232,7 +225,7 @@ func New(args Args) (*Loader, error) {
// this point. Netns is configured before Run() is called. Netstack is
// configured using a control uRPC message. Host network is configured inside
// Run().
- networkStack, err := newEmptyNetworkStack(args.Conf, k)
+ networkStack, err := newEmptyNetworkStack(args.Conf, k, k)
if err != nil {
return nil, fmt.Errorf("creating network: %v", err)
}
@@ -300,7 +293,9 @@ func New(args Args) (*Loader, error) {
}
// Create a watchdog.
- dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
+ dogOpts := watchdog.DefaultOpts
+ dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
+ dog := watchdog.New(k, dogOpts)
procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
if err != nil {
@@ -337,29 +332,6 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
}
- // Handle signals by forwarding them to the root container process
- // (except for panic signal, which should cause a panic).
- l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
- // Panic signal should cause a panic.
- if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
- panic("Signal-induced panic")
- }
-
- // Otherwise forward to root container.
- deliveryMode := DeliverToProcess
- if args.Console {
- // Since we are running with a console, we should
- // forward the signal to the foreground process group
- // so that job control signals like ^C can be handled
- // properly.
- deliveryMode = DeliverToForegroundProcessGroup
- }
- log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
- if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
- log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
- }
- })
-
// Create the control server using the provided FD.
//
// This must be done *after* we have initialized the kernel since the
@@ -535,23 +507,12 @@ func (l *Loader) run() error {
return err
}
- // Read /etc/passwd for the user's HOME directory and set the HOME
- // environment variable as required by POSIX if it is not overridden by
- // the user.
- hasHomeEnvv := false
- for _, envv := range l.rootProcArgs.Envv {
- if strings.HasPrefix(envv, "HOME=") {
- hasHomeEnvv = true
- }
- }
- if !hasHomeEnvv {
- homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
- if err != nil {
- return fmt.Errorf("error reading exec user: %v", err)
- }
-
- l.rootProcArgs.Envv = append(l.rootProcArgs.Envv, "HOME="+homeDir)
+ // Add the HOME enviroment variable if it is not already set.
+ envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+ if err != nil {
+ return err
}
+ l.rootProcArgs.Envv = envv
// Create the root container init task. It will begin running
// when the kernel is started.
@@ -578,8 +539,27 @@ func (l *Loader) run() error {
ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
}
- // Start signal forwarding only after an init process is created.
- l.stopSignalForwarding = l.startSignalForwarding()
+ // Handle signals by forwarding them to the root container process
+ // (except for panic signal, which should cause a panic).
+ l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
+ // Panic signal should cause a panic.
+ if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+ panic("Signal-induced panic")
+ }
+
+ // Otherwise forward to root container.
+ deliveryMode := DeliverToProcess
+ if l.console {
+ // Since we are running with a console, we should forward the signal to
+ // the foreground process group so that job control signals like ^C can
+ // be handled properly.
+ deliveryMode = DeliverToForegroundProcessGroup
+ }
+ log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+ if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
+ log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err)
+ }
+ })
log.Infof("Process should have started...")
l.watchdog.Start()
@@ -815,6 +795,16 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
})
defer args.MountNamespace.DecRef()
+ // Add the HOME enviroment varible if it is not already set.
+ root := args.MountNamespace.Root()
+ defer root.DecRef()
+ ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+ envv, err := maybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
+ if err != nil {
+ return 0, err
+ }
+ args.Envv = envv
+
// Start the process.
proc := control.Proc{Kernel: l.k}
args.PIDNamespace = tg.PIDNamespace()
@@ -906,22 +896,25 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
return l.k.GlobalInit().ExitStatus()
}
-func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
+func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
switch conf.Network {
case NetworkHost:
return hostinet.NewStack(), nil
case NetworkNone, NetworkSandbox:
// NetworkNone sets up loopback using netstack.
- netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
- protoNames := []string{tcp.ProtocolName, udp.ProtocolName, icmp.ProtocolName4}
- s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{
- Clock: clock,
- Stats: epsocket.Metrics,
- HandleLocal: true,
+ netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+ transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+ s := netstack.Stack{stack.New(stack.Options{
+ NetworkProtocols: netProtos,
+ TransportProtocols: transProtos,
+ Clock: clock,
+ Stats: netstack.Metrics,
+ HandleLocal: true,
// Enable raw sockets for users with sufficient
// privileges.
- Raw: true,
+ RawFactory: raw.EndpointFactory{},
+ UniqueID: uniqueID,
})}
// Enable SACK Recovery.
@@ -929,6 +922,10 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
return nil, fmt.Errorf("failed to enable SACK: %v", err)
}
+ // Set default TTLs as required by socket/netstack.
+ s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+ s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
// Enable Receive Buffer Auto-Tuning.
if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go
new file mode 100644
index 000000000..b9669f2ac
--- /dev/null
+++ b/runsc/boot/loader_amd64.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package boot
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+func init() {
+ // Register the global syscall table.
+ kernel.RegisterSyscallTable(linux.AMD64)
+}
diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go
new file mode 100644
index 000000000..cf64d28c8
--- /dev/null
+++ b/runsc/boot/loader_arm64.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package boot
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+func init() {
+ // Register the global syscall table.
+ kernel.RegisterSyscallTable(linux.ARM64)
+}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index ea0d9f790..dd4926bb9 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -50,12 +50,13 @@ type DefaultRoute struct {
// FDBasedLink configures an fd-based link.
type FDBasedLink struct {
- Name string
- MTU int
- Addresses []net.IP
- Routes []Route
- GSOMaxSize uint32
- LinkAddress net.HardwareAddr
+ Name string
+ MTU int
+ Addresses []net.IP
+ Routes []Route
+ GSOMaxSize uint32
+ SoftwareGSOEnabled bool
+ LinkAddress net.HardwareAddr
// NumChannels controls how many underlying FD's are to be used to
// create this endpoint.
@@ -79,7 +80,8 @@ type CreateLinksAndRoutesArgs struct {
LoopbackLinks []LoopbackLink
FDBasedLinks []FDBasedLink
- DefaultGateway DefaultRoute
+ Defaultv4Gateway DefaultRoute
+ Defaultv6Gateway DefaultRoute
}
// Empty returns true if route hasn't been set.
@@ -163,6 +165,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
Address: mac,
PacketDispatchMode: fdbased.RecvMMsg,
GSOMaxSize: link.GSOMaxSize,
+ SoftwareGSOEnabled: link.SoftwareGSOEnabled,
RXChecksumOffload: true,
})
if err != nil {
@@ -184,12 +187,24 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
}
}
- if !args.DefaultGateway.Route.Empty() {
- nicID, ok := nicids[args.DefaultGateway.Name]
+ if !args.Defaultv4Gateway.Route.Empty() {
+ nicID, ok := nicids[args.Defaultv4Gateway.Name]
if !ok {
- return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+ return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name)
}
- route, err := args.DefaultGateway.Route.toTcpipRoute(nicID)
+ route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID)
+ if err != nil {
+ return err
+ }
+ routes = append(routes, route)
+ }
+
+ if !args.Defaultv6Gateway.Route.Empty() {
+ nicID, ok := nicids[args.Defaultv6Gateway.Name]
+ if !ok {
+ return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name)
+ }
+ route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID)
if err != nil {
return err
}
@@ -203,14 +218,14 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
// createNICWithAddrs creates a NIC in the network stack and adds the given
// addresses.
-func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP, loopback bool) error {
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error {
if loopback {
- if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(linkEP)); err != nil {
- return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+ if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil {
+ return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, ep, err)
}
} else {
- if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
- return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+ if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil {
+ return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, ep, err)
}
}
diff --git a/runsc/boot/user.go b/runsc/boot/user.go
index d1d423a5c..56cc12ee0 100644
--- a/runsc/boot/user.go
+++ b/runsc/boot/user.go
@@ -16,6 +16,7 @@ package boot
import (
"bufio"
+ "fmt"
"io"
"strconv"
"strings"
@@ -23,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/usermem"
)
@@ -42,7 +44,7 @@ func (r *fileReader) Read(buf []byte) (int, error) {
// getExecUserHome returns the home directory of the executing user read from
// /etc/passwd as read from the container filesystem.
-func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32) (string, error) {
+func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) {
// The default user home directory to return if no user matching the user
// if found in the /etc/passwd found in the image.
const defaultHome = "/"
@@ -82,7 +84,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32
File: f,
}
- homeDir, err := findHomeInPasswd(uid, r, defaultHome)
+ homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome)
if err != nil {
return "", err
}
@@ -90,6 +92,28 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32
return homeDir, nil
}
+// maybeAddExecUserHome returns a new slice with the HOME enviroment variable
+// set if the slice does not already contain it, otherwise it returns the
+// original slice unmodified.
+func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
+ // Check if the envv already contains HOME.
+ for _, env := range envv {
+ if strings.HasPrefix(env, "HOME=") {
+ // We have it. Return the original slice unmodified.
+ return envv, nil
+ }
+ }
+
+ // Read /etc/passwd for the user's HOME directory and set the HOME
+ // environment variable as required by POSIX if it is not overridden by
+ // the user.
+ homeDir, err := getExecUserHome(ctx, mns, uid)
+ if err != nil {
+ return nil, fmt.Errorf("error reading exec user: %v", err)
+ }
+ return append(envv, "HOME="+homeDir), nil
+}
+
// findHomeInPasswd parses a passwd file and returns the given user's home
// directory. This function does it's best to replicate the runc's behavior.
func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) {
diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go
index 906baf3e5..9aee2ad07 100644
--- a/runsc/boot/user_test.go
+++ b/runsc/boot/user_test.go
@@ -25,6 +25,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
func setupTempDir() (string, error) {
@@ -68,7 +69,7 @@ func setupPasswd(contents string, perms os.FileMode) func() (string, error) {
// TestGetExecUserHome tests the getExecUserHome function.
func TestGetExecUserHome(t *testing.T) {
tests := map[string]struct {
- uid uint32
+ uid auth.KUID
createRoot func() (string, error)
expected string
}{
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index ab3a25b9b..653ca5f52 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -101,6 +101,14 @@ func getValue(path, name string) (string, error) {
return string(out), nil
}
+func getInt(path, name string) (int, error) {
+ s, err := getValue(path, name)
+ if err != nil {
+ return 0, err
+ }
+ return strconv.Atoi(strings.TrimSpace(s))
+}
+
// fillFromAncestor sets the value of a cgroup file from the first ancestor
// that has content. It does nothing if the file in 'path' has already been set.
func fillFromAncestor(path string) (string, error) {
@@ -323,6 +331,22 @@ func (c *Cgroup) Join() (func(), error) {
return undo, nil
}
+func (c *Cgroup) CPUQuota() (float64, error) {
+ path := c.makePath("cpu")
+ quota, err := getInt(path, "cpu.cfs_quota_us")
+ if err != nil {
+ return -1, err
+ }
+ period, err := getInt(path, "cpu.cfs_period_us")
+ if err != nil {
+ return -1, err
+ }
+ if quota <= 0 || period <= 0 {
+ return -1, err
+ }
+ return float64(quota) / float64(period), nil
+}
+
// NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
func (c *Cgroup) NumCPU() (int, error) {
path := c.makePath("cpuset")
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 7313e473f..f37415810 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -32,16 +32,17 @@ import (
// Debug implements subcommands.Command for the "debug" command.
type Debug struct {
- pid int
- stacks bool
- signal int
- profileHeap string
- profileCPU string
- profileDelay int
- trace string
- strace string
- logLevel string
- logPackets string
+ pid int
+ stacks bool
+ signal int
+ profileHeap string
+ profileCPU string
+ trace string
+ strace string
+ logLevel string
+ logPackets string
+ duration time.Duration
+ ps bool
}
// Name implements subcommands.Command.
@@ -65,12 +66,13 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
- f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile")
+ f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`)
f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).")
f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.")
+ f.BoolVar(&d.ps, "ps", false, "lists processes")
}
// Execute implements subcommands.Command.Execute.
@@ -163,7 +165,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
if err := c.Sandbox.StartCPUProfile(f); err != nil {
return Errorf(err.Error())
}
- log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
+ log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU)
}
if d.trace != "" {
delay = true
@@ -181,8 +183,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
if err := c.Sandbox.StartTrace(f); err != nil {
return Errorf(err.Error())
}
- log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace)
-
+ log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace)
}
if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 {
@@ -241,9 +242,20 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
log.Infof("Logging options changed")
}
+ if d.ps {
+ pList, err := c.Processes()
+ if err != nil {
+ Fatalf("getting processes for container: %v", err)
+ }
+ o, err := control.ProcessListToJSON(pList)
+ if err != nil {
+ Fatalf("generating JSON: %v", err)
+ }
+ log.Infof(o)
+ }
if delay {
- time.Sleep(time.Duration(d.profileDelay) * time.Second)
+ time.Sleep(d.duration)
}
return subcommands.ExitSuccess
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index e817eff77..d1e99243b 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -105,11 +105,11 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
// Execute implements subcommands.Command.Execute. It starts a process in an
// already created container.
func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
- e, id, err := ex.parseArgs(f)
+ conf := args[0].(*boot.Config)
+ e, id, err := ex.parseArgs(f, conf.EnableRaw)
if err != nil {
Fatalf("parsing process spec: %v", err)
}
- conf := args[0].(*boot.Config)
waitStatus := args[1].(*syscall.WaitStatus)
c, err := container.Load(conf.RootDir, id)
@@ -117,6 +117,9 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
Fatalf("loading sandbox: %v", err)
}
+ log.Debugf("Exec arguments: %+v", e)
+ log.Debugf("Exec capablities: %+v", e.Capabilities)
+
// Replace empty settings with defaults from container.
if e.WorkingDirectory == "" {
e.WorkingDirectory = c.Spec.Process.Cwd
@@ -127,15 +130,13 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
Fatalf("getting environment variables: %v", err)
}
}
+
if e.Capabilities == nil {
- // enableRaw is set to true to prevent the filtering out of
- // CAP_NET_RAW. This is the opposite of Create() because exec
- // requires the capability to be set explicitly, while 'docker
- // run' sets it by default.
- e.Capabilities, err = specutils.Capabilities(true /* enableRaw */, c.Spec.Process.Capabilities)
+ e.Capabilities, err = specutils.Capabilities(conf.EnableRaw, c.Spec.Process.Capabilities)
if err != nil {
Fatalf("creating capabilities: %v", err)
}
+ log.Infof("Using exec capabilities from container: %+v", e.Capabilities)
}
// containerd expects an actual process to represent the container being
@@ -282,14 +283,14 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
// parseArgs parses exec information from the command line or a JSON file
// depending on whether the --process flag was used. Returns an ExecArgs and
// the ID of the container to be used.
-func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
+func (ex *Exec) parseArgs(f *flag.FlagSet, enableRaw bool) (*control.ExecArgs, string, error) {
if ex.processPath == "" {
// Requires at least a container ID and command.
if f.NArg() < 2 {
f.Usage()
return nil, "", fmt.Errorf("both a container-id and command are required")
}
- e, err := ex.argsFromCLI(f.Args()[1:])
+ e, err := ex.argsFromCLI(f.Args()[1:], enableRaw)
return e, f.Arg(0), err
}
// Requires only the container ID.
@@ -297,11 +298,11 @@ func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) {
f.Usage()
return nil, "", fmt.Errorf("a container-id is required")
}
- e, err := ex.argsFromProcessFile()
+ e, err := ex.argsFromProcessFile(enableRaw)
return e, f.Arg(0), err
}
-func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
+func (ex *Exec) argsFromCLI(argv []string, enableRaw bool) (*control.ExecArgs, error) {
extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs))
for _, s := range ex.extraKGIDs {
kgid, err := strconv.Atoi(s)
@@ -314,7 +315,7 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
var caps *auth.TaskCapabilities
if len(ex.caps) > 0 {
var err error
- caps, err = capabilities(ex.caps)
+ caps, err = capabilities(ex.caps, enableRaw)
if err != nil {
return nil, fmt.Errorf("capabilities error: %v", err)
}
@@ -332,7 +333,7 @@ func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) {
}, nil
}
-func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
+func (ex *Exec) argsFromProcessFile(enableRaw bool) (*control.ExecArgs, error) {
f, err := os.Open(ex.processPath)
if err != nil {
return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err)
@@ -342,21 +343,21 @@ func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) {
if err := json.NewDecoder(f).Decode(&p); err != nil {
return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err)
}
- return argsFromProcess(&p)
+ return argsFromProcess(&p, enableRaw)
}
// argsFromProcess performs all the non-IO conversion from the Process struct
// to ExecArgs.
-func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) {
+func argsFromProcess(p *specs.Process, enableRaw bool) (*control.ExecArgs, error) {
// Create capabilities.
var caps *auth.TaskCapabilities
if p.Capabilities != nil {
var err error
- // enableRaw is set to true to prevent the filtering out of
- // CAP_NET_RAW. This is the opposite of Create() because exec
- // requires the capability to be set explicitly, while 'docker
- // run' sets it by default.
- caps, err = specutils.Capabilities(true /* enableRaw */, p.Capabilities)
+ // Starting from Docker 19, capabilities are explicitly set for exec (instead
+ // of nil like before). So we can't distinguish 'exec' from
+ // 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter
+ // CAP_NET_RAW in the same way as container start.
+ caps, err = specutils.Capabilities(enableRaw, p.Capabilities)
if err != nil {
return nil, fmt.Errorf("error creating capabilities: %v", err)
}
@@ -409,7 +410,7 @@ func resolveEnvs(envs ...[]string) ([]string, error) {
// capabilities takes a list of capabilities as strings and returns an
// auth.TaskCapabilities struct with those capabilities in every capability set.
// This mimics runc's behavior.
-func capabilities(cs []string) (*auth.TaskCapabilities, error) {
+func capabilities(cs []string, enableRaw bool) (*auth.TaskCapabilities, error) {
var specCaps specs.LinuxCapabilities
for _, cap := range cs {
specCaps.Ambient = append(specCaps.Ambient, cap)
@@ -418,11 +419,11 @@ func capabilities(cs []string) (*auth.TaskCapabilities, error) {
specCaps.Inheritable = append(specCaps.Inheritable, cap)
specCaps.Permitted = append(specCaps.Permitted, cap)
}
- // enableRaw is set to true to prevent the filtering out of
- // CAP_NET_RAW. This is the opposite of Create() because exec requires
- // the capability to be set explicitly, while 'docker run' sets it by
- // default.
- return specutils.Capabilities(true /* enableRaw */, &specCaps)
+ // Starting from Docker 19, capabilities are explicitly set for exec (instead
+ // of nil like before). So we can't distinguish 'exec' from
+ // 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter
+ // CAP_NET_RAW in the same way as container start.
+ return specutils.Capabilities(enableRaw, &specCaps)
}
// stringSlice allows a flag to be used multiple times, where each occurrence
diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go
index eb38a431f..a1e980d08 100644
--- a/runsc/cmd/exec_test.go
+++ b/runsc/cmd/exec_test.go
@@ -91,7 +91,7 @@ func TestCLIArgs(t *testing.T) {
}
for _, tc := range testCases {
- e, err := tc.ex.argsFromCLI(tc.argv)
+ e, err := tc.ex.argsFromCLI(tc.argv, true)
if err != nil {
t.Errorf("argsFromCLI(%+v): got error: %+v", tc.ex, err)
} else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
@@ -144,7 +144,7 @@ func TestJSONArgs(t *testing.T) {
}
for _, tc := range testCases {
- e, err := argsFromProcess(&tc.p)
+ e, err := argsFromProcess(&tc.p, true)
if err != nil {
t.Errorf("argsFromProcess(%+v): got error: %+v", tc.p, err)
} else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) {
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 9faabf494..4831210c0 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -27,6 +27,7 @@ import (
"flag"
"github.com/google/subcommands"
specs "github.com/opencontainers/runtime-spec/specs-go"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/unet"
@@ -135,7 +136,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
//
// Note that all mount points have been mounted in the proper location in
// setupRootFS().
- cleanMounts, err := resolveMounts(spec.Mounts, root)
+ cleanMounts, err := resolveMounts(conf, spec.Mounts, root)
if err != nil {
Fatalf("Failure to resolve mounts: %v", err)
}
@@ -182,6 +183,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
cfg := fsgofer.Config{
ROMount: isReadonlyMount(m.Options),
PanicOnWrite: g.panicOnWrite,
+ HostUDS: conf.FSGoferHostUDS,
}
ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
if err != nil {
@@ -200,6 +202,10 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
}
+ if conf.FSGoferHostUDS {
+ filter.InstallUDSFilters()
+ }
+
if err := filter.Install(); err != nil {
Fatalf("installing seccomp filters: %v", err)
}
@@ -375,7 +381,7 @@ func setupMounts(mounts []specs.Mount, root string) error {
// Otherwise, it may follow symlinks to locations that would be overwritten
// with another mount point and return the wrong location. In short, make sure
// setupMounts() has been called before.
-func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) {
+func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
cleanMounts := make([]specs.Mount, 0, len(mounts))
for _, m := range mounts {
if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
@@ -390,8 +396,15 @@ func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) {
if err != nil {
panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
}
+
+ opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options)
+ if err != nil {
+ return nil, err
+ }
+
cpy := m
cpy.Destination = filepath.Join("/", relDst)
+ cpy.Options = opts
cleanMounts = append(cleanMounts, cpy)
}
return cleanMounts, nil
@@ -418,7 +431,7 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro
path := filepath.Join(base, name)
if !strings.HasPrefix(path, root) {
// One cannot '..' their way out of root.
- path = root
+ base = root
continue
}
fi, err := os.Lstat(path)
@@ -448,3 +461,20 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro
}
return base, nil
}
+
+// adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
+func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) {
+ rv := make([]string, len(opts))
+ copy(rv, opts)
+
+ if conf.OverlayfsStaleRead {
+ statfs := syscall.Statfs_t{}
+ if err := syscall.Statfs(path, &statfs); err != nil {
+ return nil, err
+ }
+ if statfs.Type == unix.OVERLAYFS_SUPER_MAGIC {
+ rv = append(rv, "overlayfs_stale_read")
+ }
+ }
+ return rv, nil
+}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index bc1fa25e3..2bd12120d 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -7,6 +7,7 @@ go_library(
srcs = [
"container.go",
"hook.go",
+ "state_file.go",
"status.go",
],
importpath = "gvisor.dev/gvisor/runsc/container",
@@ -47,6 +48,7 @@ go_test(
],
deps = [
"//pkg/abi/linux",
+ "//pkg/bits",
"//pkg/log",
"//pkg/sentry/control",
"//pkg/sentry/kernel",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 7d67c3a75..5ed131a7f 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -28,6 +28,7 @@ import (
"github.com/kr/pty"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/sentry/control"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/unet"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/testutil"
@@ -219,9 +220,9 @@ func TestJobControlSignalExec(t *testing.T) {
// Make sure all the processes are running.
expectedPL := []*control.Process{
// Root container process.
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
// Bash from exec process.
- {PID: 2, Cmd: "bash"},
+ {PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(c, expectedPL); err != nil {
t.Error(err)
@@ -231,7 +232,7 @@ func TestJobControlSignalExec(t *testing.T) {
ptyMaster.Write([]byte("sleep 100\n"))
// Wait for it to start. Sleep's PPID is bash's PID.
- expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
+ expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}})
if err := waitForProcessList(c, expectedPL); err != nil {
t.Error(err)
}
@@ -361,7 +362,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
// Wait for bash to start.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "bash"},
+ {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(c, expectedPL); err != nil {
t.Fatal(err)
@@ -371,7 +372,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
ptyMaster.Write([]byte("sleep 100\n"))
// Wait for sleep to start.
- expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep"})
+ expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
if err := waitForProcessList(c, expectedPL); err != nil {
t.Fatal(err)
}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index bbb364214..68782c4be 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -17,13 +17,11 @@ package container
import (
"context"
- "encoding/json"
"fmt"
"io/ioutil"
"os"
"os/exec"
"os/signal"
- "path/filepath"
"regexp"
"strconv"
"strings"
@@ -31,7 +29,6 @@ import (
"time"
"github.com/cenkalti/backoff"
- "github.com/gofrs/flock"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
@@ -41,17 +38,6 @@ import (
"gvisor.dev/gvisor/runsc/specutils"
)
-const (
- // metadataFilename is the name of the metadata file relative to the
- // container root directory that holds sandbox metadata.
- metadataFilename = "meta.json"
-
- // metadataLockFilename is the name of a lock file in the container
- // root directory that is used to prevent concurrent modifications to
- // the container state and metadata.
- metadataLockFilename = "meta.lock"
-)
-
// validateID validates the container id.
func validateID(id string) error {
// See libcontainer/factory_linux.go.
@@ -99,11 +85,6 @@ type Container struct {
// BundleDir is the directory containing the container bundle.
BundleDir string `json:"bundleDir"`
- // Root is the directory containing the container metadata file. If this
- // container is the root container, Root and RootContainerDir will be the
- // same.
- Root string `json:"root"`
-
// CreatedAt is the time the container was created.
CreatedAt time.Time `json:"createdAt"`
@@ -121,21 +102,24 @@ type Container struct {
// be 0 if the gofer has been killed.
GoferPid int `json:"goferPid"`
+ // Sandbox is the sandbox this container is running in. It's set when the
+ // container is created and reset when the sandbox is destroyed.
+ Sandbox *sandbox.Sandbox `json:"sandbox"`
+
+ // Saver handles load from/save to the state file safely from multiple
+ // processes.
+ Saver StateFile `json:"saver"`
+
+ //
+ // Fields below this line are not saved in the state file and will not
+ // be preserved across commands.
+ //
+
// goferIsChild is set if a gofer process is a child of the current process.
//
// This field isn't saved to json, because only a creator of a gofer
// process will have it as a child process.
goferIsChild bool
-
- // Sandbox is the sandbox this container is running in. It's set when the
- // container is created and reset when the sandbox is destroyed.
- Sandbox *sandbox.Sandbox `json:"sandbox"`
-
- // RootContainerDir is the root directory containing the metadata file of the
- // sandbox root container. It's used to lock in order to serialize creating
- // and deleting this Container's metadata directory. If this container is the
- // root container, this is the same as Root.
- RootContainerDir string
}
// loadSandbox loads all containers that belong to the sandbox with the given
@@ -166,43 +150,35 @@ func loadSandbox(rootDir, id string) ([]*Container, error) {
return containers, nil
}
-// Load loads a container with the given id from a metadata file. id may be an
-// abbreviation of the full container id, in which case Load loads the
-// container to which id unambiguously refers to.
-// Returns ErrNotExist if container doesn't exist.
-func Load(rootDir, id string) (*Container, error) {
- log.Debugf("Load container %q %q", rootDir, id)
- if err := validateID(id); err != nil {
+// Load loads a container with the given id from a metadata file. partialID may
+// be an abbreviation of the full container id, in which case Load loads the
+// container to which id unambiguously refers to. Returns ErrNotExist if
+// container doesn't exist.
+func Load(rootDir, partialID string) (*Container, error) {
+ log.Debugf("Load container %q %q", rootDir, partialID)
+ if err := validateID(partialID); err != nil {
return nil, fmt.Errorf("validating id: %v", err)
}
- cRoot, err := findContainerRoot(rootDir, id)
+ id, err := findContainerID(rootDir, partialID)
if err != nil {
// Preserve error so that callers can distinguish 'not found' errors.
return nil, err
}
- // Lock the container metadata to prevent other runsc instances from
- // writing to it while we are reading it.
- unlock, err := lockContainerMetadata(cRoot)
- if err != nil {
- return nil, err
+ state := StateFile{
+ RootDir: rootDir,
+ ID: id,
}
- defer unlock()
+ defer state.close()
- // Read the container metadata file and create a new Container from it.
- metaFile := filepath.Join(cRoot, metadataFilename)
- metaBytes, err := ioutil.ReadFile(metaFile)
- if err != nil {
+ c := &Container{}
+ if err := state.load(c); err != nil {
if os.IsNotExist(err) {
// Preserve error so that callers can distinguish 'not found' errors.
return nil, err
}
- return nil, fmt.Errorf("reading container metadata file %q: %v", metaFile, err)
- }
- var c Container
- if err := json.Unmarshal(metaBytes, &c); err != nil {
- return nil, fmt.Errorf("unmarshaling container metadata from %q: %v", metaFile, err)
+ return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
}
// If the status is "Running" or "Created", check that the sandbox
@@ -223,57 +199,37 @@ func Load(rootDir, id string) (*Container, error) {
}
}
- return &c, nil
+ return c, nil
}
-func findContainerRoot(rootDir, partialID string) (string, error) {
+func findContainerID(rootDir, partialID string) (string, error) {
// Check whether the id fully specifies an existing container.
- cRoot := filepath.Join(rootDir, partialID)
- if _, err := os.Stat(cRoot); err == nil {
- return cRoot, nil
+ stateFile := buildStatePath(rootDir, partialID)
+ if _, err := os.Stat(stateFile); err == nil {
+ return partialID, nil
}
// Now see whether id could be an abbreviation of exactly 1 of the
// container ids. If id is ambiguous (it could match more than 1
// container), it is an error.
- cRoot = ""
ids, err := List(rootDir)
if err != nil {
return "", err
}
+ rv := ""
for _, id := range ids {
if strings.HasPrefix(id, partialID) {
- if cRoot != "" {
- return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, cRoot, id)
+ if rv != "" {
+ return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
}
- cRoot = id
+ rv = id
}
}
- if cRoot == "" {
+ if rv == "" {
return "", os.ErrNotExist
}
- log.Debugf("abbreviated id %q resolves to full id %q", partialID, cRoot)
- return filepath.Join(rootDir, cRoot), nil
-}
-
-// List returns all container ids in the given root directory.
-func List(rootDir string) ([]string, error) {
- log.Debugf("List containers %q", rootDir)
- fs, err := ioutil.ReadDir(rootDir)
- if err != nil {
- return nil, fmt.Errorf("reading dir %q: %v", rootDir, err)
- }
- var out []string
- for _, f := range fs {
- // Filter out directories that do no belong to a container.
- cid := f.Name()
- if validateID(cid) == nil {
- if _, err := os.Stat(filepath.Join(rootDir, cid, metadataFilename)); err == nil {
- out = append(out, f.Name())
- }
- }
- }
- return out, nil
+ log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv)
+ return rv, nil
}
// Args is used to configure a new container.
@@ -316,44 +272,34 @@ func New(conf *boot.Config, args Args) (*Container, error) {
return nil, err
}
- unlockRoot, err := maybeLockRootContainer(args.Spec, conf.RootDir)
- if err != nil {
- return nil, err
+ if err := os.MkdirAll(conf.RootDir, 0711); err != nil {
+ return nil, fmt.Errorf("creating container root directory: %v", err)
}
- defer unlockRoot()
+
+ c := &Container{
+ ID: args.ID,
+ Spec: args.Spec,
+ ConsoleSocket: args.ConsoleSocket,
+ BundleDir: args.BundleDir,
+ Status: Creating,
+ CreatedAt: time.Now(),
+ Owner: os.Getenv("USER"),
+ Saver: StateFile{
+ RootDir: conf.RootDir,
+ ID: args.ID,
+ },
+ }
+ // The Cleanup object cleans up partially created containers when an error
+ // occurs. Any errors occurring during cleanup itself are ignored.
+ cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+ defer cu.Clean()
// Lock the container metadata file to prevent concurrent creations of
// containers with the same id.
- containerRoot := filepath.Join(conf.RootDir, args.ID)
- unlock, err := lockContainerMetadata(containerRoot)
- if err != nil {
+ if err := c.Saver.lockForNew(); err != nil {
return nil, err
}
- defer unlock()
-
- // Check if the container already exists by looking for the metadata
- // file.
- if _, err := os.Stat(filepath.Join(containerRoot, metadataFilename)); err == nil {
- return nil, fmt.Errorf("container with id %q already exists", args.ID)
- } else if !os.IsNotExist(err) {
- return nil, fmt.Errorf("looking for existing container in %q: %v", containerRoot, err)
- }
-
- c := &Container{
- ID: args.ID,
- Spec: args.Spec,
- ConsoleSocket: args.ConsoleSocket,
- BundleDir: args.BundleDir,
- Root: containerRoot,
- Status: Creating,
- CreatedAt: time.Now(),
- Owner: os.Getenv("USER"),
- RootContainerDir: conf.RootDir,
- }
- // The Cleanup object cleans up partially created containers when an error occurs.
- // Any errors occuring during cleanup itself are ignored.
- cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
- defer cu.Clean()
+ defer c.Saver.unlock()
// If the metadata annotations indicate that this container should be
// started in an existing sandbox, we must do so. The metadata will
@@ -431,7 +377,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
c.changeStatus(Created)
// Save the metadata file.
- if err := c.save(); err != nil {
+ if err := c.saveLocked(); err != nil {
return nil, err
}
@@ -451,17 +397,12 @@ func New(conf *boot.Config, args Args) (*Container, error) {
func (c *Container) Start(conf *boot.Config) error {
log.Debugf("Start container %q", c.ID)
- unlockRoot, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
- if err != nil {
+ if err := c.Saver.lock(); err != nil {
return err
}
- defer unlockRoot()
+ unlock := specutils.MakeCleanup(func() { c.Saver.unlock() })
+ defer unlock.Clean()
- unlock, err := c.lock()
- if err != nil {
- return err
- }
- defer unlock()
if err := c.requireStatus("start", Created); err != nil {
return err
}
@@ -509,24 +450,31 @@ func (c *Container) Start(conf *boot.Config) error {
}
c.changeStatus(Running)
- if err := c.save(); err != nil {
+ if err := c.saveLocked(); err != nil {
return err
}
- // Adjust the oom_score_adj for sandbox and gofers. This must be done after
- // save().
- return c.adjustOOMScoreAdj(conf)
+ // Release lock before adjusting OOM score because the lock is acquired there.
+ unlock.Clean()
+
+ // Adjust the oom_score_adj for sandbox. This must be done after saveLocked().
+ if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Saver.RootDir, false); err != nil {
+ return err
+ }
+
+ // Set container's oom_score_adj to the gofer since it is dedicated to
+ // the container, in case the gofer uses up too much memory.
+ return c.adjustGoferOOMScoreAdj()
}
// Restore takes a container and replaces its kernel and file system
// to restore a container from its state file.
func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
log.Debugf("Restore container %q", c.ID)
- unlock, err := c.lock()
- if err != nil {
+ if err := c.Saver.lock(); err != nil {
return err
}
- defer unlock()
+ defer c.Saver.unlock()
if err := c.requireStatus("restore", Created); err != nil {
return err
@@ -544,7 +492,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
return err
}
c.changeStatus(Running)
- return c.save()
+ return c.saveLocked()
}
// Run is a helper that calls Create + Start + Wait.
@@ -704,11 +652,10 @@ func (c *Container) Checkpoint(f *os.File) error {
// The call only succeeds if the container's status is created or running.
func (c *Container) Pause() error {
log.Debugf("Pausing container %q", c.ID)
- unlock, err := c.lock()
- if err != nil {
+ if err := c.Saver.lock(); err != nil {
return err
}
- defer unlock()
+ defer c.Saver.unlock()
if c.Status != Created && c.Status != Running {
return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status)
@@ -718,18 +665,17 @@ func (c *Container) Pause() error {
return fmt.Errorf("pausing container: %v", err)
}
c.changeStatus(Paused)
- return c.save()
+ return c.saveLocked()
}
// Resume unpauses the container and its kernel.
// The call only succeeds if the container's status is paused.
func (c *Container) Resume() error {
log.Debugf("Resuming container %q", c.ID)
- unlock, err := c.lock()
- if err != nil {
+ if err := c.Saver.lock(); err != nil {
return err
}
- defer unlock()
+ defer c.Saver.unlock()
if c.Status != Paused {
return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
@@ -738,7 +684,7 @@ func (c *Container) Resume() error {
return fmt.Errorf("resuming container: %v", err)
}
c.changeStatus(Running)
- return c.save()
+ return c.saveLocked()
}
// State returns the metadata of the container.
@@ -766,6 +712,17 @@ func (c *Container) Processes() ([]*control.Process, error) {
func (c *Container) Destroy() error {
log.Debugf("Destroy container %q", c.ID)
+ if err := c.Saver.lock(); err != nil {
+ return err
+ }
+ defer func() {
+ c.Saver.unlock()
+ c.Saver.close()
+ }()
+
+ // Stored for later use as stop() sets c.Sandbox to nil.
+ sb := c.Sandbox
+
// We must perform the following cleanup steps:
// * stop the container and gofer processes,
// * remove the container filesystem on the host, and
@@ -775,35 +732,43 @@ func (c *Container) Destroy() error {
// do our best to perform all of the cleanups. Hence, we keep a slice
// of errors return their concatenation.
var errs []string
-
- unlock, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
- if err != nil {
- return err
- }
- defer unlock()
-
if err := c.stop(); err != nil {
err = fmt.Errorf("stopping container: %v", err)
log.Warningf("%v", err)
errs = append(errs, err.Error())
}
- if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
- err = fmt.Errorf("deleting container root directory %q: %v", c.Root, err)
+ if err := c.Saver.destroy(); err != nil {
+ err = fmt.Errorf("deleting container state files: %v", err)
log.Warningf("%v", err)
errs = append(errs, err.Error())
}
c.changeStatus(Stopped)
+ // Adjust oom_score_adj for the sandbox. This must be done after the container
+ // is stopped and the directory at c.Root is removed. Adjustment can be
+ // skipped if the root container is exiting, because it brings down the entire
+ // sandbox.
+ //
+ // Use 'sb' to tell whether it has been executed before because Destroy must
+ // be idempotent.
+ if sb != nil && !isRoot(c.Spec) {
+ if err := adjustSandboxOOMScoreAdj(sb, c.Saver.RootDir, true); err != nil {
+ errs = append(errs, err.Error())
+ }
+ }
+
// "If any poststop hook fails, the runtime MUST log a warning, but the
- // remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
- // Based on the OCI, "The post-stop hooks MUST be called after the container is
- // deleted but before the delete operation returns"
+ // remaining hooks and lifecycle continue as if the hook had
+ // succeeded" - OCI spec.
+ //
+ // Based on the OCI, "The post-stop hooks MUST be called after the container
+ // is deleted but before the delete operation returns"
// Run it here to:
// 1) Conform to the OCI.
- // 2) Make sure it only runs once, because the root has been deleted, the container
- // can't be loaded again.
+ // 2) Make sure it only runs once, because the root has been deleted, the
+ // container can't be loaded again.
if c.Spec.Hooks != nil {
executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
}
@@ -814,18 +779,13 @@ func (c *Container) Destroy() error {
return fmt.Errorf(strings.Join(errs, "\n"))
}
-// save saves the container metadata to a file.
+// saveLocked saves the container metadata to a file.
//
// Precondition: container must be locked with container.lock().
-func (c *Container) save() error {
+func (c *Container) saveLocked() error {
log.Debugf("Save container %q", c.ID)
- metaFile := filepath.Join(c.Root, metadataFilename)
- meta, err := json.Marshal(c)
- if err != nil {
- return fmt.Errorf("invalid container metadata: %v", err)
- }
- if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
- return fmt.Errorf("writing container metadata: %v", err)
+ if err := c.Saver.saveLocked(c); err != nil {
+ return fmt.Errorf("saving container metadata: %v", err)
}
return nil
}
@@ -926,7 +886,14 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
}
if conf.DebugLog != "" {
- debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer")
+ test := ""
+ if len(conf.TestOnlyTestNameEnv) != 0 {
+ // Fetch test name if one is provided and the test only flag was set.
+ if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+ test = t
+ }
+ }
+ debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer", test)
if err != nil {
return nil, nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
}
@@ -1079,50 +1046,8 @@ func (c *Container) requireStatus(action string, statuses ...Status) error {
return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
}
-// lock takes a file lock on the container metadata lock file.
-func (c *Container) lock() (func() error, error) {
- return lockContainerMetadata(filepath.Join(c.Root, c.ID))
-}
-
-// lockContainerMetadata takes a file lock on the metadata lock file in the
-// given container root directory.
-func lockContainerMetadata(containerRootDir string) (func() error, error) {
- if err := os.MkdirAll(containerRootDir, 0711); err != nil {
- return nil, fmt.Errorf("creating container root directory %q: %v", containerRootDir, err)
- }
- f := filepath.Join(containerRootDir, metadataLockFilename)
- l := flock.NewFlock(f)
- if err := l.Lock(); err != nil {
- return nil, fmt.Errorf("acquiring lock on container lock file %q: %v", f, err)
- }
- return l.Unlock, nil
-}
-
-// maybeLockRootContainer locks the sandbox root container. It is used to
-// prevent races to create and delete child container sandboxes.
-func maybeLockRootContainer(spec *specs.Spec, rootDir string) (func() error, error) {
- if isRoot(spec) {
- return func() error { return nil }, nil
- }
-
- sbid, ok := specutils.SandboxID(spec)
- if !ok {
- return nil, fmt.Errorf("no sandbox ID found when locking root container")
- }
- sb, err := Load(rootDir, sbid)
- if err != nil {
- return nil, err
- }
-
- unlock, err := sb.lock()
- if err != nil {
- return nil, err
- }
- return unlock, nil
-}
-
func isRoot(spec *specs.Spec) bool {
- return specutils.ShouldCreateSandbox(spec)
+ return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer
}
// runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
@@ -1139,33 +1064,85 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
return fn()
}
-// adjustOOMScoreAdj sets the oom_score_adj for the sandbox and all gofers.
+// adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer.
+func (c *Container) adjustGoferOOMScoreAdj() error {
+ if c.GoferPid != 0 && c.Spec.Process.OOMScoreAdj != nil {
+ if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
+ // Ignore NotExist error because it can be returned when the sandbox
+ // exited while OOM score was being adjusted.
+ if !os.IsNotExist(err) {
+ return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
+ }
+ log.Warningf("Gofer process (%d) not found setting oom_score_adj", c.GoferPid)
+ }
+ }
+
+ return nil
+}
+
+// adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox.
// oom_score_adj is set to the lowest oom_score_adj among the containers
// running in the sandbox.
//
// TODO(gvisor.dev/issue/512): This call could race with other containers being
// created at the same time and end up setting the wrong oom_score_adj to the
// sandbox.
-func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
- // If this container's OOMScoreAdj is nil then we can exit early as no
- // change should be made to oom_score_adj for the sandbox.
- if c.Spec.Process.OOMScoreAdj == nil {
- return nil
- }
-
- containers, err := loadSandbox(conf.RootDir, c.Sandbox.ID)
+func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) error {
+ containers, err := loadSandbox(rootDir, s.ID)
if err != nil {
return fmt.Errorf("loading sandbox containers: %v", err)
}
+ // Do nothing if the sandbox has been terminated.
+ if len(containers) == 0 {
+ return nil
+ }
+
// Get the lowest score for all containers.
var lowScore int
scoreFound := false
- for _, container := range containers {
- if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
+ if len(containers) == 1 && specutils.SpecContainerType(containers[0].Spec) == specutils.ContainerTypeUnspecified {
+ // This is a single-container sandbox. Set the oom_score_adj to
+ // the value specified in the OCI bundle.
+ if containers[0].Spec.Process.OOMScoreAdj != nil {
scoreFound = true
- lowScore = *container.Spec.Process.OOMScoreAdj
+ lowScore = *containers[0].Spec.Process.OOMScoreAdj
}
+ } else {
+ for _, container := range containers {
+ // Special multi-container support for CRI. Ignore the root
+ // container when calculating oom_score_adj for the sandbox because
+ // it is the infrastructure (pause) container and always has a very
+ // low oom_score_adj.
+ //
+ // We will use OOMScoreAdj in the single-container case where the
+ // containerd container-type annotation is not present.
+ if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox {
+ continue
+ }
+
+ if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
+ scoreFound = true
+ lowScore = *container.Spec.Process.OOMScoreAdj
+ }
+ }
+ }
+
+ // If the container is destroyed and remaining containers have no
+ // oomScoreAdj specified then we must revert to the oom_score_adj of the
+ // parent process.
+ if !scoreFound && destroy {
+ ppid, err := specutils.GetParentPid(s.Pid)
+ if err != nil {
+ return fmt.Errorf("getting parent pid of sandbox pid %d: %v", s.Pid, err)
+ }
+ pScore, err := specutils.GetOOMScoreAdj(ppid)
+ if err != nil {
+ return fmt.Errorf("getting oom_score_adj of parent %d: %v", ppid, err)
+ }
+
+ scoreFound = true
+ lowScore = pScore
}
// Only set oom_score_adj if one of the containers has oom_score_adj set
@@ -1177,15 +1154,15 @@ func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
}
// Set the lowest of all containers oom_score_adj to the sandbox.
- if err := setOOMScoreAdj(c.Sandbox.Pid, lowScore); err != nil {
- return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", c.Sandbox.ID, err)
+ if err := setOOMScoreAdj(s.Pid, lowScore); err != nil {
+ // Ignore NotExist error because it can be returned when the sandbox
+ // exited while OOM score was being adjusted.
+ if !os.IsNotExist(err) {
+ return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", s.ID, err)
+ }
+ log.Warningf("Sandbox process (%d) not found setting oom_score_adj", s.Pid)
}
- // Set container's oom_score_adj to the gofer since it is dedicated to the
- // container, in case the gofer uses up too much memory.
- if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
- return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
- }
return nil
}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 2ac12e5b6..c10f85992 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -34,8 +34,10 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/boot/platforms"
@@ -51,8 +53,9 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
err = fmt.Errorf("error getting process data from container: %v", err)
return &backoff.PermanentError{Err: err}
}
- if !procListsEqual(got, want) {
- return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
+ if r, err := procListsEqual(got, want); !r {
+ return fmt.Errorf("container got process list: %s, want: %s: error: %v",
+ procListToString(got), procListToString(want), err)
}
return nil
}
@@ -90,22 +93,34 @@ func blockUntilWaitable(pid int) error {
// procListsEqual is used to check whether 2 Process lists are equal for all
// implemented fields.
-func procListsEqual(got, want []*control.Process) bool {
+func procListsEqual(got, want []*control.Process) (bool, error) {
if len(got) != len(want) {
- return false
+ return false, nil
}
for i := range got {
pd1 := got[i]
pd2 := want[i]
- // Zero out unimplemented and timing dependant fields.
+ // Zero out timing dependant fields.
pd1.Time = ""
pd1.STime = ""
pd1.C = 0
- if *pd1 != *pd2 {
- return false
+ // Ignore TTY field too, since it's not relevant in the cases
+ // where we use this method. Tests that care about the TTY
+ // field should check for it themselves.
+ pd1.TTY = ""
+ pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1})
+ if err != nil {
+ return false, err
+ }
+ pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2})
+ if err != nil {
+ return false, err
+ }
+ if pd1Json != pd2Json {
+ return false, nil
}
}
- return true
+ return true, nil
}
// getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
@@ -115,7 +130,11 @@ func getAndCheckProcLists(cont *Container, want []*control.Process) error {
if err != nil {
return fmt.Errorf("error getting process data from container: %v", err)
}
- if procListsEqual(got, want) {
+ equal, err := procListsEqual(got, want)
+ if err != nil {
+ return err
+ }
+ if equal {
return nil
}
return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
@@ -287,11 +306,12 @@ func TestLifecycle(t *testing.T) {
// expectedPL lists the expected process state of the container.
expectedPL := []*control.Process{
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{1},
},
}
// Create the container.
@@ -589,18 +609,20 @@ func TestExec(t *testing.T) {
// expectedPL lists the expected process state of the container.
expectedPL := []*control.Process{
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{1},
},
{
- UID: uid,
- PID: 2,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: uid,
+ PID: 2,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{2},
},
}
@@ -1061,18 +1083,20 @@ func TestPauseResume(t *testing.T) {
// expectedPL lists the expected process state of the container.
expectedPL := []*control.Process{
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{1},
},
{
- UID: uid,
- PID: 2,
- PPID: 0,
- C: 0,
- Cmd: "bash",
+ UID: uid,
+ PID: 2,
+ PPID: 0,
+ C: 0,
+ Cmd: "bash",
+ Threads: []kernel.ThreadID{2},
},
}
@@ -1125,11 +1149,12 @@ func TestPauseResume(t *testing.T) {
expectedPL2 := []*control.Process{
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{1},
},
}
@@ -1240,18 +1265,20 @@ func TestCapabilities(t *testing.T) {
// expectedPL lists the expected process state of the container.
expectedPL := []*control.Process{
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{1},
},
{
- UID: uid,
- PID: 2,
- PPID: 0,
- C: 0,
- Cmd: "exe",
+ UID: uid,
+ PID: 2,
+ PPID: 0,
+ C: 0,
+ Cmd: "exe",
+ Threads: []kernel.ThreadID{2},
},
}
if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
@@ -1547,7 +1574,8 @@ func TestAbbreviatedIDs(t *testing.T) {
}
defer os.RemoveAll(rootDir)
- conf := testutil.TestConfigWithRoot(rootDir)
+ conf := testutil.TestConfig()
+ conf.RootDir = rootDir
cids := []string{
"foo-" + testutil.UniqueContainerID(),
@@ -2049,6 +2077,156 @@ func TestMountSymlink(t *testing.T) {
}
}
+// Check that --net-raw disables the CAP_NET_RAW capability.
+func TestNetRaw(t *testing.T) {
+ capNetRaw := strconv.FormatUint(bits.MaskOf64(int(linux.CAP_NET_RAW)), 10)
+ app, err := testutil.FindFile("runsc/container/test_app/test_app")
+ if err != nil {
+ t.Fatal("error finding test_app:", err)
+ }
+
+ for _, enableRaw := range []bool{true, false} {
+ conf := testutil.TestConfig()
+ conf.EnableRaw = enableRaw
+
+ test := "--enabled"
+ if !enableRaw {
+ test = "--disabled"
+ }
+
+ spec := testutil.NewSpecWithArgs(app, "capability", test, capNetRaw)
+ if err := run(spec, conf); err != nil {
+ t.Fatalf("Error running container: %v", err)
+ }
+ }
+}
+
+// TestOverlayfsStaleRead most basic test that '--overlayfs-stale-read' works.
+func TestOverlayfsStaleRead(t *testing.T) {
+ conf := testutil.TestConfig()
+ conf.OverlayfsStaleRead = true
+
+ in, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.in")
+ if err != nil {
+ t.Fatalf("ioutil.TempFile() failed: %v", err)
+ }
+ defer in.Close()
+ if _, err := in.WriteString("stale data"); err != nil {
+ t.Fatalf("in.Write() failed: %v", err)
+ }
+
+ out, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.out")
+ if err != nil {
+ t.Fatalf("ioutil.TempFile() failed: %v", err)
+ }
+ defer out.Close()
+
+ const want = "foobar"
+ cmd := fmt.Sprintf("cat %q && echo %q> %q && cp %q %q", in.Name(), want, in.Name(), in.Name(), out.Name())
+ spec := testutil.NewSpecWithArgs("/bin/bash", "-c", cmd)
+ if err := run(spec, conf); err != nil {
+ t.Fatalf("Error running container: %v", err)
+ }
+
+ gotBytes, err := ioutil.ReadAll(out)
+ if err != nil {
+ t.Fatalf("out.Read() failed: %v", err)
+ }
+ got := strings.TrimSpace(string(gotBytes))
+ if want != got {
+ t.Errorf("Wrong content in out file, got: %q. want: %q", got, want)
+ }
+}
+
+// TestTTYField checks TTY field returned by container.Processes().
+func TestTTYField(t *testing.T) {
+ stop := testutil.StartReaper()
+ defer stop()
+
+ testApp, err := testutil.FindFile("runsc/container/test_app/test_app")
+ if err != nil {
+ t.Fatal("error finding test_app:", err)
+ }
+
+ testCases := []struct {
+ name string
+ useTTY bool
+ wantTTYField string
+ }{
+ {
+ name: "no tty",
+ useTTY: false,
+ wantTTYField: "?",
+ },
+ {
+ name: "tty used",
+ useTTY: true,
+ wantTTYField: "pts/0",
+ },
+ }
+
+ for _, test := range testCases {
+ t.Run(test.name, func(t *testing.T) {
+ conf := testutil.TestConfig()
+
+ // We will run /bin/sleep, possibly with an open TTY.
+ cmd := []string{"/bin/sleep", "10000"}
+ if test.useTTY {
+ // Run inside the "pty-runner".
+ cmd = append([]string{testApp, "pty-runner"}, cmd...)
+ }
+
+ spec := testutil.NewSpecWithArgs(cmd...)
+ rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ defer os.RemoveAll(bundleDir)
+
+ // Create and start the container.
+ args := Args{
+ ID: testutil.UniqueContainerID(),
+ Spec: spec,
+ BundleDir: bundleDir,
+ }
+ c, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("error creating container: %v", err)
+ }
+ defer c.Destroy()
+ if err := c.Start(conf); err != nil {
+ t.Fatalf("error starting container: %v", err)
+ }
+
+ // Wait for sleep to be running, and check the TTY
+ // field.
+ var gotTTYField string
+ cb := func() error {
+ ps, err := c.Processes()
+ if err != nil {
+ err = fmt.Errorf("error getting process data from container: %v", err)
+ return &backoff.PermanentError{Err: err}
+ }
+ for _, p := range ps {
+ if strings.Contains(p.Cmd, "sleep") {
+ gotTTYField = p.TTY
+ return nil
+ }
+ }
+ return fmt.Errorf("sleep not running")
+ }
+ if err := testutil.Poll(cb, 30*time.Second); err != nil {
+ t.Fatalf("error waiting for sleep process: %v", err)
+ }
+
+ if gotTTYField != test.wantTTYField {
+ t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField)
+ }
+ })
+ }
+}
+
// executeSync synchronously executes a new process.
func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index bd45a5118..4ad09ceab 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -60,13 +60,8 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
}
func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
- // Setup root dir if one hasn't been provided.
if len(conf.RootDir) == 0 {
- rootDir, err := testutil.SetupRootDir()
- if err != nil {
- return nil, nil, fmt.Errorf("error creating root dir: %v", err)
- }
- conf.RootDir = rootDir
+ panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
}
var containers []*Container
@@ -78,7 +73,6 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
for _, b := range bundles {
os.RemoveAll(b)
}
- os.RemoveAll(conf.RootDir)
}
for i, spec := range specs {
bundleDir, err := testutil.SetupBundleDir(spec)
@@ -129,11 +123,11 @@ func execMany(execs []execDesc) error {
func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
for _, spec := range pod {
- spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source
- spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type
- spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod"
+ spec.Annotations[boot.MountPrefix+name+".source"] = mount.Source
+ spec.Annotations[boot.MountPrefix+name+".type"] = mount.Type
+ spec.Annotations[boot.MountPrefix+name+".share"] = "pod"
if len(mount.Options) > 0 {
- spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",")
+ spec.Annotations[boot.MountPrefix+name+".options"] = strings.Join(mount.Options, ",")
}
}
}
@@ -144,6 +138,13 @@ func TestMultiContainerSanity(t *testing.T) {
for _, conf := range configs(all...) {
t.Logf("Running test with conf: %+v", conf)
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ conf.RootDir = rootDir
+
// Setup the containers.
sleep := []string{"sleep", "100"}
specs, ids := createSpecs(sleep, sleep)
@@ -155,13 +156,13 @@ func TestMultiContainerSanity(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
}
expectedPL = []*control.Process{
- {PID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -175,6 +176,13 @@ func TestMultiPIDNS(t *testing.T) {
for _, conf := range configs(all...) {
t.Logf("Running test with conf: %+v", conf)
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ conf.RootDir = rootDir
+
// Setup the containers.
sleep := []string{"sleep", "100"}
testSpecs, ids := createSpecs(sleep, sleep)
@@ -194,13 +202,13 @@ func TestMultiPIDNS(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
}
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -213,6 +221,13 @@ func TestMultiPIDNSPath(t *testing.T) {
for _, conf := range configs(all...) {
t.Logf("Running test with conf: %+v", conf)
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ conf.RootDir = rootDir
+
// Setup the containers.
sleep := []string{"sleep", "100"}
testSpecs, ids := createSpecs(sleep, sleep, sleep)
@@ -249,7 +264,7 @@ func TestMultiPIDNSPath(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -259,7 +274,7 @@ func TestMultiPIDNSPath(t *testing.T) {
}
expectedPL = []*control.Process{
- {PID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -268,13 +283,21 @@ func TestMultiPIDNSPath(t *testing.T) {
}
func TestMultiContainerWait(t *testing.T) {
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+
+ conf := testutil.TestConfig()
+ conf.RootDir = rootDir
+
// The first container should run the entire duration of the test.
cmd1 := []string{"sleep", "100"}
// We'll wait on the second container, which is much shorter lived.
cmd2 := []string{"sleep", "1"}
specs, ids := createSpecs(cmd1, cmd2)
- conf := testutil.TestConfig()
containers, cleanup, err := startContainers(conf, specs, ids)
if err != nil {
t.Fatalf("error starting containers: %v", err)
@@ -283,7 +306,7 @@ func TestMultiContainerWait(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -328,7 +351,7 @@ func TestMultiContainerWait(t *testing.T) {
// After Wait returns, ensure that the root container is running and
// the child has finished.
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
@@ -344,12 +367,14 @@ func TestExecWait(t *testing.T) {
}
defer os.RemoveAll(rootDir)
+ conf := testutil.TestConfig()
+ conf.RootDir = rootDir
+
// The first container should run the entire duration of the test.
cmd1 := []string{"sleep", "100"}
// We'll wait on the second container, which is much shorter lived.
cmd2 := []string{"sleep", "1"}
specs, ids := createSpecs(cmd1, cmd2)
- conf := testutil.TestConfig()
containers, cleanup, err := startContainers(conf, specs, ids)
if err != nil {
t.Fatalf("error starting containers: %v", err)
@@ -358,7 +383,7 @@ func TestExecWait(t *testing.T) {
// Check via ps that process is running.
expectedPL := []*control.Process{
- {PID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Fatalf("failed to wait for sleep to start: %v", err)
@@ -393,7 +418,7 @@ func TestExecWait(t *testing.T) {
// Wait for the exec'd process to exit.
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Fatalf("failed to wait for second container to stop: %v", err)
@@ -432,7 +457,15 @@ func TestMultiContainerMount(t *testing.T) {
})
// Setup the containers.
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+
conf := testutil.TestConfig()
+ conf.RootDir = rootDir
+
containers, cleanup, err := startContainers(conf, sps, ids)
if err != nil {
t.Fatalf("error starting containers: %v", err)
@@ -454,6 +487,13 @@ func TestMultiContainerSignal(t *testing.T) {
for _, conf := range configs(all...) {
t.Logf("Running test with conf: %+v", conf)
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ conf.RootDir = rootDir
+
// Setup the containers.
sleep := []string{"sleep", "100"}
specs, ids := createSpecs(sleep, sleep)
@@ -465,7 +505,7 @@ func TestMultiContainerSignal(t *testing.T) {
// Check via ps that container 1 process is running.
expectedPL := []*control.Process{
- {PID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
@@ -479,7 +519,7 @@ func TestMultiContainerSignal(t *testing.T) {
// Make sure process 1 is still running.
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -548,6 +588,13 @@ func TestMultiContainerDestroy(t *testing.T) {
for _, conf := range configs(all...) {
t.Logf("Running test with conf: %+v", conf)
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ conf.RootDir = rootDir
+
// First container will remain intact while the second container is killed.
podSpecs, ids := createSpecs(
[]string{"sleep", "100"},
@@ -586,9 +633,10 @@ func TestMultiContainerDestroy(t *testing.T) {
if err != nil {
t.Fatalf("error getting process data from sandbox: %v", err)
}
- expectedPL := []*control.Process{{PID: 1, Cmd: "sleep"}}
- if !procListsEqual(pss, expectedPL) {
- t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL))
+ expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
+ if r, err := procListsEqual(pss, expectedPL); !r {
+ t.Errorf("container got process list: %s, want: %s: error: %v",
+ procListToString(pss), procListToString(expectedPL), err)
}
// Check that cont.Destroy is safe to call multiple times.
@@ -599,13 +647,21 @@ func TestMultiContainerDestroy(t *testing.T) {
}
func TestMultiContainerProcesses(t *testing.T) {
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+
+ conf := testutil.TestConfig()
+ conf.RootDir = rootDir
+
// Note: use curly braces to keep 'sh' process around. Otherwise, shell
// will just execve into 'sleep' and both containers will look the
// same.
specs, ids := createSpecs(
[]string{"sleep", "100"},
[]string{"sh", "-c", "{ sleep 100; }"})
- conf := testutil.TestConfig()
containers, cleanup, err := startContainers(conf, specs, ids)
if err != nil {
t.Fatalf("error starting containers: %v", err)
@@ -614,7 +670,7 @@ func TestMultiContainerProcesses(t *testing.T) {
// Check root's container process list doesn't include other containers.
expectedPL0 := []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL0); err != nil {
t.Errorf("failed to wait for process to start: %v", err)
@@ -622,8 +678,8 @@ func TestMultiContainerProcesses(t *testing.T) {
// Same for the other container.
expectedPL1 := []*control.Process{
- {PID: 2, Cmd: "sh"},
- {PID: 3, PPID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}},
+ {PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
}
if err := waitForProcessList(containers[1], expectedPL1); err != nil {
t.Errorf("failed to wait for process to start: %v", err)
@@ -637,7 +693,7 @@ func TestMultiContainerProcesses(t *testing.T) {
if _, err := containers[1].Execute(args); err != nil {
t.Fatalf("error exec'ing: %v", err)
}
- expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep"})
+ expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}})
if err := waitForProcessList(containers[1], expectedPL1); err != nil {
t.Errorf("failed to wait for process to start: %v", err)
}
@@ -650,6 +706,15 @@ func TestMultiContainerProcesses(t *testing.T) {
// TestMultiContainerKillAll checks that all process that belong to a container
// are killed when SIGKILL is sent to *all* processes in that container.
func TestMultiContainerKillAll(t *testing.T) {
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+
+ conf := testutil.TestConfig()
+ conf.RootDir = rootDir
+
for _, tc := range []struct {
killContainer bool
}{
@@ -665,7 +730,6 @@ func TestMultiContainerKillAll(t *testing.T) {
specs, ids := createSpecs(
[]string{app, "task-tree", "--depth=2", "--width=2"},
[]string{app, "task-tree", "--depth=4", "--width=2"})
- conf := testutil.TestConfig()
containers, cleanup, err := startContainers(conf, specs, ids)
if err != nil {
t.Fatalf("error starting containers: %v", err)
@@ -739,19 +803,13 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
specs, ids := createSpecs(
[]string{"/bin/sleep", "100"},
[]string{"/bin/sleep", "100"})
- rootDir, err := testutil.SetupRootDir()
- if err != nil {
- t.Fatalf("error creating root dir: %v", err)
- }
- defer os.RemoveAll(rootDir)
- conf := testutil.TestConfigWithRoot(rootDir)
-
- // Create and start root container.
- rootBundleDir, err := testutil.SetupBundleDir(specs[0])
+ conf := testutil.TestConfig()
+ rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
if err != nil {
t.Fatalf("error setting up container: %v", err)
}
+ defer os.RemoveAll(rootDir)
defer os.RemoveAll(rootBundleDir)
rootArgs := Args{
@@ -800,19 +858,12 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
}
specs, ids := createSpecs(cmds...)
- rootDir, err := testutil.SetupRootDir()
- if err != nil {
- t.Fatalf("error creating root dir: %v", err)
- }
- defer os.RemoveAll(rootDir)
-
- conf := testutil.TestConfigWithRoot(rootDir)
-
- // Create and start root container.
- rootBundleDir, err := testutil.SetupBundleDir(specs[0])
+ conf := testutil.TestConfig()
+ rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
if err != nil {
t.Fatalf("error setting up container: %v", err)
}
+ defer os.RemoveAll(rootDir)
defer os.RemoveAll(rootBundleDir)
rootArgs := Args{
@@ -886,9 +937,17 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename)
cmd := []string{"sh", "-c", script}
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+
+ conf := testutil.TestConfig()
+ conf.RootDir = rootDir
+
// Make sure overlay is enabled, and none of the root filesystems are
// read-only, otherwise we won't be able to create the file.
- conf := testutil.TestConfig()
conf.Overlay = true
specs, ids := createSpecs(cmdRoot, cmd, cmd)
for _, s := range specs {
@@ -941,26 +1000,21 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
}
allSpecs, allIDs := createSpecs(cmds...)
- rootDir, err := testutil.SetupRootDir()
- if err != nil {
- t.Fatalf("error creating root dir: %v", err)
- }
- defer os.RemoveAll(rootDir)
-
// Split up the specs and IDs.
rootSpec := allSpecs[0]
rootID := allIDs[0]
childrenSpecs := allSpecs[1:]
childrenIDs := allIDs[1:]
- bundleDir, err := testutil.SetupBundleDir(rootSpec)
+ conf := testutil.TestConfig()
+ rootDir, bundleDir, err := testutil.SetupContainer(rootSpec, conf)
if err != nil {
- t.Fatalf("error setting up bundle dir: %v", err)
+ t.Fatalf("error setting up container: %v", err)
}
+ defer os.RemoveAll(rootDir)
defer os.RemoveAll(bundleDir)
// Start root container.
- conf := testutil.TestConfigWithRoot(rootDir)
rootArgs := Args{
ID: rootID,
Spec: rootSpec,
@@ -1029,6 +1083,13 @@ func TestMultiContainerSharedMount(t *testing.T) {
for _, conf := range configs(all...) {
t.Logf("Running test with conf: %+v", conf)
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ conf.RootDir = rootDir
+
// Setup the containers.
sleep := []string{"sleep", "100"}
podSpec, ids := createSpecs(sleep, sleep)
@@ -1137,6 +1198,13 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
for _, conf := range configs(all...) {
t.Logf("Running test with conf: %+v", conf)
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ conf.RootDir = rootDir
+
// Setup the containers.
sleep := []string{"sleep", "100"}
podSpec, ids := createSpecs(sleep, sleep)
@@ -1197,6 +1265,13 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
for _, conf := range configs(all...) {
t.Logf("Running test with conf: %+v", conf)
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ conf.RootDir = rootDir
+
// Setup the containers.
sleep := []string{"sleep", "100"}
podSpec, ids := createSpecs(sleep, sleep)
@@ -1297,6 +1372,59 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
}
}
+// Test that unsupported pod mounts options are ignored when matching master and
+// slave mounts.
+func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+
+ conf := testutil.TestConfig()
+ conf.RootDir = rootDir
+
+ // Setup the containers.
+ sleep := []string{"/bin/sleep", "100"}
+ podSpec, ids := createSpecs(sleep, sleep)
+ mnt0 := specs.Mount{
+ Destination: "/mydir/test",
+ Source: "/some/dir",
+ Type: "tmpfs",
+ Options: []string{"rw", "rbind", "relatime"},
+ }
+ podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+ mnt1 := mnt0
+ mnt1.Destination = "/mydir2/test2"
+ mnt1.Options = []string{"rw", "nosuid"}
+ podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+ createSharedMount(mnt0, "test-mount", podSpec...)
+
+ containers, cleanup, err := startContainers(conf, podSpec, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+
+ execs := []execDesc{
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
+ desc: "directory is mounted in container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
+ desc: "directory is mounted in container1",
+ },
+ }
+ if err := execMany(execs); err != nil {
+ t.Fatal(err.Error())
+ }
+}
+
// Test that one container can send an FD to another container, even though
// they have distinct MountNamespaces.
func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
@@ -1329,6 +1457,15 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
Type: "tmpfs",
}
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+
+ conf := testutil.TestConfig()
+ conf.RootDir = rootDir
+
// Create the specs.
specs, ids := createSpecs(
[]string{"sleep", "1000"},
@@ -1339,7 +1476,6 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
specs[1].Mounts = append(specs[2].Mounts, sharedMnt, writeableMnt)
specs[2].Mounts = append(specs[1].Mounts, sharedMnt)
- conf := testutil.TestConfig()
containers, cleanup, err := startContainers(conf, specs, ids)
if err != nil {
t.Fatalf("error starting containers: %v", err)
@@ -1358,9 +1494,17 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
// Test that container is destroyed when Gofer is killed.
func TestMultiContainerGoferKilled(t *testing.T) {
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+
+ conf := testutil.TestConfig()
+ conf.RootDir = rootDir
+
sleep := []string{"sleep", "100"}
specs, ids := createSpecs(sleep, sleep, sleep)
- conf := testutil.TestConfig()
containers, cleanup, err := startContainers(conf, specs, ids)
if err != nil {
t.Fatalf("error starting containers: %v", err)
@@ -1370,7 +1514,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
// Ensure container is running
c := containers[2]
expectedPL := []*control.Process{
- {PID: 3, Cmd: "sleep"},
+ {PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
}
if err := waitForProcessList(c, expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -1398,7 +1542,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
continue // container[2] has been killed.
}
pl := []*control.Process{
- {PID: kernel.ThreadID(i + 1), Cmd: "sleep"},
+ {PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}},
}
if err := waitForProcessList(c, pl); err != nil {
t.Errorf("Container %q was affected by another container: %v", c.ID, err)
@@ -1418,7 +1562,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
// and RPC errors out.
impossiblePL := []*control.Process{
- {PID: 100, Cmd: "non-existent-process"},
+ {PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}},
}
if err := waitForProcessList(c, impossiblePL); err == nil {
t.Fatalf("Sandbox was not killed after gofer death")
@@ -1436,7 +1580,15 @@ func TestMultiContainerGoferKilled(t *testing.T) {
func TestMultiContainerLoadSandbox(t *testing.T) {
sleep := []string{"sleep", "100"}
specs, ids := createSpecs(sleep, sleep, sleep)
+
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+
conf := testutil.TestConfig()
+ conf.RootDir = rootDir
// Create containers for the sandbox.
wants, cleanup, err := startContainers(conf, specs, ids)
@@ -1529,7 +1681,15 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
Type: "bind",
})
+ rootDir, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+
conf := testutil.TestConfig()
+ conf.RootDir = rootDir
+
pod, cleanup, err := startContainers(conf, podSpecs, ids)
if err != nil {
t.Fatalf("error starting containers: %v", err)
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
new file mode 100644
index 000000000..d95151ea5
--- /dev/null
+++ b/runsc/container/state_file.go
@@ -0,0 +1,185 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "sync"
+
+ "github.com/gofrs/flock"
+ "gvisor.dev/gvisor/pkg/log"
+)
+
+const stateFileExtension = ".state"
+
+// StateFile handles load from/save to container state safely from multiple
+// processes. It uses a lock file to provide synchronization between operations.
+//
+// The lock file is located at: "${s.RootDir}/${s.ID}.lock".
+// The state file is located at: "${s.RootDir}/${s.ID}.state".
+type StateFile struct {
+ // RootDir is the directory containing the container metadata file.
+ RootDir string `json:"rootDir"`
+
+ // ID is the container ID.
+ ID string `json:"id"`
+
+ //
+ // Fields below this line are not saved in the state file and will not
+ // be preserved across commands.
+ //
+
+ once sync.Once
+ flock *flock.Flock
+}
+
+// List returns all container ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+ log.Debugf("List containers %q", rootDir)
+ list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension))
+ if err != nil {
+ return nil, err
+ }
+ var out []string
+ for _, path := range list {
+ // Filter out files that do no belong to a container.
+ fileName := filepath.Base(path)
+ if len(fileName) < len(stateFileExtension) {
+ panic(fmt.Sprintf("invalid file match %q", path))
+ }
+ // Remove the extension.
+ cid := fileName[:len(fileName)-len(stateFileExtension)]
+ if validateID(cid) == nil {
+ out = append(out, cid)
+ }
+ }
+ return out, nil
+}
+
+// lock globally locks all locking operations for the container.
+func (s *StateFile) lock() error {
+ s.once.Do(func() {
+ s.flock = flock.NewFlock(s.lockPath())
+ })
+
+ if err := s.flock.Lock(); err != nil {
+ return fmt.Errorf("acquiring lock on %q: %v", s.flock, err)
+ }
+ return nil
+}
+
+// lockForNew acquires the lock and checks if the state file doesn't exist. This
+// is done to ensure that more than one creation didn't race to create
+// containers with the same ID.
+func (s *StateFile) lockForNew() error {
+ if err := s.lock(); err != nil {
+ return err
+ }
+
+ // Checks if the container already exists by looking for the metadata file.
+ if _, err := os.Stat(s.statePath()); err == nil {
+ s.unlock()
+ return fmt.Errorf("container already exists")
+ } else if !os.IsNotExist(err) {
+ s.unlock()
+ return fmt.Errorf("looking for existing container: %v", err)
+ }
+ return nil
+}
+
+// unlock globally unlocks all locking operations for the container.
+func (s *StateFile) unlock() error {
+ if !s.flock.Locked() {
+ panic("unlock called without lock held")
+ }
+
+ if err := s.flock.Unlock(); err != nil {
+ log.Warningf("Error to release lock on %q: %v", s.flock, err)
+ return fmt.Errorf("releasing lock on %q: %v", s.flock, err)
+ }
+ return nil
+}
+
+// saveLocked saves 'v' to the state file.
+//
+// Preconditions: lock() must been called before.
+func (s *StateFile) saveLocked(v interface{}) error {
+ if !s.flock.Locked() {
+ panic("saveLocked called without lock held")
+ }
+
+ meta, err := json.Marshal(v)
+ if err != nil {
+ return err
+ }
+ if err := ioutil.WriteFile(s.statePath(), meta, 0640); err != nil {
+ return fmt.Errorf("writing json file: %v", err)
+ }
+ return nil
+}
+
+func (s *StateFile) load(v interface{}) error {
+ if err := s.lock(); err != nil {
+ return err
+ }
+ defer s.unlock()
+
+ metaBytes, err := ioutil.ReadFile(s.statePath())
+ if err != nil {
+ return err
+ }
+ return json.Unmarshal(metaBytes, &v)
+}
+
+func (s *StateFile) close() error {
+ if s.flock == nil {
+ return nil
+ }
+ if s.flock.Locked() {
+ panic("Closing locked file")
+ }
+ return s.flock.Close()
+}
+
+func buildStatePath(rootDir, id string) string {
+ return filepath.Join(rootDir, id+stateFileExtension)
+}
+
+// statePath is the full path to the state file.
+func (s *StateFile) statePath() string {
+ return buildStatePath(s.RootDir, s.ID)
+}
+
+// lockPath is the full path to the lock file.
+func (s *StateFile) lockPath() string {
+ return filepath.Join(s.RootDir, s.ID+".lock")
+}
+
+// destroy deletes all state created by the stateFile. It may be called with the
+// lock file held. In that case, the lock file must still be unlocked and
+// properly closed after destroy returns.
+func (s *StateFile) destroy() error {
+ if err := os.Remove(s.statePath()); err != nil && !os.IsNotExist(err) {
+ return err
+ }
+ if err := os.Remove(s.lockPath()); err != nil && !os.IsNotExist(err) {
+ return err
+ }
+ return nil
+}
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index 9bf9e6e9d..bfd338bb6 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -15,5 +15,6 @@ go_binary(
"//pkg/unet",
"//runsc/testutil",
"@com_github_google_subcommands//:go_default_library",
+ "@com_github_kr_pty//:go_default_library",
],
)
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index 7f735c254..a1c8a741a 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -19,25 +19,31 @@ package main
import (
"context"
"fmt"
+ "io"
+ "io/ioutil"
"log"
"net"
"os"
"os/exec"
+ "regexp"
"strconv"
sys "syscall"
"time"
"flag"
"github.com/google/subcommands"
+ "github.com/kr/pty"
"gvisor.dev/gvisor/runsc/testutil"
)
func main() {
subcommands.Register(subcommands.HelpCommand(), "")
subcommands.Register(subcommands.FlagsCommand(), "")
+ subcommands.Register(new(capability), "")
subcommands.Register(new(fdReceiver), "")
subcommands.Register(new(fdSender), "")
subcommands.Register(new(forkBomb), "")
+ subcommands.Register(new(ptyRunner), "")
subcommands.Register(new(reaper), "")
subcommands.Register(new(syscall), "")
subcommands.Register(new(taskTree), "")
@@ -287,3 +293,102 @@ func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interfac
}
return subcommands.ExitSuccess
}
+
+type capability struct {
+ enabled uint64
+ disabled uint64
+}
+
+// Name implements subcommands.Command.
+func (*capability) Name() string {
+ return "capability"
+}
+
+// Synopsis implements subcommands.Command.
+func (*capability) Synopsis() string {
+ return "checks if effective capabilities are set/unset"
+}
+
+// Usage implements subcommands.Command.
+func (*capability) Usage() string {
+ return "capability [--enabled=number] [--disabled=number]"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *capability) SetFlags(f *flag.FlagSet) {
+ f.Uint64Var(&c.enabled, "enabled", 0, "")
+ f.Uint64Var(&c.disabled, "disabled", 0, "")
+}
+
+// Execute implements subcommands.Command.
+func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ if c.enabled == 0 && c.disabled == 0 {
+ fmt.Println("One of the flags must be set")
+ return subcommands.ExitUsageError
+ }
+
+ status, err := ioutil.ReadFile("/proc/self/status")
+ if err != nil {
+ fmt.Printf("Error reading %q: %v\n", "proc/self/status", err)
+ return subcommands.ExitFailure
+ }
+ re := regexp.MustCompile("CapEff:\t([0-9a-f]+)\n")
+ matches := re.FindStringSubmatch(string(status))
+ if matches == nil || len(matches) != 2 {
+ fmt.Printf("Effective capabilities not found in\n%s\n", status)
+ return subcommands.ExitFailure
+ }
+ caps, err := strconv.ParseUint(matches[1], 16, 64)
+ if err != nil {
+ fmt.Printf("failed to convert capabilities %q: %v\n", matches[1], err)
+ return subcommands.ExitFailure
+ }
+
+ if c.enabled != 0 && (caps&c.enabled) != c.enabled {
+ fmt.Printf("Missing capabilities, want: %#x: got: %#x\n", c.enabled, caps)
+ return subcommands.ExitFailure
+ }
+ if c.disabled != 0 && (caps&c.disabled) != 0 {
+ fmt.Printf("Extra capabilities found, dont_want: %#x: got: %#x\n", c.disabled, caps)
+ return subcommands.ExitFailure
+ }
+
+ return subcommands.ExitSuccess
+}
+
+type ptyRunner struct{}
+
+// Name implements subcommands.Command.
+func (*ptyRunner) Name() string {
+ return "pty-runner"
+}
+
+// Synopsis implements subcommands.Command.
+func (*ptyRunner) Synopsis() string {
+ return "runs the given command with an open pty terminal"
+}
+
+// Usage implements subcommands.Command.
+func (*ptyRunner) Usage() string {
+ return "pty-runner [command]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*ptyRunner) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
+ c := exec.Command(fs.Args()[0], fs.Args()[1:]...)
+ f, err := pty.Start(c)
+ if err != nil {
+ fmt.Printf("pty.Start failed: %v", err)
+ return subcommands.ExitFailure
+ }
+ defer f.Close()
+
+ // Copy stdout from the command to keep this process alive until the
+ // subprocess exits.
+ io.Copy(os.Stdout, f)
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/criutil/criutil.go b/runsc/criutil/criutil.go
index c8ddf5a9a..773f5a1c4 100644
--- a/runsc/criutil/criutil.go
+++ b/runsc/criutil/criutil.go
@@ -157,13 +157,55 @@ func (cc *Crictl) RmPod(podID string) error {
return err
}
-// StartPodAndContainer pulls an image, then starts a sandbox and container in
-// that sandbox. It returns the pod ID and container ID.
-func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) {
+// StartContainer pulls the given image ands starts the container in the
+// sandbox with the given podID.
+func (cc *Crictl) StartContainer(podID, image, sbSpec, contSpec string) (string, error) {
+ // Write the specs to files that can be read by crictl.
+ sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec)
+ if err != nil {
+ return "", fmt.Errorf("failed to write sandbox spec: %v", err)
+ }
+ contSpecFile, err := testutil.WriteTmpFile("contSpec", contSpec)
+ if err != nil {
+ return "", fmt.Errorf("failed to write container spec: %v", err)
+ }
+
+ return cc.startContainer(podID, image, sbSpecFile, contSpecFile)
+}
+
+func (cc *Crictl) startContainer(podID, image, sbSpecFile, contSpecFile string) (string, error) {
if err := cc.Pull(image); err != nil {
- return "", "", fmt.Errorf("failed to pull %s: %v", image, err)
+ return "", fmt.Errorf("failed to pull %s: %v", image, err)
+ }
+
+ contID, err := cc.Create(podID, contSpecFile, sbSpecFile)
+ if err != nil {
+ return "", fmt.Errorf("failed to create container in pod %q: %v", podID, err)
+ }
+
+ if _, err := cc.Start(contID); err != nil {
+ return "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err)
+ }
+
+ return contID, nil
+}
+
+// StopContainer stops and deletes the container with the given container ID.
+func (cc *Crictl) StopContainer(contID string) error {
+ if err := cc.Stop(contID); err != nil {
+ return fmt.Errorf("failed to stop container %q: %v", contID, err)
+ }
+
+ if err := cc.Rm(contID); err != nil {
+ return fmt.Errorf("failed to remove container %q: %v", contID, err)
}
+ return nil
+}
+
+// StartPodAndContainer pulls an image, then starts a sandbox and container in
+// that sandbox. It returns the pod ID and container ID.
+func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) {
// Write the specs to files that can be read by crictl.
sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec)
if err != nil {
@@ -179,28 +221,17 @@ func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string,
return "", "", err
}
- contID, err := cc.Create(podID, contSpecFile, sbSpecFile)
- if err != nil {
- return "", "", fmt.Errorf("failed to create container in pod %q: %v", podID, err)
- }
+ contID, err := cc.startContainer(podID, image, sbSpecFile, contSpecFile)
- if _, err := cc.Start(contID); err != nil {
- return "", "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err)
- }
-
- return podID, contID, nil
+ return podID, contID, err
}
// StopPodAndContainer stops a container and pod.
func (cc *Crictl) StopPodAndContainer(podID, contID string) error {
- if err := cc.Stop(contID); err != nil {
+ if err := cc.StopContainer(contID); err != nil {
return fmt.Errorf("failed to stop container %q in pod %q: %v", contID, podID, err)
}
- if err := cc.Rm(contID); err != nil {
- return fmt.Errorf("failed to remove container %q in pod %q: %v", contID, podID, err)
- }
-
if err := cc.StopPod(podID); err != nil {
return fmt.Errorf("failed to stop pod %q: %v", podID, err)
}
diff --git a/runsc/debian/description b/runsc/debian/description
index 6e3b1b2c0..9e8e08805 100644
--- a/runsc/debian/description
+++ b/runsc/debian/description
@@ -1,5 +1 @@
-gVisor is a user-space kernel, written in Go, that implements a substantial
-portion of the Linux system surface. It includes an Open Container Initiative
-(OCI) runtime called runsc that provides an isolation boundary between the
-application and the host kernel. The runsc runtime integrates with Docker and
-Kubernetes, making it simple to run sandboxed containers.
+gVisor container sandbox runtime
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 41f5fe1e8..9b6346ca2 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -240,7 +240,7 @@ func (d *Docker) Stop() error {
// Run calls 'docker run' with the arguments provided. The container starts
// running in the background and the call returns immediately.
func (d *Docker) Run(args ...string) error {
- a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-d"}
+ a := d.runArgs("-d")
a = append(a, args...)
_, err := do(a...)
if err == nil {
@@ -251,7 +251,7 @@ func (d *Docker) Run(args ...string) error {
// RunWithPty is like Run but with an attached pty.
func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) {
- a := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-it"}
+ a := d.runArgs("-it")
a = append(a, args...)
return doWithPty(a...)
}
@@ -259,8 +259,7 @@ func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) {
// RunFg calls 'docker run' with the arguments provided in the foreground. It
// blocks until the container exits and returns the output.
func (d *Docker) RunFg(args ...string) (string, error) {
- a := []string{"run", "--runtime", d.Runtime, "--name", d.Name}
- a = append(a, args...)
+ a := d.runArgs(args...)
out, err := do(a...)
if err == nil {
d.logDockerID()
@@ -268,6 +267,14 @@ func (d *Docker) RunFg(args ...string) (string, error) {
return string(out), err
}
+func (d *Docker) runArgs(args ...string) []string {
+ // Environment variable RUNSC_TEST_NAME is picked up by the runtime and added
+ // to the log name, so one can easily identify the corresponding logs for
+ // this test.
+ rv := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-e", "RUNSC_TEST_NAME=" + d.Name}
+ return append(rv, args...)
+}
+
// Logs calls 'docker logs'.
func (d *Docker) Logs() (string, error) {
return do("logs", d.Name)
@@ -275,7 +282,22 @@ func (d *Docker) Logs() (string, error) {
// Exec calls 'docker exec' with the arguments provided.
func (d *Docker) Exec(args ...string) (string, error) {
- a := []string{"exec", d.Name}
+ return d.ExecWithFlags(nil, args...)
+}
+
+// ExecWithFlags calls 'docker exec <flags> name <args>'.
+func (d *Docker) ExecWithFlags(flags []string, args ...string) (string, error) {
+ a := []string{"exec"}
+ a = append(a, flags...)
+ a = append(a, d.Name)
+ a = append(a, args...)
+ return do(a...)
+}
+
+// ExecAsUser calls 'docker exec' as the given user with the arguments
+// provided.
+func (d *Docker) ExecAsUser(user string, args ...string) (string, error) {
+ a := []string{"exec", "--user", user, d.Name}
a = append(a, args...)
return do(a...)
}
@@ -358,6 +380,16 @@ func (d *Docker) FindPort(sandboxPort int) (int, error) {
return port, nil
}
+// FindIP returns the IP address of the container as a string.
+func (d *Docker) FindIP() (string, error) {
+ const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}`
+ out, err := do("inspect", "-f", format, d.Name)
+ if err != nil {
+ return "", fmt.Errorf("error retrieving IP: %v", err)
+ }
+ return strings.TrimSpace(out), nil
+}
+
// SandboxPid returns the PID to the sandbox process.
func (d *Docker) SandboxPid() (int, error) {
out, err := do("inspect", "-f={{.State.Pid}}", d.Name)
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 80a4aa2fe..afcb41801 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -6,6 +6,8 @@ go_library(
name = "fsgofer",
srcs = [
"fsgofer.go",
+ "fsgofer_amd64_unsafe.go",
+ "fsgofer_arm64_unsafe.go",
"fsgofer_unsafe.go",
],
importpath = "gvisor.dev/gvisor/runsc/fsgofer",
diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD
index e2318a978..bac73f89d 100644
--- a/runsc/fsgofer/filter/BUILD
+++ b/runsc/fsgofer/filter/BUILD
@@ -6,6 +6,8 @@ go_library(
name = "filter",
srcs = [
"config.go",
+ "config_amd64.go",
+ "config_arm64.go",
"extra_filters.go",
"extra_filters_msan.go",
"extra_filters_race.go",
@@ -17,6 +19,7 @@ go_library(
],
deps = [
"//pkg/abi/linux",
+ "//pkg/flipcall",
"//pkg/log",
"//pkg/seccomp",
"@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 8ddfa77d6..a1792330f 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -25,11 +25,7 @@ import (
// allowedSyscalls is the set of syscalls executed by the gofer.
var allowedSyscalls = seccomp.SyscallRules{
- syscall.SYS_ACCEPT: {},
- syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
- {seccomp.AllowValue(linux.ARCH_GET_FS)},
- {seccomp.AllowValue(linux.ARCH_SET_FS)},
- },
+ syscall.SYS_ACCEPT: {},
syscall.SYS_CLOCK_GETTIME: {},
syscall.SYS_CLONE: []seccomp.Rule{
{
@@ -83,6 +79,11 @@ var allowedSyscalls = seccomp.SyscallRules{
seccomp.AllowAny{},
seccomp.AllowValue(syscall.F_GETFD),
},
+ // Used by flipcall.PacketWindowAllocator.Init().
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(unix.F_ADD_SEALS),
+ },
},
syscall.SYS_FSTAT: {},
syscall.SYS_FSTATFS: {},
@@ -103,6 +104,19 @@ var allowedSyscalls = seccomp.SyscallRules{
seccomp.AllowAny{},
seccomp.AllowValue(0),
},
+ // Non-private futex used for flipcall.
+ seccomp.Rule{
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAIT),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ },
+ seccomp.Rule{
+ seccomp.AllowAny{},
+ seccomp.AllowValue(linux.FUTEX_WAKE),
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ },
},
syscall.SYS_GETDENTS64: {},
syscall.SYS_GETPID: {},
@@ -112,6 +126,7 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_LINKAT: {},
syscall.SYS_LSEEK: {},
syscall.SYS_MADVISE: {},
+ unix.SYS_MEMFD_CREATE: {}, /// Used by flipcall.PacketWindowAllocator.Init().
syscall.SYS_MKDIRAT: {},
syscall.SYS_MMAP: []seccomp.Rule{
{
@@ -136,7 +151,6 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_MPROTECT: {},
syscall.SYS_MUNMAP: {},
syscall.SYS_NANOSLEEP: {},
- syscall.SYS_NEWFSTATAT: {},
syscall.SYS_OPENAT: {},
syscall.SYS_PPOLL: {},
syscall.SYS_PREAD64: {},
@@ -158,8 +172,16 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_RENAMEAT: {},
syscall.SYS_RESTART_SYSCALL: {},
syscall.SYS_RT_SIGPROCMASK: {},
+ syscall.SYS_RT_SIGRETURN: {},
syscall.SYS_SCHED_YIELD: {},
syscall.SYS_SENDMSG: []seccomp.Rule{
+ // Used by fdchannel.Endpoint.SendFD().
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(0),
+ },
+ // Used by unet.SocketWriter.WriteVec().
{
seccomp.AllowAny{},
seccomp.AllowAny{},
@@ -170,7 +192,15 @@ var allowedSyscalls = seccomp.SyscallRules{
{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
},
syscall.SYS_SIGALTSTACK: {},
- syscall.SYS_SYMLINKAT: {},
+ // Used by fdchannel.NewConnectedSockets().
+ syscall.SYS_SOCKETPAIR: {
+ {
+ seccomp.AllowValue(syscall.AF_UNIX),
+ seccomp.AllowValue(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC),
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_SYMLINKAT: {},
syscall.SYS_TGKILL: []seccomp.Rule{
{
seccomp.AllowValue(uint64(os.Getpid())),
@@ -180,3 +210,28 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_UTIMENSAT: {},
syscall.SYS_WRITE: {},
}
+
+var udsSyscalls = seccomp.SyscallRules{
+ syscall.SYS_SOCKET: []seccomp.Rule{
+ {
+ seccomp.AllowValue(syscall.AF_UNIX),
+ seccomp.AllowValue(syscall.SOCK_STREAM),
+ seccomp.AllowValue(0),
+ },
+ {
+ seccomp.AllowValue(syscall.AF_UNIX),
+ seccomp.AllowValue(syscall.SOCK_DGRAM),
+ seccomp.AllowValue(0),
+ },
+ {
+ seccomp.AllowValue(syscall.AF_UNIX),
+ seccomp.AllowValue(syscall.SOCK_SEQPACKET),
+ seccomp.AllowValue(0),
+ },
+ },
+ syscall.SYS_CONNECT: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ },
+ },
+}
diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go
new file mode 100644
index 000000000..a4b28cb8b
--- /dev/null
+++ b/runsc/fsgofer/filter/config_amd64.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+ allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+ {seccomp.AllowValue(linux.ARCH_GET_FS)},
+ {seccomp.AllowValue(linux.ARCH_SET_FS)},
+ }
+
+ allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{}
+}
diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go
new file mode 100644
index 000000000..d2697deb7
--- /dev/null
+++ b/runsc/fsgofer/filter/config_arm64.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package filter
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/seccomp"
+)
+
+func init() {
+ allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{}
+}
diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go
index 65053415f..289886720 100644
--- a/runsc/fsgofer/filter/filter.go
+++ b/runsc/fsgofer/filter/filter.go
@@ -23,11 +23,16 @@ import (
// Install installs seccomp filters.
func Install() error {
- s := allowedSyscalls
-
// Set of additional filters used by -race and -msan. Returns empty
// when not enabled.
- s.Merge(instrumentationFilters())
+ allowedSyscalls.Merge(instrumentationFilters())
+
+ return seccomp.Install(allowedSyscalls)
+}
- return seccomp.Install(s)
+// InstallUDSFilters extends the allowed syscalls to include those necessary for
+// connecting to a host UDS.
+func InstallUDSFilters() {
+ // Add additional filters required for connecting to the host's sockets.
+ allowedSyscalls.Merge(udsSyscalls)
}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 7c4d2b94e..b59e1a70e 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -54,6 +54,7 @@ const (
regular fileType = iota
directory
symlink
+ socket
unknown
)
@@ -66,6 +67,8 @@ func (f fileType) String() string {
return "directory"
case symlink:
return "symlink"
+ case socket:
+ return "socket"
}
return "unknown"
}
@@ -82,6 +85,9 @@ type Config struct {
// PanicOnWrite panics on attempts to write to RO mounts.
PanicOnWrite bool
+
+ // HostUDS signals whether the gofer can mount a host's UDS.
+ HostUDS bool
}
type attachPoint struct {
@@ -119,35 +125,31 @@ func NewAttachPoint(prefix string, c Config) (p9.Attacher, error) {
// Attach implements p9.Attacher.
func (a *attachPoint) Attach() (p9.File, error) {
- // dirFD (1st argument) is ignored because 'prefix' is always absolute.
- stat, err := statAt(-1, a.prefix)
- if err != nil {
- return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err)
- }
- mode := syscall.O_RDWR
- if a.conf.ROMount || (stat.Mode&syscall.S_IFMT) == syscall.S_IFDIR {
- mode = syscall.O_RDONLY
+ a.attachedMu.Lock()
+ defer a.attachedMu.Unlock()
+
+ if a.attached {
+ return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
}
- // Open the root directory.
- f, err := fd.Open(a.prefix, openFlags|mode, 0)
+ f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) {
+ return fd.Open(a.prefix, openFlags|mode, 0)
+ })
if err != nil {
- return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err)
+ return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err)
}
- a.attachedMu.Lock()
- defer a.attachedMu.Unlock()
- if a.attached {
- f.Close()
- return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
+ stat, err := stat(f.FD())
+ if err != nil {
+ return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err)
}
- rv, err := newLocalFile(a, f, a.prefix, stat)
+ lf, err := newLocalFile(a, f, a.prefix, stat)
if err != nil {
- return nil, err
+ return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err)
}
a.attached = true
- return rv, nil
+ return lf, nil
}
// makeQID returns a unique QID for the given stat buffer.
@@ -197,6 +199,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
// The reason that the file is not opened initially as read-write is for better
// performance with 'overlay2' storage driver. overlay2 eagerly copies the
// entire file up when it's opened in write mode, and would perform badly when
+// multiple files are only being opened for read (esp. startup).
type localFile struct {
p9.DefaultWalkGetAttr
@@ -263,10 +266,10 @@ func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, erro
// actual file open and is customizable by the caller.
func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) {
// Attempt to open file in the following mode in order:
- // 1. RDONLY | NONBLOCK: for all files, works for directories and ro mounts too.
- // Use non-blocking to prevent getting stuck inside open(2) for FIFOs. This option
- // has no effect on regular files.
- // 2. PATH: for symlinks
+ // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
+ // Use non-blocking to prevent getting stuck inside open(2) for
+ // FIFOs. This option has no effect on regular files.
+ // 2. PATH: for symlinks, sockets.
modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
var err error
@@ -295,7 +298,7 @@ func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error)
return file, nil
}
-func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
+func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) {
var ft fileType
switch stat.Mode & syscall.S_IFMT {
case syscall.S_IFREG:
@@ -304,6 +307,11 @@ func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
ft = directory
case syscall.S_IFLNK:
ft = symlink
+ case syscall.S_IFSOCK:
+ if !permitSocket {
+ return unknown, syscall.EPERM
+ }
+ ft = socket
default:
return unknown, syscall.EPERM
}
@@ -311,7 +319,7 @@ func getSupportedFileType(stat syscall.Stat_t) (fileType, error) {
}
func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) {
- ft, err := getSupportedFileType(stat)
+ ft, err := getSupportedFileType(stat, a.conf.HostUDS)
if err != nil {
return nil, err
}
@@ -359,23 +367,24 @@ func fchown(fd int, uid p9.UID, gid p9.GID) error {
}
// Open implements p9.File.
-func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
if l.isOpen() {
panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath))
}
// Check if control file can be used or if a new open must be created.
var newFile *fd.FD
- if mode == p9.ReadOnly {
- log.Debugf("Open reusing control file, mode: %v, %q", mode, l.hostPath)
+ if flags == p9.ReadOnly {
+ log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath)
newFile = l.file
} else {
// Ideally reopen would call name_to_handle_at (with empty name) and
// open_by_handle_at to reopen the file without using 'hostPath'. However,
// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
- log.Debugf("Open reopening file, mode: %v, %q", mode, l.hostPath)
+ log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath)
var err error
- newFile, err = reopenProcFd(l.file, openFlags|mode.OSFlags())
+ // Constrain open flags to the open mode and O_TRUNC.
+ newFile, err = reopenProcFd(l.file, openFlags|(flags.OSFlags()&(syscall.O_ACCMODE|syscall.O_TRUNC)))
if err != nil {
return nil, p9.QID{}, 0, extractErrno(err)
}
@@ -402,7 +411,7 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
}
l.file = newFile
}
- l.mode = mode
+ l.mode = flags & p9.OpenFlagsModeMask
return fd, l.attachPoint.makeQID(stat), 0, nil
}
@@ -594,7 +603,7 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
Mode: p9.FileMode(stat.Mode),
UID: p9.UID(stat.Uid),
GID: p9.GID(stat.Gid),
- NLink: stat.Nlink,
+ NLink: uint64(stat.Nlink),
RDev: stat.Rdev,
Size: uint64(stat.Size),
BlockSize: uint64(stat.Blksize),
@@ -948,14 +957,14 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
}
func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) {
+ var dirents []p9.Dirent
+
// Limit 'count' to cap the slice size that is returned.
const maxCount = 100000
if count > maxCount {
count = maxCount
}
- dirents := make([]p9.Dirent, 0, count)
-
// Pre-allocate buffers that will be reused to get partial results.
direntsBuf := make([]byte, 8192)
names := make([]string, 0, 100)
@@ -1025,8 +1034,48 @@ func (l *localFile) Flush() error {
}
// Connect implements p9.File.
-func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) {
- return nil, syscall.ECONNREFUSED
+func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
+ if !l.attachPoint.conf.HostUDS {
+ return nil, syscall.ECONNREFUSED
+ }
+
+ // TODO(gvisor.dev/issue/1003): Due to different app vs replacement
+ // mappings, the app path may have fit in the sockaddr, but we can't
+ // fit f.path in our sockaddr. We'd need to redirect through a shorter
+ // path in order to actually connect to this socket.
+ if len(l.hostPath) > linux.UnixPathMax {
+ return nil, syscall.ECONNREFUSED
+ }
+
+ var stype int
+ switch flags {
+ case p9.StreamSocket:
+ stype = syscall.SOCK_STREAM
+ case p9.DgramSocket:
+ stype = syscall.SOCK_DGRAM
+ case p9.SeqpacketSocket:
+ stype = syscall.SOCK_SEQPACKET
+ default:
+ return nil, syscall.ENXIO
+ }
+
+ f, err := syscall.Socket(syscall.AF_UNIX, stype, 0)
+ if err != nil {
+ return nil, err
+ }
+
+ if err := syscall.SetNonblock(f, true); err != nil {
+ syscall.Close(f)
+ return nil, err
+ }
+
+ sa := syscall.SockaddrUnix{Name: l.hostPath}
+ if err := syscall.Connect(f, &sa); err != nil {
+ syscall.Close(f)
+ return nil, err
+ }
+
+ return fd.New(f), nil
}
// Close implements p9.File.
diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go
new file mode 100644
index 000000000..5d4aab597
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package fsgofer
+
+import (
+ "syscall"
+ "unsafe"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/syserr"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+ nameBytes, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return syscall.Stat_t{}, err
+ }
+ namePtr := unsafe.Pointer(nameBytes)
+
+ var stat syscall.Stat_t
+ statPtr := unsafe.Pointer(&stat)
+
+ if _, _, errno := syscall.Syscall6(
+ syscall.SYS_NEWFSTATAT,
+ uintptr(dirFd),
+ uintptr(namePtr),
+ uintptr(statPtr),
+ linux.AT_SYMLINK_NOFOLLOW,
+ 0,
+ 0); errno != 0 {
+
+ return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+ }
+ return stat, nil
+}
diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go
new file mode 100644
index 000000000..8041fd352
--- /dev/null
+++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go
@@ -0,0 +1,49 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package fsgofer
+
+import (
+ "syscall"
+ "unsafe"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/syserr"
+)
+
+func statAt(dirFd int, name string) (syscall.Stat_t, error) {
+ nameBytes, err := syscall.BytePtrFromString(name)
+ if err != nil {
+ return syscall.Stat_t{}, err
+ }
+ namePtr := unsafe.Pointer(nameBytes)
+
+ var stat syscall.Stat_t
+ statPtr := unsafe.Pointer(&stat)
+
+ if _, _, errno := syscall.Syscall6(
+ syscall.SYS_FSTATAT,
+ uintptr(dirFd),
+ uintptr(namePtr),
+ uintptr(statPtr),
+ linux.AT_SYMLINK_NOFOLLOW,
+ 0,
+ 0); errno != 0 {
+
+ return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+ }
+ return stat, nil
+}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index cbbe71019..05af7e397 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -665,7 +665,7 @@ func TestAttachInvalidType(t *testing.T) {
}
f, err := a.Attach()
if f != nil || err == nil {
- t.Fatalf("Attach should have failed, got (%v, nil)", f)
+ t.Fatalf("Attach should have failed, got (%v, %v)", f, err)
}
})
}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index ff2556aee..542b54365 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -18,34 +18,9 @@ import (
"syscall"
"unsafe"
- "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/syserr"
)
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
- nameBytes, err := syscall.BytePtrFromString(name)
- if err != nil {
- return syscall.Stat_t{}, err
- }
- namePtr := unsafe.Pointer(nameBytes)
-
- var stat syscall.Stat_t
- statPtr := unsafe.Pointer(&stat)
-
- if _, _, errno := syscall.Syscall6(
- syscall.SYS_NEWFSTATAT,
- uintptr(dirFd),
- uintptr(namePtr),
- uintptr(statPtr),
- linux.AT_SYMLINK_NOFOLLOW,
- 0,
- 0); errno != 0 {
-
- return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
- }
- return stat, nil
-}
-
func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
// utimensat(2) doesn't accept empty name, instead name must be nil to make it
// operate directly on 'dirFd' unlike other *at syscalls.
diff --git a/runsc/main.go b/runsc/main.go
index 0ff68160d..abf929511 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -26,6 +26,7 @@ import (
"path/filepath"
"strings"
"syscall"
+ "time"
"flag"
@@ -41,34 +42,39 @@ import (
var (
// Although these flags are not part of the OCI spec, they are used by
// Docker, and thus should not be changed.
- rootDir = flag.String("root", "", "root directory for storage of container state")
- logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout")
- logFormat = flag.String("log-format", "text", "log format: text (default), json, or json-k8s")
- debug = flag.Bool("debug", false, "enable debug logging")
- showVersion = flag.Bool("version", false, "show version and exit")
+ rootDir = flag.String("root", "", "root directory for storage of container state.")
+ logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.")
+ logFormat = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
+ debug = flag.Bool("debug", false, "enable debug logging.")
+ showVersion = flag.Bool("version", false, "show version and exit.")
+ // TODO(gvisor.dev/issue/193): support systemd cgroups
+ systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.")
// These flags are unique to runsc, and are used to configure parts of the
// system that are not covered by the runtime spec.
// Debugging flags.
debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
- logPackets = flag.Bool("log-packets", false, "enable network packet logging")
+ logPackets = flag.Bool("log-packets", false, "enable network packet logging.")
logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.")
debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.")
- debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
- alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr")
+ debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
+ alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.")
// Debugging flags: strace related
- strace = flag.Bool("strace", false, "enable strace")
+ strace = flag.Bool("strace", false, "enable strace.")
straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
- straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
+ straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
// Flags that control sandbox runtime behavior.
- platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+ platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
- gso = flag.Bool("gso", true, "enable generic segmenation offload")
+ hardwareGSO = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
+ softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.")
fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+ fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+ overlayfsStaleRead = flag.Bool("overlayfs-stale-read", false, "reopen cached FDs after a file is opened for write to workaround overlayfs limitation on kernels before 4.19.")
watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
@@ -76,9 +82,11 @@ var (
numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
+ cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
// Test flags, not to be used outside tests, ever.
testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
+ testOnlyTestNameEnv = flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
)
func main() {
@@ -132,6 +140,12 @@ func main() {
os.Exit(0)
}
+ // TODO(gvisor.dev/issue/193): support systemd cgroups
+ if *systemdCgroup {
+ fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193")
+ os.Exit(1)
+ }
+
var errorLogger io.Writer
if *logFD > -1 {
errorLogger = os.NewFile(uintptr(*logFD), "error log file")
@@ -194,9 +208,11 @@ func main() {
DebugLog: *debugLog,
DebugLogFormat: *debugLogFormat,
FileAccess: fsAccess,
+ FSGoferHostUDS: *fsGoferHostUDS,
Overlay: *overlay,
Network: netType,
- GSO: *gso,
+ HardwareGSO: *hardwareGSO,
+ SoftwareGSO: *softwareGSO,
LogPackets: *logPackets,
Platform: platformType,
Strace: *strace,
@@ -209,8 +225,11 @@ func main() {
Rootless: *rootless,
AlsoLogToStderr: *alsoLogToStderr,
ReferenceLeakMode: refsLeakMode,
+ OverlayfsStaleRead: *overlayfsStaleRead,
+ CPUNumFromQuota: *cpuNumFromQuota,
TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
+ TestOnlyTestNameEnv: *testOnlyTestNameEnv,
}
if len(*straceSyscalls) != 0 {
conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
@@ -221,6 +240,18 @@ func main() {
log.SetLevel(log.Debug)
}
+ // Logging will include the local date and time via the time package.
+ //
+ // On first use, time.Local initializes the local time zone, which
+ // involves opening tzdata files on the host. Since this requires
+ // opening host files, it must be done before syscall filter
+ // installation.
+ //
+ // Generally there will be a log message before filter installation
+ // that will force initialization, but force initialization here in
+ // case that does not occur.
+ _ = time.Local.String()
+
subcommand := flag.CommandLine.Arg(0)
var e log.Emitter
@@ -237,14 +268,14 @@ func main() {
// want with them. Since Docker and Containerd both eat boot's stderr, we
// dup our stderr to the provided log FD so that panics will appear in the
// logs, rather than just disappear.
- if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
+ if err := syscall.Dup3(int(f.Fd()), int(os.Stderr.Fd()), 0); err != nil {
cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
}
e = newEmitter(*debugLogFormat, f)
} else if *debugLog != "" {
- f, err := specutils.DebugLogFile(*debugLog, subcommand)
+ f, err := specutils.DebugLogFile(*debugLog, subcommand, "" /* name */)
if err != nil {
cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 7fdceaab6..8001949d5 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -19,6 +19,8 @@ go_library(
"//pkg/log",
"//pkg/sentry/control",
"//pkg/sentry/platform",
+ "//pkg/tcpip/header",
+ "//pkg/tcpip/stack",
"//pkg/urpc",
"//runsc/boot",
"//runsc/boot/platforms",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 5634f0707..be8b72b3e 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -28,6 +28,8 @@ import (
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/specutils"
@@ -61,7 +63,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
// Build the path to the net namespace of the sandbox process.
// This is what we will copy.
nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
- if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil {
+ if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels); err != nil {
return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
}
case boot.NetworkHost:
@@ -136,7 +138,7 @@ func isRootNS() (bool, error) {
// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
// net namespace with the given path, creates them in the sandbox, and removes
// them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int) error {
// Join the network namespace that we will be copying.
restore, err := joinNetNS(nsPath)
if err != nil {
@@ -182,36 +184,39 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
continue
}
- // Keep only IPv4 addresses.
- var ip4addrs []*net.IPNet
+ var ipAddrs []*net.IPNet
for _, ifaddr := range allAddrs {
ipNet, ok := ifaddr.(*net.IPNet)
if !ok {
return fmt.Errorf("address is not IPNet: %+v", ifaddr)
}
- if ipNet.IP.To4() == nil {
- log.Warningf("IPv6 is not supported, skipping: %v", ipNet)
- continue
- }
- ip4addrs = append(ip4addrs, ipNet)
+ ipAddrs = append(ipAddrs, ipNet)
}
- if len(ip4addrs) == 0 {
- log.Warningf("No IPv4 address found for interface %q, skipping", iface.Name)
+ if len(ipAddrs) == 0 {
+ log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
continue
}
// Scrape the routes before removing the address, since that
// will remove the routes as well.
- routes, def, err := routesForIface(iface)
+ routes, defv4, defv6, err := routesForIface(iface)
if err != nil {
return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
}
- if def != nil {
- if !args.DefaultGateway.Route.Empty() {
- return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway)
+ if defv4 != nil {
+ if !args.Defaultv4Gateway.Route.Empty() {
+ return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
}
- args.DefaultGateway.Route = *def
- args.DefaultGateway.Name = iface.Name
+ args.Defaultv4Gateway.Route = *defv4
+ args.Defaultv4Gateway.Name = iface.Name
+ }
+
+ if defv6 != nil {
+ if !args.Defaultv6Gateway.Route.Empty() {
+ return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
+ }
+ args.Defaultv6Gateway.Route = *defv6
+ args.Defaultv6Gateway.Name = iface.Name
}
link := boot.FDBasedLink{
@@ -232,7 +237,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
// Create the socket for the device.
for i := 0; i < link.NumChannels; i++ {
log.Debugf("Creating Channel %d", i)
- socketEntry, err := createSocket(iface, ifaceLink, enableGSO)
+ socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO)
if err != nil {
return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
}
@@ -247,9 +252,15 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
}
+ if link.GSOMaxSize == 0 && softwareGSO {
+ // Hardware GSO is disabled. Let's enable software GSO.
+ link.GSOMaxSize = stack.SoftwareGSOMaxSize
+ link.SoftwareGSOEnabled = true
+ }
+
// Collect the addresses for the interface, enable forwarding,
// and remove them from the host.
- for _, addr := range ip4addrs {
+ for _, addr := range ipAddrs {
link.Addresses = append(link.Addresses, addr.IP)
// Steal IP address from NIC.
@@ -345,46 +356,56 @@ func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink,
}
// routesForIface iterates over all routes for the given interface and converts
-// them to boot.Routes.
-func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
+// them to boot.Routes. It also returns the a default v4/v6 route if found.
+func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
link, err := netlink.LinkByIndex(iface.Index)
if err != nil {
- return nil, nil, err
+ return nil, nil, nil, err
}
rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
if err != nil {
- return nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
+ return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
}
- var def *boot.Route
+ var defv4, defv6 *boot.Route
var routes []boot.Route
for _, r := range rs {
// Is it a default route?
if r.Dst == nil {
if r.Gw == nil {
- return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
- }
- if r.Gw.To4() == nil {
- log.Warningf("IPv6 is not supported, skipping default route: %v", r)
- continue
- }
- if def != nil {
- return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r)
+ return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
}
// Create a catch all route to the gateway.
- def = &boot.Route{
- Destination: net.IPNet{
- IP: net.IPv4zero,
- Mask: net.IPMask(net.IPv4zero),
- },
- Gateway: r.Gw,
+ switch len(r.Gw) {
+ case header.IPv4AddressSize:
+ if defv4 != nil {
+ return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
+ }
+ defv4 = &boot.Route{
+ Destination: net.IPNet{
+ IP: net.IPv4zero,
+ Mask: net.IPMask(net.IPv4zero),
+ },
+ Gateway: r.Gw,
+ }
+ case header.IPv6AddressSize:
+ if defv6 != nil {
+ return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
+ }
+
+ defv6 = &boot.Route{
+ Destination: net.IPNet{
+ IP: net.IPv6zero,
+ Mask: net.IPMask(net.IPv6zero),
+ },
+ Gateway: r.Gw,
+ }
+ default:
+ return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
}
continue
}
- if r.Dst.IP.To4() == nil {
- log.Warningf("IPv6 is not supported, skipping route: %v", r)
- continue
- }
+
dst := *r.Dst
dst.IP = dst.IP.Mask(dst.Mask)
routes = append(routes, boot.Route{
@@ -392,7 +413,7 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
Gateway: r.Gw,
})
}
- return routes, def, nil
+ return routes, defv4, defv6, nil
}
// removeAddress removes IP address from network device. It's equivalent to:
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index df3c0c5ef..ce1452b87 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -18,6 +18,7 @@ package sandbox
import (
"context"
"fmt"
+ "math"
"os"
"os/exec"
"strconv"
@@ -351,7 +352,15 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
nextFD++
}
if conf.DebugLog != "" {
- debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot")
+ test := ""
+ if len(conf.TestOnlyTestNameEnv) != 0 {
+ // Fetch test name if one is provided and the test only flag was set.
+ if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+ test = t
+ }
+ }
+
+ debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot", test)
if err != nil {
return fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
}
@@ -623,6 +632,26 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
if err != nil {
return fmt.Errorf("getting cpu count from cgroups: %v", err)
}
+ if conf.CPUNumFromQuota {
+ // Dropping below 2 CPUs can trigger application to disable
+ // locks that can lead do hard to debug errors, so just
+ // leaving two cores as reasonable default.
+ const minCPUs = 2
+
+ quota, err := s.Cgroup.CPUQuota()
+ if err != nil {
+ return fmt.Errorf("getting cpu qouta from cgroups: %v", err)
+ }
+ if n := int(math.Ceil(quota)); n > 0 {
+ if n < minCPUs {
+ n = minCPUs
+ }
+ if n < cpuNum {
+ // Only lower the cpu number.
+ cpuNum = n
+ }
+ }
+ }
cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
mem, err := s.Cgroup.MemoryLimit()
@@ -996,16 +1025,22 @@ func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
// DestroyContainer destroys the given container. If it is the root container,
// then the entire sandbox is destroyed.
func (s *Sandbox) DestroyContainer(cid string) error {
+ if err := s.destroyContainer(cid); err != nil {
+ // If the sandbox isn't running, the container has already been destroyed,
+ // ignore the error in this case.
+ if s.IsRunning() {
+ return err
+ }
+ }
+ return nil
+}
+
+func (s *Sandbox) destroyContainer(cid string) error {
if s.IsRootContainer(cid) {
log.Debugf("Destroying root container %q by destroying sandbox", cid)
return s.destroy()
}
- if !s.IsRunning() {
- // Sandbox isn't running anymore, container is already destroyed.
- return nil
- }
-
log.Debugf("Destroying container %q in sandbox %q", cid, s.ID)
conn, err := s.sandboxConnect()
if err != nil {
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index fbfb8e2f8..205638803 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
go_library(
name = "specutils",
srcs = [
+ "cri.go",
"fs.go",
"namespace.go",
"specutils.go",
@@ -13,6 +14,7 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/abi/linux",
+ "//pkg/bits",
"//pkg/log",
"//pkg/sentry/kernel/auth",
"@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/specutils/cri.go b/runsc/specutils/cri.go
new file mode 100644
index 000000000..9c5877cd5
--- /dev/null
+++ b/runsc/specutils/cri.go
@@ -0,0 +1,110 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package specutils
+
+import (
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+const (
+ // ContainerdContainerTypeAnnotation is the OCI annotation set by
+ // containerd to indicate whether the container to create should have
+ // its own sandbox or a container within an existing sandbox.
+ ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
+ // ContainerdContainerTypeContainer is the container type value
+ // indicating the container should be created in an existing sandbox.
+ ContainerdContainerTypeContainer = "container"
+ // ContainerdContainerTypeSandbox is the container type value
+ // indicating the container should be created in a new sandbox.
+ ContainerdContainerTypeSandbox = "sandbox"
+
+ // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
+ // which sandbox the container should be created in when the container
+ // is not the first container in the sandbox.
+ ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
+
+ // CRIOContainerTypeAnnotation is the OCI annotation set by
+ // CRI-O to indicate whether the container to create should have
+ // its own sandbox or a container within an existing sandbox.
+ CRIOContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType"
+
+ // CRIOContainerTypeContainer is the container type value
+ // indicating the container should be created in an existing sandbox.
+ CRIOContainerTypeContainer = "container"
+ // CRIOContainerTypeSandbox is the container type value
+ // indicating the container should be created in a new sandbox.
+ CRIOContainerTypeSandbox = "sandbox"
+
+ // CRIOSandboxIDAnnotation is the OCI annotation set to indicate
+ // which sandbox the container should be created in when the container
+ // is not the first container in the sandbox.
+ CRIOSandboxIDAnnotation = "io.kubernetes.cri-o.SandboxID"
+)
+
+// ContainerType represents the type of container requested by the calling container manager.
+type ContainerType int
+
+const (
+ // ContainerTypeUnspecified indicates that no known container type
+ // annotation was found in the spec.
+ ContainerTypeUnspecified ContainerType = iota
+ // ContainerTypeUnknown indicates that a container type was specified
+ // but is unknown to us.
+ ContainerTypeUnknown
+ // ContainerTypeSandbox indicates that the container should be run in a
+ // new sandbox.
+ ContainerTypeSandbox
+ // ContainerTypeContainer indicates that the container should be run in
+ // an existing sandbox.
+ ContainerTypeContainer
+)
+
+// SpecContainerType tries to determine the type of container specified by the
+// container manager using well-known container annotations.
+func SpecContainerType(spec *specs.Spec) ContainerType {
+ if t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]; ok {
+ switch t {
+ case ContainerdContainerTypeSandbox:
+ return ContainerTypeSandbox
+ case ContainerdContainerTypeContainer:
+ return ContainerTypeContainer
+ default:
+ return ContainerTypeUnknown
+ }
+ }
+ if t, ok := spec.Annotations[CRIOContainerTypeAnnotation]; ok {
+ switch t {
+ case CRIOContainerTypeSandbox:
+ return ContainerTypeSandbox
+ case CRIOContainerTypeContainer:
+ return ContainerTypeContainer
+ default:
+ return ContainerTypeUnknown
+ }
+ }
+ return ContainerTypeUnspecified
+}
+
+// SandboxID returns the ID of the sandbox to join and whether an ID was found
+// in the spec.
+func SandboxID(spec *specs.Spec) (string, bool) {
+ if id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]; ok {
+ return id, true
+ }
+ if id, ok := spec.Annotations[CRIOSandboxIDAnnotation]; ok {
+ return id, true
+ }
+ return "", false
+}
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index d441419cb..c7dd3051c 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -33,19 +33,19 @@ import (
func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr {
switch nst {
case specs.IPCNamespace:
- return syscall.CLONE_NEWIPC
+ return unix.CLONE_NEWIPC
case specs.MountNamespace:
- return syscall.CLONE_NEWNS
+ return unix.CLONE_NEWNS
case specs.NetworkNamespace:
- return syscall.CLONE_NEWNET
+ return unix.CLONE_NEWNET
case specs.PIDNamespace:
- return syscall.CLONE_NEWPID
+ return unix.CLONE_NEWPID
case specs.UTSNamespace:
- return syscall.CLONE_NEWUTS
+ return unix.CLONE_NEWUTS
case specs.UserNamespace:
- return syscall.CLONE_NEWUSER
+ return unix.CLONE_NEWUSER
case specs.CgroupNamespace:
- panic("cgroup namespace has no associated clone flag")
+ return unix.CLONE_NEWCGROUP
default:
panic(fmt.Sprintf("unknown namespace %v", nst))
}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 2eec92349..d3c2e4e78 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -23,6 +23,7 @@ import (
"os"
"path"
"path/filepath"
+ "strconv"
"strings"
"syscall"
"time"
@@ -30,6 +31,7 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
@@ -90,7 +92,7 @@ func ValidateSpec(spec *specs.Spec) error {
log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
}
- // TODO(b/72226747): Apply seccomp to application inside sandbox.
+ // TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox.
if spec.Linux != nil && spec.Linux.Seccomp != nil {
log.Warningf("Seccomp spec is being ignored")
}
@@ -106,23 +108,18 @@ func ValidateSpec(spec *specs.Spec) error {
}
}
- // Two annotations are use by containerd to support multi-container pods.
- // "io.kubernetes.cri.container-type"
- // "io.kubernetes.cri.sandbox-id"
- containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation]
- _, hasSandboxID := spec.Annotations[ContainerdSandboxIDAnnotation]
- switch {
- // Non-containerd use won't set a container type.
- case !hasContainerType:
- case containerType == ContainerdContainerTypeSandbox:
- // When starting a container in an existing sandbox, the sandbox ID
- // must be set.
- case containerType == ContainerdContainerTypeContainer:
- if !hasSandboxID {
- return fmt.Errorf("spec has container-type of %s, but no sandbox ID set", containerType)
+ // CRI specifies whether a container should start a new sandbox, or run
+ // another container in an existing sandbox.
+ switch SpecContainerType(spec) {
+ case ContainerTypeContainer:
+ // When starting a container in an existing sandbox, the
+ // sandbox ID must be set.
+ if _, ok := SandboxID(spec); !ok {
+ return fmt.Errorf("spec has container-type of container, but no sandbox ID set")
}
+ case ContainerTypeUnknown:
+ return fmt.Errorf("unknown container-type")
default:
- return fmt.Errorf("unknown container-type: %s", containerType)
}
return nil
@@ -240,6 +237,15 @@ func AllCapabilities() *specs.LinuxCapabilities {
}
}
+// AllCapabilitiesUint64 returns a bitmask containing all capabilities set.
+func AllCapabilitiesUint64() uint64 {
+ var rv uint64
+ for _, cap := range capFromName {
+ rv |= bits.MaskOf64(int(cap))
+ }
+ return rv
+}
+
var capFromName = map[string]linux.Capability{
"CAP_CHOWN": linux.CAP_CHOWN,
"CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE,
@@ -327,39 +333,6 @@ func IsSupportedDevMount(m specs.Mount) bool {
return true
}
-const (
- // ContainerdContainerTypeAnnotation is the OCI annotation set by
- // containerd to indicate whether the container to create should have
- // its own sandbox or a container within an existing sandbox.
- ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
- // ContainerdContainerTypeContainer is the container type value
- // indicating the container should be created in an existing sandbox.
- ContainerdContainerTypeContainer = "container"
- // ContainerdContainerTypeSandbox is the container type value
- // indicating the container should be created in a new sandbox.
- ContainerdContainerTypeSandbox = "sandbox"
-
- // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
- // which sandbox the container should be created in when the container
- // is not the first container in the sandbox.
- ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
-)
-
-// ShouldCreateSandbox returns true if the spec indicates that a new sandbox
-// should be created for the container. If false, the container should be
-// started in an existing sandbox.
-func ShouldCreateSandbox(spec *specs.Spec) bool {
- t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]
- return !ok || t == ContainerdContainerTypeSandbox
-}
-
-// SandboxID returns the ID of the sandbox to join and whether an ID was found
-// in the spec.
-func SandboxID(spec *specs.Spec) (string, bool) {
- id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]
- return id, ok
-}
-
// WaitForReady waits for a process to become ready. The process is ready when
// the 'ready' function returns true. It continues to wait if 'ready' returns
// false. It returns error on timeout, if the process stops or if 'ready' fails.
@@ -398,13 +371,15 @@ func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) er
// - %TIMESTAMP%: is replaced with a timestamp using the following format:
// <yyyymmdd-hhmmss.uuuuuu>
// - %COMMAND%: is replaced with 'command'
-func DebugLogFile(logPattern, command string) (*os.File, error) {
+// - %TEST%: is replaced with 'test' (omitted by default)
+func DebugLogFile(logPattern, command, test string) (*os.File, error) {
if strings.HasSuffix(logPattern, "/") {
// Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>
logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%"
}
logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1)
logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1)
+ logPattern = strings.Replace(logPattern, "%TEST%", test, -1)
dir := filepath.Dir(logPattern)
if err := os.MkdirAll(dir, 0775); err != nil {
@@ -503,3 +478,53 @@ func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
}
}
}
+
+// GetOOMScoreAdj reads the given process' oom_score_adj
+func GetOOMScoreAdj(pid int) (int, error) {
+ data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid))
+ if err != nil {
+ return 0, err
+ }
+ return strconv.Atoi(strings.TrimSpace(string(data)))
+}
+
+// GetParentPid gets the parent process ID of the specified PID.
+func GetParentPid(pid int) (int, error) {
+ data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/stat", pid))
+ if err != nil {
+ return 0, err
+ }
+
+ var cpid string
+ var name string
+ var state string
+ var ppid int
+ // Parse after the binary name.
+ _, err = fmt.Sscanf(string(data),
+ "%v %v %v %d",
+ // cpid is ignored.
+ &cpid,
+ // name is ignored.
+ &name,
+ // state is ignored.
+ &state,
+ &ppid)
+
+ if err != nil {
+ return 0, err
+ }
+
+ return ppid, nil
+}
+
+// EnvVar looks for a varible value in the env slice assuming the following
+// format: "NAME=VALUE".
+func EnvVar(env []string, name string) (string, bool) {
+ prefix := name + "="
+ for _, e := range env {
+ if strings.HasPrefix(e, prefix) {
+ return strings.TrimPrefix(e, prefix), true
+ }
+ }
+ return "", false
+}
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index d44ebc906..c96ca2eb6 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -9,6 +9,7 @@ go_library(
importpath = "gvisor.dev/gvisor/runsc/testutil",
visibility = ["//:sandbox"],
deps = [
+ "//pkg/log",
"//runsc/boot",
"//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index 57ab73d97..9632776d2 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -25,13 +25,14 @@ import (
"fmt"
"io"
"io/ioutil"
- "log"
+ "math"
"math/rand"
"net/http"
"os"
"os/exec"
"os/signal"
"path/filepath"
+ "strconv"
"strings"
"sync"
"sync/atomic"
@@ -40,6 +41,7 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -149,13 +151,6 @@ func TestConfig() *boot.Config {
}
}
-// TestConfigWithRoot returns the default configuration to use in tests.
-func TestConfigWithRoot(rootDir string) *boot.Config {
- conf := TestConfig()
- conf.RootDir = rootDir
- return conf
-}
-
// NewSpecWithArgs creates a simple spec with the given args suitable for use
// in tests.
func NewSpecWithArgs(args ...string) *specs.Spec {
@@ -284,7 +279,7 @@ func WaitForHTTP(port int, timeout time.Duration) error {
url := fmt.Sprintf("http://localhost:%d/", port)
resp, err := c.Get(url)
if err != nil {
- log.Printf("Waiting %s: %v", url, err)
+ log.Infof("Waiting %s: %v", url, err)
return err
}
resp.Body.Close()
@@ -438,3 +433,44 @@ func IsStatic(filename string) (bool, error) {
}
return true, nil
}
+
+// TestBoundsForShard calculates the beginning and end indices for the test
+// based on the TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars. The
+// returned ints are the beginning (inclusive) and end (exclusive) of the
+// subslice corresponding to the shard. If either of the env vars are not
+// present, then the function will return bounds that include all tests. If
+// there are more shards than there are tests, then the returned list may be
+// empty.
+func TestBoundsForShard(numTests int) (int, int, error) {
+ var (
+ begin = 0
+ end = numTests
+ )
+ indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS")
+ if indexStr == "" || totalStr == "" {
+ return begin, end, nil
+ }
+
+ // Parse index and total to ints.
+ shardIndex, err := strconv.Atoi(indexStr)
+ if err != nil {
+ return 0, 0, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err)
+ }
+ shardTotal, err := strconv.Atoi(totalStr)
+ if err != nil {
+ return 0, 0, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err)
+ }
+
+ // Calculate!
+ shardSize := int(math.Ceil(float64(numTests) / float64(shardTotal)))
+ begin = shardIndex * shardSize
+ end = ((shardIndex + 1) * shardSize)
+ if begin > numTests {
+ // Nothing to run.
+ return 0, 0, nil
+ }
+ if end > numTests {
+ end = numTests
+ }
+ return begin, end, nil
+}
diff --git a/runsc/version.go b/runsc/version.go
index ce0573a9b..ab9194b9d 100644
--- a/runsc/version.go
+++ b/runsc/version.go
@@ -15,4 +15,4 @@
package main
// version is set during linking.
-var version = ""
+var version = "VERSION_MISSING"
diff --git a/runsc/version_test.sh b/runsc/version_test.sh
new file mode 100755
index 000000000..cc0ca3f05
--- /dev/null
+++ b/runsc/version_test.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euf -x -o pipefail
+
+readonly runsc="${TEST_SRCDIR}/__main__/runsc/linux_amd64_pure_stripped/runsc"
+readonly version=$($runsc --version)
+
+# Version should should not match VERSION, which is the default and which will
+# also appear if something is wrong with workspace_status.sh script.
+if [[ $version =~ "VERSION" ]]; then
+ echo "FAIL: Got bad version $version"
+ exit 1
+fi
+
+# Version should contain at least one number.
+if [[ ! $version =~ [0-9] ]]; then
+ echo "FAIL: Got bad version $version"
+ exit 1
+fi
+
+echo "PASS: Got OK version $version"
+exit 0