diff options
Diffstat (limited to 'runsc')
48 files changed, 1863 insertions, 706 deletions
diff --git a/runsc/BUILD b/runsc/BUILD index e4e8e64a3..e5587421d 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -76,16 +76,24 @@ pkg_tar( genrule( name = "deb-version", + # Note that runsc must appear in the srcs parameter and not the tools + # parameter, otherwise it will not be stamped. This is reasonable, as tools + # may be encoded differently in the build graph (cached more aggressively + # because they are assumes to be hermetic). + srcs = [":runsc"], outs = ["version.txt"], cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@", stamp = 1, - tools = [":runsc"], ) pkg_deb( name = "runsc-debian", architecture = "amd64", data = ":debian-data", + # Note that the description_file will be flatten (all newlines removed), + # and therefore it is kept to a simple one-line description. The expected + # format for debian packages is "short summary\nLonger explanation of + # tool." and this is impossible with the flattening. description_file = "debian/description", homepage = "https://gvisor.dev/", maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>", diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 6fe2b57de..6226b63f8 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "compat.go", "compat_amd64.go", + "compat_arm64.go", "config.go", "controller.go", "debug.go", @@ -15,6 +16,8 @@ go_library( "fs.go", "limits.go", "loader.go", + "loader_amd64.go", + "loader_arm64.go", "network.go", "pprof.go", "strace.go", @@ -60,6 +63,7 @@ go_library( "//pkg/sentry/socket/hostinet", "//pkg/sentry/socket/netlink", "//pkg/sentry/socket/netlink/route", + "//pkg/sentry/socket/netlink/uevent", "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix", "//pkg/sentry/state", @@ -107,7 +111,6 @@ go_test( "//pkg/control/server", "//pkg/log", "//pkg/p9", - "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/context/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 07e35ab10..352e710d2 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -21,10 +21,8 @@ import ( "syscall" "github.com/golang/protobuf/proto" - "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/arch" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" "gvisor.dev/gvisor/pkg/sentry/strace" @@ -53,9 +51,9 @@ type compatEmitter struct { } func newCompatEmitter(logFD int) (*compatEmitter, error) { - nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64) + nameMap, ok := getSyscallNameMap() if !ok { - return nil, fmt.Errorf("amd64 Linux syscall table not found") + return nil, fmt.Errorf("Linux syscall table not found") } c := &compatEmitter{ @@ -86,16 +84,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (bool, error) { } func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { - regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64 + regs := us.Registers c.mu.Lock() defer c.mu.Unlock() - sysnr := regs.OrigRax + sysnr := syscallNum(regs) tr := c.trackers[sysnr] if tr == nil { switch sysnr { - case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL: + case syscall.SYS_PRCTL: // args: cmd, ... tr = newArgsTracker(0) @@ -112,10 +110,14 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { tr = newArgsTracker(2) default: - tr = &onceTracker{} + tr = newArchArgsTracker(sysnr) + if tr == nil { + tr = &onceTracker{} + } } c.trackers[sysnr] = tr } + if tr.shouldReport(regs) { c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs) tr.onReported(regs) @@ -139,10 +141,10 @@ func (c *compatEmitter) Close() error { // the syscall and arguments. type syscallTracker interface { // shouldReport returns true is the syscall should be reported. - shouldReport(regs *rpb.AMD64Registers) bool + shouldReport(regs *rpb.Registers) bool // onReported marks the syscall as reported. - onReported(regs *rpb.AMD64Registers) + onReported(regs *rpb.Registers) } // onceTracker reports only a single time, used for most syscalls. @@ -150,10 +152,45 @@ type onceTracker struct { reported bool } -func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool { +func (o *onceTracker) shouldReport(_ *rpb.Registers) bool { return !o.reported } -func (o *onceTracker) onReported(_ *rpb.AMD64Registers) { +func (o *onceTracker) onReported(_ *rpb.Registers) { o.reported = true } + +// argsTracker reports only once for each different combination of arguments. +// It's used for generic syscalls like ioctl to report once per 'cmd'. +type argsTracker struct { + // argsIdx is the syscall arguments to use as unique ID. + argsIdx []int + reported map[string]struct{} + count int +} + +func newArgsTracker(argIdx ...int) *argsTracker { + return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})} +} + +// key returns the command based on the syscall argument index. +func (a *argsTracker) key(regs *rpb.Registers) string { + var rv string + for _, idx := range a.argsIdx { + rv += fmt.Sprintf("%d|", argVal(idx, regs)) + } + return rv +} + +func (a *argsTracker) shouldReport(regs *rpb.Registers) bool { + if a.count >= reportLimit { + return false + } + _, ok := a.reported[a.key(regs)] + return !ok +} + +func (a *argsTracker) onReported(regs *rpb.Registers) { + a.count++ + a.reported[a.key(regs)] = struct{}{} +} diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go index 43cd0db94..42b0ca8b0 100644 --- a/runsc/boot/compat_amd64.go +++ b/runsc/boot/compat_amd64.go @@ -16,62 +16,81 @@ package boot import ( "fmt" + "syscall" + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" ) // reportLimit is the max number of events that should be reported per tracker. const reportLimit = 100 -// argsTracker reports only once for each different combination of arguments. -// It's used for generic syscalls like ioctl to report once per 'cmd'. -type argsTracker struct { - // argsIdx is the syscall arguments to use as unique ID. - argsIdx []int - reported map[string]struct{} - count int +// newRegs create a empty Registers instance. +func newRegs() *rpb.Registers { + return &rpb.Registers{ + Arch: &rpb.Registers_Amd64{ + Amd64: &rpb.AMD64Registers{}, + }, + } } -func newArgsTracker(argIdx ...int) *argsTracker { - return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})} -} +func argVal(argIdx int, regs *rpb.Registers) uint32 { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 -// cmd returns the command based on the syscall argument index. -func (a *argsTracker) key(regs *rpb.AMD64Registers) string { - var rv string - for _, idx := range a.argsIdx { - rv += fmt.Sprintf("%d|", argVal(idx, regs)) + switch argIdx { + case 0: + return uint32(amd64Regs.Rdi) + case 1: + return uint32(amd64Regs.Rsi) + case 2: + return uint32(amd64Regs.Rdx) + case 3: + return uint32(amd64Regs.R10) + case 4: + return uint32(amd64Regs.R8) + case 5: + return uint32(amd64Regs.R9) } - return rv + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } -func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 { +func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + switch argIdx { case 0: - return uint32(regs.Rdi) + amd64Regs.Rdi = argVal case 1: - return uint32(regs.Rsi) + amd64Regs.Rsi = argVal case 2: - return uint32(regs.Rdx) + amd64Regs.Rdx = argVal case 3: - return uint32(regs.R10) + amd64Regs.R10 = argVal case 4: - return uint32(regs.R8) + amd64Regs.R8 = argVal case 5: - return uint32(regs.R9) + amd64Regs.R9 = argVal + default: + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } - panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } -func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool { - if a.count >= reportLimit { - return false - } - _, ok := a.reported[a.key(regs)] - return !ok +func getSyscallNameMap() (strace.SyscallMap, bool) { + return strace.Lookup(abi.Linux, arch.AMD64) +} + +func syscallNum(regs *rpb.Registers) uint64 { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + return amd64Regs.OrigRax } -func (a *argsTracker) onReported(regs *rpb.AMD64Registers) { - a.count++ - a.reported[a.key(regs)] = struct{}{} +func newArchArgsTracker(sysnr uint64) syscallTracker { + switch sysnr { + case syscall.SYS_ARCH_PRCTL: + // args: cmd, ... + return newArgsTracker(0) + } + return nil } diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go new file mode 100644 index 000000000..f784cd237 --- /dev/null +++ b/runsc/boot/compat_arm64.go @@ -0,0 +1,91 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" +) + +// reportLimit is the max number of events that should be reported per tracker. +const reportLimit = 100 + +// newRegs create a empty Registers instance. +func newRegs() *rpb.Registers { + return &rpb.Registers{ + Arch: &rpb.Registers_Arm64{ + Arm64: &rpb.ARM64Registers{}, + }, + } +} + +func argVal(argIdx int, regs *rpb.Registers) uint32 { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + + switch argIdx { + case 0: + return uint32(arm64Regs.R0) + case 1: + return uint32(arm64Regs.R1) + case 2: + return uint32(arm64Regs.R2) + case 3: + return uint32(arm64Regs.R3) + case 4: + return uint32(arm64Regs.R4) + case 5: + return uint32(arm64Regs.R5) + } + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) +} + +func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + + switch argIdx { + case 0: + arm64Regs.R0 = argVal + case 1: + arm64Regs.R1 = argVal + case 2: + arm64Regs.R2 = argVal + case 3: + arm64Regs.R3 = argVal + case 4: + arm64Regs.R4 = argVal + case 5: + arm64Regs.R5 = argVal + default: + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) + } +} + +func getSyscallNameMap() (strace.SyscallMap, bool) { + return strace.Lookup(abi.Linux, arch.ARM64) +} + +func syscallNum(regs *rpb.Registers) uint64 { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + return arm64Regs.R8 +} + +func newArchArgsTracker(sysnr uint64) syscallTracker { + // currently, no arch specific syscalls need to be handled here. + return nil +} diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go index 388298d8d..839c5303b 100644 --- a/runsc/boot/compat_test.go +++ b/runsc/boot/compat_test.go @@ -16,8 +16,6 @@ package boot import ( "testing" - - rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" ) func TestOnceTracker(t *testing.T) { @@ -35,31 +33,34 @@ func TestOnceTracker(t *testing.T) { func TestArgsTracker(t *testing.T) { for _, tc := range []struct { - name string - idx []int - rdi1 uint64 - rdi2 uint64 - rsi1 uint64 - rsi2 uint64 - want bool + name string + idx []int + arg1_1 uint64 + arg1_2 uint64 + arg2_1 uint64 + arg2_2 uint64 + want bool }{ - {name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false}, - {name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false}, - {name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true}, - {name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true}, - {name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false}, - {name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false}, - {name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true}, + {name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false}, + {name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false}, + {name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true}, + {name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true}, + {name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false}, + {name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false}, + {name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true}, } { t.Run(tc.name, func(t *testing.T) { c := newArgsTracker(tc.idx...) - regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1} + regs := newRegs() + setArgVal(0, tc.arg1_1, regs) + setArgVal(1, tc.arg2_1, regs) if !c.shouldReport(regs) { t.Error("first call to shouldReport, got: false, want: true") } c.onReported(regs) - regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2 + setArgVal(0, tc.arg1_2, regs) + setArgVal(1, tc.arg2_2, regs) if got := c.shouldReport(regs); tc.want != got { t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want) } @@ -70,7 +71,9 @@ func TestArgsTracker(t *testing.T) { func TestArgsTrackerLimit(t *testing.T) { c := newArgsTracker(0, 1) for i := 0; i < reportLimit; i++ { - regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)} + regs := newRegs() + setArgVal(0, 123, regs) + setArgVal(1, uint64(i), regs) if !c.shouldReport(regs) { t.Error("shouldReport before limit was reached, got: false, want: true") } @@ -78,7 +81,9 @@ func TestArgsTrackerLimit(t *testing.T) { } // Should hit the count limit now. - regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456} + regs := newRegs() + setArgVal(0, 123, regs) + setArgVal(1, 123456, regs) if c.shouldReport(regs) { t.Error("shouldReport after limit was reached, got: true, want: false") } diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 01a29e8d5..a878bc2ce 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -178,8 +178,11 @@ type Config struct { // capabilities. EnableRaw bool - // GSO indicates that generic segmentation offload is enabled. - GSO bool + // HardwareGSO indicates that hardware segmentation offload is enabled. + HardwareGSO bool + + // SoftwareGSO indicates that software segmentation offload is enabled. + SoftwareGSO bool // LogPackets indicates that all network packets should be logged. LogPackets bool @@ -247,6 +250,12 @@ type Config struct { // multiple tests are run in parallel, since there is no way to pass // parameters to the runtime from docker. TestOnlyTestNameEnv string + + // CPUNumFromQuota sets CPU number count to available CPU quota, using + // least integer value greater than or equal to quota. + // + // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. + CPUNumFromQuota bool } // ToFlags returns a slice of flags that correspond to the given Config. @@ -275,8 +284,13 @@ func (c *Config) ToFlags() []string { "--rootless=" + strconv.FormatBool(c.Rootless), "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr), "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), + "--gso=" + strconv.FormatBool(c.HardwareGSO), + "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), } + if c.CPUNumFromQuota { + f = append(f, "--cpu-num-from-quota") + } // Only include these if set since it is never to be used by users. if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { f = append(f, "--TESTONLY-unsafe-nonroot=true") diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 5f644b57e..9c9e94864 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -51,7 +51,7 @@ const ( ContainerEvent = "containerManager.Event" // ContainerExecuteAsync is the URPC endpoint for executing a command in a - // container.. + // container. ContainerExecuteAsync = "containerManager.ExecuteAsync" // ContainerPause pauses the container. @@ -152,7 +152,9 @@ func newController(fd int, l *Loader) (*controller, error) { srv.Register(&debug{}) srv.Register(&control.Logging{}) if l.conf.ProfileEnable { - srv.Register(&control.Profile{}) + srv.Register(&control.Profile{ + Kernel: l.k, + }) } return &controller{ @@ -161,7 +163,7 @@ func newController(fd int, l *Loader) (*controller, error) { }, nil } -// containerManager manages sandboes containers. +// containerManager manages sandbox containers. type containerManager struct { // startChan is used to signal when the root container process should // be started. @@ -380,7 +382,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { } // Since we have a new kernel we also must make a new watchdog. - dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction) + dogOpts := watchdog.DefaultOpts + dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction + dog := watchdog.New(k, dogOpts) // Change the loader fields to reflect the changes made when restoring. cm.l.k = k diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD index f5509b6b7..3a9dcfc04 100644 --- a/runsc/boot/filter/BUILD +++ b/runsc/boot/filter/BUILD @@ -6,6 +6,8 @@ go_library( name = "filter", srcs = [ "config.go", + "config_amd64.go", + "config_arm64.go", "extra_filters.go", "extra_filters_msan.go", "extra_filters_race.go", diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index efbf1fd4a..4fb9adca6 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -26,10 +26,6 @@ import ( // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS. var allowedSyscalls = seccomp.SyscallRules{ - syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ - {seccomp.AllowValue(linux.ARCH_GET_FS)}, - {seccomp.AllowValue(linux.ARCH_SET_FS)}, - }, syscall.SYS_CLOCK_GETTIME: {}, syscall.SYS_CLONE: []seccomp.Rule{ { @@ -42,9 +38,15 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.CLONE_THREAD), }, }, - syscall.SYS_CLOSE: {}, - syscall.SYS_DUP: {}, - syscall.SYS_DUP2: {}, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, + syscall.SYS_DUP3: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(0), + }, + }, syscall.SYS_EPOLL_CREATE1: {}, syscall.SYS_EPOLL_CTL: {}, syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ @@ -132,11 +134,6 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowValue(syscall.SOL_SOCKET), seccomp.AllowValue(syscall.SO_SNDBUF), }, - { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), - }, }, syscall.SYS_GETTID: {}, syscall.SYS_GETTIMEOFDAY: {}, @@ -243,6 +240,15 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowValue(0), }, }, + unix.SYS_SENDMMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT), + seccomp.AllowValue(0), + }, + }, syscall.SYS_RESTART_SYSCALL: {}, syscall.SYS_RT_SIGACTION: {}, syscall.SYS_RT_SIGPROCMASK: {}, @@ -306,6 +312,26 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_TOS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_RECVTOS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_TCLASS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + }, + { + seccomp.AllowAny{}, seccomp.AllowValue(syscall.SOL_IPV6), seccomp.AllowValue(syscall.IPV6_V6ONLY), }, @@ -407,6 +433,34 @@ func hostInetFilters() seccomp.SyscallRules { seccomp.AllowAny{}, seccomp.AllowValue(4), }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_TOS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_RECVTOS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_TCLASS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ { diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go new file mode 100644 index 000000000..5335ff82c --- /dev/null +++ b/runsc/boot/filter/config_amd64.go @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" +) + +func init() { + allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL], + seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)}, + seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)}, + ) +} diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go new file mode 100644 index 000000000..7fa9bbda3 --- /dev/null +++ b/runsc/boot/filter/config_arm64.go @@ -0,0 +1,21 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package filter + +// Reserve for future customization. +func init() { +} diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 76036c147..421ccd255 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -16,7 +16,6 @@ package boot import ( "fmt" - "path" "path/filepath" "sort" "strconv" @@ -52,7 +51,7 @@ const ( rootDevice = "9pfs-/" // MountPrefix is the annotation prefix for mount hints. - MountPrefix = "gvisor.dev/spec/mount" + MountPrefix = "dev.gvisor.spec.mount." // Filesystems that runsc supports. bind = "bind" @@ -465,6 +464,13 @@ func (m *mountHint) checkCompatible(mount specs.Mount) error { return nil } +func (m *mountHint) fileAccessType() FileAccessType { + if m.share == container { + return FileAccessExclusive + } + return FileAccessShared +} + func filterUnsupportedOptions(mount specs.Mount) []string { rv := make([]string, 0, len(mount.Options)) for _, o := range mount.Options { @@ -483,14 +489,15 @@ type podMountHints struct { func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { mnts := make(map[string]*mountHint) for k, v := range spec.Annotations { - // Look for 'gvisor.dev/spec/mount' annotations and parse them. + // Look for 'dev.gvisor.spec.mount' annotations and parse them. if strings.HasPrefix(k, MountPrefix) { - parts := strings.Split(k, "/") - if len(parts) != 5 { + // Remove the prefix and split the rest. + parts := strings.Split(k[len(MountPrefix):], ".") + if len(parts) != 2 { return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v) } - name := parts[3] - if len(name) == 0 || path.Clean(name) != name { + name := parts[0] + if len(name) == 0 { return nil, fmt.Errorf("invalid mount name: %s", name) } mnt := mnts[name] @@ -498,7 +505,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { mnt = &mountHint{name: name} mnts[name] = mnt } - if err := mnt.setField(parts[4], v); err != nil { + if err := mnt.setField(parts[1], v); err != nil { return nil, err } } @@ -568,6 +575,11 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin func (c *containerMounter) processHints(conf *Config) error { ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfs { + continue + } log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) inode, err := c.mountSharedMaster(ctx, conf, hint) if err != nil { @@ -764,8 +776,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( case bind: fd := c.fds.remove() fsName = "9p" - // Non-root bind mounts are always shared. - opts = p9MountOptions(fd, FileAccessShared) + opts = p9MountOptions(fd, c.getMountAccessType(m)) // If configured, add overlay to all writable mounts. useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly @@ -778,6 +789,14 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( return fsName, opts, useOverlay, nil } +func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType { + if hint := c.hints.findMount(mount); hint != nil { + return hint.fileAccessType() + } + // Non-root bind mounts are always shared if no hints were provided. + return FileAccessShared +} + // mountSubmount mounts volumes inside the container's root. Because mounts may // be readonly, a lower ramfs overlay is added to create the mount point dir. // Another overlay is added with tmpfs on top if Config.Overlay is true. @@ -837,7 +856,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns return fmt.Errorf("mount %q error: %v", m.Destination, err) } - log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) + log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts) return nil } diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go index 49ab34b33..912037075 100644 --- a/runsc/boot/fs_test.go +++ b/runsc/boot/fs_test.go @@ -15,7 +15,6 @@ package boot import ( - "path" "reflect" "strings" "testing" @@ -26,19 +25,19 @@ import ( func TestPodMountHintsHappy(t *testing.T) { spec := &specs.Spec{ Annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", - path.Join(MountPrefix, "mount2", "source"): "bar", - path.Join(MountPrefix, "mount2", "type"): "bind", - path.Join(MountPrefix, "mount2", "share"): "container", - path.Join(MountPrefix, "mount2", "options"): "rw,private", + MountPrefix + "mount2.source": "bar", + MountPrefix + "mount2.type": "bind", + MountPrefix + "mount2.share": "container", + MountPrefix + "mount2.options": "rw,private", }, } podHints, err := newPodMountHints(spec) if err != nil { - t.Errorf("newPodMountHints failed: %v", err) + t.Fatalf("newPodMountHints failed: %v", err) } // Check that fields were set correctly. @@ -86,95 +85,95 @@ func TestPodMountHintsErrors(t *testing.T) { { name: "too short", annotations: map[string]string{ - path.Join(MountPrefix, "mount1"): "foo", + MountPrefix + "mount1": "foo", }, error: "invalid mount annotation", }, { name: "no name", annotations: map[string]string{ - MountPrefix + "//source": "foo", + MountPrefix + ".source": "foo", }, error: "invalid mount name", }, { name: "missing source", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", }, error: "source field", }, { name: "missing type", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.share": "pod", }, error: "type field", }, { name: "missing share", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", }, error: "share field", }, { name: "invalid field name", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "invalid"): "foo", + MountPrefix + "mount1.invalid": "foo", }, error: "invalid mount annotation", }, { name: "invalid source", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", }, error: "source cannot be empty", }, { name: "invalid type", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "invalid-type", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "invalid-type", + MountPrefix + "mount1.share": "pod", }, error: "invalid type", }, { name: "invalid share", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "invalid-share", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "invalid-share", }, error: "invalid share", }, { name: "invalid options", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", - path.Join(MountPrefix, "mount1", "options"): "invalid-option", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + MountPrefix + "mount1.options": "invalid-option", }, error: "unknown mount option", }, { name: "duplicate source", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", - path.Join(MountPrefix, "mount2", "source"): "foo", - path.Join(MountPrefix, "mount2", "type"): "bind", - path.Join(MountPrefix, "mount2", "share"): "container", + MountPrefix + "mount2.source": "foo", + MountPrefix + "mount2.type": "bind", + MountPrefix + "mount2.share": "container", }, error: "have the same mount source", }, @@ -191,3 +190,61 @@ func TestPodMountHintsErrors(t *testing.T) { }) } } + +func TestGetMountAccessType(t *testing.T) { + const source = "foo" + for _, tst := range []struct { + name string + annotations map[string]string + want FileAccessType + }{ + { + name: "container=exclusive", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "container", + }, + want: FileAccessExclusive, + }, + { + name: "pod=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "pod", + }, + want: FileAccessShared, + }, + { + name: "shared=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "shared", + }, + want: FileAccessShared, + }, + { + name: "default=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source + "mismatch", + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "container", + }, + want: FileAccessShared, + }, + } { + t.Run(tst.name, func(t *testing.T) { + spec := &specs.Spec{Annotations: tst.annotations} + podHints, err := newPodMountHints(spec) + if err != nil { + t.Fatalf("newPodMountHints failed: %v", err) + } + mounter := containerMounter{hints: podHints} + if got := mounter.getMountAccessType(specs.Mount{Source: source}); got != tst.want { + t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got) + } + }) + } +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index c8e5e86ee..bc1d0c1bb 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -43,7 +43,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/sighandling" - slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/watchdog" @@ -65,6 +64,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink" _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) @@ -93,10 +93,6 @@ type Loader struct { // spec is the base configuration for the root container. spec *specs.Spec - // startSignalForwarding enables forwarding of signals to the sandboxed - // container. It should be called after the init process is loaded. - startSignalForwarding func() func() - // stopSignalForwarding disables forwarding of signals to the sandboxed // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() @@ -146,9 +142,6 @@ type execProcess struct { func init() { // Initialize the random number generator. mrand.Seed(gtime.Now().UnixNano()) - - // Register the global syscall table. - kernel.RegisterSyscallTable(slinux.AMD64) } // Args are the arguments for New(). @@ -232,7 +225,7 @@ func New(args Args) (*Loader, error) { // this point. Netns is configured before Run() is called. Netstack is // configured using a control uRPC message. Host network is configured inside // Run(). - networkStack, err := newEmptyNetworkStack(args.Conf, k) + networkStack, err := newEmptyNetworkStack(args.Conf, k, k) if err != nil { return nil, fmt.Errorf("creating network: %v", err) } @@ -300,7 +293,9 @@ func New(args Args) (*Loader, error) { } // Create a watchdog. - dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction) + dogOpts := watchdog.DefaultOpts + dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction + dog := watchdog.New(k, dogOpts) procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) if err != nil { @@ -337,29 +332,6 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("ignore child stop signals failed: %v", err) } - // Handle signals by forwarding them to the root container process - // (except for panic signal, which should cause a panic). - l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) { - // Panic signal should cause a panic. - if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) { - panic("Signal-induced panic") - } - - // Otherwise forward to root container. - deliveryMode := DeliverToProcess - if args.Console { - // Since we are running with a console, we should - // forward the signal to the foreground process group - // so that job control signals like ^C can be handled - // properly. - deliveryMode = DeliverToForegroundProcessGroup - } - log.Infof("Received external signal %d, mode: %v", sig, deliveryMode) - if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil { - log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err) - } - }) - // Create the control server using the provided FD. // // This must be done *after* we have initialized the kernel since the @@ -567,8 +539,27 @@ func (l *Loader) run() error { ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup()) } - // Start signal forwarding only after an init process is created. - l.stopSignalForwarding = l.startSignalForwarding() + // Handle signals by forwarding them to the root container process + // (except for panic signal, which should cause a panic). + l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { + // Panic signal should cause a panic. + if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) { + panic("Signal-induced panic") + } + + // Otherwise forward to root container. + deliveryMode := DeliverToProcess + if l.console { + // Since we are running with a console, we should forward the signal to + // the foreground process group so that job control signals like ^C can + // be handled properly. + deliveryMode = DeliverToForegroundProcessGroup + } + log.Infof("Received external signal %d, mode: %v", sig, deliveryMode) + if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { + log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err) + } + }) log.Infof("Process should have started...") l.watchdog.Start() @@ -905,7 +896,7 @@ func (l *Loader) WaitExit() kernel.ExitStatus { return l.k.GlobalInit().ExitStatus() } -func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { +func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { switch conf.Network { case NetworkHost: return hostinet.NewStack(), nil @@ -922,7 +913,8 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { HandleLocal: true, // Enable raw sockets for users with sufficient // privileges. - UnassociatedFactory: raw.EndpointFactory{}, + RawFactory: raw.EndpointFactory{}, + UniqueID: uniqueID, })} // Enable SACK Recovery. diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go new file mode 100644 index 000000000..b9669f2ac --- /dev/null +++ b/runsc/boot/loader_amd64.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package boot + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" +) + +func init() { + // Register the global syscall table. + kernel.RegisterSyscallTable(linux.AMD64) +} diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go new file mode 100644 index 000000000..cf64d28c8 --- /dev/null +++ b/runsc/boot/loader_arm64.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package boot + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" +) + +func init() { + // Register the global syscall table. + kernel.RegisterSyscallTable(linux.ARM64) +} diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 32cba5ac1..dd4926bb9 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -50,12 +50,13 @@ type DefaultRoute struct { // FDBasedLink configures an fd-based link. type FDBasedLink struct { - Name string - MTU int - Addresses []net.IP - Routes []Route - GSOMaxSize uint32 - LinkAddress net.HardwareAddr + Name string + MTU int + Addresses []net.IP + Routes []Route + GSOMaxSize uint32 + SoftwareGSOEnabled bool + LinkAddress net.HardwareAddr // NumChannels controls how many underlying FD's are to be used to // create this endpoint. @@ -79,7 +80,8 @@ type CreateLinksAndRoutesArgs struct { LoopbackLinks []LoopbackLink FDBasedLinks []FDBasedLink - DefaultGateway DefaultRoute + Defaultv4Gateway DefaultRoute + Defaultv6Gateway DefaultRoute } // Empty returns true if route hasn't been set. @@ -121,10 +123,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct nicID++ nicids[link.Name] = nicID - ep := loopback.New() + linkEP := loopback.New() log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) - if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, true /* loopback */); err != nil { + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, true /* loopback */); err != nil { return err } @@ -156,13 +158,14 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } mac := tcpip.LinkAddress(link.LinkAddress) - ep, err := fdbased.New(&fdbased.Options{ + linkEP, err := fdbased.New(&fdbased.Options{ FDs: FDs, MTU: uint32(link.MTU), EthernetHeader: true, Address: mac, PacketDispatchMode: fdbased.RecvMMsg, GSOMaxSize: link.GSOMaxSize, + SoftwareGSOEnabled: link.SoftwareGSOEnabled, RXChecksumOffload: true, }) if err != nil { @@ -170,7 +173,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) - if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, false /* loopback */); err != nil { + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil { return err } @@ -184,12 +187,24 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } } - if !args.DefaultGateway.Route.Empty() { - nicID, ok := nicids[args.DefaultGateway.Name] + if !args.Defaultv4Gateway.Route.Empty() { + nicID, ok := nicids[args.Defaultv4Gateway.Name] if !ok { - return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name) + return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name) } - route, err := args.DefaultGateway.Route.toTcpipRoute(nicID) + route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + + if !args.Defaultv6Gateway.Route.Empty() { + nicID, ok := nicids[args.Defaultv6Gateway.Name] + if !ok { + return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name) + } + route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID) if err != nil { return err } @@ -206,11 +221,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error { if loopback { if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil { - return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v) failed: %v", id, name, err) + return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, ep, err) } } else { if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil { - return fmt.Errorf("CreateNamedNIC(%v, %v) failed: %v", id, name, err) + return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, ep, err) } } diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index ab3a25b9b..653ca5f52 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -101,6 +101,14 @@ func getValue(path, name string) (string, error) { return string(out), nil } +func getInt(path, name string) (int, error) { + s, err := getValue(path, name) + if err != nil { + return 0, err + } + return strconv.Atoi(strings.TrimSpace(s)) +} + // fillFromAncestor sets the value of a cgroup file from the first ancestor // that has content. It does nothing if the file in 'path' has already been set. func fillFromAncestor(path string) (string, error) { @@ -323,6 +331,22 @@ func (c *Cgroup) Join() (func(), error) { return undo, nil } +func (c *Cgroup) CPUQuota() (float64, error) { + path := c.makePath("cpu") + quota, err := getInt(path, "cpu.cfs_quota_us") + if err != nil { + return -1, err + } + period, err := getInt(path, "cpu.cfs_period_us") + if err != nil { + return -1, err + } + if quota <= 0 || period <= 0 { + return -1, err + } + return float64(quota) / float64(period), nil +} + // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'. func (c *Cgroup) NumCPU() (int, error) { path := c.makePath("cpuset") diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index 7313e473f..f37415810 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -32,16 +32,17 @@ import ( // Debug implements subcommands.Command for the "debug" command. type Debug struct { - pid int - stacks bool - signal int - profileHeap string - profileCPU string - profileDelay int - trace string - strace string - logLevel string - logPackets string + pid int + stacks bool + signal int + profileHeap string + profileCPU string + trace string + strace string + logLevel string + logPackets string + duration time.Duration + ps bool } // Name implements subcommands.Command. @@ -65,12 +66,13 @@ func (d *Debug) SetFlags(f *flag.FlagSet) { f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log") f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") - f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile") + f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles") f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.") f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox") f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`) f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).") f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.") + f.BoolVar(&d.ps, "ps", false, "lists processes") } // Execute implements subcommands.Command.Execute. @@ -163,7 +165,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if err := c.Sandbox.StartCPUProfile(f); err != nil { return Errorf(err.Error()) } - log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU) + log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU) } if d.trace != "" { delay = true @@ -181,8 +183,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if err := c.Sandbox.StartTrace(f); err != nil { return Errorf(err.Error()) } - log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace) - + log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace) } if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 { @@ -241,9 +242,20 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } log.Infof("Logging options changed") } + if d.ps { + pList, err := c.Processes() + if err != nil { + Fatalf("getting processes for container: %v", err) + } + o, err := control.ProcessListToJSON(pList) + if err != nil { + Fatalf("generating JSON: %v", err) + } + log.Infof(o) + } if delay { - time.Sleep(time.Duration(d.profileDelay) * time.Second) + time.Sleep(d.duration) } return subcommands.ExitSuccess diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 26d1cd5ab..2bd12120d 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "container.go", "hook.go", + "state_file.go", "status.go", ], importpath = "gvisor.dev/gvisor/runsc/container", diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 7d67c3a75..5ed131a7f 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -28,6 +28,7 @@ import ( "github.com/kr/pty" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/testutil" @@ -219,9 +220,9 @@ func TestJobControlSignalExec(t *testing.T) { // Make sure all the processes are running. expectedPL := []*control.Process{ // Root container process. - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, // Bash from exec process. - {PID: 2, Cmd: "bash"}, + {PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) @@ -231,7 +232,7 @@ func TestJobControlSignalExec(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for it to start. Sleep's PPID is bash's PID. - expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"}) + expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}}) if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) } @@ -361,7 +362,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Wait for bash to start. expectedPL := []*control.Process{ - {PID: 1, Cmd: "bash"}, + {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(c, expectedPL); err != nil { t.Fatal(err) @@ -371,7 +372,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for sleep to start. - expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep"}) + expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}}) if err := waitForProcessList(c, expectedPL); err != nil { t.Fatal(err) } diff --git a/runsc/container/container.go b/runsc/container/container.go index a721c1c31..68782c4be 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -17,13 +17,11 @@ package container import ( "context" - "encoding/json" "fmt" "io/ioutil" "os" "os/exec" "os/signal" - "path/filepath" "regexp" "strconv" "strings" @@ -31,7 +29,6 @@ import ( "time" "github.com/cenkalti/backoff" - "github.com/gofrs/flock" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" @@ -41,17 +38,6 @@ import ( "gvisor.dev/gvisor/runsc/specutils" ) -const ( - // metadataFilename is the name of the metadata file relative to the - // container root directory that holds sandbox metadata. - metadataFilename = "meta.json" - - // metadataLockFilename is the name of a lock file in the container - // root directory that is used to prevent concurrent modifications to - // the container state and metadata. - metadataLockFilename = "meta.lock" -) - // validateID validates the container id. func validateID(id string) error { // See libcontainer/factory_linux.go. @@ -99,11 +85,6 @@ type Container struct { // BundleDir is the directory containing the container bundle. BundleDir string `json:"bundleDir"` - // Root is the directory containing the container metadata file. If this - // container is the root container, Root and RootContainerDir will be the - // same. - Root string `json:"root"` - // CreatedAt is the time the container was created. CreatedAt time.Time `json:"createdAt"` @@ -121,21 +102,24 @@ type Container struct { // be 0 if the gofer has been killed. GoferPid int `json:"goferPid"` + // Sandbox is the sandbox this container is running in. It's set when the + // container is created and reset when the sandbox is destroyed. + Sandbox *sandbox.Sandbox `json:"sandbox"` + + // Saver handles load from/save to the state file safely from multiple + // processes. + Saver StateFile `json:"saver"` + + // + // Fields below this line are not saved in the state file and will not + // be preserved across commands. + // + // goferIsChild is set if a gofer process is a child of the current process. // // This field isn't saved to json, because only a creator of a gofer // process will have it as a child process. goferIsChild bool - - // Sandbox is the sandbox this container is running in. It's set when the - // container is created and reset when the sandbox is destroyed. - Sandbox *sandbox.Sandbox `json:"sandbox"` - - // RootContainerDir is the root directory containing the metadata file of the - // sandbox root container. It's used to lock in order to serialize creating - // and deleting this Container's metadata directory. If this container is the - // root container, this is the same as Root. - RootContainerDir string } // loadSandbox loads all containers that belong to the sandbox with the given @@ -166,43 +150,35 @@ func loadSandbox(rootDir, id string) ([]*Container, error) { return containers, nil } -// Load loads a container with the given id from a metadata file. id may be an -// abbreviation of the full container id, in which case Load loads the -// container to which id unambiguously refers to. -// Returns ErrNotExist if container doesn't exist. -func Load(rootDir, id string) (*Container, error) { - log.Debugf("Load container %q %q", rootDir, id) - if err := validateID(id); err != nil { +// Load loads a container with the given id from a metadata file. partialID may +// be an abbreviation of the full container id, in which case Load loads the +// container to which id unambiguously refers to. Returns ErrNotExist if +// container doesn't exist. +func Load(rootDir, partialID string) (*Container, error) { + log.Debugf("Load container %q %q", rootDir, partialID) + if err := validateID(partialID); err != nil { return nil, fmt.Errorf("validating id: %v", err) } - cRoot, err := findContainerRoot(rootDir, id) + id, err := findContainerID(rootDir, partialID) if err != nil { // Preserve error so that callers can distinguish 'not found' errors. return nil, err } - // Lock the container metadata to prevent other runsc instances from - // writing to it while we are reading it. - unlock, err := lockContainerMetadata(cRoot) - if err != nil { - return nil, err + state := StateFile{ + RootDir: rootDir, + ID: id, } - defer unlock() + defer state.close() - // Read the container metadata file and create a new Container from it. - metaFile := filepath.Join(cRoot, metadataFilename) - metaBytes, err := ioutil.ReadFile(metaFile) - if err != nil { + c := &Container{} + if err := state.load(c); err != nil { if os.IsNotExist(err) { // Preserve error so that callers can distinguish 'not found' errors. return nil, err } - return nil, fmt.Errorf("reading container metadata file %q: %v", metaFile, err) - } - var c Container - if err := json.Unmarshal(metaBytes, &c); err != nil { - return nil, fmt.Errorf("unmarshaling container metadata from %q: %v", metaFile, err) + return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err) } // If the status is "Running" or "Created", check that the sandbox @@ -223,57 +199,37 @@ func Load(rootDir, id string) (*Container, error) { } } - return &c, nil + return c, nil } -func findContainerRoot(rootDir, partialID string) (string, error) { +func findContainerID(rootDir, partialID string) (string, error) { // Check whether the id fully specifies an existing container. - cRoot := filepath.Join(rootDir, partialID) - if _, err := os.Stat(cRoot); err == nil { - return cRoot, nil + stateFile := buildStatePath(rootDir, partialID) + if _, err := os.Stat(stateFile); err == nil { + return partialID, nil } // Now see whether id could be an abbreviation of exactly 1 of the // container ids. If id is ambiguous (it could match more than 1 // container), it is an error. - cRoot = "" ids, err := List(rootDir) if err != nil { return "", err } + rv := "" for _, id := range ids { if strings.HasPrefix(id, partialID) { - if cRoot != "" { - return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, cRoot, id) + if rv != "" { + return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id) } - cRoot = id + rv = id } } - if cRoot == "" { + if rv == "" { return "", os.ErrNotExist } - log.Debugf("abbreviated id %q resolves to full id %q", partialID, cRoot) - return filepath.Join(rootDir, cRoot), nil -} - -// List returns all container ids in the given root directory. -func List(rootDir string) ([]string, error) { - log.Debugf("List containers %q", rootDir) - fs, err := ioutil.ReadDir(rootDir) - if err != nil { - return nil, fmt.Errorf("reading dir %q: %v", rootDir, err) - } - var out []string - for _, f := range fs { - // Filter out directories that do no belong to a container. - cid := f.Name() - if validateID(cid) == nil { - if _, err := os.Stat(filepath.Join(rootDir, cid, metadataFilename)); err == nil { - out = append(out, f.Name()) - } - } - } - return out, nil + log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv) + return rv, nil } // Args is used to configure a new container. @@ -316,44 +272,34 @@ func New(conf *boot.Config, args Args) (*Container, error) { return nil, err } - unlockRoot, err := maybeLockRootContainer(args.Spec, conf.RootDir) - if err != nil { - return nil, err + if err := os.MkdirAll(conf.RootDir, 0711); err != nil { + return nil, fmt.Errorf("creating container root directory: %v", err) } - defer unlockRoot() + + c := &Container{ + ID: args.ID, + Spec: args.Spec, + ConsoleSocket: args.ConsoleSocket, + BundleDir: args.BundleDir, + Status: Creating, + CreatedAt: time.Now(), + Owner: os.Getenv("USER"), + Saver: StateFile{ + RootDir: conf.RootDir, + ID: args.ID, + }, + } + // The Cleanup object cleans up partially created containers when an error + // occurs. Any errors occurring during cleanup itself are ignored. + cu := specutils.MakeCleanup(func() { _ = c.Destroy() }) + defer cu.Clean() // Lock the container metadata file to prevent concurrent creations of // containers with the same id. - containerRoot := filepath.Join(conf.RootDir, args.ID) - unlock, err := lockContainerMetadata(containerRoot) - if err != nil { + if err := c.Saver.lockForNew(); err != nil { return nil, err } - defer unlock() - - // Check if the container already exists by looking for the metadata - // file. - if _, err := os.Stat(filepath.Join(containerRoot, metadataFilename)); err == nil { - return nil, fmt.Errorf("container with id %q already exists", args.ID) - } else if !os.IsNotExist(err) { - return nil, fmt.Errorf("looking for existing container in %q: %v", containerRoot, err) - } - - c := &Container{ - ID: args.ID, - Spec: args.Spec, - ConsoleSocket: args.ConsoleSocket, - BundleDir: args.BundleDir, - Root: containerRoot, - Status: Creating, - CreatedAt: time.Now(), - Owner: os.Getenv("USER"), - RootContainerDir: conf.RootDir, - } - // The Cleanup object cleans up partially created containers when an error occurs. - // Any errors occuring during cleanup itself are ignored. - cu := specutils.MakeCleanup(func() { _ = c.Destroy() }) - defer cu.Clean() + defer c.Saver.unlock() // If the metadata annotations indicate that this container should be // started in an existing sandbox, we must do so. The metadata will @@ -431,7 +377,7 @@ func New(conf *boot.Config, args Args) (*Container, error) { c.changeStatus(Created) // Save the metadata file. - if err := c.save(); err != nil { + if err := c.saveLocked(); err != nil { return nil, err } @@ -451,17 +397,12 @@ func New(conf *boot.Config, args Args) (*Container, error) { func (c *Container) Start(conf *boot.Config) error { log.Debugf("Start container %q", c.ID) - unlockRoot, err := maybeLockRootContainer(c.Spec, c.RootContainerDir) - if err != nil { + if err := c.Saver.lock(); err != nil { return err } - defer unlockRoot() + unlock := specutils.MakeCleanup(func() { c.Saver.unlock() }) + defer unlock.Clean() - unlock, err := c.lock() - if err != nil { - return err - } - defer unlock() if err := c.requireStatus("start", Created); err != nil { return err } @@ -509,14 +450,15 @@ func (c *Container) Start(conf *boot.Config) error { } c.changeStatus(Running) - if err := c.save(); err != nil { + if err := c.saveLocked(); err != nil { return err } - // Adjust the oom_score_adj for sandbox. This must be done after - // save(). - err = adjustSandboxOOMScoreAdj(c.Sandbox, c.RootContainerDir, false) - if err != nil { + // Release lock before adjusting OOM score because the lock is acquired there. + unlock.Clean() + + // Adjust the oom_score_adj for sandbox. This must be done after saveLocked(). + if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Saver.RootDir, false); err != nil { return err } @@ -529,11 +471,10 @@ func (c *Container) Start(conf *boot.Config) error { // to restore a container from its state file. func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error { log.Debugf("Restore container %q", c.ID) - unlock, err := c.lock() - if err != nil { + if err := c.Saver.lock(); err != nil { return err } - defer unlock() + defer c.Saver.unlock() if err := c.requireStatus("restore", Created); err != nil { return err @@ -551,7 +492,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str return err } c.changeStatus(Running) - return c.save() + return c.saveLocked() } // Run is a helper that calls Create + Start + Wait. @@ -711,11 +652,10 @@ func (c *Container) Checkpoint(f *os.File) error { // The call only succeeds if the container's status is created or running. func (c *Container) Pause() error { log.Debugf("Pausing container %q", c.ID) - unlock, err := c.lock() - if err != nil { + if err := c.Saver.lock(); err != nil { return err } - defer unlock() + defer c.Saver.unlock() if c.Status != Created && c.Status != Running { return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status) @@ -725,18 +665,17 @@ func (c *Container) Pause() error { return fmt.Errorf("pausing container: %v", err) } c.changeStatus(Paused) - return c.save() + return c.saveLocked() } // Resume unpauses the container and its kernel. // The call only succeeds if the container's status is paused. func (c *Container) Resume() error { log.Debugf("Resuming container %q", c.ID) - unlock, err := c.lock() - if err != nil { + if err := c.Saver.lock(); err != nil { return err } - defer unlock() + defer c.Saver.unlock() if c.Status != Paused { return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status) @@ -745,7 +684,7 @@ func (c *Container) Resume() error { return fmt.Errorf("resuming container: %v", err) } c.changeStatus(Running) - return c.save() + return c.saveLocked() } // State returns the metadata of the container. @@ -773,6 +712,17 @@ func (c *Container) Processes() ([]*control.Process, error) { func (c *Container) Destroy() error { log.Debugf("Destroy container %q", c.ID) + if err := c.Saver.lock(); err != nil { + return err + } + defer func() { + c.Saver.unlock() + c.Saver.close() + }() + + // Stored for later use as stop() sets c.Sandbox to nil. + sb := c.Sandbox + // We must perform the following cleanup steps: // * stop the container and gofer processes, // * remove the container filesystem on the host, and @@ -782,48 +732,43 @@ func (c *Container) Destroy() error { // do our best to perform all of the cleanups. Hence, we keep a slice // of errors return their concatenation. var errs []string - - unlock, err := maybeLockRootContainer(c.Spec, c.RootContainerDir) - if err != nil { - return err - } - defer unlock() - - // Stored for later use as stop() sets c.Sandbox to nil. - sb := c.Sandbox - if err := c.stop(); err != nil { err = fmt.Errorf("stopping container: %v", err) log.Warningf("%v", err) errs = append(errs, err.Error()) } - if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) { - err = fmt.Errorf("deleting container root directory %q: %v", c.Root, err) + if err := c.Saver.destroy(); err != nil { + err = fmt.Errorf("deleting container state files: %v", err) log.Warningf("%v", err) errs = append(errs, err.Error()) } c.changeStatus(Stopped) - // Adjust oom_score_adj for the sandbox. This must be done after the - // container is stopped and the directory at c.Root is removed. - // We must test if the sandbox is nil because Destroy should be - // idempotent. - if sb != nil { - if err := adjustSandboxOOMScoreAdj(sb, c.RootContainerDir, true); err != nil { + // Adjust oom_score_adj for the sandbox. This must be done after the container + // is stopped and the directory at c.Root is removed. Adjustment can be + // skipped if the root container is exiting, because it brings down the entire + // sandbox. + // + // Use 'sb' to tell whether it has been executed before because Destroy must + // be idempotent. + if sb != nil && !isRoot(c.Spec) { + if err := adjustSandboxOOMScoreAdj(sb, c.Saver.RootDir, true); err != nil { errs = append(errs, err.Error()) } } // "If any poststop hook fails, the runtime MUST log a warning, but the - // remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec. - // Based on the OCI, "The post-stop hooks MUST be called after the container is - // deleted but before the delete operation returns" + // remaining hooks and lifecycle continue as if the hook had + // succeeded" - OCI spec. + // + // Based on the OCI, "The post-stop hooks MUST be called after the container + // is deleted but before the delete operation returns" // Run it here to: // 1) Conform to the OCI. - // 2) Make sure it only runs once, because the root has been deleted, the container - // can't be loaded again. + // 2) Make sure it only runs once, because the root has been deleted, the + // container can't be loaded again. if c.Spec.Hooks != nil { executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) } @@ -834,18 +779,13 @@ func (c *Container) Destroy() error { return fmt.Errorf(strings.Join(errs, "\n")) } -// save saves the container metadata to a file. +// saveLocked saves the container metadata to a file. // // Precondition: container must be locked with container.lock(). -func (c *Container) save() error { +func (c *Container) saveLocked() error { log.Debugf("Save container %q", c.ID) - metaFile := filepath.Join(c.Root, metadataFilename) - meta, err := json.Marshal(c) - if err != nil { - return fmt.Errorf("invalid container metadata: %v", err) - } - if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil { - return fmt.Errorf("writing container metadata: %v", err) + if err := c.Saver.saveLocked(c); err != nil { + return fmt.Errorf("saving container metadata: %v", err) } return nil } @@ -1106,50 +1046,8 @@ func (c *Container) requireStatus(action string, statuses ...Status) error { return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status) } -// lock takes a file lock on the container metadata lock file. -func (c *Container) lock() (func() error, error) { - return lockContainerMetadata(filepath.Join(c.Root, c.ID)) -} - -// lockContainerMetadata takes a file lock on the metadata lock file in the -// given container root directory. -func lockContainerMetadata(containerRootDir string) (func() error, error) { - if err := os.MkdirAll(containerRootDir, 0711); err != nil { - return nil, fmt.Errorf("creating container root directory %q: %v", containerRootDir, err) - } - f := filepath.Join(containerRootDir, metadataLockFilename) - l := flock.NewFlock(f) - if err := l.Lock(); err != nil { - return nil, fmt.Errorf("acquiring lock on container lock file %q: %v", f, err) - } - return l.Unlock, nil -} - -// maybeLockRootContainer locks the sandbox root container. It is used to -// prevent races to create and delete child container sandboxes. -func maybeLockRootContainer(spec *specs.Spec, rootDir string) (func() error, error) { - if isRoot(spec) { - return func() error { return nil }, nil - } - - sbid, ok := specutils.SandboxID(spec) - if !ok { - return nil, fmt.Errorf("no sandbox ID found when locking root container") - } - sb, err := Load(rootDir, sbid) - if err != nil { - return nil, err - } - - unlock, err := sb.lock() - if err != nil { - return nil, err - } - return unlock, nil -} - func isRoot(spec *specs.Spec) bool { - return specutils.ShouldCreateSandbox(spec) + return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer } // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute @@ -1170,7 +1068,12 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error { func (c *Container) adjustGoferOOMScoreAdj() error { if c.GoferPid != 0 && c.Spec.Process.OOMScoreAdj != nil { if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil { - return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err) + // Ignore NotExist error because it can be returned when the sandbox + // exited while OOM score was being adjusted. + if !os.IsNotExist(err) { + return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err) + } + log.Warningf("Gofer process (%d) not found setting oom_score_adj", c.GoferPid) } } @@ -1198,7 +1101,7 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) // Get the lowest score for all containers. var lowScore int scoreFound := false - if len(containers) == 1 && len(containers[0].Spec.Annotations[specutils.ContainerdContainerTypeAnnotation]) == 0 { + if len(containers) == 1 && specutils.SpecContainerType(containers[0].Spec) == specutils.ContainerTypeUnspecified { // This is a single-container sandbox. Set the oom_score_adj to // the value specified in the OCI bundle. if containers[0].Spec.Process.OOMScoreAdj != nil { @@ -1214,7 +1117,7 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) // // We will use OOMScoreAdj in the single-container case where the // containerd container-type annotation is not present. - if container.Spec.Annotations[specutils.ContainerdContainerTypeAnnotation] == specutils.ContainerdContainerTypeSandbox { + if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox { continue } @@ -1252,7 +1155,12 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) // Set the lowest of all containers oom_score_adj to the sandbox. if err := setOOMScoreAdj(s.Pid, lowScore); err != nil { - return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", s.ID, err) + // Ignore NotExist error because it can be returned when the sandbox + // exited while OOM score was being adjusted. + if !os.IsNotExist(err) { + return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", s.ID, err) + } + log.Warningf("Sandbox process (%d) not found setting oom_score_adj", s.Pid) } return nil diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index c4c56b2e0..c10f85992 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -37,6 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/platforms" @@ -52,8 +53,9 @@ func waitForProcessList(cont *Container, want []*control.Process) error { err = fmt.Errorf("error getting process data from container: %v", err) return &backoff.PermanentError{Err: err} } - if !procListsEqual(got, want) { - return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want)) + if r, err := procListsEqual(got, want); !r { + return fmt.Errorf("container got process list: %s, want: %s: error: %v", + procListToString(got), procListToString(want), err) } return nil } @@ -91,22 +93,34 @@ func blockUntilWaitable(pid int) error { // procListsEqual is used to check whether 2 Process lists are equal for all // implemented fields. -func procListsEqual(got, want []*control.Process) bool { +func procListsEqual(got, want []*control.Process) (bool, error) { if len(got) != len(want) { - return false + return false, nil } for i := range got { pd1 := got[i] pd2 := want[i] - // Zero out unimplemented and timing dependant fields. + // Zero out timing dependant fields. pd1.Time = "" pd1.STime = "" pd1.C = 0 - if *pd1 != *pd2 { - return false + // Ignore TTY field too, since it's not relevant in the cases + // where we use this method. Tests that care about the TTY + // field should check for it themselves. + pd1.TTY = "" + pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1}) + if err != nil { + return false, err + } + pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2}) + if err != nil { + return false, err + } + if pd1Json != pd2Json { + return false, nil } } - return true + return true, nil } // getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the @@ -116,7 +130,11 @@ func getAndCheckProcLists(cont *Container, want []*control.Process) error { if err != nil { return fmt.Errorf("error getting process data from container: %v", err) } - if procListsEqual(got, want) { + equal, err := procListsEqual(got, want) + if err != nil { + return err + } + if equal { return nil } return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want)) @@ -288,11 +306,12 @@ func TestLifecycle(t *testing.T) { // expectedPL lists the expected process state of the container. expectedPL := []*control.Process{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, }, } // Create the container. @@ -590,18 +609,20 @@ func TestExec(t *testing.T) { // expectedPL lists the expected process state of the container. expectedPL := []*control.Process{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, }, { - UID: uid, - PID: 2, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{2}, }, } @@ -1062,18 +1083,20 @@ func TestPauseResume(t *testing.T) { // expectedPL lists the expected process state of the container. expectedPL := []*control.Process{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, }, { - UID: uid, - PID: 2, - PPID: 0, - C: 0, - Cmd: "bash", + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "bash", + Threads: []kernel.ThreadID{2}, }, } @@ -1126,11 +1149,12 @@ func TestPauseResume(t *testing.T) { expectedPL2 := []*control.Process{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, }, } @@ -1241,18 +1265,20 @@ func TestCapabilities(t *testing.T) { // expectedPL lists the expected process state of the container. expectedPL := []*control.Process{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, }, { - UID: uid, - PID: 2, - PPID: 0, - C: 0, - Cmd: "exe", + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "exe", + Threads: []kernel.ThreadID{2}, }, } if err := waitForProcessList(cont, expectedPL[:1]); err != nil { @@ -1548,7 +1574,8 @@ func TestAbbreviatedIDs(t *testing.T) { } defer os.RemoveAll(rootDir) - conf := testutil.TestConfigWithRoot(rootDir) + conf := testutil.TestConfig() + conf.RootDir = rootDir cids := []string{ "foo-" + testutil.UniqueContainerID(), @@ -2111,6 +2138,95 @@ func TestOverlayfsStaleRead(t *testing.T) { } } +// TestTTYField checks TTY field returned by container.Processes(). +func TestTTYField(t *testing.T) { + stop := testutil.StartReaper() + defer stop() + + testApp, err := testutil.FindFile("runsc/container/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + testCases := []struct { + name string + useTTY bool + wantTTYField string + }{ + { + name: "no tty", + useTTY: false, + wantTTYField: "?", + }, + { + name: "tty used", + useTTY: true, + wantTTYField: "pts/0", + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + conf := testutil.TestConfig() + + // We will run /bin/sleep, possibly with an open TTY. + cmd := []string{"/bin/sleep", "10000"} + if test.useTTY { + // Run inside the "pty-runner". + cmd = append([]string{testApp, "pty-runner"}, cmd...) + } + + spec := testutil.NewSpecWithArgs(cmd...) + rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // Create and start the container. + args := Args{ + ID: testutil.UniqueContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Wait for sleep to be running, and check the TTY + // field. + var gotTTYField string + cb := func() error { + ps, err := c.Processes() + if err != nil { + err = fmt.Errorf("error getting process data from container: %v", err) + return &backoff.PermanentError{Err: err} + } + for _, p := range ps { + if strings.Contains(p.Cmd, "sleep") { + gotTTYField = p.TTY + return nil + } + } + return fmt.Errorf("sleep not running") + } + if err := testutil.Poll(cb, 30*time.Second); err != nil { + t.Fatalf("error waiting for sleep process: %v", err) + } + + if gotTTYField != test.wantTTYField { + t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField) + } + }) + } +} + // executeSync synchronously executes a new process. func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { pid, err := cont.Execute(args) diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 9e02a825e..4ad09ceab 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -60,13 +60,8 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) { } func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) { - // Setup root dir if one hasn't been provided. if len(conf.RootDir) == 0 { - rootDir, err := testutil.SetupRootDir() - if err != nil { - return nil, nil, fmt.Errorf("error creating root dir: %v", err) - } - conf.RootDir = rootDir + panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.") } var containers []*Container @@ -78,7 +73,6 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C for _, b := range bundles { os.RemoveAll(b) } - os.RemoveAll(conf.RootDir) } for i, spec := range specs { bundleDir, err := testutil.SetupBundleDir(spec) @@ -129,11 +123,11 @@ func execMany(execs []execDesc) error { func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) { for _, spec := range pod { - spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source - spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type - spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod" + spec.Annotations[boot.MountPrefix+name+".source"] = mount.Source + spec.Annotations[boot.MountPrefix+name+".type"] = mount.Type + spec.Annotations[boot.MountPrefix+name+".share"] = "pod" if len(mount.Options) > 0 { - spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",") + spec.Annotations[boot.MountPrefix+name+".options"] = strings.Join(mount.Options, ",") } } } @@ -144,6 +138,13 @@ func TestMultiContainerSanity(t *testing.T) { for _, conf := range configs(all...) { t.Logf("Running test with conf: %+v", conf) + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf.RootDir = rootDir + // Setup the containers. sleep := []string{"sleep", "100"} specs, ids := createSpecs(sleep, sleep) @@ -155,13 +156,13 @@ func TestMultiContainerSanity(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) } expectedPL = []*control.Process{ - {PID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -175,6 +176,13 @@ func TestMultiPIDNS(t *testing.T) { for _, conf := range configs(all...) { t.Logf("Running test with conf: %+v", conf) + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf.RootDir = rootDir + // Setup the containers. sleep := []string{"sleep", "100"} testSpecs, ids := createSpecs(sleep, sleep) @@ -194,13 +202,13 @@ func TestMultiPIDNS(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) } expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -213,6 +221,13 @@ func TestMultiPIDNSPath(t *testing.T) { for _, conf := range configs(all...) { t.Logf("Running test with conf: %+v", conf) + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf.RootDir = rootDir + // Setup the containers. sleep := []string{"sleep", "100"} testSpecs, ids := createSpecs(sleep, sleep, sleep) @@ -249,7 +264,7 @@ func TestMultiPIDNSPath(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -259,7 +274,7 @@ func TestMultiPIDNSPath(t *testing.T) { } expectedPL = []*control.Process{ - {PID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -268,13 +283,21 @@ func TestMultiPIDNSPath(t *testing.T) { } func TestMultiContainerWait(t *testing.T) { + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + + conf := testutil.TestConfig() + conf.RootDir = rootDir + // The first container should run the entire duration of the test. cmd1 := []string{"sleep", "100"} // We'll wait on the second container, which is much shorter lived. cmd2 := []string{"sleep", "1"} specs, ids := createSpecs(cmd1, cmd2) - conf := testutil.TestConfig() containers, cleanup, err := startContainers(conf, specs, ids) if err != nil { t.Fatalf("error starting containers: %v", err) @@ -283,7 +306,7 @@ func TestMultiContainerWait(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -328,7 +351,7 @@ func TestMultiContainerWait(t *testing.T) { // After Wait returns, ensure that the root container is running and // the child has finished. expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err) @@ -344,12 +367,14 @@ func TestExecWait(t *testing.T) { } defer os.RemoveAll(rootDir) + conf := testutil.TestConfig() + conf.RootDir = rootDir + // The first container should run the entire duration of the test. cmd1 := []string{"sleep", "100"} // We'll wait on the second container, which is much shorter lived. cmd2 := []string{"sleep", "1"} specs, ids := createSpecs(cmd1, cmd2) - conf := testutil.TestConfig() containers, cleanup, err := startContainers(conf, specs, ids) if err != nil { t.Fatalf("error starting containers: %v", err) @@ -358,7 +383,7 @@ func TestExecWait(t *testing.T) { // Check via ps that process is running. expectedPL := []*control.Process{ - {PID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Fatalf("failed to wait for sleep to start: %v", err) @@ -393,7 +418,7 @@ func TestExecWait(t *testing.T) { // Wait for the exec'd process to exit. expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Fatalf("failed to wait for second container to stop: %v", err) @@ -432,7 +457,15 @@ func TestMultiContainerMount(t *testing.T) { }) // Setup the containers. + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf := testutil.TestConfig() + conf.RootDir = rootDir + containers, cleanup, err := startContainers(conf, sps, ids) if err != nil { t.Fatalf("error starting containers: %v", err) @@ -454,6 +487,13 @@ func TestMultiContainerSignal(t *testing.T) { for _, conf := range configs(all...) { t.Logf("Running test with conf: %+v", conf) + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf.RootDir = rootDir + // Setup the containers. sleep := []string{"sleep", "100"} specs, ids := createSpecs(sleep, sleep) @@ -465,7 +505,7 @@ func TestMultiContainerSignal(t *testing.T) { // Check via ps that container 1 process is running. expectedPL := []*control.Process{ - {PID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, } if err := waitForProcessList(containers[1], expectedPL); err != nil { @@ -479,7 +519,7 @@ func TestMultiContainerSignal(t *testing.T) { // Make sure process 1 is still running. expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -548,6 +588,13 @@ func TestMultiContainerDestroy(t *testing.T) { for _, conf := range configs(all...) { t.Logf("Running test with conf: %+v", conf) + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf.RootDir = rootDir + // First container will remain intact while the second container is killed. podSpecs, ids := createSpecs( []string{"sleep", "100"}, @@ -586,9 +633,10 @@ func TestMultiContainerDestroy(t *testing.T) { if err != nil { t.Fatalf("error getting process data from sandbox: %v", err) } - expectedPL := []*control.Process{{PID: 1, Cmd: "sleep"}} - if !procListsEqual(pss, expectedPL) { - t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL)) + expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}} + if r, err := procListsEqual(pss, expectedPL); !r { + t.Errorf("container got process list: %s, want: %s: error: %v", + procListToString(pss), procListToString(expectedPL), err) } // Check that cont.Destroy is safe to call multiple times. @@ -599,13 +647,21 @@ func TestMultiContainerDestroy(t *testing.T) { } func TestMultiContainerProcesses(t *testing.T) { + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + + conf := testutil.TestConfig() + conf.RootDir = rootDir + // Note: use curly braces to keep 'sh' process around. Otherwise, shell // will just execve into 'sleep' and both containers will look the // same. specs, ids := createSpecs( []string{"sleep", "100"}, []string{"sh", "-c", "{ sleep 100; }"}) - conf := testutil.TestConfig() containers, cleanup, err := startContainers(conf, specs, ids) if err != nil { t.Fatalf("error starting containers: %v", err) @@ -614,7 +670,7 @@ func TestMultiContainerProcesses(t *testing.T) { // Check root's container process list doesn't include other containers. expectedPL0 := []*control.Process{ - {PID: 1, Cmd: "sleep"}, + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(containers[0], expectedPL0); err != nil { t.Errorf("failed to wait for process to start: %v", err) @@ -622,8 +678,8 @@ func TestMultiContainerProcesses(t *testing.T) { // Same for the other container. expectedPL1 := []*control.Process{ - {PID: 2, Cmd: "sh"}, - {PID: 3, PPID: 2, Cmd: "sleep"}, + {PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}}, + {PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}}, } if err := waitForProcessList(containers[1], expectedPL1); err != nil { t.Errorf("failed to wait for process to start: %v", err) @@ -637,7 +693,7 @@ func TestMultiContainerProcesses(t *testing.T) { if _, err := containers[1].Execute(args); err != nil { t.Fatalf("error exec'ing: %v", err) } - expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep"}) + expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}}) if err := waitForProcessList(containers[1], expectedPL1); err != nil { t.Errorf("failed to wait for process to start: %v", err) } @@ -650,6 +706,15 @@ func TestMultiContainerProcesses(t *testing.T) { // TestMultiContainerKillAll checks that all process that belong to a container // are killed when SIGKILL is sent to *all* processes in that container. func TestMultiContainerKillAll(t *testing.T) { + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + + conf := testutil.TestConfig() + conf.RootDir = rootDir + for _, tc := range []struct { killContainer bool }{ @@ -665,7 +730,6 @@ func TestMultiContainerKillAll(t *testing.T) { specs, ids := createSpecs( []string{app, "task-tree", "--depth=2", "--width=2"}, []string{app, "task-tree", "--depth=4", "--width=2"}) - conf := testutil.TestConfig() containers, cleanup, err := startContainers(conf, specs, ids) if err != nil { t.Fatalf("error starting containers: %v", err) @@ -739,19 +803,13 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) { specs, ids := createSpecs( []string{"/bin/sleep", "100"}, []string{"/bin/sleep", "100"}) - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - - conf := testutil.TestConfigWithRoot(rootDir) - // Create and start root container. - rootBundleDir, err := testutil.SetupBundleDir(specs[0]) + conf := testutil.TestConfig() + rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf) if err != nil { t.Fatalf("error setting up container: %v", err) } + defer os.RemoveAll(rootDir) defer os.RemoveAll(rootBundleDir) rootArgs := Args{ @@ -800,19 +858,12 @@ func TestMultiContainerDestroyStarting(t *testing.T) { } specs, ids := createSpecs(cmds...) - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - - conf := testutil.TestConfigWithRoot(rootDir) - - // Create and start root container. - rootBundleDir, err := testutil.SetupBundleDir(specs[0]) + conf := testutil.TestConfig() + rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf) if err != nil { t.Fatalf("error setting up container: %v", err) } + defer os.RemoveAll(rootDir) defer os.RemoveAll(rootBundleDir) rootArgs := Args{ @@ -886,9 +937,17 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) { script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename) cmd := []string{"sh", "-c", script} + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + + conf := testutil.TestConfig() + conf.RootDir = rootDir + // Make sure overlay is enabled, and none of the root filesystems are // read-only, otherwise we won't be able to create the file. - conf := testutil.TestConfig() conf.Overlay = true specs, ids := createSpecs(cmdRoot, cmd, cmd) for _, s := range specs { @@ -941,26 +1000,21 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) { } allSpecs, allIDs := createSpecs(cmds...) - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - // Split up the specs and IDs. rootSpec := allSpecs[0] rootID := allIDs[0] childrenSpecs := allSpecs[1:] childrenIDs := allIDs[1:] - bundleDir, err := testutil.SetupBundleDir(rootSpec) + conf := testutil.TestConfig() + rootDir, bundleDir, err := testutil.SetupContainer(rootSpec, conf) if err != nil { - t.Fatalf("error setting up bundle dir: %v", err) + t.Fatalf("error setting up container: %v", err) } + defer os.RemoveAll(rootDir) defer os.RemoveAll(bundleDir) // Start root container. - conf := testutil.TestConfigWithRoot(rootDir) rootArgs := Args{ ID: rootID, Spec: rootSpec, @@ -1029,6 +1083,13 @@ func TestMultiContainerSharedMount(t *testing.T) { for _, conf := range configs(all...) { t.Logf("Running test with conf: %+v", conf) + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf.RootDir = rootDir + // Setup the containers. sleep := []string{"sleep", "100"} podSpec, ids := createSpecs(sleep, sleep) @@ -1137,6 +1198,13 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) { for _, conf := range configs(all...) { t.Logf("Running test with conf: %+v", conf) + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf.RootDir = rootDir + // Setup the containers. sleep := []string{"sleep", "100"} podSpec, ids := createSpecs(sleep, sleep) @@ -1197,6 +1265,13 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { for _, conf := range configs(all...) { t.Logf("Running test with conf: %+v", conf) + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf.RootDir = rootDir + // Setup the containers. sleep := []string{"sleep", "100"} podSpec, ids := createSpecs(sleep, sleep) @@ -1300,8 +1375,14 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { // Test that unsupported pod mounts options are ignored when matching master and // slave mounts. func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf := testutil.TestConfig() - t.Logf("Running test with conf: %+v", conf) + conf.RootDir = rootDir // Setup the containers. sleep := []string{"/bin/sleep", "100"} @@ -1376,6 +1457,15 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) { Type: "tmpfs", } + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + + conf := testutil.TestConfig() + conf.RootDir = rootDir + // Create the specs. specs, ids := createSpecs( []string{"sleep", "1000"}, @@ -1386,7 +1476,6 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) { specs[1].Mounts = append(specs[2].Mounts, sharedMnt, writeableMnt) specs[2].Mounts = append(specs[1].Mounts, sharedMnt) - conf := testutil.TestConfig() containers, cleanup, err := startContainers(conf, specs, ids) if err != nil { t.Fatalf("error starting containers: %v", err) @@ -1405,9 +1494,17 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) { // Test that container is destroyed when Gofer is killed. func TestMultiContainerGoferKilled(t *testing.T) { + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + + conf := testutil.TestConfig() + conf.RootDir = rootDir + sleep := []string{"sleep", "100"} specs, ids := createSpecs(sleep, sleep, sleep) - conf := testutil.TestConfig() containers, cleanup, err := startContainers(conf, specs, ids) if err != nil { t.Fatalf("error starting containers: %v", err) @@ -1417,7 +1514,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { // Ensure container is running c := containers[2] expectedPL := []*control.Process{ - {PID: 3, Cmd: "sleep"}, + {PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}}, } if err := waitForProcessList(c, expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -1445,7 +1542,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { continue // container[2] has been killed. } pl := []*control.Process{ - {PID: kernel.ThreadID(i + 1), Cmd: "sleep"}, + {PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}}, } if err := waitForProcessList(c, pl); err != nil { t.Errorf("Container %q was affected by another container: %v", c.ID, err) @@ -1465,7 +1562,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { // Wait until sandbox stops. waitForProcessList will loop until sandbox exits // and RPC errors out. impossiblePL := []*control.Process{ - {PID: 100, Cmd: "non-existent-process"}, + {PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}}, } if err := waitForProcessList(c, impossiblePL); err == nil { t.Fatalf("Sandbox was not killed after gofer death") @@ -1483,7 +1580,15 @@ func TestMultiContainerGoferKilled(t *testing.T) { func TestMultiContainerLoadSandbox(t *testing.T) { sleep := []string{"sleep", "100"} specs, ids := createSpecs(sleep, sleep, sleep) + + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf := testutil.TestConfig() + conf.RootDir = rootDir // Create containers for the sandbox. wants, cleanup, err := startContainers(conf, specs, ids) @@ -1576,7 +1681,15 @@ func TestMultiContainerRunNonRoot(t *testing.T) { Type: "bind", }) + rootDir, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer os.RemoveAll(rootDir) + conf := testutil.TestConfig() + conf.RootDir = rootDir + pod, cleanup, err := startContainers(conf, podSpecs, ids) if err != nil { t.Fatalf("error starting containers: %v", err) diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go new file mode 100644 index 000000000..d95151ea5 --- /dev/null +++ b/runsc/container/state_file.go @@ -0,0 +1,185 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "sync" + + "github.com/gofrs/flock" + "gvisor.dev/gvisor/pkg/log" +) + +const stateFileExtension = ".state" + +// StateFile handles load from/save to container state safely from multiple +// processes. It uses a lock file to provide synchronization between operations. +// +// The lock file is located at: "${s.RootDir}/${s.ID}.lock". +// The state file is located at: "${s.RootDir}/${s.ID}.state". +type StateFile struct { + // RootDir is the directory containing the container metadata file. + RootDir string `json:"rootDir"` + + // ID is the container ID. + ID string `json:"id"` + + // + // Fields below this line are not saved in the state file and will not + // be preserved across commands. + // + + once sync.Once + flock *flock.Flock +} + +// List returns all container ids in the given root directory. +func List(rootDir string) ([]string, error) { + log.Debugf("List containers %q", rootDir) + list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension)) + if err != nil { + return nil, err + } + var out []string + for _, path := range list { + // Filter out files that do no belong to a container. + fileName := filepath.Base(path) + if len(fileName) < len(stateFileExtension) { + panic(fmt.Sprintf("invalid file match %q", path)) + } + // Remove the extension. + cid := fileName[:len(fileName)-len(stateFileExtension)] + if validateID(cid) == nil { + out = append(out, cid) + } + } + return out, nil +} + +// lock globally locks all locking operations for the container. +func (s *StateFile) lock() error { + s.once.Do(func() { + s.flock = flock.NewFlock(s.lockPath()) + }) + + if err := s.flock.Lock(); err != nil { + return fmt.Errorf("acquiring lock on %q: %v", s.flock, err) + } + return nil +} + +// lockForNew acquires the lock and checks if the state file doesn't exist. This +// is done to ensure that more than one creation didn't race to create +// containers with the same ID. +func (s *StateFile) lockForNew() error { + if err := s.lock(); err != nil { + return err + } + + // Checks if the container already exists by looking for the metadata file. + if _, err := os.Stat(s.statePath()); err == nil { + s.unlock() + return fmt.Errorf("container already exists") + } else if !os.IsNotExist(err) { + s.unlock() + return fmt.Errorf("looking for existing container: %v", err) + } + return nil +} + +// unlock globally unlocks all locking operations for the container. +func (s *StateFile) unlock() error { + if !s.flock.Locked() { + panic("unlock called without lock held") + } + + if err := s.flock.Unlock(); err != nil { + log.Warningf("Error to release lock on %q: %v", s.flock, err) + return fmt.Errorf("releasing lock on %q: %v", s.flock, err) + } + return nil +} + +// saveLocked saves 'v' to the state file. +// +// Preconditions: lock() must been called before. +func (s *StateFile) saveLocked(v interface{}) error { + if !s.flock.Locked() { + panic("saveLocked called without lock held") + } + + meta, err := json.Marshal(v) + if err != nil { + return err + } + if err := ioutil.WriteFile(s.statePath(), meta, 0640); err != nil { + return fmt.Errorf("writing json file: %v", err) + } + return nil +} + +func (s *StateFile) load(v interface{}) error { + if err := s.lock(); err != nil { + return err + } + defer s.unlock() + + metaBytes, err := ioutil.ReadFile(s.statePath()) + if err != nil { + return err + } + return json.Unmarshal(metaBytes, &v) +} + +func (s *StateFile) close() error { + if s.flock == nil { + return nil + } + if s.flock.Locked() { + panic("Closing locked file") + } + return s.flock.Close() +} + +func buildStatePath(rootDir, id string) string { + return filepath.Join(rootDir, id+stateFileExtension) +} + +// statePath is the full path to the state file. +func (s *StateFile) statePath() string { + return buildStatePath(s.RootDir, s.ID) +} + +// lockPath is the full path to the lock file. +func (s *StateFile) lockPath() string { + return filepath.Join(s.RootDir, s.ID+".lock") +} + +// destroy deletes all state created by the stateFile. It may be called with the +// lock file held. In that case, the lock file must still be unlocked and +// properly closed after destroy returns. +func (s *StateFile) destroy() error { + if err := os.Remove(s.statePath()); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Remove(s.lockPath()); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD index 9bf9e6e9d..bfd338bb6 100644 --- a/runsc/container/test_app/BUILD +++ b/runsc/container/test_app/BUILD @@ -15,5 +15,6 @@ go_binary( "//pkg/unet", "//runsc/testutil", "@com_github_google_subcommands//:go_default_library", + "@com_github_kr_pty//:go_default_library", ], ) diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go index 913d781c6..a1c8a741a 100644 --- a/runsc/container/test_app/test_app.go +++ b/runsc/container/test_app/test_app.go @@ -19,6 +19,7 @@ package main import ( "context" "fmt" + "io" "io/ioutil" "log" "net" @@ -31,6 +32,7 @@ import ( "flag" "github.com/google/subcommands" + "github.com/kr/pty" "gvisor.dev/gvisor/runsc/testutil" ) @@ -41,6 +43,7 @@ func main() { subcommands.Register(new(fdReceiver), "") subcommands.Register(new(fdSender), "") subcommands.Register(new(forkBomb), "") + subcommands.Register(new(ptyRunner), "") subcommands.Register(new(reaper), "") subcommands.Register(new(syscall), "") subcommands.Register(new(taskTree), "") @@ -352,3 +355,40 @@ func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...inter return subcommands.ExitSuccess } + +type ptyRunner struct{} + +// Name implements subcommands.Command. +func (*ptyRunner) Name() string { + return "pty-runner" +} + +// Synopsis implements subcommands.Command. +func (*ptyRunner) Synopsis() string { + return "runs the given command with an open pty terminal" +} + +// Usage implements subcommands.Command. +func (*ptyRunner) Usage() string { + return "pty-runner [command]" +} + +// SetFlags implements subcommands.Command.SetFlags. +func (*ptyRunner) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command. +func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus { + c := exec.Command(fs.Args()[0], fs.Args()[1:]...) + f, err := pty.Start(c) + if err != nil { + fmt.Printf("pty.Start failed: %v", err) + return subcommands.ExitFailure + } + defer f.Close() + + // Copy stdout from the command to keep this process alive until the + // subprocess exits. + io.Copy(os.Stdout, f) + + return subcommands.ExitSuccess +} diff --git a/runsc/debian/description b/runsc/debian/description index 6e3b1b2c0..9e8e08805 100644 --- a/runsc/debian/description +++ b/runsc/debian/description @@ -1,5 +1 @@ -gVisor is a user-space kernel, written in Go, that implements a substantial -portion of the Linux system surface. It includes an Open Container Initiative -(OCI) runtime called runsc that provides an isolation boundary between the -application and the host kernel. The runsc runtime integrates with Docker and -Kubernetes, making it simple to run sandboxed containers. +gVisor container sandbox runtime diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go index 57f6ae8de..9b6346ca2 100644 --- a/runsc/dockerutil/dockerutil.go +++ b/runsc/dockerutil/dockerutil.go @@ -380,6 +380,16 @@ func (d *Docker) FindPort(sandboxPort int) (int, error) { return port, nil } +// FindIP returns the IP address of the container as a string. +func (d *Docker) FindIP() (string, error) { + const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}` + out, err := do("inspect", "-f", format, d.Name) + if err != nil { + return "", fmt.Errorf("error retrieving IP: %v", err) + } + return strings.TrimSpace(out), nil +} + // SandboxPid returns the PID to the sandbox process. func (d *Docker) SandboxPid() (int, error) { out, err := do("inspect", "-f={{.State.Pid}}", d.Name) diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 80a4aa2fe..afcb41801 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -6,6 +6,8 @@ go_library( name = "fsgofer", srcs = [ "fsgofer.go", + "fsgofer_amd64_unsafe.go", + "fsgofer_arm64_unsafe.go", "fsgofer_unsafe.go", ], importpath = "gvisor.dev/gvisor/runsc/fsgofer", diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD index 02168ad1b..bac73f89d 100644 --- a/runsc/fsgofer/filter/BUILD +++ b/runsc/fsgofer/filter/BUILD @@ -6,6 +6,8 @@ go_library( name = "filter", srcs = [ "config.go", + "config_amd64.go", + "config_arm64.go", "extra_filters.go", "extra_filters_msan.go", "extra_filters_race.go", diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 0bf7507b7..a1792330f 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -25,11 +25,7 @@ import ( // allowedSyscalls is the set of syscalls executed by the gofer. var allowedSyscalls = seccomp.SyscallRules{ - syscall.SYS_ACCEPT: {}, - syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ - {seccomp.AllowValue(linux.ARCH_GET_FS)}, - {seccomp.AllowValue(linux.ARCH_SET_FS)}, - }, + syscall.SYS_ACCEPT: {}, syscall.SYS_CLOCK_GETTIME: {}, syscall.SYS_CLONE: []seccomp.Rule{ { @@ -155,7 +151,6 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_MPROTECT: {}, syscall.SYS_MUNMAP: {}, syscall.SYS_NANOSLEEP: {}, - syscall.SYS_NEWFSTATAT: {}, syscall.SYS_OPENAT: {}, syscall.SYS_PPOLL: {}, syscall.SYS_PREAD64: {}, @@ -220,6 +215,18 @@ var udsSyscalls = seccomp.SyscallRules{ syscall.SYS_SOCKET: []seccomp.Rule{ { seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_STREAM), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_DGRAM), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_SEQPACKET), + seccomp.AllowValue(0), }, }, syscall.SYS_CONNECT: []seccomp.Rule{ diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go new file mode 100644 index 000000000..a4b28cb8b --- /dev/null +++ b/runsc/fsgofer/filter/config_amd64.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" +) + +func init() { + allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{ + {seccomp.AllowValue(linux.ARCH_GET_FS)}, + {seccomp.AllowValue(linux.ARCH_SET_FS)}, + } + + allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{} +} diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go new file mode 100644 index 000000000..d2697deb7 --- /dev/null +++ b/runsc/fsgofer/filter/config_arm64.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +func init() { + allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{} +} diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index ed8b02cf0..b59e1a70e 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -199,6 +199,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { // The reason that the file is not opened initially as read-write is for better // performance with 'overlay2' storage driver. overlay2 eagerly copies the // entire file up when it's opened in write mode, and would perform badly when +// multiple files are only being opened for read (esp. startup). type localFile struct { p9.DefaultWalkGetAttr @@ -265,10 +266,10 @@ func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, erro // actual file open and is customizable by the caller. func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) { // Attempt to open file in the following mode in order: - // 1. RDONLY | NONBLOCK: for all files, works for directories and ro mounts too. - // Use non-blocking to prevent getting stuck inside open(2) for FIFOs. This option - // has no effect on regular files. - // 2. PATH: for symlinks + // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. + // Use non-blocking to prevent getting stuck inside open(2) for + // FIFOs. This option has no effect on regular files. + // 2. PATH: for symlinks, sockets. modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH} var err error @@ -366,23 +367,24 @@ func fchown(fd int, uid p9.UID, gid p9.GID) error { } // Open implements p9.File. -func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { +func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { if l.isOpen() { panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath)) } // Check if control file can be used or if a new open must be created. var newFile *fd.FD - if mode == p9.ReadOnly { - log.Debugf("Open reusing control file, mode: %v, %q", mode, l.hostPath) + if flags == p9.ReadOnly { + log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath) newFile = l.file } else { // Ideally reopen would call name_to_handle_at (with empty name) and // open_by_handle_at to reopen the file without using 'hostPath'. However, // name_to_handle_at and open_by_handle_at aren't supported by overlay2. - log.Debugf("Open reopening file, mode: %v, %q", mode, l.hostPath) + log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath) var err error - newFile, err = reopenProcFd(l.file, openFlags|mode.OSFlags()) + // Constrain open flags to the open mode and O_TRUNC. + newFile, err = reopenProcFd(l.file, openFlags|(flags.OSFlags()&(syscall.O_ACCMODE|syscall.O_TRUNC))) if err != nil { return nil, p9.QID{}, 0, extractErrno(err) } @@ -409,7 +411,7 @@ func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { } l.file = newFile } - l.mode = mode + l.mode = flags & p9.OpenFlagsModeMask return fd, l.attachPoint.makeQID(stat), 0, nil } @@ -601,7 +603,7 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) Mode: p9.FileMode(stat.Mode), UID: p9.UID(stat.Uid), GID: p9.GID(stat.Gid), - NLink: stat.Nlink, + NLink: uint64(stat.Nlink), RDev: stat.Rdev, Size: uint64(stat.Size), BlockSize: uint64(stat.Blksize), @@ -955,14 +957,14 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { } func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) { + var dirents []p9.Dirent + // Limit 'count' to cap the slice size that is returned. const maxCount = 100000 if count > maxCount { count = maxCount } - dirents := make([]p9.Dirent, 0, count) - // Pre-allocate buffers that will be reused to get partial results. direntsBuf := make([]byte, 8192) names := make([]string, 0, 100) @@ -1032,12 +1034,48 @@ func (l *localFile) Flush() error { } // Connect implements p9.File. -func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) { - // Check to see if the CLI option has been set to allow the UDS mount. +func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) { if !l.attachPoint.conf.HostUDS { return nil, syscall.ECONNREFUSED } - return fd.DialUnix(l.hostPath) + + // TODO(gvisor.dev/issue/1003): Due to different app vs replacement + // mappings, the app path may have fit in the sockaddr, but we can't + // fit f.path in our sockaddr. We'd need to redirect through a shorter + // path in order to actually connect to this socket. + if len(l.hostPath) > linux.UnixPathMax { + return nil, syscall.ECONNREFUSED + } + + var stype int + switch flags { + case p9.StreamSocket: + stype = syscall.SOCK_STREAM + case p9.DgramSocket: + stype = syscall.SOCK_DGRAM + case p9.SeqpacketSocket: + stype = syscall.SOCK_SEQPACKET + default: + return nil, syscall.ENXIO + } + + f, err := syscall.Socket(syscall.AF_UNIX, stype, 0) + if err != nil { + return nil, err + } + + if err := syscall.SetNonblock(f, true); err != nil { + syscall.Close(f) + return nil, err + } + + sa := syscall.SockaddrUnix{Name: l.hostPath} + if err := syscall.Connect(f, &sa); err != nil { + syscall.Close(f) + return nil, err + } + + return fd.New(f), nil } // Close implements p9.File. diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go new file mode 100644 index 000000000..5d4aab597 --- /dev/null +++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go @@ -0,0 +1,49 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package fsgofer + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserr" +) + +func statAt(dirFd int, name string) (syscall.Stat_t, error) { + nameBytes, err := syscall.BytePtrFromString(name) + if err != nil { + return syscall.Stat_t{}, err + } + namePtr := unsafe.Pointer(nameBytes) + + var stat syscall.Stat_t + statPtr := unsafe.Pointer(&stat) + + if _, _, errno := syscall.Syscall6( + syscall.SYS_NEWFSTATAT, + uintptr(dirFd), + uintptr(namePtr), + uintptr(statPtr), + linux.AT_SYMLINK_NOFOLLOW, + 0, + 0); errno != 0 { + + return syscall.Stat_t{}, syserr.FromHost(errno).ToError() + } + return stat, nil +} diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go new file mode 100644 index 000000000..8041fd352 --- /dev/null +++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go @@ -0,0 +1,49 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package fsgofer + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserr" +) + +func statAt(dirFd int, name string) (syscall.Stat_t, error) { + nameBytes, err := syscall.BytePtrFromString(name) + if err != nil { + return syscall.Stat_t{}, err + } + namePtr := unsafe.Pointer(nameBytes) + + var stat syscall.Stat_t + statPtr := unsafe.Pointer(&stat) + + if _, _, errno := syscall.Syscall6( + syscall.SYS_FSTATAT, + uintptr(dirFd), + uintptr(namePtr), + uintptr(statPtr), + linux.AT_SYMLINK_NOFOLLOW, + 0, + 0); errno != 0 { + + return syscall.Stat_t{}, syserr.FromHost(errno).ToError() + } + return stat, nil +} diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go index ff2556aee..542b54365 100644 --- a/runsc/fsgofer/fsgofer_unsafe.go +++ b/runsc/fsgofer/fsgofer_unsafe.go @@ -18,34 +18,9 @@ import ( "syscall" "unsafe" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/syserr" ) -func statAt(dirFd int, name string) (syscall.Stat_t, error) { - nameBytes, err := syscall.BytePtrFromString(name) - if err != nil { - return syscall.Stat_t{}, err - } - namePtr := unsafe.Pointer(nameBytes) - - var stat syscall.Stat_t - statPtr := unsafe.Pointer(&stat) - - if _, _, errno := syscall.Syscall6( - syscall.SYS_NEWFSTATAT, - uintptr(dirFd), - uintptr(namePtr), - uintptr(statPtr), - linux.AT_SYMLINK_NOFOLLOW, - 0, - 0); errno != 0 { - - return syscall.Stat_t{}, syserr.FromHost(errno).ToError() - } - return stat, nil -} - func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error { // utimensat(2) doesn't accept empty name, instead name must be nil to make it // operate directly on 'dirFd' unlike other *at syscalls. diff --git a/runsc/main.go b/runsc/main.go index 80b2d300c..abf929511 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -26,6 +26,7 @@ import ( "path/filepath" "strings" "syscall" + "time" "flag" @@ -46,6 +47,8 @@ var ( logFormat = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.") debug = flag.Bool("debug", false, "enable debug logging.") showVersion = flag.Bool("version", false, "show version and exit.") + // TODO(gvisor.dev/issue/193): support systemd cgroups + systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.") // These flags are unique to runsc, and are used to configure parts of the // system that are not covered by the runtime spec. @@ -66,7 +69,8 @@ var ( // Flags that control sandbox runtime behavior. platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.") network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") - gso = flag.Bool("gso", true, "enable generic segmenation offload.") + hardwareGSO = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.") + softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.") fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") @@ -78,6 +82,7 @@ var ( numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.") + cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") @@ -135,6 +140,12 @@ func main() { os.Exit(0) } + // TODO(gvisor.dev/issue/193): support systemd cgroups + if *systemdCgroup { + fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193") + os.Exit(1) + } + var errorLogger io.Writer if *logFD > -1 { errorLogger = os.NewFile(uintptr(*logFD), "error log file") @@ -200,7 +211,8 @@ func main() { FSGoferHostUDS: *fsGoferHostUDS, Overlay: *overlay, Network: netType, - GSO: *gso, + HardwareGSO: *hardwareGSO, + SoftwareGSO: *softwareGSO, LogPackets: *logPackets, Platform: platformType, Strace: *strace, @@ -214,6 +226,7 @@ func main() { AlsoLogToStderr: *alsoLogToStderr, ReferenceLeakMode: refsLeakMode, OverlayfsStaleRead: *overlayfsStaleRead, + CPUNumFromQuota: *cpuNumFromQuota, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, TestOnlyTestNameEnv: *testOnlyTestNameEnv, @@ -227,6 +240,18 @@ func main() { log.SetLevel(log.Debug) } + // Logging will include the local date and time via the time package. + // + // On first use, time.Local initializes the local time zone, which + // involves opening tzdata files on the host. Since this requires + // opening host files, it must be done before syscall filter + // installation. + // + // Generally there will be a log message before filter installation + // that will force initialization, but force initialization here in + // case that does not occur. + _ = time.Local.String() + subcommand := flag.CommandLine.Arg(0) var e log.Emitter diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 7fdceaab6..8001949d5 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -19,6 +19,8 @@ go_library( "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/platform", + "//pkg/tcpip/header", + "//pkg/tcpip/stack", "//pkg/urpc", "//runsc/boot", "//runsc/boot/platforms", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 5634f0707..be8b72b3e 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -28,6 +28,8 @@ import ( "github.com/vishvananda/netlink" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" @@ -61,7 +63,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") - if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil { + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels); err != nil { return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) } case boot.NetworkHost: @@ -136,7 +138,7 @@ func isRootNS() (bool, error) { // createInterfacesAndRoutesFromNS scrapes the interface and routes from the // net namespace with the given path, creates them in the sandbox, and removes // them from the host. -func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error { +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int) error { // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { @@ -182,36 +184,39 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO continue } - // Keep only IPv4 addresses. - var ip4addrs []*net.IPNet + var ipAddrs []*net.IPNet for _, ifaddr := range allAddrs { ipNet, ok := ifaddr.(*net.IPNet) if !ok { return fmt.Errorf("address is not IPNet: %+v", ifaddr) } - if ipNet.IP.To4() == nil { - log.Warningf("IPv6 is not supported, skipping: %v", ipNet) - continue - } - ip4addrs = append(ip4addrs, ipNet) + ipAddrs = append(ipAddrs, ipNet) } - if len(ip4addrs) == 0 { - log.Warningf("No IPv4 address found for interface %q, skipping", iface.Name) + if len(ipAddrs) == 0 { + log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name) continue } // Scrape the routes before removing the address, since that // will remove the routes as well. - routes, def, err := routesForIface(iface) + routes, defv4, defv6, err := routesForIface(iface) if err != nil { return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) } - if def != nil { - if !args.DefaultGateway.Route.Empty() { - return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway) + if defv4 != nil { + if !args.Defaultv4Gateway.Route.Empty() { + return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) } - args.DefaultGateway.Route = *def - args.DefaultGateway.Name = iface.Name + args.Defaultv4Gateway.Route = *defv4 + args.Defaultv4Gateway.Name = iface.Name + } + + if defv6 != nil { + if !args.Defaultv6Gateway.Route.Empty() { + return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) + } + args.Defaultv6Gateway.Route = *defv6 + args.Defaultv6Gateway.Name = iface.Name } link := boot.FDBasedLink{ @@ -232,7 +237,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO // Create the socket for the device. for i := 0; i < link.NumChannels; i++ { log.Debugf("Creating Channel %d", i) - socketEntry, err := createSocket(iface, ifaceLink, enableGSO) + socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO) if err != nil { return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) } @@ -247,9 +252,15 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) } + if link.GSOMaxSize == 0 && softwareGSO { + // Hardware GSO is disabled. Let's enable software GSO. + link.GSOMaxSize = stack.SoftwareGSOMaxSize + link.SoftwareGSOEnabled = true + } + // Collect the addresses for the interface, enable forwarding, // and remove them from the host. - for _, addr := range ip4addrs { + for _, addr := range ipAddrs { link.Addresses = append(link.Addresses, addr.IP) // Steal IP address from NIC. @@ -345,46 +356,56 @@ func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, } // routesForIface iterates over all routes for the given interface and converts -// them to boot.Routes. -func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) { +// them to boot.Routes. It also returns the a default v4/v6 route if found. +func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) { link, err := netlink.LinkByIndex(iface.Index) if err != nil { - return nil, nil, err + return nil, nil, nil, err } rs, err := netlink.RouteList(link, netlink.FAMILY_ALL) if err != nil { - return nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err) + return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err) } - var def *boot.Route + var defv4, defv6 *boot.Route var routes []boot.Route for _, r := range rs { // Is it a default route? if r.Dst == nil { if r.Gw == nil { - return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) - } - if r.Gw.To4() == nil { - log.Warningf("IPv6 is not supported, skipping default route: %v", r) - continue - } - if def != nil { - return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r) + return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) } // Create a catch all route to the gateway. - def = &boot.Route{ - Destination: net.IPNet{ - IP: net.IPv4zero, - Mask: net.IPMask(net.IPv4zero), - }, - Gateway: r.Gw, + switch len(r.Gw) { + case header.IPv4AddressSize: + if defv4 != nil { + return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r) + } + defv4 = &boot.Route{ + Destination: net.IPNet{ + IP: net.IPv4zero, + Mask: net.IPMask(net.IPv4zero), + }, + Gateway: r.Gw, + } + case header.IPv6AddressSize: + if defv6 != nil { + return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r) + } + + defv6 = &boot.Route{ + Destination: net.IPNet{ + IP: net.IPv6zero, + Mask: net.IPMask(net.IPv6zero), + }, + Gateway: r.Gw, + } + default: + return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r) } continue } - if r.Dst.IP.To4() == nil { - log.Warningf("IPv6 is not supported, skipping route: %v", r) - continue - } + dst := *r.Dst dst.IP = dst.IP.Mask(dst.Mask) routes = append(routes, boot.Route{ @@ -392,7 +413,7 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) { Gateway: r.Gw, }) } - return routes, def, nil + return routes, defv4, defv6, nil } // removeAddress removes IP address from network device. It's equivalent to: diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index ee9327fc8..ce1452b87 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -18,6 +18,7 @@ package sandbox import ( "context" "fmt" + "math" "os" "os/exec" "strconv" @@ -631,6 +632,26 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF if err != nil { return fmt.Errorf("getting cpu count from cgroups: %v", err) } + if conf.CPUNumFromQuota { + // Dropping below 2 CPUs can trigger application to disable + // locks that can lead do hard to debug errors, so just + // leaving two cores as reasonable default. + const minCPUs = 2 + + quota, err := s.Cgroup.CPUQuota() + if err != nil { + return fmt.Errorf("getting cpu qouta from cgroups: %v", err) + } + if n := int(math.Ceil(quota)); n > 0 { + if n < minCPUs { + n = minCPUs + } + if n < cpuNum { + // Only lower the cpu number. + cpuNum = n + } + } + } cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) mem, err := s.Cgroup.MemoryLimit() @@ -1004,16 +1025,22 @@ func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { // DestroyContainer destroys the given container. If it is the root container, // then the entire sandbox is destroyed. func (s *Sandbox) DestroyContainer(cid string) error { + if err := s.destroyContainer(cid); err != nil { + // If the sandbox isn't running, the container has already been destroyed, + // ignore the error in this case. + if s.IsRunning() { + return err + } + } + return nil +} + +func (s *Sandbox) destroyContainer(cid string) error { if s.IsRootContainer(cid) { log.Debugf("Destroying root container %q by destroying sandbox", cid) return s.destroy() } - if !s.IsRunning() { - // Sandbox isn't running anymore, container is already destroyed. - return nil - } - log.Debugf("Destroying container %q in sandbox %q", cid, s.ID) conn, err := s.sandboxConnect() if err != nil { diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index fa58313a0..205638803 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_library( name = "specutils", srcs = [ + "cri.go", "fs.go", "namespace.go", "specutils.go", diff --git a/runsc/specutils/cri.go b/runsc/specutils/cri.go new file mode 100644 index 000000000..9c5877cd5 --- /dev/null +++ b/runsc/specutils/cri.go @@ -0,0 +1,110 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package specutils + +import ( + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + // ContainerdContainerTypeAnnotation is the OCI annotation set by + // containerd to indicate whether the container to create should have + // its own sandbox or a container within an existing sandbox. + ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type" + // ContainerdContainerTypeContainer is the container type value + // indicating the container should be created in an existing sandbox. + ContainerdContainerTypeContainer = "container" + // ContainerdContainerTypeSandbox is the container type value + // indicating the container should be created in a new sandbox. + ContainerdContainerTypeSandbox = "sandbox" + + // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate + // which sandbox the container should be created in when the container + // is not the first container in the sandbox. + ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id" + + // CRIOContainerTypeAnnotation is the OCI annotation set by + // CRI-O to indicate whether the container to create should have + // its own sandbox or a container within an existing sandbox. + CRIOContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType" + + // CRIOContainerTypeContainer is the container type value + // indicating the container should be created in an existing sandbox. + CRIOContainerTypeContainer = "container" + // CRIOContainerTypeSandbox is the container type value + // indicating the container should be created in a new sandbox. + CRIOContainerTypeSandbox = "sandbox" + + // CRIOSandboxIDAnnotation is the OCI annotation set to indicate + // which sandbox the container should be created in when the container + // is not the first container in the sandbox. + CRIOSandboxIDAnnotation = "io.kubernetes.cri-o.SandboxID" +) + +// ContainerType represents the type of container requested by the calling container manager. +type ContainerType int + +const ( + // ContainerTypeUnspecified indicates that no known container type + // annotation was found in the spec. + ContainerTypeUnspecified ContainerType = iota + // ContainerTypeUnknown indicates that a container type was specified + // but is unknown to us. + ContainerTypeUnknown + // ContainerTypeSandbox indicates that the container should be run in a + // new sandbox. + ContainerTypeSandbox + // ContainerTypeContainer indicates that the container should be run in + // an existing sandbox. + ContainerTypeContainer +) + +// SpecContainerType tries to determine the type of container specified by the +// container manager using well-known container annotations. +func SpecContainerType(spec *specs.Spec) ContainerType { + if t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]; ok { + switch t { + case ContainerdContainerTypeSandbox: + return ContainerTypeSandbox + case ContainerdContainerTypeContainer: + return ContainerTypeContainer + default: + return ContainerTypeUnknown + } + } + if t, ok := spec.Annotations[CRIOContainerTypeAnnotation]; ok { + switch t { + case CRIOContainerTypeSandbox: + return ContainerTypeSandbox + case CRIOContainerTypeContainer: + return ContainerTypeContainer + default: + return ContainerTypeUnknown + } + } + return ContainerTypeUnspecified +} + +// SandboxID returns the ID of the sandbox to join and whether an ID was found +// in the spec. +func SandboxID(spec *specs.Spec) (string, bool) { + if id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]; ok { + return id, true + } + if id, ok := spec.Annotations[CRIOSandboxIDAnnotation]; ok { + return id, true + } + return "", false +} diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 3d9ced1b6..d3c2e4e78 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -108,23 +108,18 @@ func ValidateSpec(spec *specs.Spec) error { } } - // Two annotations are use by containerd to support multi-container pods. - // "io.kubernetes.cri.container-type" - // "io.kubernetes.cri.sandbox-id" - containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation] - _, hasSandboxID := spec.Annotations[ContainerdSandboxIDAnnotation] - switch { - // Non-containerd use won't set a container type. - case !hasContainerType: - case containerType == ContainerdContainerTypeSandbox: - // When starting a container in an existing sandbox, the sandbox ID - // must be set. - case containerType == ContainerdContainerTypeContainer: - if !hasSandboxID { - return fmt.Errorf("spec has container-type of %s, but no sandbox ID set", containerType) + // CRI specifies whether a container should start a new sandbox, or run + // another container in an existing sandbox. + switch SpecContainerType(spec) { + case ContainerTypeContainer: + // When starting a container in an existing sandbox, the + // sandbox ID must be set. + if _, ok := SandboxID(spec); !ok { + return fmt.Errorf("spec has container-type of container, but no sandbox ID set") } + case ContainerTypeUnknown: + return fmt.Errorf("unknown container-type") default: - return fmt.Errorf("unknown container-type: %s", containerType) } return nil @@ -338,39 +333,6 @@ func IsSupportedDevMount(m specs.Mount) bool { return true } -const ( - // ContainerdContainerTypeAnnotation is the OCI annotation set by - // containerd to indicate whether the container to create should have - // its own sandbox or a container within an existing sandbox. - ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type" - // ContainerdContainerTypeContainer is the container type value - // indicating the container should be created in an existing sandbox. - ContainerdContainerTypeContainer = "container" - // ContainerdContainerTypeSandbox is the container type value - // indicating the container should be created in a new sandbox. - ContainerdContainerTypeSandbox = "sandbox" - - // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate - // which sandbox the container should be created in when the container - // is not the first container in the sandbox. - ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id" -) - -// ShouldCreateSandbox returns true if the spec indicates that a new sandbox -// should be created for the container. If false, the container should be -// started in an existing sandbox. -func ShouldCreateSandbox(spec *specs.Spec) bool { - t, ok := spec.Annotations[ContainerdContainerTypeAnnotation] - return !ok || t == ContainerdContainerTypeSandbox -} - -// SandboxID returns the ID of the sandbox to join and whether an ID was found -// in the spec. -func SandboxID(spec *specs.Spec) (string, bool) { - id, ok := spec.Annotations[ContainerdSandboxIDAnnotation] - return id, ok -} - // WaitForReady waits for a process to become ready. The process is ready when // the 'ready' function returns true. It continues to wait if 'ready' returns // false. It returns error on timeout, if the process stops or if 'ready' fails. diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD index d44ebc906..c96ca2eb6 100644 --- a/runsc/testutil/BUILD +++ b/runsc/testutil/BUILD @@ -9,6 +9,7 @@ go_library( importpath = "gvisor.dev/gvisor/runsc/testutil", visibility = ["//:sandbox"], deps = [ + "//pkg/log", "//runsc/boot", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go index edf8b126c..9632776d2 100644 --- a/runsc/testutil/testutil.go +++ b/runsc/testutil/testutil.go @@ -25,7 +25,6 @@ import ( "fmt" "io" "io/ioutil" - "log" "math" "math/rand" "net/http" @@ -42,6 +41,7 @@ import ( "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" ) @@ -151,13 +151,6 @@ func TestConfig() *boot.Config { } } -// TestConfigWithRoot returns the default configuration to use in tests. -func TestConfigWithRoot(rootDir string) *boot.Config { - conf := TestConfig() - conf.RootDir = rootDir - return conf -} - // NewSpecWithArgs creates a simple spec with the given args suitable for use // in tests. func NewSpecWithArgs(args ...string) *specs.Spec { @@ -286,7 +279,7 @@ func WaitForHTTP(port int, timeout time.Duration) error { url := fmt.Sprintf("http://localhost:%d/", port) resp, err := c.Get(url) if err != nil { - log.Printf("Waiting %s: %v", url, err) + log.Infof("Waiting %s: %v", url, err) return err } resp.Body.Close() |