diff options
Diffstat (limited to 'runsc')
112 files changed, 26384 insertions, 0 deletions
diff --git a/runsc/BUILD b/runsc/BUILD new file mode 100644 index 000000000..757f6d44c --- /dev/null +++ b/runsc/BUILD @@ -0,0 +1,123 @@ +load("//tools:defs.bzl", "go_binary", "pkg_deb", "pkg_tar") + +package(licenses = ["notice"]) + +go_binary( + name = "runsc", + srcs = [ + "main.go", + "version.go", + ], + pure = True, + visibility = [ + "//visibility:public", + ], + x_defs = {"main.version": "{STABLE_VERSION}"}, + deps = [ + "//pkg/log", + "//pkg/refs", + "//pkg/sentry/platform", + "//runsc/boot", + "//runsc/cmd", + "//runsc/flag", + "//runsc/specutils", + "@com_github_google_subcommands//:go_default_library", + ], +) + +# The runsc-race target is a race-compatible BUILD target. This must be built +# via: bazel build --features=race :runsc-race +# +# This is neccessary because the race feature must apply to all dependencies +# due a bug in gazelle file selection. The pure attribute must be off because +# the race detector requires linking with non-Go components, although we still +# require a static binary. +# +# Note that in the future this might be convertible to a compatible target by +# using the pure and static attributes within a select function, but select is +# not currently compatible with string attributes [1]. +# +# [1] https://github.com/bazelbuild/bazel/issues/1698 +go_binary( + name = "runsc-race", + srcs = [ + "main.go", + "version.go", + ], + static = True, + visibility = [ + "//visibility:public", + ], + x_defs = {"main.version": "{STABLE_VERSION}"}, + deps = [ + "//pkg/log", + "//pkg/refs", + "//pkg/sentry/platform", + "//runsc/boot", + "//runsc/cmd", + "//runsc/flag", + "//runsc/specutils", + "@com_github_google_subcommands//:go_default_library", + ], +) + +pkg_tar( + name = "runsc-bin", + srcs = [":runsc"], + mode = "0755", + package_dir = "/usr/bin", + strip_prefix = "/runsc/linux_amd64_pure_stripped", +) + +pkg_tar( + name = "debian-data", + extension = "tar.gz", + deps = [ + ":runsc-bin", + ], +) + +genrule( + name = "deb-version", + # Note that runsc must appear in the srcs parameter and not the tools + # parameter, otherwise it will not be stamped. This is reasonable, as tools + # may be encoded differently in the build graph (cached more aggressively + # because they are assumes to be hermetic). + srcs = [":runsc"], + outs = ["version.txt"], + # Note that the little dance here is necessary because files in the $(SRCS) + # attribute are not executable by default, and we can't touch in place. + cmd = "cp $(location :runsc) $(@D)/runsc && \ + chmod a+x $(@D)/runsc && \ + $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \ + rm -f $(@D)/runsc", + stamp = 1, +) + +pkg_deb( + name = "runsc-debian", + architecture = "amd64", + data = ":debian-data", + # Note that the description_file will be flatten (all newlines removed), + # and therefore it is kept to a simple one-line description. The expected + # format for debian packages is "short summary\nLonger explanation of + # tool." and this is impossible with the flattening. + description_file = "debian/description", + homepage = "https://gvisor.dev/", + maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>", + package = "runsc", + postinst = "debian/postinst.sh", + version_file = ":version.txt", + visibility = [ + "//visibility:public", + ], +) + +sh_test( + name = "version_test", + size = "small", + srcs = ["version_test.sh"], + args = ["$(location :runsc)"], + data = [":runsc"], + tags = ["noguitar"], +) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD new file mode 100644 index 000000000..aad2a41de --- /dev/null +++ b/runsc/boot/BUILD @@ -0,0 +1,137 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "boot", + srcs = [ + "compat.go", + "compat_amd64.go", + "compat_arm64.go", + "config.go", + "controller.go", + "debug.go", + "events.go", + "fs.go", + "limits.go", + "loader.go", + "network.go", + "strace.go", + "vfs.go", + ], + visibility = [ + "//pkg/test:__subpackages__", + "//runsc:__subpackages__", + "//test:__subpackages__", + ], + deps = [ + "//pkg/abi", + "//pkg/abi/linux", + "//pkg/context", + "//pkg/control/server", + "//pkg/cpuid", + "//pkg/eventchannel", + "//pkg/fspath", + "//pkg/log", + "//pkg/memutil", + "//pkg/rand", + "//pkg/refs", + "//pkg/sentry/arch", + "//pkg/sentry/arch:registers_go_proto", + "//pkg/sentry/control", + "//pkg/sentry/devices/memdev", + "//pkg/sentry/devices/ttydev", + "//pkg/sentry/devices/tundev", + "//pkg/sentry/fdimport", + "//pkg/sentry/fs", + "//pkg/sentry/fs/dev", + "//pkg/sentry/fs/gofer", + "//pkg/sentry/fs/host", + "//pkg/sentry/fs/proc", + "//pkg/sentry/fs/ramfs", + "//pkg/sentry/fs/sys", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/fs/tty", + "//pkg/sentry/fs/user", + "//pkg/sentry/fsimpl/devpts", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/fuse", + "//pkg/sentry/fsimpl/gofer", + "//pkg/sentry/fsimpl/host", + "//pkg/sentry/fsimpl/overlay", + "//pkg/sentry/fsimpl/proc", + "//pkg/sentry/fsimpl/sys", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel:uncaught_signal_go_proto", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/sighandling", + "//pkg/sentry/socket/hostinet", + "//pkg/sentry/socket/netlink", + "//pkg/sentry/socket/netlink/route", + "//pkg/sentry/socket/netlink/uevent", + "//pkg/sentry/socket/netstack", + "//pkg/sentry/socket/unix", + "//pkg/sentry/state", + "//pkg/sentry/strace", + "//pkg/sentry/syscalls/linux/vfs2", + "//pkg/sentry/time", + "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", + "//pkg/sentry/usage", + "//pkg/sentry/vfs", + "//pkg/sentry/watchdog", + "//pkg/sync", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/link/fdbased", + "//pkg/tcpip/link/loopback", + "//pkg/tcpip/link/qdisc/fifo", + "//pkg/tcpip/link/sniffer", + "//pkg/tcpip/network/arp", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/network/ipv6", + "//pkg/tcpip/stack", + "//pkg/tcpip/transport/icmp", + "//pkg/tcpip/transport/raw", + "//pkg/tcpip/transport/tcp", + "//pkg/tcpip/transport/udp", + "//pkg/urpc", + "//runsc/boot/filter", + "//runsc/boot/platforms", + "//runsc/boot/pprof", + "//runsc/specutils", + "@com_github_golang_protobuf//proto:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "boot_test", + size = "small", + srcs = [ + "compat_test.go", + "fs_test.go", + "loader_test.go", + ], + library = ":boot", + deps = [ + "//pkg/control/server", + "//pkg/fspath", + "//pkg/log", + "//pkg/p9", + "//pkg/sentry/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/unet", + "//runsc/fsgofer", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go new file mode 100644 index 000000000..84c67cbc2 --- /dev/null +++ b/runsc/boot/compat.go @@ -0,0 +1,202 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "os" + "syscall" + + "github.com/golang/protobuf/proto" + "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/log" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" + spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" + "gvisor.dev/gvisor/pkg/sync" +) + +func initCompatLogs(fd int) error { + ce, err := newCompatEmitter(fd) + if err != nil { + return err + } + eventchannel.AddEmitter(ce) + return nil +} + +type compatEmitter struct { + sink *log.BasicLogger + nameMap strace.SyscallMap + + // mu protects the fields below. + mu sync.Mutex + + // trackers map syscall number to the respective tracker instance. + // Protected by 'mu'. + trackers map[uint64]syscallTracker +} + +func newCompatEmitter(logFD int) (*compatEmitter, error) { + nameMap, ok := getSyscallNameMap() + if !ok { + return nil, fmt.Errorf("Linux syscall table not found") + } + + c := &compatEmitter{ + // Always logs to default logger. + sink: log.Log(), + nameMap: nameMap, + trackers: make(map[uint64]syscallTracker), + } + + if logFD > 0 { + f := os.NewFile(uintptr(logFD), "user log file") + target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}} + c.sink = &log.BasicLogger{Level: log.Info, Emitter: target} + } + return c, nil +} + +// Emit implements eventchannel.Emitter. +func (c *compatEmitter) Emit(msg proto.Message) (bool, error) { + switch m := msg.(type) { + case *spb.UnimplementedSyscall: + c.emitUnimplementedSyscall(m) + case *ucspb.UncaughtSignal: + c.emitUncaughtSignal(m) + } + + return false, nil +} + +func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { + regs := us.Registers + + c.mu.Lock() + defer c.mu.Unlock() + + sysnr := syscallNum(regs) + tr := c.trackers[sysnr] + if tr == nil { + switch sysnr { + case syscall.SYS_PRCTL: + // args: cmd, ... + tr = newArgsTracker(0) + + case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX, syscall.SYS_FALLOCATE: + // args: fd/addr, cmd, ... + tr = newArgsTracker(1) + + case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT: + // args: fd, level, name, ... + tr = newArgsTracker(1, 2) + + case syscall.SYS_SEMCTL: + // args: semid, semnum, cmd, ... + tr = newArgsTracker(2) + + default: + tr = newArchArgsTracker(sysnr) + if tr == nil { + tr = &onceTracker{} + } + } + c.trackers[sysnr] = tr + } + + if tr.shouldReport(regs) { + name := c.nameMap.Name(uintptr(sysnr)) + c.sink.Infof("Unsupported syscall %s(%#x,%#x,%#x,%#x,%#x,%#x). It is "+ + "likely that you can safely ignore this message and that this is not "+ + "the cause of any error. Please, refer to %s/%s for more information.", + name, argVal(0, regs), argVal(1, regs), argVal(2, regs), argVal(3, regs), + argVal(4, regs), argVal(5, regs), syscallLink, name) + + tr.onReported(regs) + } +} + +func (c *compatEmitter) emitUncaughtSignal(msg *ucspb.UncaughtSignal) { + sig := syscall.Signal(msg.SignalNumber) + c.sink.Infof( + "Uncaught signal: %q (%d), PID: %d, TID: %d, fault addr: %#x", + sig, msg.SignalNumber, msg.Pid, msg.Tid, msg.FaultAddr) +} + +// Close implements eventchannel.Emitter. +func (c *compatEmitter) Close() error { + c.sink = nil + return nil +} + +// syscallTracker interface allows filters to apply differently depending on +// the syscall and arguments. +type syscallTracker interface { + // shouldReport returns true is the syscall should be reported. + shouldReport(regs *rpb.Registers) bool + + // onReported marks the syscall as reported. + onReported(regs *rpb.Registers) +} + +// onceTracker reports only a single time, used for most syscalls. +type onceTracker struct { + reported bool +} + +func (o *onceTracker) shouldReport(_ *rpb.Registers) bool { + return !o.reported +} + +func (o *onceTracker) onReported(_ *rpb.Registers) { + o.reported = true +} + +// argsTracker reports only once for each different combination of arguments. +// It's used for generic syscalls like ioctl to report once per 'cmd'. +type argsTracker struct { + // argsIdx is the syscall arguments to use as unique ID. + argsIdx []int + reported map[string]struct{} + count int +} + +func newArgsTracker(argIdx ...int) *argsTracker { + return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})} +} + +// key returns the command based on the syscall argument index. +func (a *argsTracker) key(regs *rpb.Registers) string { + var rv string + for _, idx := range a.argsIdx { + rv += fmt.Sprintf("%d|", argVal(idx, regs)) + } + return rv +} + +func (a *argsTracker) shouldReport(regs *rpb.Registers) bool { + if a.count >= reportLimit { + return false + } + _, ok := a.reported[a.key(regs)] + return !ok +} + +func (a *argsTracker) onReported(regs *rpb.Registers) { + a.count++ + a.reported[a.key(regs)] = struct{}{} +} diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go new file mode 100644 index 000000000..8eb76b2ba --- /dev/null +++ b/runsc/boot/compat_amd64.go @@ -0,0 +1,100 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" +) + +const ( + // reportLimit is the max number of events that should be reported per + // tracker. + reportLimit = 100 + syscallLink = "https://gvisor.dev/c/linux/amd64" +) + +// newRegs create a empty Registers instance. +func newRegs() *rpb.Registers { + return &rpb.Registers{ + Arch: &rpb.Registers_Amd64{ + Amd64: &rpb.AMD64Registers{}, + }, + } +} + +func argVal(argIdx int, regs *rpb.Registers) uint64 { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + + switch argIdx { + case 0: + return amd64Regs.Rdi + case 1: + return amd64Regs.Rsi + case 2: + return amd64Regs.Rdx + case 3: + return amd64Regs.R10 + case 4: + return amd64Regs.R8 + case 5: + return amd64Regs.R9 + } + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) +} + +func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + + switch argIdx { + case 0: + amd64Regs.Rdi = argVal + case 1: + amd64Regs.Rsi = argVal + case 2: + amd64Regs.Rdx = argVal + case 3: + amd64Regs.R10 = argVal + case 4: + amd64Regs.R8 = argVal + case 5: + amd64Regs.R9 = argVal + default: + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) + } +} + +func getSyscallNameMap() (strace.SyscallMap, bool) { + return strace.Lookup(abi.Linux, arch.AMD64) +} + +func syscallNum(regs *rpb.Registers) uint64 { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + return amd64Regs.OrigRax +} + +func newArchArgsTracker(sysnr uint64) syscallTracker { + switch sysnr { + case syscall.SYS_ARCH_PRCTL: + // args: cmd, ... + return newArgsTracker(0) + } + return nil +} diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go new file mode 100644 index 000000000..bce9d95b3 --- /dev/null +++ b/runsc/boot/compat_arm64.go @@ -0,0 +1,95 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" +) + +const ( + // reportLimit is the max number of events that should be reported per + // tracker. + reportLimit = 100 + syscallLink = "https://gvisor.dev/c/linux/arm64" +) + +// newRegs create a empty Registers instance. +func newRegs() *rpb.Registers { + return &rpb.Registers{ + Arch: &rpb.Registers_Arm64{ + Arm64: &rpb.ARM64Registers{}, + }, + } +} + +func argVal(argIdx int, regs *rpb.Registers) uint64 { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + + switch argIdx { + case 0: + return arm64Regs.R0 + case 1: + return arm64Regs.R1 + case 2: + return arm64Regs.R2 + case 3: + return arm64Regs.R3 + case 4: + return arm64Regs.R4 + case 5: + return arm64Regs.R5 + } + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) +} + +func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + + switch argIdx { + case 0: + arm64Regs.R0 = argVal + case 1: + arm64Regs.R1 = argVal + case 2: + arm64Regs.R2 = argVal + case 3: + arm64Regs.R3 = argVal + case 4: + arm64Regs.R4 = argVal + case 5: + arm64Regs.R5 = argVal + default: + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) + } +} + +func getSyscallNameMap() (strace.SyscallMap, bool) { + return strace.Lookup(abi.Linux, arch.ARM64) +} + +func syscallNum(regs *rpb.Registers) uint64 { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + return arm64Regs.R8 +} + +func newArchArgsTracker(sysnr uint64) syscallTracker { + // currently, no arch specific syscalls need to be handled here. + return nil +} diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go new file mode 100644 index 000000000..839c5303b --- /dev/null +++ b/runsc/boot/compat_test.go @@ -0,0 +1,90 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "testing" +) + +func TestOnceTracker(t *testing.T) { + o := onceTracker{} + if !o.shouldReport(nil) { + t.Error("first call to checkAndMark, got: false, want: true") + } + o.onReported(nil) + for i := 0; i < 2; i++ { + if o.shouldReport(nil) { + t.Error("after first call to checkAndMark, got: true, want: false") + } + } +} + +func TestArgsTracker(t *testing.T) { + for _, tc := range []struct { + name string + idx []int + arg1_1 uint64 + arg1_2 uint64 + arg2_1 uint64 + arg2_2 uint64 + want bool + }{ + {name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false}, + {name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false}, + {name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true}, + {name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true}, + {name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false}, + {name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false}, + {name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true}, + } { + t.Run(tc.name, func(t *testing.T) { + c := newArgsTracker(tc.idx...) + regs := newRegs() + setArgVal(0, tc.arg1_1, regs) + setArgVal(1, tc.arg2_1, regs) + if !c.shouldReport(regs) { + t.Error("first call to shouldReport, got: false, want: true") + } + c.onReported(regs) + + setArgVal(0, tc.arg1_2, regs) + setArgVal(1, tc.arg2_2, regs) + if got := c.shouldReport(regs); tc.want != got { + t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want) + } + }) + } +} + +func TestArgsTrackerLimit(t *testing.T) { + c := newArgsTracker(0, 1) + for i := 0; i < reportLimit; i++ { + regs := newRegs() + setArgVal(0, 123, regs) + setArgVal(1, uint64(i), regs) + if !c.shouldReport(regs) { + t.Error("shouldReport before limit was reached, got: false, want: true") + } + c.onReported(regs) + } + + // Should hit the count limit now. + regs := newRegs() + setArgVal(0, 123, regs) + setArgVal(1, 123456, regs) + if c.shouldReport(regs) { + t.Error("shouldReport after limit was reached, got: true, want: false") + } +} diff --git a/runsc/boot/config.go b/runsc/boot/config.go new file mode 100644 index 000000000..bb01b8fb5 --- /dev/null +++ b/runsc/boot/config.go @@ -0,0 +1,329 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "strconv" + "strings" + + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/watchdog" +) + +// FileAccessType tells how the filesystem is accessed. +type FileAccessType int + +const ( + // FileAccessShared sends IO requests to a Gofer process that validates the + // requests and forwards them to the host. + FileAccessShared FileAccessType = iota + + // FileAccessExclusive is the same as FileAccessShared, but enables + // extra caching for improved performance. It should only be used if + // the sandbox has exclusive access to the filesystem. + FileAccessExclusive +) + +// MakeFileAccessType converts type from string. +func MakeFileAccessType(s string) (FileAccessType, error) { + switch s { + case "shared": + return FileAccessShared, nil + case "exclusive": + return FileAccessExclusive, nil + default: + return 0, fmt.Errorf("invalid file access type %q", s) + } +} + +func (f FileAccessType) String() string { + switch f { + case FileAccessShared: + return "shared" + case FileAccessExclusive: + return "exclusive" + default: + return fmt.Sprintf("unknown(%d)", f) + } +} + +// NetworkType tells which network stack to use. +type NetworkType int + +const ( + // NetworkSandbox uses internal network stack, isolated from the host. + NetworkSandbox NetworkType = iota + + // NetworkHost redirects network related syscalls to the host network. + NetworkHost + + // NetworkNone sets up just loopback using netstack. + NetworkNone +) + +// MakeNetworkType converts type from string. +func MakeNetworkType(s string) (NetworkType, error) { + switch s { + case "sandbox": + return NetworkSandbox, nil + case "host": + return NetworkHost, nil + case "none": + return NetworkNone, nil + default: + return 0, fmt.Errorf("invalid network type %q", s) + } +} + +func (n NetworkType) String() string { + switch n { + case NetworkSandbox: + return "sandbox" + case NetworkHost: + return "host" + case NetworkNone: + return "none" + default: + return fmt.Sprintf("unknown(%d)", n) + } +} + +// MakeWatchdogAction converts type from string. +func MakeWatchdogAction(s string) (watchdog.Action, error) { + switch strings.ToLower(s) { + case "log", "logwarning": + return watchdog.LogWarning, nil + case "panic": + return watchdog.Panic, nil + default: + return 0, fmt.Errorf("invalid watchdog action %q", s) + } +} + +// MakeRefsLeakMode converts type from string. +func MakeRefsLeakMode(s string) (refs.LeakMode, error) { + switch strings.ToLower(s) { + case "disabled": + return refs.NoLeakChecking, nil + case "log-names": + return refs.LeaksLogWarning, nil + case "log-traces": + return refs.LeaksLogTraces, nil + default: + return 0, fmt.Errorf("invalid refs leakmode %q", s) + } +} + +func refsLeakModeToString(mode refs.LeakMode) string { + switch mode { + // If not set, default it to disabled. + case refs.UninitializedLeakChecking, refs.NoLeakChecking: + return "disabled" + case refs.LeaksLogWarning: + return "log-names" + case refs.LeaksLogTraces: + return "log-traces" + default: + panic(fmt.Sprintf("Invalid leakmode: %d", mode)) + } +} + +// Config holds configuration that is not part of the runtime spec. +type Config struct { + // RootDir is the runtime root directory. + RootDir string + + // Debug indicates that debug logging should be enabled. + Debug bool + + // LogFilename is the filename to log to, if not empty. + LogFilename string + + // LogFormat is the log format. + LogFormat string + + // DebugLog is the path to log debug information to, if not empty. + DebugLog string + + // PanicLog is the path to log GO's runtime messages, if not empty. + PanicLog string + + // DebugLogFormat is the log format for debug. + DebugLogFormat string + + // FileAccess indicates how the filesystem is accessed. + FileAccess FileAccessType + + // Overlay is whether to wrap the root filesystem in an overlay. + Overlay bool + + // FSGoferHostUDS enables the gofer to mount a host UDS. + FSGoferHostUDS bool + + // Network indicates what type of network to use. + Network NetworkType + + // EnableRaw indicates whether raw sockets should be enabled. Raw + // sockets are disabled by stripping CAP_NET_RAW from the list of + // capabilities. + EnableRaw bool + + // HardwareGSO indicates that hardware segmentation offload is enabled. + HardwareGSO bool + + // SoftwareGSO indicates that software segmentation offload is enabled. + SoftwareGSO bool + + // TXChecksumOffload indicates that TX Checksum Offload is enabled. + TXChecksumOffload bool + + // RXChecksumOffload indicates that RX Checksum Offload is enabled. + RXChecksumOffload bool + + // QDisc indicates the type of queuening discipline to use by default + // for non-loopback interfaces. + QDisc QueueingDiscipline + + // LogPackets indicates that all network packets should be logged. + LogPackets bool + + // Platform is the platform to run on. + Platform string + + // Strace indicates that strace should be enabled. + Strace bool + + // StraceSyscalls is the set of syscalls to trace. If StraceEnable is + // true and this list is empty, then all syscalls will be traced. + StraceSyscalls []string + + // StraceLogSize is the max size of data blobs to display. + StraceLogSize uint + + // DisableSeccomp indicates whether seccomp syscall filters should be + // disabled. Pardon the double negation, but default to enabled is important. + DisableSeccomp bool + + // WatchdogAction sets what action the watchdog takes when triggered. + WatchdogAction watchdog.Action + + // PanicSignal registers signal handling that panics. Usually set to + // SIGUSR2(12) to troubleshoot hangs. -1 disables it. + PanicSignal int + + // ProfileEnable is set to prepare the sandbox to be profiled. + ProfileEnable bool + + // RestoreFile is the path to the saved container image + RestoreFile string + + // NumNetworkChannels controls the number of AF_PACKET sockets that map + // to the same underlying network device. This allows netstack to better + // scale for high throughput use cases. + NumNetworkChannels int + + // Rootless allows the sandbox to be started with a user that is not root. + // Defense is depth measures are weaker with rootless. Specifically, the + // sandbox and Gofer process run as root inside a user namespace with root + // mapped to the caller's user. + Rootless bool + + // AlsoLogToStderr allows to send log messages to stderr. + AlsoLogToStderr bool + + // ReferenceLeakMode sets reference leak check mode + ReferenceLeakMode refs.LeakMode + + // OverlayfsStaleRead instructs the sandbox to assume that the root mount + // is on a Linux overlayfs mount, which does not necessarily preserve + // coherence between read-only and subsequent writable file descriptors + // representing the "same" file. + OverlayfsStaleRead bool + + // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in + // tests. It allows runsc to start the sandbox process as the current + // user, and without chrooting the sandbox process. This can be + // necessary in test environments that have limited capabilities. + TestOnlyAllowRunAsCurrentUserWithoutChroot bool + + // TestOnlyTestNameEnv should only be used in tests. It looks up for the + // test name in the container environment variables and adds it to the debug + // log file name. This is done to help identify the log with the test when + // multiple tests are run in parallel, since there is no way to pass + // parameters to the runtime from docker. + TestOnlyTestNameEnv string + + // CPUNumFromQuota sets CPU number count to available CPU quota, using + // least integer value greater than or equal to quota. + // + // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. + CPUNumFromQuota bool + + // Enables VFS2 (not plumbled through yet). + VFS2 bool +} + +// ToFlags returns a slice of flags that correspond to the given Config. +func (c *Config) ToFlags() []string { + f := []string{ + "--root=" + c.RootDir, + "--debug=" + strconv.FormatBool(c.Debug), + "--log=" + c.LogFilename, + "--log-format=" + c.LogFormat, + "--debug-log=" + c.DebugLog, + "--panic-log=" + c.PanicLog, + "--debug-log-format=" + c.DebugLogFormat, + "--file-access=" + c.FileAccess.String(), + "--overlay=" + strconv.FormatBool(c.Overlay), + "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS), + "--network=" + c.Network.String(), + "--log-packets=" + strconv.FormatBool(c.LogPackets), + "--platform=" + c.Platform, + "--strace=" + strconv.FormatBool(c.Strace), + "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","), + "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)), + "--watchdog-action=" + c.WatchdogAction.String(), + "--panic-signal=" + strconv.Itoa(c.PanicSignal), + "--profile=" + strconv.FormatBool(c.ProfileEnable), + "--net-raw=" + strconv.FormatBool(c.EnableRaw), + "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), + "--rootless=" + strconv.FormatBool(c.Rootless), + "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr), + "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), + "--gso=" + strconv.FormatBool(c.HardwareGSO), + "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), + "--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload), + "--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload), + "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), + "--qdisc=" + c.QDisc.String(), + } + if c.CPUNumFromQuota { + f = append(f, "--cpu-num-from-quota") + } + // Only include these if set since it is never to be used by users. + if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { + f = append(f, "--TESTONLY-unsafe-nonroot=true") + } + if len(c.TestOnlyTestNameEnv) != 0 { + f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv) + } + + if c.VFS2 { + f = append(f, "--vfs2=true") + } + + return f +} diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go new file mode 100644 index 000000000..8125d5061 --- /dev/null +++ b/runsc/boot/controller.go @@ -0,0 +1,506 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "errors" + "fmt" + "os" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/sentry/state" + "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/specutils" +) + +const ( + // ContainerCheckpoint checkpoints a container. + ContainerCheckpoint = "containerManager.Checkpoint" + + // ContainerCreate creates a container. + ContainerCreate = "containerManager.Create" + + // ContainerDestroy is used to stop a non-root container and free all + // associated resources in the sandbox. + ContainerDestroy = "containerManager.Destroy" + + // ContainerEvent is the URPC endpoint for getting stats about the + // container used by "runsc events". + ContainerEvent = "containerManager.Event" + + // ContainerExecuteAsync is the URPC endpoint for executing a command in a + // container. + ContainerExecuteAsync = "containerManager.ExecuteAsync" + + // ContainerPause pauses the container. + ContainerPause = "containerManager.Pause" + + // ContainerProcesses is the URPC endpoint for getting the list of + // processes running in a container. + ContainerProcesses = "containerManager.Processes" + + // ContainerRestore restores a container from a statefile. + ContainerRestore = "containerManager.Restore" + + // ContainerResume unpauses the paused container. + ContainerResume = "containerManager.Resume" + + // ContainerSignal is used to send a signal to a container. + ContainerSignal = "containerManager.Signal" + + // ContainerSignalProcess is used to send a signal to a particular + // process in a container. + ContainerSignalProcess = "containerManager.SignalProcess" + + // ContainerStart is the URPC endpoint for running a non-root container + // within a sandbox. + ContainerStart = "containerManager.Start" + + // ContainerWait is used to wait on the init process of the container + // and return its ExitStatus. + ContainerWait = "containerManager.Wait" + + // ContainerWaitPID is used to wait on a process with a certain PID in + // the sandbox and return its ExitStatus. + ContainerWaitPID = "containerManager.WaitPID" + + // NetworkCreateLinksAndRoutes is the URPC endpoint for creating links + // and routes in a network stack. + NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes" + + // RootContainerStart is the URPC endpoint for starting a new sandbox + // with root container. + RootContainerStart = "containerManager.StartRoot" + + // SandboxStacks collects sandbox stacks for debugging. + SandboxStacks = "debug.Stacks" +) + +// Profiling related commands (see pprof.go for more details). +const ( + StartCPUProfile = "Profile.StartCPUProfile" + StopCPUProfile = "Profile.StopCPUProfile" + HeapProfile = "Profile.HeapProfile" + GoroutineProfile = "Profile.GoroutineProfile" + BlockProfile = "Profile.BlockProfile" + MutexProfile = "Profile.MutexProfile" + StartTrace = "Profile.StartTrace" + StopTrace = "Profile.StopTrace" +) + +// Logging related commands (see logging.go for more details). +const ( + ChangeLogging = "Logging.Change" +) + +// ControlSocketAddr generates an abstract unix socket name for the given ID. +func ControlSocketAddr(id string) string { + return fmt.Sprintf("\x00runsc-sandbox.%s", id) +} + +// controller holds the control server, and is used for communication into the +// sandbox. +type controller struct { + // srv is the control server. + srv *server.Server + + // manager holds the containerManager methods. + manager *containerManager +} + +// newController creates a new controller. The caller must call +// controller.srv.StartServing() to start the controller. +func newController(fd int, l *Loader) (*controller, error) { + srv, err := server.CreateFromFD(fd) + if err != nil { + return nil, err + } + + manager := &containerManager{ + startChan: make(chan struct{}), + startResultChan: make(chan error), + l: l, + } + srv.Register(manager) + + if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { + net := &Network{ + Stack: eps.Stack, + } + srv.Register(net) + } + + srv.Register(&debug{}) + srv.Register(&control.Logging{}) + if l.conf.ProfileEnable { + srv.Register(&control.Profile{ + Kernel: l.k, + }) + } + + return &controller{ + srv: srv, + manager: manager, + }, nil +} + +// containerManager manages sandbox containers. +type containerManager struct { + // startChan is used to signal when the root container process should + // be started. + startChan chan struct{} + + // startResultChan is used to signal when the root container has + // started. Any errors encountered during startup will be sent to the + // channel. A nil value indicates success. + startResultChan chan error + + // l is the loader that creates containers and sandboxes. + l *Loader +} + +// StartRoot will start the root container process. +func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error { + log.Debugf("containerManager.StartRoot %q", *cid) + // Tell the root container to start and wait for the result. + cm.startChan <- struct{}{} + if err := <-cm.startResultChan; err != nil { + return fmt.Errorf("starting sandbox: %v", err) + } + return nil +} + +// Processes retrieves information about processes running in the sandbox. +func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error { + log.Debugf("containerManager.Processes: %q", *cid) + return control.Processes(cm.l.k, *cid, out) +} + +// Create creates a container within a sandbox. +func (cm *containerManager) Create(cid *string, _ *struct{}) error { + log.Debugf("containerManager.Create: %q", *cid) + return cm.l.createContainer(*cid) +} + +// StartArgs contains arguments to the Start method. +type StartArgs struct { + // Spec is the spec of the container to start. + Spec *specs.Spec + + // Config is the runsc-specific configuration for the sandbox. + Conf *Config + + // CID is the ID of the container to start. + CID string + + // FilePayload contains, in order: + // * stdin, stdout, and stderr. + // * the file descriptor over which the sandbox will + // request files from its root filesystem. + urpc.FilePayload +} + +// Start runs a created container within a sandbox. +func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { + log.Debugf("containerManager.Start: %+v", args) + + // Validate arguments. + if args == nil { + return errors.New("start missing arguments") + } + if args.Spec == nil { + return errors.New("start arguments missing spec") + } + if args.Conf == nil { + return errors.New("start arguments missing config") + } + if args.CID == "" { + return errors.New("start argument missing container ID") + } + if len(args.FilePayload.Files) < 4 { + return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") + } + + // All validation passed, logs the spec for debugging. + specutils.LogSpec(args.Spec) + + err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files) + if err != nil { + log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) + return err + } + log.Debugf("Container %q started", args.CID) + + return nil +} + +// Destroy stops a container if it is still running and cleans up its +// filesystem. +func (cm *containerManager) Destroy(cid *string, _ *struct{}) error { + log.Debugf("containerManager.destroy %q", *cid) + return cm.l.destroyContainer(*cid) +} + +// ExecuteAsync starts running a command on a created or running sandbox. It +// returns the PID of the new process. +func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error { + log.Debugf("containerManager.ExecuteAsync: %+v", args) + tgid, err := cm.l.executeAsync(args) + if err != nil { + log.Debugf("containerManager.ExecuteAsync failed: %+v: %v", args, err) + return err + } + *pid = int32(tgid) + return nil +} + +// Checkpoint pauses a sandbox and saves its state. +func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error { + log.Debugf("containerManager.Checkpoint") + state := control.State{ + Kernel: cm.l.k, + Watchdog: cm.l.watchdog, + } + return state.Save(o, nil) +} + +// Pause suspends a container. +func (cm *containerManager) Pause(_, _ *struct{}) error { + log.Debugf("containerManager.Pause") + cm.l.k.Pause() + return nil +} + +// RestoreOpts contains options related to restoring a container's file system. +type RestoreOpts struct { + // FilePayload contains the state file to be restored, followed by the + // platform device file if necessary. + urpc.FilePayload + + // SandboxID contains the ID of the sandbox. + SandboxID string +} + +// Restore loads a container from a statefile. +// The container's current kernel is destroyed, a restore environment is +// created, and the kernel is recreated with the restore state file. The +// container then sends the signal to start. +func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { + log.Debugf("containerManager.Restore") + + var specFile, deviceFile *os.File + switch numFiles := len(o.FilePayload.Files); numFiles { + case 2: + // The device file is donated to the platform. + // Can't take ownership away from os.File. dup them to get a new FD. + fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd())) + if err != nil { + return fmt.Errorf("failed to dup file: %v", err) + } + deviceFile = os.NewFile(uintptr(fd), "platform device") + fallthrough + case 1: + specFile = o.FilePayload.Files[0] + case 0: + return fmt.Errorf("at least one file must be passed to Restore") + default: + return fmt.Errorf("at most two files may be passed to Restore") + } + + // Pause the kernel while we build a new one. + cm.l.k.Pause() + + p, err := createPlatform(cm.l.conf, deviceFile) + if err != nil { + return fmt.Errorf("creating platform: %v", err) + } + k := &kernel.Kernel{ + Platform: p, + } + mf, err := createMemoryFile() + if err != nil { + return fmt.Errorf("creating memory file: %v", err) + } + k.SetMemoryFile(mf) + networkStack := cm.l.k.RootNetworkNamespace().Stack() + cm.l.k = k + + // Set up the restore environment. + mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints) + renv, err := mntr.createRestoreEnvironment(cm.l.conf) + if err != nil { + return fmt.Errorf("creating RestoreEnvironment: %v", err) + } + fs.SetRestoreEnvironment(*renv) + + // Prepare to load from the state file. + if eps, ok := networkStack.(*netstack.Stack); ok { + stack.StackFromEnv = eps.Stack // FIXME(b/36201077) + } + info, err := specFile.Stat() + if err != nil { + return err + } + if info.Size() == 0 { + return fmt.Errorf("file cannot be empty") + } + + if cm.l.conf.ProfileEnable { + // pprof.Initialize opens /proc/self/maps, so has to be called before + // installing seccomp filters. + pprof.Initialize() + } + + // Seccomp filters have to be applied before parsing the state file. + if err := cm.l.installSeccompFilters(); err != nil { + return err + } + + // Load the state. + loadOpts := state.LoadOpts{Source: specFile} + if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil { + return err + } + + // Since we have a new kernel we also must make a new watchdog. + dogOpts := watchdog.DefaultOpts + dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction + dog := watchdog.New(k, dogOpts) + + // Change the loader fields to reflect the changes made when restoring. + cm.l.k = k + cm.l.watchdog = dog + cm.l.rootProcArgs = kernel.CreateProcessArgs{} + cm.l.restore = true + + // Reinitialize the sandbox ID and processes map. Note that it doesn't + // restore the state of multiple containers, nor exec processes. + cm.l.sandboxID = o.SandboxID + cm.l.mu.Lock() + eid := execID{cid: o.SandboxID} + cm.l.processes = map[execID]*execProcess{ + eid: { + tg: cm.l.k.GlobalInit(), + }, + } + cm.l.mu.Unlock() + + // Tell the root container to start and wait for the result. + cm.startChan <- struct{}{} + if err := <-cm.startResultChan; err != nil { + return fmt.Errorf("starting sandbox: %v", err) + } + + return nil +} + +// Resume unpauses a container. +func (cm *containerManager) Resume(_, _ *struct{}) error { + log.Debugf("containerManager.Resume") + cm.l.k.Unpause() + return nil +} + +// Wait waits for the init process in the given container. +func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error { + log.Debugf("containerManager.Wait") + err := cm.l.waitContainer(*cid, waitStatus) + log.Debugf("containerManager.Wait returned, waitStatus: %v: %v", waitStatus, err) + return err +} + +// WaitPIDArgs are arguments to the WaitPID method. +type WaitPIDArgs struct { + // PID is the PID in the container's PID namespace. + PID int32 + + // CID is the container ID. + CID string +} + +// WaitPID waits for the process with PID 'pid' in the sandbox. +func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error { + log.Debugf("containerManager.Wait") + return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus) +} + +// SignalDeliveryMode enumerates different signal delivery modes. +type SignalDeliveryMode int + +const ( + // DeliverToProcess delivers the signal to the container process with + // the specified PID. If PID is 0, then the container init process is + // signaled. + DeliverToProcess SignalDeliveryMode = iota + + // DeliverToAllProcesses delivers the signal to all processes in the + // container. PID must be 0. + DeliverToAllProcesses + + // DeliverToForegroundProcessGroup delivers the signal to the + // foreground process group in the same TTY session as the specified + // process. If PID is 0, then the signal is delivered to the foreground + // process group for the TTY for the init process. + DeliverToForegroundProcessGroup +) + +func (s SignalDeliveryMode) String() string { + switch s { + case DeliverToProcess: + return "Process" + case DeliverToAllProcesses: + return "All" + case DeliverToForegroundProcessGroup: + return "Foreground Process Group" + } + return fmt.Sprintf("unknown signal delivery mode: %d", s) +} + +// SignalArgs are arguments to the Signal method. +type SignalArgs struct { + // CID is the container ID. + CID string + + // Signo is the signal to send to the process. + Signo int32 + + // PID is the process ID in the given container that will be signaled. + // If 0, the root container will be signalled. + PID int32 + + // Mode is the signal delivery mode. + Mode SignalDeliveryMode +} + +// Signal sends a signal to one or more processes in a container. If args.PID +// is 0, then the container init process is used. Depending on the +// args.SignalDeliveryMode option, the signal may be sent directly to the +// indicated process, to all processes in the container, or to the foreground +// process group. +func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error { + log.Debugf("containerManager.Signal %+v", args) + return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode) +} diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go new file mode 100644 index 000000000..1fb32c527 --- /dev/null +++ b/runsc/boot/debug.go @@ -0,0 +1,29 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "gvisor.dev/gvisor/pkg/log" +) + +type debug struct { +} + +// Stacks collects all sandbox stacks and copies them to 'stacks'. +func (*debug) Stacks(_ *struct{}, stacks *string) error { + buf := log.Stacks(true) + *stacks = string(buf) + return nil +} diff --git a/runsc/boot/events.go b/runsc/boot/events.go new file mode 100644 index 000000000..422f4da00 --- /dev/null +++ b/runsc/boot/events.go @@ -0,0 +1,81 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usage" +) + +// Event struct for encoding the event data to JSON. Corresponds to runc's +// main.event struct. +type Event struct { + Type string `json:"type"` + ID string `json:"id"` + Data interface{} `json:"data,omitempty"` +} + +// Stats is the runc specific stats structure for stability when encoding and +// decoding stats. +type Stats struct { + Memory Memory `json:"memory"` + Pids Pids `json:"pids"` +} + +// Pids contains stats on processes. +type Pids struct { + Current uint64 `json:"current,omitempty"` + Limit uint64 `json:"limit,omitempty"` +} + +// MemoryEntry contains stats on a kind of memory. +type MemoryEntry struct { + Limit uint64 `json:"limit"` + Usage uint64 `json:"usage,omitempty"` + Max uint64 `json:"max,omitempty"` + Failcnt uint64 `json:"failcnt"` +} + +// Memory contains stats on memory. +type Memory struct { + Cache uint64 `json:"cache,omitempty"` + Usage MemoryEntry `json:"usage,omitempty"` + Swap MemoryEntry `json:"swap,omitempty"` + Kernel MemoryEntry `json:"kernel,omitempty"` + KernelTCP MemoryEntry `json:"kernelTCP,omitempty"` + Raw map[string]uint64 `json:"raw,omitempty"` +} + +// Event gets the events from the container. +func (cm *containerManager) Event(_ *struct{}, out *Event) error { + stats := &Stats{} + stats.populateMemory(cm.l.k) + stats.populatePIDs(cm.l.k) + *out = Event{Type: "stats", Data: stats} + return nil +} + +func (s *Stats) populateMemory(k *kernel.Kernel) { + mem := k.MemoryFile() + mem.UpdateUsage() + _, totalUsage := usage.MemoryAccounting.Copy() + s.Memory.Usage = MemoryEntry{ + Usage: totalUsage, + } +} + +func (s *Stats) populatePIDs(k *kernel.Kernel) { + s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups())) +} diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD new file mode 100644 index 000000000..ed18f0047 --- /dev/null +++ b/runsc/boot/filter/BUILD @@ -0,0 +1,28 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "filter", + srcs = [ + "config.go", + "config_amd64.go", + "config_arm64.go", + "config_profile.go", + "extra_filters.go", + "extra_filters_msan.go", + "extra_filters_race.go", + "filter.go", + ], + visibility = [ + "//runsc/boot:__subpackages__", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/seccomp", + "//pkg/sentry/platform", + "//pkg/tcpip/link/fdbased", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go new file mode 100644 index 000000000..60e33425f --- /dev/null +++ b/runsc/boot/filter/config.go @@ -0,0 +1,559 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "os" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" +) + +// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS. +var allowedSyscalls = seccomp.SyscallRules{ + syscall.SYS_CLOCK_GETTIME: {}, + syscall.SYS_CLONE: []seccomp.Rule{ + { + seccomp.AllowValue( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + }, + }, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, + syscall.SYS_DUP3: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.O_CLOEXEC), + }, + }, + syscall.SYS_EPOLL_CREATE1: {}, + syscall.SYS_EPOLL_CTL: {}, + syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(0), + }, + }, + syscall.SYS_EVENTFD2: []seccomp.Rule{ + { + seccomp.AllowValue(0), + seccomp.AllowValue(0), + }, + }, + syscall.SYS_EXIT: {}, + syscall.SYS_EXIT_GROUP: {}, + syscall.SYS_FALLOCATE: {}, + syscall.SYS_FCHMOD: {}, + syscall.SYS_FCNTL: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.F_GETFL), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.F_SETFL), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.F_GETFD), + }, + }, + syscall.SYS_FSTAT: {}, + syscall.SYS_FSYNC: {}, + syscall.SYS_FTRUNCATE: {}, + syscall.SYS_FUTEX: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), + seccomp.AllowAny{}, + }, + // Non-private variants are included for flipcall support. They are otherwise + // unncessary, as the sentry will use only private futexes internally. + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAIT), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAKE), + seccomp.AllowAny{}, + }, + }, + syscall.SYS_GETPID: {}, + unix.SYS_GETRANDOM: {}, + syscall.SYS_GETSOCKOPT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_DOMAIN), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_TYPE), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_ERROR), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_SNDBUF), + }, + }, + syscall.SYS_GETTID: {}, + syscall.SYS_GETTIMEOFDAY: {}, + // SYS_IOCTL is needed for terminal support, but we only allow + // setting/getting termios and winsize. + syscall.SYS_IOCTL: []seccomp.Rule{ + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TCGETS), + seccomp.AllowAny{}, /* termios struct */ + }, + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TCSETS), + seccomp.AllowAny{}, /* termios struct */ + }, + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TCSETSF), + seccomp.AllowAny{}, /* termios struct */ + }, + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TCSETSW), + seccomp.AllowAny{}, /* termios struct */ + }, + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TIOCSWINSZ), + seccomp.AllowAny{}, /* winsize struct */ + }, + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TIOCGWINSZ), + seccomp.AllowAny{}, /* winsize struct */ + }, + }, + syscall.SYS_LSEEK: {}, + syscall.SYS_MADVISE: {}, + syscall.SYS_MINCORE: {}, + // Used by the Go runtime as a temporarily workaround for a Linux + // 5.2-5.4 bug. + // + // See src/runtime/os_linux_x86.go. + // + // TODO(b/148688965): Remove once this is gone from Go. + syscall.SYS_MLOCK: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(4096), + }, + }, + syscall.SYS_MMAP: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_SHARED), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_PRIVATE), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ), + seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), + }, + }, + syscall.SYS_MPROTECT: {}, + syscall.SYS_MUNMAP: {}, + syscall.SYS_NANOSLEEP: {}, + syscall.SYS_PPOLL: {}, + syscall.SYS_PREAD64: {}, + syscall.SYS_PREADV: {}, + unix.SYS_PREADV2: {}, + syscall.SYS_PWRITE64: {}, + syscall.SYS_PWRITEV: {}, + unix.SYS_PWRITEV2: {}, + syscall.SYS_READ: {}, + syscall.SYS_RECVMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), + }, + }, + syscall.SYS_RECVMMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(fdbased.MaxMsgsPerRecv), + seccomp.AllowValue(syscall.MSG_DONTWAIT), + seccomp.AllowValue(0), + }, + }, + unix.SYS_SENDMMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT), + seccomp.AllowValue(0), + }, + }, + syscall.SYS_RESTART_SYSCALL: {}, + syscall.SYS_RT_SIGACTION: {}, + syscall.SYS_RT_SIGPROCMASK: {}, + syscall.SYS_RT_SIGRETURN: {}, + syscall.SYS_SCHED_YIELD: {}, + syscall.SYS_SENDMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), + }, + }, + syscall.SYS_SETITIMER: {}, + syscall.SYS_SHUTDOWN: []seccomp.Rule{ + // Used by fs/host to shutdown host sockets. + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)}, + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)}, + // Used by unet to shutdown connections. + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, + }, + syscall.SYS_SIGALTSTACK: {}, + unix.SYS_STATX: {}, + syscall.SYS_SYNC_FILE_RANGE: {}, + syscall.SYS_TEE: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(1), /* len */ + seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */ + }, + }, + syscall.SYS_TGKILL: []seccomp.Rule{ + { + seccomp.AllowValue(uint64(os.Getpid())), + }, + }, + syscall.SYS_UTIMENSAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* null pathname */ + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* flags */ + }, + }, + syscall.SYS_WRITE: {}, + // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with + // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR + // option is enabled for a packet socket. + syscall.SYS_WRITEV: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(2), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(3), + }, + }, +} + +// hostInetFilters contains syscalls that are needed by sentry/socket/hostinet. +func hostInetFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_ACCEPT4: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + }, + }, + syscall.SYS_BIND: {}, + syscall.SYS_CONNECT: {}, + syscall.SYS_GETPEERNAME: {}, + syscall.SYS_GETSOCKNAME: {}, + syscall.SYS_GETSOCKOPT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_TOS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_RECVTOS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_TCLASS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_V6ONLY), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_ERROR), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_KEEPALIVE), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_SNDBUF), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_RCVBUF), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_REUSEADDR), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_TYPE), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_LINGER), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_TCP), + seccomp.AllowValue(syscall.TCP_NODELAY), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_TCP), + seccomp.AllowValue(syscall.TCP_INFO), + }, + }, + syscall.SYS_IOCTL: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.TIOCOUTQ), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.TIOCINQ), + }, + }, + syscall.SYS_LISTEN: {}, + syscall.SYS_READV: {}, + syscall.SYS_RECVFROM: {}, + syscall.SYS_RECVMSG: {}, + syscall.SYS_SENDMSG: {}, + syscall.SYS_SENDTO: {}, + syscall.SYS_SETSOCKOPT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_V6ONLY), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_SNDBUF), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_RCVBUF), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_REUSEADDR), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_TCP), + seccomp.AllowValue(syscall.TCP_NODELAY), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_TOS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_RECVTOS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_TCLASS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + }, + syscall.SYS_SHUTDOWN: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SHUT_RD), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SHUT_WR), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SHUT_RDWR), + }, + }, + syscall.SYS_SOCKET: []seccomp.Rule{ + { + seccomp.AllowValue(syscall.AF_INET), + seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_INET), + seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_INET6), + seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_INET6), + seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.AllowValue(0), + }, + }, + syscall.SYS_WRITEV: {}, + } +} + +func controlServerFilters(fd int) seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_ACCEPT: []seccomp.Rule{ + { + seccomp.AllowValue(fd), + }, + }, + syscall.SYS_LISTEN: []seccomp.Rule{ + { + seccomp.AllowValue(fd), + seccomp.AllowValue(16 /* unet.backlog */), + }, + }, + syscall.SYS_GETSOCKOPT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_PEERCRED), + }, + }, + } +} diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go new file mode 100644 index 000000000..5335ff82c --- /dev/null +++ b/runsc/boot/filter/config_amd64.go @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" +) + +func init() { + allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL], + seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)}, + seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)}, + ) +} diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go new file mode 100644 index 000000000..7fa9bbda3 --- /dev/null +++ b/runsc/boot/filter/config_arm64.go @@ -0,0 +1,21 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package filter + +// Reserve for future customization. +func init() { +} diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go new file mode 100644 index 000000000..194952a7b --- /dev/null +++ b/runsc/boot/filter/config_profile.go @@ -0,0 +1,34 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// profileFilters returns extra syscalls made by runtime/pprof package. +func profileFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_OPENAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), + }, + }, + } +} diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go new file mode 100644 index 000000000..e28d4b8d6 --- /dev/null +++ b/runsc/boot/filter/extra_filters.go @@ -0,0 +1,28 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !msan,!race + +package filter + +import ( + "gvisor.dev/gvisor/pkg/seccomp" +) + +// instrumentationFilters returns additional filters for syscalls used by +// Go instrumentation tools, e.g. -race, -msan. +// Returns empty when disabled. +func instrumentationFilters() seccomp.SyscallRules { + return nil +} diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go new file mode 100644 index 000000000..209e646a7 --- /dev/null +++ b/runsc/boot/filter/extra_filters_msan.go @@ -0,0 +1,34 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build msan + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// instrumentationFilters returns additional filters for syscalls used by MSAN. +func instrumentationFilters() seccomp.SyscallRules { + Report("MSAN is enabled: syscall filters less restrictive!") + return seccomp.SyscallRules{ + syscall.SYS_CLONE: {}, + syscall.SYS_MMAP: {}, + syscall.SYS_SCHED_GETAFFINITY: {}, + syscall.SYS_SET_ROBUST_LIST: {}, + } +} diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go new file mode 100644 index 000000000..9ff80276a --- /dev/null +++ b/runsc/boot/filter/extra_filters_race.go @@ -0,0 +1,41 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build race + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// instrumentationFilters returns additional filters for syscalls used by TSAN. +func instrumentationFilters() seccomp.SyscallRules { + Report("TSAN is enabled: syscall filters less restrictive!") + return seccomp.SyscallRules{ + syscall.SYS_BRK: {}, + syscall.SYS_CLONE: {}, + syscall.SYS_FUTEX: {}, + syscall.SYS_MMAP: {}, + syscall.SYS_MUNLOCK: {}, + syscall.SYS_NANOSLEEP: {}, + syscall.SYS_OPEN: {}, + syscall.SYS_OPENAT: {}, + syscall.SYS_SET_ROBUST_LIST: {}, + // Used within glibc's malloc. + syscall.SYS_TIME: {}, + } +} diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go new file mode 100644 index 000000000..e80c171b3 --- /dev/null +++ b/runsc/boot/filter/filter.go @@ -0,0 +1,60 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package filter defines all syscalls the sandbox is allowed to make +// to the host, and installs seccomp filters to prevent prohibited +// syscalls in case it's compromised. +package filter + +import ( + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/sentry/platform" +) + +// Options are seccomp filter related options. +type Options struct { + Platform platform.Platform + HostNetwork bool + ProfileEnable bool + ControllerFD int +} + +// Install installs seccomp filters for based on the given platform. +func Install(opt Options) error { + s := allowedSyscalls + s.Merge(controlServerFilters(opt.ControllerFD)) + + // Set of additional filters used by -race and -msan. Returns empty + // when not enabled. + s.Merge(instrumentationFilters()) + + if opt.HostNetwork { + Report("host networking enabled: syscall filters less restrictive!") + s.Merge(hostInetFilters()) + } + if opt.ProfileEnable { + Report("profile enabled: syscall filters less restrictive!") + s.Merge(profileFilters()) + } + + s.Merge(opt.Platform.SyscallFilters()) + + return seccomp.Install(s) +} + +// Report writes a warning message to the log. +func Report(msg string) { + log.Warningf("*** SECCOMP WARNING: %s", msg) +} diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go new file mode 100644 index 000000000..59639ba19 --- /dev/null +++ b/runsc/boot/fs.go @@ -0,0 +1,1034 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "path/filepath" + "sort" + "strconv" + "strings" + "syscall" + + // Include filesystem types that OCI spec might mount. + _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" + _ "gvisor.dev/gvisor/pkg/sentry/fs/host" + _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" + _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" + "gvisor.dev/gvisor/pkg/sentry/vfs" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/gofer" + "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" + "gvisor.dev/gvisor/pkg/sentry/fs/user" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + procvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" + sysvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" + tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/runsc/specutils" +) + +const ( + // Device name for root mount. + rootDevice = "9pfs-/" + + // MountPrefix is the annotation prefix for mount hints. + MountPrefix = "dev.gvisor.spec.mount." + + // Supported filesystems that map to different internal filesystem. + bind = "bind" + nonefs = "none" +) + +// tmpfs has some extra supported options that we must pass through. +var tmpfsAllowedData = []string{"mode", "uid", "gid"} + +func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { + // Upper layer uses the same flags as lower, but it must be read-write. + upperFlags := lowerFlags + upperFlags.ReadOnly = false + + tmpFS := mustFindFilesystem("tmpfs") + if !fs.IsDir(lower.StableAttr) { + // Create overlay on top of mount file, e.g. /etc/hostname. + msrc := fs.NewCachingMountSource(ctx, tmpFS, upperFlags) + return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags) + } + + // Create overlay on top of mount dir. + upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil) + if err != nil { + return nil, fmt.Errorf("creating tmpfs overlay: %v", err) + } + + // Replicate permissions and owner from lower to upper mount point. + attr, err := lower.UnstableAttr(ctx) + if err != nil { + return nil, fmt.Errorf("reading attributes from lower mount point: %v", err) + } + if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) { + return nil, fmt.Errorf("error setting permission to upper mount point") + } + if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil { + return nil, fmt.Errorf("setting owner to upper mount point: %v", err) + } + + return fs.NewOverlayRoot(ctx, upper, lower, upperFlags) +} + +// compileMounts returns the supported mounts from the mount spec, adding any +// mandatory mounts that are required by the OCI specification. +func compileMounts(spec *specs.Spec) []specs.Mount { + // Keep track of whether proc and sys were mounted. + var procMounted, sysMounted bool + var mounts []specs.Mount + + // Always mount /dev. + mounts = append(mounts, specs.Mount{ + Type: devtmpfs.Name, + Destination: "/dev", + }) + + mounts = append(mounts, specs.Mount{ + Type: devpts.Name, + Destination: "/dev/pts", + }) + + // Mount all submounts from the spec. + for _, m := range spec.Mounts { + if !specutils.IsSupportedDevMount(m) { + log.Warningf("ignoring dev mount at %q", m.Destination) + continue + } + mounts = append(mounts, m) + switch filepath.Clean(m.Destination) { + case "/proc": + procMounted = true + case "/sys": + sysMounted = true + } + } + + // Mount proc and sys even if the user did not ask for it, as the spec + // says we SHOULD. + var mandatoryMounts []specs.Mount + if !procMounted { + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: procvfs2.Name, + Destination: "/proc", + }) + } + if !sysMounted { + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: sysvfs2.Name, + Destination: "/sys", + }) + } + + // The mandatory mounts should be ordered right after the root, in case + // there are submounts of these mandatory mounts already in the spec. + mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...) + + return mounts +} + +// p9MountData creates a slice of p9 mount data. +func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string { + opts := []string{ + "trans=fd", + "rfdno=" + strconv.Itoa(fd), + "wfdno=" + strconv.Itoa(fd), + } + if !vfs2 { + // privateunixsocket is always enabled in VFS2. VFS1 requires explicit + // enablement. + opts = append(opts, "privateunixsocket=true") + } + if fa == FileAccessShared { + opts = append(opts, "cache=remote_revalidating") + } + return opts +} + +// parseAndFilterOptions parses a MountOptions slice and filters by the allowed +// keys. +func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) { + var out []string + for _, o := range opts { + ok, err := parseMountOption(o, allowedKeys...) + if err != nil { + return nil, err + } + if ok { + out = append(out, o) + } + } + return out, nil +} + +func parseMountOption(opt string, allowedKeys ...string) (bool, error) { + kv := strings.SplitN(opt, "=", 3) + if len(kv) > 2 { + return false, fmt.Errorf("invalid option %q", opt) + } + return specutils.ContainsStr(allowedKeys, kv[0]), nil +} + +// mountDevice returns a device string based on the fs type and target +// of the mount. +func mountDevice(m specs.Mount) string { + if m.Type == bind { + // Make a device string that includes the target, which is consistent across + // S/R and uniquely identifies the connection. + return "9pfs-" + m.Destination + } + // All other fs types use device "none". + return "none" +} + +func mountFlags(opts []string) fs.MountSourceFlags { + mf := fs.MountSourceFlags{} + // Note: changes to supported options must be reflected in + // isSupportedMountFlag() as well. + for _, o := range opts { + switch o { + case "rw": + mf.ReadOnly = false + case "ro": + mf.ReadOnly = true + case "noatime": + mf.NoAtime = true + case "noexec": + mf.NoExec = true + default: + log.Warningf("ignoring unknown mount option %q", o) + } + } + return mf +} + +func isSupportedMountFlag(fstype, opt string) bool { + switch opt { + case "rw", "ro", "noatime", "noexec": + return true + } + if fstype == tmpfsvfs2.Name { + ok, err := parseMountOption(opt, tmpfsAllowedData...) + return ok && err == nil + } + return false +} + +func mustFindFilesystem(name string) fs.Filesystem { + fs, ok := fs.FindFilesystem(name) + if !ok { + panic(fmt.Sprintf("could not find filesystem %q", name)) + } + return fs +} + +// addSubmountOverlay overlays the inode over a ramfs tree containing the given +// paths. +func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) { + // Construct a ramfs tree of mount points. The contents never + // change, so this can be fully caching. There's no real + // filesystem backing this tree, so we set the filesystem to + // nil. + msrc := fs.NewCachingMountSource(ctx, nil, fs.MountSourceFlags{}) + mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts) + if err != nil { + return nil, fmt.Errorf("creating mount tree: %v", err) + } + overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{}) + if err != nil { + return nil, fmt.Errorf("adding mount overlay: %v", err) + } + return overlayInode, err +} + +// subtargets takes a set of Mounts and returns only the targets that are +// children of the given root. The returned paths are relative to the root. +func subtargets(root string, mnts []specs.Mount) []string { + var targets []string + for _, mnt := range mnts { + if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath { + targets = append(targets, relPath) + } + } + return targets +} + +func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + if conf.VFS2 { + return setupContainerVFS2(ctx, conf, mntr, procArgs) + } + mns, err := mntr.setupFS(conf, procArgs) + if err != nil { + return err + } + + // Set namespace here so that it can be found in ctx. + procArgs.MountNamespace = mns + + // Resolve the executable path from working dir and environment. + resolved, err := user.ResolveExecutablePath(ctx, procArgs) + if err != nil { + return err + } + procArgs.Filename = resolved + return nil +} + +func adjustDirentCache(k *kernel.Kernel) error { + var hl syscall.Rlimit + if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil { + return fmt.Errorf("getting RLIMIT_NOFILE: %v", err) + } + if int64(hl.Cur) != syscall.RLIM_INFINITY { + newSize := hl.Cur / 2 + if newSize < gofer.DefaultDirentCacheSize { + log.Infof("Setting gofer dirent cache size to %d", newSize) + gofer.DefaultDirentCacheSize = newSize + k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize) + } + } + return nil +} + +type fdDispenser struct { + fds []int +} + +func (f *fdDispenser) remove() int { + if f.empty() { + panic("fdDispenser out of fds") + } + rv := f.fds[0] + f.fds = f.fds[1:] + return rv +} + +func (f *fdDispenser) empty() bool { + return len(f.fds) == 0 +} + +type shareType int + +const ( + invalid shareType = iota + + // container shareType indicates that the mount is used by a single container. + container + + // pod shareType indicates that the mount is used by more than one container + // inside the pod. + pod + + // shared shareType indicates that the mount can also be shared with a process + // outside the pod, e.g. NFS. + shared +) + +func parseShare(val string) (shareType, error) { + switch val { + case "container": + return container, nil + case "pod": + return pod, nil + case "shared": + return shared, nil + default: + return 0, fmt.Errorf("invalid share value %q", val) + } +} + +func (s shareType) String() string { + switch s { + case invalid: + return "invalid" + case container: + return "container" + case pod: + return "pod" + case shared: + return "shared" + default: + return fmt.Sprintf("invalid share value %d", s) + } +} + +// mountHint represents extra information about mounts that are provided via +// annotations. They can override mount type, and provide sharing information +// so that mounts can be correctly shared inside the pod. +type mountHint struct { + name string + share shareType + mount specs.Mount + + // root is the inode where the volume is mounted. For mounts with 'pod' share + // the volume is mounted once and then bind mounted inside the containers. + root *fs.Inode + + // vfsMount is the master mount for the volume. For mounts with 'pod' share + // the master volume is bind mounted inside the containers. + vfsMount *vfs.Mount +} + +func (m *mountHint) setField(key, val string) error { + switch key { + case "source": + if len(val) == 0 { + return fmt.Errorf("source cannot be empty") + } + m.mount.Source = val + case "type": + return m.setType(val) + case "share": + share, err := parseShare(val) + if err != nil { + return err + } + m.share = share + case "options": + return m.setOptions(val) + default: + return fmt.Errorf("invalid mount annotation: %s=%s", key, val) + } + return nil +} + +func (m *mountHint) setType(val string) error { + switch val { + case "tmpfs", "bind": + m.mount.Type = val + default: + return fmt.Errorf("invalid type %q", val) + } + return nil +} + +func (m *mountHint) setOptions(val string) error { + opts := strings.Split(val, ",") + if err := specutils.ValidateMountOptions(opts); err != nil { + return err + } + // Sort options so it can be compared with container mount options later on. + sort.Strings(opts) + m.mount.Options = opts + return nil +} + +func (m *mountHint) isSupported() bool { + return m.mount.Type == tmpfsvfs2.Name && m.share == pod +} + +// checkCompatible verifies that shared mount is compatible with master. +// For now enforce that all options are the same. Once bind mount is properly +// supported, then we should ensure the master is less restrictive than the +// container, e.g. master can be 'rw' while container mounts as 'ro'. +func (m *mountHint) checkCompatible(mount specs.Mount) error { + // Remove options that don't affect to mount's behavior. + masterOpts := filterUnsupportedOptions(m.mount) + slaveOpts := filterUnsupportedOptions(mount) + + if len(masterOpts) != len(slaveOpts) { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + } + + sort.Strings(masterOpts) + sort.Strings(slaveOpts) + for i, opt := range masterOpts { + if opt != slaveOpts[i] { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + } + } + return nil +} + +func (m *mountHint) fileAccessType() FileAccessType { + if m.share == container { + return FileAccessExclusive + } + return FileAccessShared +} + +func filterUnsupportedOptions(mount specs.Mount) []string { + rv := make([]string, 0, len(mount.Options)) + for _, o := range mount.Options { + if isSupportedMountFlag(mount.Type, o) { + rv = append(rv, o) + } + } + return rv +} + +// podMountHints contains a collection of mountHints for the pod. +type podMountHints struct { + mounts map[string]*mountHint +} + +func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { + mnts := make(map[string]*mountHint) + for k, v := range spec.Annotations { + // Look for 'dev.gvisor.spec.mount' annotations and parse them. + if strings.HasPrefix(k, MountPrefix) { + // Remove the prefix and split the rest. + parts := strings.Split(k[len(MountPrefix):], ".") + if len(parts) != 2 { + return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v) + } + name := parts[0] + if len(name) == 0 { + return nil, fmt.Errorf("invalid mount name: %s", name) + } + mnt := mnts[name] + if mnt == nil { + mnt = &mountHint{name: name} + mnts[name] = mnt + } + if err := mnt.setField(parts[1], v); err != nil { + return nil, err + } + } + } + + // Validate all hints after done parsing. + for name, m := range mnts { + log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share) + if m.share == invalid { + return nil, fmt.Errorf("share field for %q has not been set", m.name) + } + if len(m.mount.Source) == 0 { + return nil, fmt.Errorf("source field for %q has not been set", m.name) + } + if len(m.mount.Type) == 0 { + return nil, fmt.Errorf("type field for %q has not been set", m.name) + } + + // Check for duplicate mount sources. + for name2, m2 := range mnts { + if name != name2 && m.mount.Source == m2.mount.Source { + return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source) + } + } + } + + return &podMountHints{mounts: mnts}, nil +} + +func (p *podMountHints) findMount(mount specs.Mount) *mountHint { + for _, m := range p.mounts { + if m.mount.Source == mount.Source { + return m + } + } + return nil +} + +type containerMounter struct { + root *specs.Root + + // mounts is the set of submounts for the container. It's a copy from the spec + // that may be freely modified without affecting the original spec. + mounts []specs.Mount + + // fds is the list of FDs to be dispensed for mounts that require it. + fds fdDispenser + + k *kernel.Kernel + + hints *podMountHints +} + +func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter { + return &containerMounter{ + root: spec.Root, + mounts: compileMounts(spec), + fds: fdDispenser{fds: goferFDs}, + k: k, + hints: hints, + } +} + +// processHints processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error { + if conf.VFS2 { + return c.processHintsVFS2(conf, creds) + } + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfsvfs2.Name { + continue + } + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + inode, err := c.mountSharedMaster(ctx, conf, hint) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.root = inode + } + return nil +} + +// setupFS is used to set up the file system for all containers. This is the +// main entry point method, with most of the other being internal only. It +// returns the mount namespace that is created for the container. +func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { + log.Infof("Configuring container's file system") + + // Create context with root credentials to mount the filesystem (the current + // user may not be privileged enough). + rootProcArgs := *procArgs + rootProcArgs.WorkingDirectory = "/" + rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs.Umask = 0022 + rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals + rootCtx := rootProcArgs.NewContext(c.k) + + mns, err := c.createMountNamespace(rootCtx, conf) + if err != nil { + return nil, err + } + + // Set namespace here so that it can be found in rootCtx. + rootProcArgs.MountNamespace = mns + + if err := c.mountSubmounts(rootCtx, conf, mns); err != nil { + return nil, err + } + return mns, nil +} + +func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) { + rootInode, err := c.createRootMount(ctx, conf) + if err != nil { + return nil, fmt.Errorf("creating filesystem for container: %v", err) + } + mns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + return nil, fmt.Errorf("creating new mount namespace for container: %v", err) + } + return mns, nil +} + +func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { + root := mns.Root() + defer root.DecRef() + + for _, m := range c.mounts { + log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) + if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err) + } + } else { + if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { + return fmt.Errorf("mount submount %q: %v", m.Destination, err) + } + } + } + + if err := c.mountTmp(ctx, conf, mns, root); err != nil { + return fmt.Errorf("mount submount %q: %v", "tmp", err) + } + + if err := c.checkDispenser(); err != nil { + return err + } + return nil +} + +func (c *containerMounter) checkDispenser() error { + if !c.fds.empty() { + return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds) + } + return nil +} + +// mountSharedMaster mounts the master of a volume that is shared among +// containers in a pod. It returns the root mount's inode. +func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + + // Mount with revalidate because it's shared among containers. + opts = append(opts, "cache=revalidate") + + // All filesystem names should have been mapped to something we know. + filesystem := mustFindFilesystem(fsName) + + mf := mountFlags(hint.mount.Options) + if useOverlay { + // All writes go to upper, be paranoid and make lower readonly. + mf.ReadOnly = true + } + + inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil) + if err != nil { + return nil, fmt.Errorf("creating mount %q: %v", hint.name, err) + } + + if useOverlay { + log.Debugf("Adding overlay on top of shared mount %q", hint.name) + inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf) + if err != nil { + return nil, err + } + } + + return inode, nil +} + +// createRootMount creates the root filesystem. +func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) { + // First construct the filesystem from the spec.Root. + mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay} + + fd := c.fds.remove() + log.Infof("Mounting root over 9P, ioFD: %d", fd) + p9FS := mustFindFilesystem("9p") + opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) + + if conf.OverlayfsStaleRead { + // We can't check for overlayfs here because sandbox is chroot'ed and gofer + // can only send mount options for specs.Mounts (specs.Root is missing + // Options field). So assume root is always on top of overlayfs. + opts = append(opts, "overlayfs_stale_read") + } + + rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) + if err != nil { + return nil, fmt.Errorf("creating root mount point: %v", err) + } + + // We need to overlay the root on top of a ramfs with stub directories + // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always + // mounted even if they are not in the spec. + submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + if err != nil { + return nil, fmt.Errorf("adding submount overlay: %v", err) + } + + if conf.Overlay && !c.root.Readonly { + log.Debugf("Adding overlay on top of root mount") + // Overlay a tmpfs filesystem on top of the root. + rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) + if err != nil { + return nil, err + } + } + + log.Infof("Mounted %q to %q type root", c.root.Path, "/") + return rootInode, nil +} + +// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) { + var ( + fsName string + opts []string + useOverlay bool + ) + + switch m.Type { + case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name: + fsName = m.Type + case nonefs: + fsName = sysvfs2.Name + case tmpfsvfs2.Name: + fsName = m.Type + + var err error + opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) + if err != nil { + return "", nil, false, err + } + + case bind: + fd := c.fds.remove() + fsName = gofervfs2.Name + opts = p9MountData(fd, c.getMountAccessType(m), conf.VFS2) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + + default: + log.Warningf("ignoring unknown filesystem type %q", m.Type) + } + return fsName, opts, useOverlay, nil +} + +func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType { + if hint := c.hints.findMount(mount); hint != nil { + return hint.fileAccessType() + } + // Non-root bind mounts are always shared if no hints were provided. + return FileAccessShared +} + +// mountSubmount mounts volumes inside the container's root. Because mounts may +// be readonly, a lower ramfs overlay is added to create the mount point dir. +// Another overlay is added with tmpfs on top if Config.Overlay is true. +// 'm.Destination' must be an absolute path with '..' and symlinks resolved. +func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) + if err != nil { + return err + } + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + // All filesystem names should have been mapped to something we know. + filesystem := mustFindFilesystem(fsName) + + mf := mountFlags(m.Options) + if useOverlay { + // All writes go to upper, be paranoid and make lower readonly. + mf.ReadOnly = true + } + + inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil) + if err != nil { + err := fmt.Errorf("creating mount with source %q: %v", m.Source, err) + // Check to see if this is a common error due to a Linux bug. + // This error is generated here in order to cause it to be + // printed to the user using Docker via 'runsc create' etc. rather + // than simply printed to the logs for the 'runsc boot' command. + // + // We check the error message string rather than type because the + // actual error types (syscall.EIO, syscall.EPIPE) are lost by file system + // implementation (e.g. p9). + // TODO(gvisor.dev/issue/1765): Remove message when bug is resolved. + if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) { + return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug")) + } + return err + } + + // If there are submounts, we need to overlay the mount on top of a ramfs + // with stub directories for submount paths. + submounts := subtargets(m.Destination, c.mounts) + if len(submounts) > 0 { + log.Infof("Adding submount overlay over %q", m.Destination) + inode, err = addSubmountOverlay(ctx, inode, submounts) + if err != nil { + return fmt.Errorf("adding submount overlay: %v", err) + } + } + + if useOverlay { + log.Debugf("Adding overlay on top of mount %q", m.Destination) + inode, err = addOverlay(ctx, conf, inode, m.Type, mf) + if err != nil { + return err + } + } + + maxTraversals := uint(0) + dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals) + if err != nil { + return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) + } + defer dirent.DecRef() + if err := mns.Mount(ctx, dirent, inode); err != nil { + return fmt.Errorf("mount %q error: %v", m.Destination, err) + } + + log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts) + return nil +} + +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error { + if err := source.checkCompatible(mount); err != nil { + return err + } + + maxTraversals := uint(0) + target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals) + if err != nil { + return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) + } + defer target.DecRef() + + // Take a ref on the inode that is about to be (re)-mounted. + source.root.IncRef() + if err := mns.Mount(ctx, target, source.root); err != nil { + source.root.DecRef() + return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) + } + + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return nil +} + +// addRestoreMount adds a mount to the MountSources map used for restoring a +// checkpointed container. +func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error { + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) + if err != nil { + return err + } + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + newMount := fs.MountArgs{ + Dev: mountDevice(m), + Flags: mountFlags(m.Options), + DataString: strings.Join(opts, ","), + } + if useOverlay { + newMount.Flags.ReadOnly = true + } + renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount) + log.Infof("Added mount at %q: %+v", fsName, newMount) + return nil +} + +// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding +// the mounts to the environment. +func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) { + renv := &fs.RestoreEnvironment{ + MountSources: make(map[string][]fs.MountArgs), + } + + // Add root mount. + fd := c.fds.remove() + opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) + + mf := fs.MountSourceFlags{} + if c.root.Readonly || conf.Overlay { + mf.ReadOnly = true + } + + rootMount := fs.MountArgs{ + Dev: rootDevice, + Flags: mf, + DataString: strings.Join(opts, ","), + } + renv.MountSources[gofervfs2.Name] = append(renv.MountSources[gofervfs2.Name], rootMount) + + // Add submounts. + var tmpMounted bool + for _, m := range c.mounts { + if err := c.addRestoreMount(conf, renv, m); err != nil { + return nil, err + } + if filepath.Clean(m.Destination) == "/tmp" { + tmpMounted = true + } + } + + // TODO(b/67958150): handle '/tmp' properly (see mountTmp()). + if !tmpMounted { + tmpMount := specs.Mount{ + Type: tmpfsvfs2.Name, + Destination: "/tmp", + } + if err := c.addRestoreMount(conf, renv, tmpMount); err != nil { + return nil, err + } + } + + return renv, nil +} + +// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so. +// Technically we don't have to mount tmpfs at /tmp, as we could just rely on +// the host /tmp, but this is a nice optimization, and fixes some apps that call +// mknod in /tmp. It's unsafe to mount tmpfs if: +// 1. /tmp is mounted explicitly: we should not override user's wish +// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp +// +// Note that when there are submounts inside of '/tmp', directories for the +// mount points must be present, making '/tmp' not empty anymore. +func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { + for _, m := range c.mounts { + if filepath.Clean(m.Destination) == "/tmp" { + log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m) + return nil + } + } + + maxTraversals := uint(0) + tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals) + switch err { + case nil: + // Found '/tmp' in filesystem, check if it's empty. + defer tmp.DecRef() + f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) + if err != nil { + return err + } + defer f.DecRef() + serializer := &fs.CollectEntriesSerializer{} + if err := f.Readdir(ctx, serializer); err != nil { + return err + } + // If more than "." and ".." is found, skip internal tmpfs to prevent hiding + // existing files. + if len(serializer.Order) > 2 { + log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp") + return nil + } + log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp") + fallthrough + + case syserror.ENOENT: + // No '/tmp' found (or fallthrough from above). Safe to mount internal + // tmpfs. + tmpMount := specs.Mount{ + Type: tmpfsvfs2.Name, + Destination: "/tmp", + // Sticky bit is added to prevent accidental deletion of files from + // another user. This is normally done for /tmp. + Options: []string{"mode=01777"}, + } + return c.mountSubmount(ctx, conf, mns, root, tmpMount) + + default: + return err + } +} diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go new file mode 100644 index 000000000..912037075 --- /dev/null +++ b/runsc/boot/fs_test.go @@ -0,0 +1,250 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "reflect" + "strings" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestPodMountHintsHappy(t *testing.T) { + spec := &specs.Spec{ + Annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + + MountPrefix + "mount2.source": "bar", + MountPrefix + "mount2.type": "bind", + MountPrefix + "mount2.share": "container", + MountPrefix + "mount2.options": "rw,private", + }, + } + podHints, err := newPodMountHints(spec) + if err != nil { + t.Fatalf("newPodMountHints failed: %v", err) + } + + // Check that fields were set correctly. + mount1 := podHints.mounts["mount1"] + if want := "mount1"; want != mount1.name { + t.Errorf("mount1 name, want: %q, got: %q", want, mount1.name) + } + if want := "foo"; want != mount1.mount.Source { + t.Errorf("mount1 source, want: %q, got: %q", want, mount1.mount.Source) + } + if want := "tmpfs"; want != mount1.mount.Type { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Type) + } + if want := pod; want != mount1.share { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.share) + } + if want := []string(nil); !reflect.DeepEqual(want, mount1.mount.Options) { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Options) + } + + mount2 := podHints.mounts["mount2"] + if want := "mount2"; want != mount2.name { + t.Errorf("mount2 name, want: %q, got: %q", want, mount2.name) + } + if want := "bar"; want != mount2.mount.Source { + t.Errorf("mount2 source, want: %q, got: %q", want, mount2.mount.Source) + } + if want := "bind"; want != mount2.mount.Type { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Type) + } + if want := container; want != mount2.share { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.share) + } + if want := []string{"private", "rw"}; !reflect.DeepEqual(want, mount2.mount.Options) { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Options) + } +} + +func TestPodMountHintsErrors(t *testing.T) { + for _, tst := range []struct { + name string + annotations map[string]string + error string + }{ + { + name: "too short", + annotations: map[string]string{ + MountPrefix + "mount1": "foo", + }, + error: "invalid mount annotation", + }, + { + name: "no name", + annotations: map[string]string{ + MountPrefix + ".source": "foo", + }, + error: "invalid mount name", + }, + { + name: "missing source", + annotations: map[string]string{ + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + }, + error: "source field", + }, + { + name: "missing type", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.share": "pod", + }, + error: "type field", + }, + { + name: "missing share", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + }, + error: "share field", + }, + { + name: "invalid field name", + annotations: map[string]string{ + MountPrefix + "mount1.invalid": "foo", + }, + error: "invalid mount annotation", + }, + { + name: "invalid source", + annotations: map[string]string{ + MountPrefix + "mount1.source": "", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + }, + error: "source cannot be empty", + }, + { + name: "invalid type", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "invalid-type", + MountPrefix + "mount1.share": "pod", + }, + error: "invalid type", + }, + { + name: "invalid share", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "invalid-share", + }, + error: "invalid share", + }, + { + name: "invalid options", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + MountPrefix + "mount1.options": "invalid-option", + }, + error: "unknown mount option", + }, + { + name: "duplicate source", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + + MountPrefix + "mount2.source": "foo", + MountPrefix + "mount2.type": "bind", + MountPrefix + "mount2.share": "container", + }, + error: "have the same mount source", + }, + } { + t.Run(tst.name, func(t *testing.T) { + spec := &specs.Spec{Annotations: tst.annotations} + podHints, err := newPodMountHints(spec) + if err == nil || !strings.Contains(err.Error(), tst.error) { + t.Errorf("newPodMountHints invalid error, want: .*%s.*, got: %v", tst.error, err) + } + if podHints != nil { + t.Errorf("newPodMountHints must return nil on failure: %+v", podHints) + } + }) + } +} + +func TestGetMountAccessType(t *testing.T) { + const source = "foo" + for _, tst := range []struct { + name string + annotations map[string]string + want FileAccessType + }{ + { + name: "container=exclusive", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "container", + }, + want: FileAccessExclusive, + }, + { + name: "pod=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "pod", + }, + want: FileAccessShared, + }, + { + name: "shared=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "shared", + }, + want: FileAccessShared, + }, + { + name: "default=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source + "mismatch", + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "container", + }, + want: FileAccessShared, + }, + } { + t.Run(tst.name, func(t *testing.T) { + spec := &specs.Spec{Annotations: tst.annotations} + podHints, err := newPodMountHints(spec) + if err != nil { + t.Fatalf("newPodMountHints failed: %v", err) + } + mounter := containerMounter{hints: podHints} + if got := mounter.getMountAccessType(specs.Mount{Source: source}); got != tst.want { + t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got) + } + }) + } +} diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go new file mode 100644 index 000000000..ce62236e5 --- /dev/null +++ b/runsc/boot/limits.go @@ -0,0 +1,154 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sync" +) + +// Mapping from linux resource names to limits.LimitType. +var fromLinuxResource = map[string]limits.LimitType{ + "RLIMIT_AS": limits.AS, + "RLIMIT_CORE": limits.Core, + "RLIMIT_CPU": limits.CPU, + "RLIMIT_DATA": limits.Data, + "RLIMIT_FSIZE": limits.FileSize, + "RLIMIT_LOCKS": limits.Locks, + "RLIMIT_MEMLOCK": limits.MemoryLocked, + "RLIMIT_MSGQUEUE": limits.MessageQueueBytes, + "RLIMIT_NICE": limits.Nice, + "RLIMIT_NOFILE": limits.NumberOfFiles, + "RLIMIT_NPROC": limits.ProcessCount, + "RLIMIT_RSS": limits.Rss, + "RLIMIT_RTPRIO": limits.RealTimePriority, + "RLIMIT_RTTIME": limits.Rttime, + "RLIMIT_SIGPENDING": limits.SignalsPending, + "RLIMIT_STACK": limits.Stack, +} + +func findName(lt limits.LimitType) string { + for k, v := range fromLinuxResource { + if v == lt { + return k + } + } + return "unknown" +} + +var defaults defs + +type defs struct { + mu sync.Mutex + set *limits.LimitSet + err error +} + +func (d *defs) get() (*limits.LimitSet, error) { + d.mu.Lock() + defer d.mu.Unlock() + + if d.err != nil { + return nil, d.err + } + if d.set == nil { + if err := d.initDefaults(); err != nil { + d.err = err + return nil, err + } + } + return d.set, nil +} + +func (d *defs) initDefaults() error { + ls, err := limits.NewLinuxLimitSet() + if err != nil { + return err + } + + // Set default limits based on what containers get by default, ex: + // $ docker run --rm debian prlimit + ls.SetUnchecked(limits.AS, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.Core, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.CPU, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536}) + ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200}) + ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0}) + ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576}) + ls.SetUnchecked(limits.ProcessCount, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.Rss, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.RealTimePriority, limits.Limit{Cur: 0, Max: 0}) + ls.SetUnchecked(limits.Rttime, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0}) + ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity}) + + // Read host limits that directly affect the sandbox and adjust the defaults + // based on them. + for _, res := range []int{syscall.RLIMIT_FSIZE, syscall.RLIMIT_NOFILE} { + var hl syscall.Rlimit + if err := syscall.Getrlimit(res, &hl); err != nil { + return err + } + + lt, ok := limits.FromLinuxResource[res] + if !ok { + return fmt.Errorf("unknown rlimit type %v", res) + } + hostLimit := limits.Limit{ + Cur: limits.FromLinux(hl.Cur), + Max: limits.FromLinux(hl.Max), + } + + defaultLimit := ls.Get(lt) + if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur { + log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur) + } + if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max { + log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max) + ls.SetUnchecked(lt, hostLimit) + } + } + + d.set = ls + return nil +} + +func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) { + ls, err := defaults.get() + if err != nil { + return nil, err + } + + // Then apply overwrites on top of defaults. + for _, rl := range spec.Process.Rlimits { + lt, ok := fromLinuxResource[rl.Type] + if !ok { + return nil, fmt.Errorf("unknown resource %q", rl.Type) + } + ls.SetUnchecked(lt, limits.Limit{ + Cur: rl.Soft, + Max: rl.Hard, + }) + } + return ls, nil +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go new file mode 100644 index 000000000..0c0423ab2 --- /dev/null +++ b/runsc/boot/loader.go @@ -0,0 +1,1284 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package boot loads the kernel and runs a container. +package boot + +import ( + "fmt" + mrand "math/rand" + "os" + "runtime" + "sync/atomic" + "syscall" + gtime "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/fdimport" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/fs/user" + hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/sighandling" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" + "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" + "gvisor.dev/gvisor/pkg/tcpip/transport/raw" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/runsc/boot/filter" + _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. + "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/specutils" + + // Include supported socket providers. + "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" +) + +// Loader keeps state needed to start the kernel and run the container.. +type Loader struct { + // k is the kernel. + k *kernel.Kernel + + // ctrl is the control server. + ctrl *controller + + conf *Config + + // console is set to true if terminal is enabled. + console bool + + watchdog *watchdog.Watchdog + + // stdioFDs contains stdin, stdout, and stderr. + stdioFDs []int + + // goferFDs are the FDs that attach the sandbox to the gofers. + goferFDs []int + + // spec is the base configuration for the root container. + spec *specs.Spec + + // stopSignalForwarding disables forwarding of signals to the sandboxed + // container. It should be called when a sandbox is destroyed. + stopSignalForwarding func() + + // restore is set to true if we are restoring a container. + restore bool + + // rootProcArgs refers to the root sandbox init task. + rootProcArgs kernel.CreateProcessArgs + + // sandboxID is the ID for the whole sandbox. + sandboxID string + + // mu guards processes. + mu sync.Mutex + + // processes maps containers init process and invocation of exec. Root + // processes are keyed with container ID and pid=0, while exec invocations + // have the corresponding pid set. + // + // processes is guardded by mu. + processes map[execID]*execProcess + + // mountHints provides extra information about mounts for containers that + // apply to the entire pod. + mountHints *podMountHints +} + +// execID uniquely identifies a sentry process that is executed in a container. +type execID struct { + cid string + pid kernel.ThreadID +} + +// execProcess contains the thread group and host TTY of a sentry process. +type execProcess struct { + // tg will be nil for containers that haven't started yet. + tg *kernel.ThreadGroup + + // tty will be nil if the process is not attached to a terminal. + tty *host.TTYFileOperations + + // tty will be nil if the process is not attached to a terminal. + ttyVFS2 *hostvfs2.TTYFileDescription + + // pidnsPath is the pid namespace path in spec + pidnsPath string +} + +func init() { + // Initialize the random number generator. + mrand.Seed(gtime.Now().UnixNano()) +} + +// Args are the arguments for New(). +type Args struct { + // Id is the sandbox ID. + ID string + // Spec is the sandbox specification. + Spec *specs.Spec + // Conf is the system configuration. + Conf *Config + // ControllerFD is the FD to the URPC controller. The Loader takes ownership + // of this FD and may close it at any time. + ControllerFD int + // Device is an optional argument that is passed to the platform. The Loader + // takes ownership of this file and may close it at any time. + Device *os.File + // GoferFDs is an array of FDs used to connect with the Gofer. The Loader + // takes ownership of these FDs and may close them at any time. + GoferFDs []int + // StdioFDs is the stdio for the application. The Loader takes ownership of + // these FDs and may close them at any time. + StdioFDs []int + // Console is set to true if using TTY. + Console bool + // NumCPU is the number of CPUs to create inside the sandbox. + NumCPU int + // TotalMem is the initial amount of total memory to report back to the + // container. + TotalMem uint64 + // UserLogFD is the file descriptor to write user logs to. + UserLogFD int +} + +// make sure stdioFDs are always the same on initial start and on restore +const startingStdioFD = 64 + +// New initializes a new kernel loader configured by spec. +// New also handles setting up a kernel for restoring a container. +func New(args Args) (*Loader, error) { + // We initialize the rand package now to make sure /dev/urandom is pre-opened + // on kernels that do not support getrandom(2). + if err := rand.Init(); err != nil { + return nil, fmt.Errorf("setting up rand: %v", err) + } + + if err := usage.Init(); err != nil { + return nil, fmt.Errorf("setting up memory usage: %v", err) + } + + // Is this a VFSv2 kernel? + if args.Conf.VFS2 { + kernel.VFS2Enabled = true + vfs2.Override() + } + + // Create kernel and platform. + p, err := createPlatform(args.Conf, args.Device) + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + k := &kernel.Kernel{ + Platform: p, + } + + // Create memory file. + mf, err := createMemoryFile() + if err != nil { + return nil, fmt.Errorf("creating memory file: %v", err) + } + k.SetMemoryFile(mf) + + // Create VDSO. + // + // Pass k as the platform since it is savable, unlike the actual platform. + vdso, err := loader.PrepareVDSO(k) + if err != nil { + return nil, fmt.Errorf("creating vdso: %v", err) + } + + // Create timekeeper. + tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) + if err != nil { + return nil, fmt.Errorf("creating timekeeper: %v", err) + } + tk.SetClocks(time.NewCalibratedClocks()) + + if err := enableStrace(args.Conf); err != nil { + return nil, fmt.Errorf("enabling strace: %v", err) + } + + // Create root network namespace/stack. + netns, err := newRootNetworkNamespace(args.Conf, k, k) + if err != nil { + return nil, fmt.Errorf("creating network: %v", err) + } + + // Create capabilities. + caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities) + if err != nil { + return nil, fmt.Errorf("converting capabilities: %v", err) + } + + // Convert the spec's additional GIDs to KGIDs. + extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids)) + for _, GID := range args.Spec.Process.User.AdditionalGids { + extraKGIDs = append(extraKGIDs, auth.KGID(GID)) + } + + // Create credentials. + creds := auth.NewUserCredentials( + auth.KUID(args.Spec.Process.User.UID), + auth.KGID(args.Spec.Process.User.GID), + extraKGIDs, + caps, + auth.NewRootUserNamespace()) + + if args.NumCPU == 0 { + args.NumCPU = runtime.NumCPU() + } + log.Infof("CPUs: %d", args.NumCPU) + + if args.TotalMem > 0 { + // Adjust the total memory returned by the Sentry so that applications that + // use /proc/meminfo can make allocations based on this limit. + usage.MinimumTotalMemoryBytes = args.TotalMem + log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30)) + } + + // Initiate the Kernel object, which is required by the Context passed + // to createVFS in order to mount (among other things) procfs. + if err = k.Init(kernel.InitKernelArgs{ + FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, + RootUserNamespace: creds.UserNamespace, + RootNetworkNamespace: netns, + ApplicationCores: uint(args.NumCPU), + Vdso: vdso, + RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), + RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), + RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), + }); err != nil { + return nil, fmt.Errorf("initializing kernel: %v", err) + } + + if kernel.VFS2Enabled { + if err := registerFilesystems(k); err != nil { + return nil, fmt.Errorf("registering filesystems: %w", err) + } + } + + if err := adjustDirentCache(k); err != nil { + return nil, err + } + + // Turn on packet logging if enabled. + if args.Conf.LogPackets { + log.Infof("Packet logging enabled") + atomic.StoreUint32(&sniffer.LogPackets, 1) + } else { + log.Infof("Packet logging disabled") + atomic.StoreUint32(&sniffer.LogPackets, 0) + } + + // Create a watchdog. + dogOpts := watchdog.DefaultOpts + dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction + dog := watchdog.New(k, dogOpts) + + procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) + if err != nil { + return nil, fmt.Errorf("creating init process for root container: %v", err) + } + + if err := initCompatLogs(args.UserLogFD); err != nil { + return nil, fmt.Errorf("initializing compat logs: %v", err) + } + + mountHints, err := newPodMountHints(args.Spec) + if err != nil { + return nil, fmt.Errorf("creating pod mount hints: %v", err) + } + + if kernel.VFS2Enabled { + // Set up host mount that will be used for imported fds. + hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS()) + if err != nil { + return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) + } + defer hostFilesystem.DecRef() + hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create hostfs mount: %v", err) + } + k.SetHostMount(hostMount) + } + + // Make host FDs stable between invocations. Host FDs must map to the exact + // same number when the sandbox is restored. Otherwise the wrong FD will be + // used. + var stdioFDs []int + newfd := startingStdioFD + for _, fd := range args.StdioFDs { + err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) + } + stdioFDs = append(stdioFDs, newfd) + err = syscall.Close(fd) + if err != nil { + return nil, fmt.Errorf("close original stdioFDs failed: %v", err) + } + newfd++ + } + + eid := execID{cid: args.ID} + l := &Loader{ + k: k, + conf: args.Conf, + console: args.Console, + watchdog: dog, + spec: args.Spec, + goferFDs: args.GoferFDs, + stdioFDs: stdioFDs, + rootProcArgs: procArgs, + sandboxID: args.ID, + processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, + } + + // We don't care about child signals; some platforms can generate a + // tremendous number of useless ones (I'm looking at you, ptrace). + if err := sighandling.IgnoreChildStop(); err != nil { + return nil, fmt.Errorf("ignore child stop signals failed: %v", err) + } + + // Create the control server using the provided FD. + // + // This must be done *after* we have initialized the kernel since the + // controller is used to configure the kernel's network stack. + ctrl, err := newController(args.ControllerFD, l) + if err != nil { + return nil, fmt.Errorf("creating control server: %v", err) + } + l.ctrl = ctrl + + // Only start serving after Loader is set to controller and controller is set + // to Loader, because they are both used in the urpc methods. + if err := ctrl.srv.StartServing(); err != nil { + return nil, fmt.Errorf("starting control server: %v", err) + } + + return l, nil +} + +// newProcess creates a process that can be run with kernel.CreateProcess. +func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { + // Create initial limits. + ls, err := createLimitSet(spec) + if err != nil { + return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err) + } + + wd := spec.Process.Cwd + if wd == "" { + wd = "/" + } + + // Create the process arguments. + procArgs := kernel.CreateProcessArgs{ + Argv: spec.Process.Args, + Envv: spec.Process.Env, + WorkingDirectory: wd, + Credentials: creds, + Umask: 0022, + Limits: ls, + MaxSymlinkTraversals: linux.MaxSymlinkTraversals, + UTSNamespace: k.RootUTSNamespace(), + IPCNamespace: k.RootIPCNamespace(), + AbstractSocketNamespace: k.RootAbstractSocketNamespace(), + ContainerID: id, + PIDNamespace: pidns, + } + + return procArgs, nil +} + +// Destroy cleans up all resources used by the loader. +// +// Note that this will block until all open control server connections have +// been closed. For that reason, this should NOT be called in a defer, because +// a panic in a control server rpc would then hang forever. +func (l *Loader) Destroy() { + if l.ctrl != nil { + l.ctrl.srv.Stop() + } + if l.stopSignalForwarding != nil { + l.stopSignalForwarding() + } + l.watchdog.Stop() +} + +func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { + p, err := platform.Lookup(conf.Platform) + if err != nil { + panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err)) + } + log.Infof("Platform: %s", conf.Platform) + return p.New(deviceFile) +} + +func createMemoryFile() (*pgalloc.MemoryFile, error) { + const memfileName = "runsc-memory" + memfd, err := memutil.CreateMemFD(memfileName, 0) + if err != nil { + return nil, fmt.Errorf("error creating memfd: %v", err) + } + memfile := os.NewFile(uintptr(memfd), memfileName) + // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if + // there are memory cgroups specified, because at this point we're already + // in a mount namespace in which the relevant cgroupfs is not visible. + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) + if err != nil { + memfile.Close() + return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) + } + return mf, nil +} + +func (l *Loader) installSeccompFilters() error { + if l.conf.DisableSeccomp { + filter.Report("syscall filter is DISABLED. Running in less secure mode.") + } else { + opts := filter.Options{ + Platform: l.k.Platform, + HostNetwork: l.conf.Network == NetworkHost, + ProfileEnable: l.conf.ProfileEnable, + ControllerFD: l.ctrl.srv.FD(), + } + if err := filter.Install(opts); err != nil { + return fmt.Errorf("installing seccomp filters: %v", err) + } + } + return nil +} + +// Run runs the root container. +func (l *Loader) Run() error { + err := l.run() + l.ctrl.manager.startResultChan <- err + if err != nil { + // Give the controller some time to send the error to the + // runtime. If we return too quickly here the process will exit + // and the control connection will be closed before the error + // is returned. + gtime.Sleep(2 * gtime.Second) + return err + } + return nil +} + +func (l *Loader) run() error { + if l.conf.Network == NetworkHost { + // Delay host network configuration to this point because network namespace + // is configured after the loader is created and before Run() is called. + log.Debugf("Configuring host network") + stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) + if err := stack.Configure(); err != nil { + return err + } + } + + l.mu.Lock() + defer l.mu.Unlock() + + eid := execID{cid: l.sandboxID} + ep, ok := l.processes[eid] + if !ok { + return fmt.Errorf("trying to start deleted container %q", l.sandboxID) + } + + // If we are restoring, we do not want to create a process. + // l.restore is set by the container manager when a restore call is made. + var ttyFile *host.TTYFileOperations + var ttyFileVFS2 *hostvfs2.TTYFileDescription + if !l.restore { + if l.conf.ProfileEnable { + pprof.Initialize() + } + + // Finally done with all configuration. Setup filters before user code + // is loaded. + if err := l.installSeccompFilters(); err != nil { + return err + } + + // Create the FD map, which will set stdin, stdout, and stderr. If console + // is true, then ioctl calls will be passed through to the host fd. + ctx := l.rootProcArgs.NewContext(l.k) + var err error + + // CreateProcess takes a reference on FDMap if successful. We won't need + // ours either way. + l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs) + if err != nil { + return fmt.Errorf("importing fds: %v", err) + } + + // Setup the root container file system. + l.startGoferMonitor(l.sandboxID, l.goferFDs) + + mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.conf, l.rootProcArgs.Credentials); err != nil { + return err + } + if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil { + return err + } + + // Add the HOME enviroment variable if it is not already set. + var envv []string + if kernel.VFS2Enabled { + envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2, + l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + + } else { + envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, + l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + } + if err != nil { + return err + } + l.rootProcArgs.Envv = envv + + // Create the root container init task. It will begin running + // when the kernel is started. + if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil { + return fmt.Errorf("creating init process: %v", err) + } + + // CreateProcess takes a reference on FDTable if successful. + l.rootProcArgs.FDTable.DecRef() + } + + ep.tg = l.k.GlobalInit() + if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok { + ep.pidnsPath = ns.Path + } + if l.console { + // Set the foreground process group on the TTY to the global init process + // group, since that is what we are about to start running. + switch { + case ttyFileVFS2 != nil: + ep.ttyVFS2 = ttyFileVFS2 + ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup()) + case ttyFile != nil: + ep.tty = ttyFile + ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup()) + } + } + + // Handle signals by forwarding them to the root container process + // (except for panic signal, which should cause a panic). + l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { + // Panic signal should cause a panic. + if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) { + panic("Signal-induced panic") + } + + // Otherwise forward to root container. + deliveryMode := DeliverToProcess + if l.console { + // Since we are running with a console, we should forward the signal to + // the foreground process group so that job control signals like ^C can + // be handled properly. + deliveryMode = DeliverToForegroundProcessGroup + } + log.Infof("Received external signal %d, mode: %v", sig, deliveryMode) + if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { + log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err) + } + }) + + // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again + // either in createFDTable() during initial start or in descriptor.initAfterLoad() + // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the + // passed FDs, so only close for VFS1. + if !kernel.VFS2Enabled { + for _, fd := range l.stdioFDs { + err := syscall.Close(fd) + if err != nil { + return fmt.Errorf("close dup()ed stdioFDs: %v", err) + } + } + } + + log.Infof("Process should have started...") + l.watchdog.Start() + return l.k.Start() +} + +// createContainer creates a new container inside the sandbox. +func (l *Loader) createContainer(cid string) error { + l.mu.Lock() + defer l.mu.Unlock() + + eid := execID{cid: cid} + if _, ok := l.processes[eid]; ok { + return fmt.Errorf("container %q already exists", cid) + } + l.processes[eid] = &execProcess{} + return nil +} + +// startContainer starts a child container. It returns the thread group ID of +// the newly created process. Caller owns 'files' and may close them after +// this method returns. +func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error { + // Create capabilities. + caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) + if err != nil { + return fmt.Errorf("creating capabilities: %v", err) + } + + l.mu.Lock() + defer l.mu.Unlock() + + eid := execID{cid: cid} + if _, ok := l.processes[eid]; !ok { + return fmt.Errorf("trying to start a deleted container %q", cid) + } + + // Convert the spec's additional GIDs to KGIDs. + extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) + for _, GID := range spec.Process.User.AdditionalGids { + extraKGIDs = append(extraKGIDs, auth.KGID(GID)) + } + + // Create credentials. We reuse the root user namespace because the + // sentry currently supports only 1 mount namespace, which is tied to a + // single user namespace. Thus we must run in the same user namespace + // to access mounts. + creds := auth.NewUserCredentials( + auth.KUID(spec.Process.User.UID), + auth.KGID(spec.Process.User.GID), + extraKGIDs, + caps, + l.k.RootUserNamespace()) + + var pidns *kernel.PIDNamespace + if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok { + if ns.Path != "" { + for _, p := range l.processes { + if ns.Path == p.pidnsPath { + pidns = p.tg.PIDNamespace() + break + } + } + } + if pidns == nil { + pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) + } + l.processes[eid].pidnsPath = ns.Path + } else { + pidns = l.k.RootPIDNamespace() + } + procArgs, err := newProcess(cid, spec, creds, l.k, pidns) + if err != nil { + return fmt.Errorf("creating new process: %v", err) + } + + // setupContainerFS() dups stdioFDs, so we don't need to dup them here. + var stdioFDs []int + for _, f := range files[:3] { + stdioFDs = append(stdioFDs, int(f.Fd())) + } + + // Create the FD map, which will set stdin, stdout, and stderr. + ctx := procArgs.NewContext(l.k) + fdTable, _, _, err := createFDTable(ctx, false, stdioFDs) + if err != nil { + return fmt.Errorf("importing fds: %v", err) + } + // CreateProcess takes a reference on fdTable if successful. We won't + // need ours either way. + procArgs.FDTable = fdTable + + // Can't take ownership away from os.File. dup them to get a new FDs. + var goferFDs []int + for _, f := range files[3:] { + fd, err := syscall.Dup(int(f.Fd())) + if err != nil { + return fmt.Errorf("failed to dup file: %v", err) + } + goferFDs = append(goferFDs, fd) + } + + // Setup the child container file system. + l.startGoferMonitor(cid, goferFDs) + + mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints) + if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil { + return err + } + + // Add the HOME enviroment variable if it is not already set. + var envv []string + if kernel.VFS2Enabled { + envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2, + procArgs.Credentials.RealKUID, procArgs.Envv) + + } else { + envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace, + procArgs.Credentials.RealKUID, procArgs.Envv) + } + if err != nil { + return err + } + procArgs.Envv = envv + + // Create and start the new process. + tg, _, err := l.k.CreateProcess(procArgs) + if err != nil { + return fmt.Errorf("creating process: %v", err) + } + l.k.StartProcess(tg) + + // CreateProcess takes a reference on FDTable if successful. + procArgs.FDTable.DecRef() + + l.processes[eid].tg = tg + return nil +} + +// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on +// the gofer FDs looking for disconnects, and destroys the container if a +// disconnect occurs in any of the gofer FDs. +func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { + go func() { + log.Debugf("Monitoring gofer health for container %q", cid) + var events []unix.PollFd + for _, fd := range goferFDs { + events = append(events, unix.PollFd{ + Fd: int32(fd), + Events: unix.POLLHUP | unix.POLLRDHUP, + }) + } + _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { + // Use ppoll instead of poll because it's already whilelisted in seccomp. + n, err := unix.Ppoll(events, nil, nil) + return uintptr(n), 0, err + }) + if err != nil { + panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err)) + } + + // Check if the gofer has stopped as part of normal container destruction. + // This is done just to avoid sending an annoying error message to the log. + // Note that there is a small race window in between mu.Unlock() and the + // lock being reacquired in destroyContainer(), but it's harmless to call + // destroyContainer() multiple times. + l.mu.Lock() + _, ok := l.processes[execID{cid: cid}] + l.mu.Unlock() + if ok { + log.Infof("Gofer socket disconnected, destroying container %q", cid) + if err := l.destroyContainer(cid); err != nil { + log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err) + } + } + }() +} + +// destroyContainer stops a container if it is still running and cleans up its +// filesystem. +func (l *Loader) destroyContainer(cid string) error { + l.mu.Lock() + defer l.mu.Unlock() + + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) + if err != nil { + // Container doesn't exist. + return err + } + + // The container exists, but has it been started? + if tg != nil { + if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { + return fmt.Errorf("sending SIGKILL to all container processes: %v", err) + } + // Wait for all processes that belong to the container to exit (including + // exec'd processes). + for _, t := range l.k.TaskSet().Root.Tasks() { + if t.ContainerID() == cid { + t.ThreadGroup().WaitExited() + } + } + + // At this point, all processes inside of the container have exited, + // releasing all references to the container's MountNamespace and + // causing all submounts and overlays to be unmounted. + // + // Since the container's MountNamespace has been released, + // MountNamespace.destroy() will have executed, but that function may + // trigger async close operations. We must wait for those to complete + // before returning, otherwise the caller may kill the gofer before + // they complete, causing a cascade of failing RPCs. + fs.AsyncBarrier() + } + + // No more failure from this point on. Remove all container thread groups + // from the map. + for key := range l.processes { + if key.cid == cid { + delete(l.processes, key) + } + } + + log.Debugf("Container destroyed %q", cid) + return nil +} + +func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { + // Hold the lock for the entire operation to ensure that exec'd process is + // added to 'processes' in case it races with destroyContainer(). + l.mu.Lock() + defer l.mu.Unlock() + + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID}) + if err != nil { + return 0, err + } + if tg == nil { + return 0, fmt.Errorf("container %q not started", args.ContainerID) + } + + // Get the container MountNamespace from the Task. + if kernel.VFS2Enabled { + // task.MountNamespace() does not take a ref, so we must do so ourselves. + args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2() + args.MountNamespaceVFS2.IncRef() + } else { + tg.Leader().WithMuLocked(func(t *kernel.Task) { + // task.MountNamespace() does not take a ref, so we must do so ourselves. + args.MountNamespace = t.MountNamespace() + args.MountNamespace.IncRef() + }) + } + + // Add the HOME environment variable if it is not already set. + if kernel.VFS2Enabled { + defer args.MountNamespaceVFS2.DecRef() + + root := args.MountNamespaceVFS2.Root() + defer root.DecRef() + ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) + if err != nil { + return 0, err + } + args.Envv = envv + } else { + defer args.MountNamespace.DecRef() + + root := args.MountNamespace.Root() + defer root.DecRef() + ctx := fs.WithRoot(l.k.SupervisorContext(), root) + envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) + if err != nil { + return 0, err + } + args.Envv = envv + } + + // Start the process. + proc := control.Proc{Kernel: l.k} + args.PIDNamespace = tg.PIDNamespace() + newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args) + if err != nil { + return 0, err + } + + eid := execID{cid: args.ContainerID, pid: tgid} + l.processes[eid] = &execProcess{ + tg: newTG, + tty: ttyFile, + ttyVFS2: ttyFileVFS2, + } + log.Debugf("updated processes: %v", l.processes) + + return tgid, nil +} + +// waitContainer waits for the init process of a container to exit. +func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { + // Don't defer unlock, as doing so would make it impossible for + // multiple clients to wait on the same container. + tg, err := l.threadGroupFromID(execID{cid: cid}) + if err != nil { + return fmt.Errorf("can't wait for container %q: %v", cid, err) + } + + // If the thread either has already exited or exits during waiting, + // consider the container exited. + ws := l.wait(tg) + *waitStatus = ws + return nil +} + +func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { + if tgid <= 0 { + return fmt.Errorf("PID (%d) must be positive", tgid) + } + + // Try to find a process that was exec'd + eid := execID{cid: cid, pid: tgid} + execTG, err := l.threadGroupFromID(eid) + if err == nil { + ws := l.wait(execTG) + *waitStatus = ws + + l.mu.Lock() + delete(l.processes, eid) + log.Debugf("updated processes (removal): %v", l.processes) + l.mu.Unlock() + return nil + } + + // The caller may be waiting on a process not started directly via exec. + // In this case, find the process in the container's PID namespace. + initTG, err := l.threadGroupFromID(execID{cid: cid}) + if err != nil { + return fmt.Errorf("waiting for PID %d: %v", tgid, err) + } + tg := initTG.PIDNamespace().ThreadGroupWithID(tgid) + if tg == nil { + return fmt.Errorf("waiting for PID %d: no such process", tgid) + } + if tg.Leader().ContainerID() != cid { + return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) + } + ws := l.wait(tg) + *waitStatus = ws + return nil +} + +// wait waits for the process with TGID 'tgid' in a container's PID namespace +// to exit. +func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 { + tg.WaitExited() + return tg.ExitStatus().Status() +} + +// WaitForStartSignal waits for a start signal from the control server. +func (l *Loader) WaitForStartSignal() { + <-l.ctrl.manager.startChan +} + +// WaitExit waits for the root container to exit, and returns its exit status. +func (l *Loader) WaitExit() kernel.ExitStatus { + // Wait for container. + l.k.WaitExited() + + return l.k.GlobalInit().ExitStatus() +} + +func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { + // Create an empty network stack because the network namespace may be empty at + // this point. Netns is configured before Run() is called. Netstack is + // configured using a control uRPC message. Host network is configured inside + // Run(). + switch conf.Network { + case NetworkHost: + // No network namespacing support for hostinet yet, hence creator is nil. + return inet.NewRootNamespace(hostinet.NewStack(), nil), nil + + case NetworkNone, NetworkSandbox: + s, err := newEmptySandboxNetworkStack(clock, uniqueID) + if err != nil { + return nil, err + } + creator := &sandboxNetstackCreator{ + clock: clock, + uniqueID: uniqueID, + } + return inet.NewRootNamespace(s, creator), nil + + default: + panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) + } + +} + +func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { + netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} + transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + s := netstack.Stack{stack.New(stack.Options{ + NetworkProtocols: netProtos, + TransportProtocols: transProtos, + Clock: clock, + Stats: netstack.Metrics, + HandleLocal: true, + // Enable raw sockets for users with sufficient + // privileges. + RawFactory: raw.EndpointFactory{}, + UniqueID: uniqueID, + })} + + // Enable SACK Recovery. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { + return nil, fmt.Errorf("failed to enable SACK: %s", err) + } + + // Set default TTLs as required by socket/netstack. + s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + + // Enable Receive Buffer Auto-Tuning. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err) + } + + return &s, nil +} + +// sandboxNetstackCreator implements kernel.NetworkStackCreator. +// +// +stateify savable +type sandboxNetstackCreator struct { + clock tcpip.Clock + uniqueID stack.UniqueID +} + +// CreateStack implements kernel.NetworkStackCreator.CreateStack. +func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { + s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID) + if err != nil { + return nil, err + } + + // Setup loopback. + n := &Network{Stack: s.(*netstack.Stack).Stack} + nicID := tcpip.NICID(f.uniqueID.UniqueID()) + link := DefaultLoopbackLink + linkEP := loopback.New() + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return nil, err + } + + return s, nil +} + +// signal sends a signal to one or more processes in a container. If PID is 0, +// then the container init process is used. Depending on the SignalDeliveryMode +// option, the signal may be sent directly to the indicated process, to all +// processes in the container, or to the foreground process group. +func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error { + if pid < 0 { + return fmt.Errorf("PID (%d) must be positive", pid) + } + + switch mode { + case DeliverToProcess: + if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil { + return fmt.Errorf("signaling process in container %q PID %d: %v", cid, pid, err) + } + return nil + + case DeliverToForegroundProcessGroup: + if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil { + return fmt.Errorf("signaling foreground process group in container %q PID %d: %v", cid, pid, err) + } + return nil + + case DeliverToAllProcesses: + if pid != 0 { + return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid) + } + // Check that the container has actually started before signaling it. + if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil { + return err + } + if err := l.signalAllProcesses(cid, signo); err != nil { + return fmt.Errorf("signaling all processes in container %q: %v", cid, err) + } + return nil + + default: + panic(fmt.Sprintf("unknown signal delivery mode %v", mode)) + } +} + +func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error { + execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) + if err == nil { + // Send signal directly to the identified process. + return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo}) + } + + // The caller may be signaling a process not started directly via exec. + // In this case, find the process in the container's PID namespace and + // signal it. + initTG, err := l.threadGroupFromID(execID{cid: cid}) + if err != nil { + return fmt.Errorf("no thread group found: %v", err) + } + tg := initTG.PIDNamespace().ThreadGroupWithID(tgid) + if tg == nil { + return fmt.Errorf("no such process with PID %d", tgid) + } + if tg.Leader().ContainerID() != cid { + return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) + } + return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}) +} + +// signalForegrondProcessGroup looks up foreground process group from the TTY +// for the given "tgid" inside container "cid", and send the signal to it. +func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { + l.mu.Lock() + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid}) + if err != nil { + l.mu.Unlock() + return fmt.Errorf("no thread group found: %v", err) + } + if tg == nil { + l.mu.Unlock() + return fmt.Errorf("container %q not started", cid) + } + + tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid}) + l.mu.Unlock() + if err != nil { + return fmt.Errorf("no thread group found: %v", err) + } + + var pg *kernel.ProcessGroup + switch { + case ttyVFS2 != nil: + pg = ttyVFS2.ForegroundProcessGroup() + case tty != nil: + pg = tty.ForegroundProcessGroup() + default: + return fmt.Errorf("no TTY attached") + } + if pg == nil { + // No foreground process group has been set. Signal the + // original thread group. + log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid) + return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}) + } + // Send the signal to all processes in the process group. + var lastErr error + for _, tg := range l.k.TaskSet().Root.ThreadGroups() { + if tg.ProcessGroup() != pg { + continue + } + if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil { + lastErr = err + } + } + return lastErr +} + +// signalAllProcesses that belong to specified container. It's a noop if the +// container hasn't started or has exited. +func (l *Loader) signalAllProcesses(cid string, signo int32) error { + // Pause the kernel to prevent new processes from being created while + // the signal is delivered. This prevents process leaks when SIGKILL is + // sent to the entire container. + l.k.Pause() + defer l.k.Unpause() + return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}) +} + +// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it +// acquires mutex before calling it and fails in case container hasn't started +// yet. +func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) { + l.mu.Lock() + defer l.mu.Unlock() + tg, err := l.tryThreadGroupFromIDLocked(key) + if err != nil { + return nil, err + } + if tg == nil { + return nil, fmt.Errorf("container %q not started", key.cid) + } + return tg, nil +} + +// tryThreadGroupFromIDLocked returns the thread group for the given execution +// ID. It may return nil in case the container has not started yet. Returns +// error if execution ID is invalid or if the container cannot be found (maybe +// it has been deleted). Caller must hold 'mu'. +func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) { + ep := l.processes[key] + if ep == nil { + return nil, fmt.Errorf("container %q not found", key.cid) + } + return ep.tg, nil +} + +// ttyFromIDLocked returns the TTY files for the given execution ID. It may +// return nil in case the container has not started yet. Returns error if +// execution ID is invalid or if the container cannot be found (maybe it has +// been deleted). Caller must hold 'mu'. +func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { + ep := l.processes[key] + if ep == nil { + return nil, nil, fmt.Errorf("container %q not found", key.cid) + } + return ep.tty, ep.ttyVFS2, nil +} + +func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { + if len(stdioFDs) != 3 { + return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) + } + + k := kernel.KernelFromContext(ctx) + fdTable := k.NewFDTable() + ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) + if err != nil { + fdTable.DecRef() + return nil, nil, nil, err + } + return fdTable, ttyFile, ttyFileVFS2, nil +} diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go new file mode 100644 index 000000000..b723e4335 --- /dev/null +++ b/runsc/boot/loader_test.go @@ -0,0 +1,715 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "math/rand" + "os" + "reflect" + "syscall" + "testing" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/runsc/fsgofer" +) + +func init() { + log.SetLevel(log.Debug) + rand.Seed(time.Now().UnixNano()) + if err := fsgofer.OpenProcSelfFD(); err != nil { + panic(err) + } +} + +func testConfig() *Config { + return &Config{ + RootDir: "unused_root_dir", + Network: NetworkNone, + DisableSeccomp: true, + Platform: "ptrace", + } +} + +// testSpec returns a simple spec that can be used in tests. +func testSpec() *specs.Spec { + return &specs.Spec{ + // The host filesystem root is the sandbox root. + Root: &specs.Root{ + Path: "/", + Readonly: true, + }, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + } +} + +// startGofer starts a new gofer routine serving 'root' path. It returns the +// sandbox side of the connection, and a function that when called will stop the +// gofer. +func startGofer(root string) (int, func(), error) { + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return 0, nil, err + } + sandboxEnd, goferEnd := fds[0], fds[1] + + socket, err := unet.NewSocket(goferEnd) + if err != nil { + syscall.Close(sandboxEnd) + syscall.Close(goferEnd) + return 0, nil, fmt.Errorf("error creating server on FD %d: %v", goferEnd, err) + } + at, err := fsgofer.NewAttachPoint(root, fsgofer.Config{ROMount: true}) + if err != nil { + return 0, nil, err + } + go func() { + s := p9.NewServer(at) + if err := s.Handle(socket); err != nil { + log.Infof("Gofer is stopping. FD: %d, err: %v\n", goferEnd, err) + } + }() + // Closing the gofer socket will stop the gofer and exit goroutine above. + cleanup := func() { + if err := socket.Close(); err != nil { + log.Warningf("Error closing gofer socket: %v", err) + } + } + return sandboxEnd, cleanup, nil +} + +func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) { + fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10])) + if err != nil { + return nil, nil, err + } + conf := testConfig() + conf.VFS2 = vfsEnabled + + sandEnd, cleanup, err := startGofer(spec.Root.Path) + if err != nil { + return nil, nil, err + } + + // Loader takes ownership of stdio. + var stdio []int + for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} { + newFd, err := unix.Dup(int(f.Fd())) + if err != nil { + return nil, nil, err + } + stdio = append(stdio, newFd) + } + + args := Args{ + ID: "foo", + Spec: spec, + Conf: conf, + ControllerFD: fd, + GoferFDs: []int{sandEnd}, + StdioFDs: stdio, + } + l, err := New(args) + if err != nil { + cleanup() + return nil, nil, err + } + return l, cleanup, nil +} + +// TestRun runs a simple application in a sandbox and checks that it succeeds. +func TestRun(t *testing.T) { + doRun(t, false) +} + +// TestRunVFS2 runs TestRun in VFSv2. +func TestRunVFS2(t *testing.T) { + doRun(t, true) +} + +func doRun(t *testing.T, vfsEnabled bool) { + l, cleanup, err := createLoader(vfsEnabled, testSpec()) + if err != nil { + t.Fatalf("error creating loader: %v", err) + } + + defer l.Destroy() + defer cleanup() + + // Start a goroutine to read the start chan result, otherwise Run will + // block forever. + var resultChanErr error + var wg sync.WaitGroup + wg.Add(1) + go func() { + resultChanErr = <-l.ctrl.manager.startResultChan + wg.Done() + }() + + // Run the container. + if err := l.Run(); err != nil { + t.Errorf("error running container: %v", err) + } + + // We should have not gotten an error on the startResultChan. + wg.Wait() + if resultChanErr != nil { + t.Errorf("error on startResultChan: %v", resultChanErr) + } + + // Wait for the application to exit. It should succeed. + if status := l.WaitExit(); status.Code != 0 || status.Signo != 0 { + t.Errorf("application exited with status %+v, want 0", status) + } +} + +// TestStartSignal tests that the controller Start message will cause +// WaitForStartSignal to return. +func TestStartSignal(t *testing.T) { + doStartSignal(t, false) +} + +// TestStartSignalVFS2 does TestStartSignal with VFS2. +func TestStartSignalVFS2(t *testing.T) { + doStartSignal(t, true) +} + +func doStartSignal(t *testing.T, vfsEnabled bool) { + l, cleanup, err := createLoader(vfsEnabled, testSpec()) + if err != nil { + t.Fatalf("error creating loader: %v", err) + } + defer l.Destroy() + defer cleanup() + + // We aren't going to wait on this application, so the control server + // needs to be shut down manually. + defer l.ctrl.srv.Stop() + + // Start a goroutine that calls WaitForStartSignal and writes to a + // channel when it returns. + waitFinished := make(chan struct{}) + go func() { + l.WaitForStartSignal() + // Pretend that Run() executed and returned no error. + l.ctrl.manager.startResultChan <- nil + waitFinished <- struct{}{} + }() + + // Nothing has been written to the channel, so waitFinished should not + // return. Give it a little bit of time to make sure the goroutine has + // started. + select { + case <-waitFinished: + t.Errorf("WaitForStartSignal completed but it should not have") + case <-time.After(50 * time.Millisecond): + // OK. + } + + // Trigger the control server StartRoot method. + cid := "foo" + if err := l.ctrl.manager.StartRoot(&cid, nil); err != nil { + t.Errorf("error calling StartRoot: %v", err) + } + + // Now WaitForStartSignal should return (within a short amount of + // time). + select { + case <-waitFinished: + // OK. + case <-time.After(50 * time.Millisecond): + t.Errorf("WaitForStartSignal did not complete but it should have") + } + +} + +type CreateMountTestcase struct { + name string + // Spec that will be used to create the mount manager. Note + // that we can't mount procfs without a kernel, so each spec + // MUST contain something other than procfs mounted at /proc. + spec specs.Spec + // Paths that are expected to exist in the resulting fs. + expectedPaths []string +} + +func createMountTestcases(vfs2 bool) []*CreateMountTestcase { + testCases := []*CreateMountTestcase{ + &CreateMountTestcase{ + // Only proc. + name: "only proc mount", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + }, + }, + // /proc, /dev, and /sys should always be mounted. + expectedPaths: []string{"/proc", "/dev", "/sys"}, + }, + { + // Mount at a deep path, with many components that do + // not exist in the root. + name: "deep mount path", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/some/very/very/deep/path", + Type: "tmpfs", + }, + { + Destination: "/proc", + Type: "tmpfs", + }, + }, + }, + // /some/deep/path should be mounted, along with /proc, + // /dev, and /sys. + expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"}, + }, + &CreateMountTestcase{ + // Mounts are nested inside each other. + name: "nested mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + { + Destination: "/foo", + Type: "tmpfs", + }, + { + Destination: "/foo/qux", + Type: "tmpfs", + }, + { + // File mounts with the same prefix. + Destination: "/foo/qux-quz", + Type: "tmpfs", + }, + { + Destination: "/foo/bar", + Type: "tmpfs", + }, + { + Destination: "/foo/bar/baz", + Type: "tmpfs", + }, + { + // A deep path that is in foo but not the other mounts. + Destination: "/foo/some/very/very/deep/path", + Type: "tmpfs", + }, + }, + }, + expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux", + "/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"}, + }, + &CreateMountTestcase{ + name: "mount inside /dev", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + { + Destination: "/dev", + Type: "tmpfs", + }, + { + // Mounted by runsc by default. + Destination: "/dev/fd", + Type: "tmpfs", + }, + { + // Mount with the same prefix. + Destination: "/dev/fd-foo", + Type: "tmpfs", + }, + { + // Unsupported fs type. + Destination: "/dev/mqueue", + Type: "mqueue", + }, + { + Destination: "/dev/foo", + Type: "tmpfs", + }, + { + Destination: "/dev/bar", + Type: "tmpfs", + }, + }, + }, + expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"}, + }, + } + + vfsCase := &CreateMountTestcase{ + name: "mounts inside mandatory mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports + // MkDirAt in VFS2 (and remove the reduntant append). + // { + // Destination: "/sys/bar", + // Type: "tmpfs", + // }, + // + { + Destination: "/tmp/baz", + Type: "tmpfs", + }, + }, + }, + expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"}, + } + + if !vfs2 { + vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"}) + vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar") + } + return append(testCases, vfsCase) +} + +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespace(t *testing.T) { + for _, tc := range createMountTestcases(false /* vfs2 */) { + t.Run(tc.name, func(t *testing.T) { + conf := testConfig() + ctx := contexttest.Context(t) + + sandEnd, cleanup, err := startGofer(tc.spec.Root.Path) + if err != nil { + t.Fatalf("failed to create gofer: %v", err) + } + defer cleanup() + + mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{}) + mns, err := mntr.createMountNamespace(ctx, conf) + if err != nil { + t.Fatalf("failed to create mount namespace: %v", err) + } + ctx = fs.WithRoot(ctx, mns.Root()) + if err := mntr.mountSubmounts(ctx, conf, mns); err != nil { + t.Fatalf("failed to create mount namespace: %v", err) + } + + root := mns.Root() + defer root.DecRef() + for _, p := range tc.expectedPaths { + maxTraversals := uint(0) + if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil { + t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + } else { + d.DecRef() + } + } + }) + } +} + +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespaceVFS2(t *testing.T) { + for _, tc := range createMountTestcases(true /* vfs2 */) { + t.Run(tc.name, func(t *testing.T) { + spec := testSpec() + spec.Mounts = tc.spec.Mounts + spec.Root = tc.spec.Root + + t.Logf("Using root: %q", spec.Root.Path) + l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec) + if err != nil { + t.Fatalf("failed to create loader: %v", err) + } + defer l.Destroy() + defer loaderCleanup() + + mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.conf, l.rootProcArgs.Credentials); err != nil { + t.Fatalf("failed process hints: %v", err) + } + + ctx := l.k.SupervisorContext() + mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs) + if err != nil { + t.Fatalf("failed to setupVFS2: %v", err) + } + + root := mns.Root() + defer root.DecRef() + for _, p := range tc.expectedPaths { + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(p), + } + + if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { + t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + } else { + d.DecRef() + } + } + }) + } +} + +// TestRestoreEnvironment tests that the correct mounts are collected from the spec and config +// in order to build the environment for restoring. +func TestRestoreEnvironment(t *testing.T) { + testCases := []struct { + name string + spec *specs.Spec + ioFDs []int + errorExpected bool + expectedRenv fs.RestoreEnvironment + }{ + { + name: "basic spec test", + spec: &specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/some/very/very/deep/path", + Type: "tmpfs", + }, + { + Destination: "/proc", + Type: "tmpfs", + }, + }, + }, + ioFDs: []int{0}, + errorExpected: false, + expectedRenv: fs.RestoreEnvironment{ + MountSources: map[string][]fs.MountArgs{ + "9p": { + { + Dev: "9pfs-/", + Flags: fs.MountSourceFlags{ReadOnly: true}, + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + }, + }, + "tmpfs": { + { + Dev: "none", + }, + { + Dev: "none", + }, + { + Dev: "none", + }, + }, + "devtmpfs": { + { + Dev: "none", + }, + }, + "devpts": { + { + Dev: "none", + }, + }, + "sysfs": { + { + Dev: "none", + }, + }, + }, + }, + }, + { + name: "bind type test", + spec: &specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/dev/fd-foo", + Type: "bind", + }, + }, + }, + ioFDs: []int{0, 1}, + errorExpected: false, + expectedRenv: fs.RestoreEnvironment{ + MountSources: map[string][]fs.MountArgs{ + "9p": { + { + Dev: "9pfs-/", + Flags: fs.MountSourceFlags{ReadOnly: true}, + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + }, + { + Dev: "9pfs-/dev/fd-foo", + DataString: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true,cache=remote_revalidating", + }, + }, + "tmpfs": { + { + Dev: "none", + }, + }, + "devtmpfs": { + { + Dev: "none", + }, + }, + "devpts": { + { + Dev: "none", + }, + }, + "proc": { + { + Dev: "none", + }, + }, + "sysfs": { + { + Dev: "none", + }, + }, + }, + }, + }, + { + name: "options test", + spec: &specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/dev/fd-foo", + Type: "tmpfs", + Options: []string{"uid=1022", "noatime"}, + }, + }, + }, + ioFDs: []int{0}, + errorExpected: false, + expectedRenv: fs.RestoreEnvironment{ + MountSources: map[string][]fs.MountArgs{ + "9p": { + { + Dev: "9pfs-/", + Flags: fs.MountSourceFlags{ReadOnly: true}, + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + }, + }, + "tmpfs": { + { + Dev: "none", + Flags: fs.MountSourceFlags{NoAtime: true}, + DataString: "uid=1022", + }, + { + Dev: "none", + }, + }, + "devtmpfs": { + { + Dev: "none", + }, + }, + "devpts": { + { + Dev: "none", + }, + }, + "proc": { + { + Dev: "none", + }, + }, + "sysfs": { + { + Dev: "none", + }, + }, + }, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + conf := testConfig() + mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{}) + actualRenv, err := mntr.createRestoreEnvironment(conf) + if !tc.errorExpected && err != nil { + t.Fatalf("could not create restore environment for test:%s", tc.name) + } else if tc.errorExpected { + if err == nil { + t.Errorf("expected an error, but no error occurred.") + } + } else { + if !reflect.DeepEqual(*actualRenv, tc.expectedRenv) { + t.Errorf("restore environments did not match for test:%s\ngot:%+v\nwant:%+v\n", tc.name, *actualRenv, tc.expectedRenv) + } + } + }) + } +} diff --git a/runsc/boot/network.go b/runsc/boot/network.go new file mode 100644 index 000000000..14d2f56a5 --- /dev/null +++ b/runsc/boot/network.go @@ -0,0 +1,341 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "net" + "runtime" + "strings" + "syscall" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/urpc" +) + +var ( + // DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and + // "::1/8" on "lo" interface. + DefaultLoopbackLink = LoopbackLink{ + Name: "lo", + Addresses: []net.IP{ + net.IP("\x7f\x00\x00\x01"), + net.IPv6loopback, + }, + Routes: []Route{ + { + Destination: net.IPNet{ + IP: net.IPv4(0x7f, 0, 0, 0), + Mask: net.IPv4Mask(0xff, 0, 0, 0), + }, + }, + { + Destination: net.IPNet{ + IP: net.IPv6loopback, + Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), + }, + }, + }, + } +) + +// Network exposes methods that can be used to configure a network stack. +type Network struct { + Stack *stack.Stack +} + +// Route represents a route in the network stack. +type Route struct { + Destination net.IPNet + Gateway net.IP +} + +// DefaultRoute represents a catch all route to the default gateway. +type DefaultRoute struct { + Route Route + Name string +} + +// QueueingDiscipline is used to specify the kind of Queueing Discipline to +// apply for a give FDBasedLink. +type QueueingDiscipline int + +const ( + // QDiscNone disables any queueing for the underlying FD. + QDiscNone QueueingDiscipline = iota + + // QDiscFIFO applies a simple fifo based queue to the underlying + // FD. + QDiscFIFO +) + +// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s +// else returns an error. +func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) { + switch s { + case "none": + return QDiscNone, nil + case "fifo": + return QDiscFIFO, nil + default: + return 0, fmt.Errorf("unsupported qdisc specified: %q", s) + } +} + +// String implements fmt.Stringer. +func (q QueueingDiscipline) String() string { + switch q { + case QDiscNone: + return "none" + case QDiscFIFO: + return "fifo" + default: + panic(fmt.Sprintf("Invalid queueing discipline: %d", q)) + } +} + +// FDBasedLink configures an fd-based link. +type FDBasedLink struct { + Name string + MTU int + Addresses []net.IP + Routes []Route + GSOMaxSize uint32 + SoftwareGSOEnabled bool + TXChecksumOffload bool + RXChecksumOffload bool + LinkAddress net.HardwareAddr + QDisc QueueingDiscipline + + // NumChannels controls how many underlying FD's are to be used to + // create this endpoint. + NumChannels int +} + +// LoopbackLink configures a loopback li nk. +type LoopbackLink struct { + Name string + Addresses []net.IP + Routes []Route +} + +// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes. +type CreateLinksAndRoutesArgs struct { + // FilePayload contains the fds associated with the FDBasedLinks. The + // number of fd's should match the sum of the NumChannels field of the + // FDBasedLink entries below. + urpc.FilePayload + + LoopbackLinks []LoopbackLink + FDBasedLinks []FDBasedLink + + Defaultv4Gateway DefaultRoute + Defaultv6Gateway DefaultRoute +} + +// Empty returns true if route hasn't been set. +func (r *Route) Empty() bool { + return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil +} + +func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) { + subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask)) + if err != nil { + return tcpip.Route{}, err + } + return tcpip.Route{ + Destination: subnet, + Gateway: ipToAddress(r.Gateway), + NIC: id, + }, nil +} + +// CreateLinksAndRoutes creates links and routes in a network stack. It should +// only be called once. +func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { + wantFDs := 0 + for _, l := range args.FDBasedLinks { + wantFDs += l.NumChannels + } + if got := len(args.FilePayload.Files); got != wantFDs { + return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs) + } + + var nicID tcpip.NICID + nicids := make(map[string]tcpip.NICID) + + // Collect routes from all links. + var routes []tcpip.Route + + // Loopback normally appear before other interfaces. + for _, link := range args.LoopbackLinks { + nicID++ + nicids[link.Name] = nicID + + linkEP := loopback.New() + + log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return err + } + + // Collect the routes from this link. + for _, r := range link.Routes { + route, err := r.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + } + + fdOffset := 0 + for _, link := range args.FDBasedLinks { + nicID++ + nicids[link.Name] = nicID + + FDs := []int{} + for j := 0; j < link.NumChannels; j++ { + // Copy the underlying FD. + oldFD := args.FilePayload.Files[fdOffset].Fd() + newFD, err := syscall.Dup(int(oldFD)) + if err != nil { + return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + } + FDs = append(FDs, newFD) + fdOffset++ + } + + mac := tcpip.LinkAddress(link.LinkAddress) + log.Infof("gso max size is: %d", link.GSOMaxSize) + + linkEP, err := fdbased.New(&fdbased.Options{ + FDs: FDs, + MTU: uint32(link.MTU), + EthernetHeader: true, + Address: mac, + PacketDispatchMode: fdbased.RecvMMsg, + GSOMaxSize: link.GSOMaxSize, + SoftwareGSOEnabled: link.SoftwareGSOEnabled, + TXChecksumOffload: link.TXChecksumOffload, + RXChecksumOffload: link.RXChecksumOffload, + }) + if err != nil { + return err + } + + switch link.QDisc { + case QDiscNone: + case QDiscFIFO: + log.Infof("Enabling FIFO QDisc on %q", link.Name) + linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) + } + + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return err + } + + // Collect the routes from this link. + for _, r := range link.Routes { + route, err := r.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + } + + if !args.Defaultv4Gateway.Route.Empty() { + nicID, ok := nicids[args.Defaultv4Gateway.Name] + if !ok { + return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name) + } + route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + + if !args.Defaultv6Gateway.Route.Empty() { + nicID, ok := nicids[args.Defaultv6Gateway.Name] + if !ok { + return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name) + } + route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + + log.Infof("Setting routes %+v", routes) + n.Stack.SetRouteTable(routes) + return nil +} + +// createNICWithAddrs creates a NIC in the network stack and adds the given +// addresses. +func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error { + opts := stack.NICOptions{Name: name} + if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil { + return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err) + } + + // Always start with an arp address for the NIC. + if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err) + } + + for _, addr := range addrs { + proto, tcpipAddr := ipToAddressAndProto(addr) + if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil { + return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err) + } + } + return nil +} + +// ipToAddressAndProto converts IP to tcpip.Address and a protocol number. +// +// Note: don't use 'len(ip)' to determine IP version because length is always 16. +func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) { + if i4 := ip.To4(); i4 != nil { + return ipv4.ProtocolNumber, tcpip.Address(i4) + } + return ipv6.ProtocolNumber, tcpip.Address(ip) +} + +// ipToAddress converts IP to tcpip.Address, ignoring the protocol. +func ipToAddress(ip net.IP) tcpip.Address { + _, addr := ipToAddressAndProto(ip) + return addr +} + +// ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the +// protocol. +func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask { + return tcpip.AddressMask(ipToAddress(net.IP(ipMask))) +} diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD new file mode 100644 index 000000000..77774f43c --- /dev/null +++ b/runsc/boot/platforms/BUILD @@ -0,0 +1,15 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "platforms", + srcs = ["platforms.go"], + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + ], +) diff --git a/runsc/boot/platforms/platforms.go b/runsc/boot/platforms/platforms.go new file mode 100644 index 000000000..056b46ad5 --- /dev/null +++ b/runsc/boot/platforms/platforms.go @@ -0,0 +1,30 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package platforms imports all available platform packages. +package platforms + +import ( + // Import platforms that runsc might use. + _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" +) + +const ( + // Ptrace runs the sandbox with the ptrace platform. + Ptrace = "ptrace" + + // KVM runs the sandbox with the KVM platform. + KVM = "kvm" +) diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD new file mode 100644 index 000000000..29cb42b2f --- /dev/null +++ b/runsc/boot/pprof/BUILD @@ -0,0 +1,11 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "pprof", + srcs = ["pprof.go"], + visibility = [ + "//runsc:__subpackages__", + ], +) diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go new file mode 100644 index 000000000..1ded20dee --- /dev/null +++ b/runsc/boot/pprof/pprof.go @@ -0,0 +1,20 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pprof provides a stub to initialize custom profilers. +package pprof + +// Initialize will be called at boot for initializing custom profilers. +func Initialize() { +} diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go new file mode 100644 index 000000000..fbfd3b07c --- /dev/null +++ b/runsc/boot/strace.go @@ -0,0 +1,40 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "gvisor.dev/gvisor/pkg/sentry/strace" +) + +func enableStrace(conf *Config) error { + // We must initialize even if strace is not enabled. + strace.Initialize() + + if !conf.Strace { + return nil + } + + max := conf.StraceLogSize + if max == 0 { + max = 1024 + } + strace.LogMaximumSize = max + + if len(conf.StraceSyscalls) == 0 { + strace.EnableAll(strace.SinkTypeLog) + return nil + } + return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog) +} diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go new file mode 100644 index 000000000..6ee6fae04 --- /dev/null +++ b/runsc/boot/vfs.go @@ -0,0 +1,482 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "path" + "sort" + "strings" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/devices/memdev" + "gvisor.dev/gvisor/pkg/sentry/devices/ttydev" + "gvisor.dev/gvisor/pkg/sentry/devices/tundev" + "gvisor.dev/gvisor/pkg/sentry/fs/user" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +func registerFilesystems(k *kernel.Kernel) error { + ctx := k.SupervisorContext() + creds := auth.NewRootCredentials(k.RootUserNamespace()) + vfsObj := k.VFS() + + vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + // TODO(b/29356795): Users may mount this once the terminals are in a + // usable state. + AllowUserMount: false, + }) + vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + // Setup files in devtmpfs. + if err := memdev.Register(vfsObj); err != nil { + return fmt.Errorf("registering memdev: %w", err) + } + if err := ttydev.Register(vfsObj); err != nil { + return fmt.Errorf("registering ttydev: %w", err) + } + + if err := fuse.Register(vfsObj); err != nil { + return fmt.Errorf("registering fusedev: %w", err) + } + if err := tundev.Register(vfsObj); err != nil { + return fmt.Errorf("registering tundev: %v", err) + } + a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name) + if err != nil { + return fmt.Errorf("creating devtmpfs accessor: %w", err) + } + defer a.Release() + + if err := a.UserspaceInit(ctx); err != nil { + return fmt.Errorf("initializing userspace: %w", err) + } + if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating memdev devtmpfs files: %w", err) + } + if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating ttydev devtmpfs files: %w", err) + } + if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating tundev devtmpfs files: %v", err) + } + if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { + return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + } + return nil +} + +func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + mns, err := mntr.setupVFS2(ctx, conf, procArgs) + if err != nil { + return fmt.Errorf("failed to setupFS: %w", err) + } + procArgs.MountNamespaceVFS2 = mns + + // Resolve the executable path from working dir and environment. + resolved, err := user.ResolveExecutablePath(ctx, procArgs) + if err != nil { + return err + } + procArgs.Filename = resolved + return nil +} + +func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { + log.Infof("Configuring container's file system with VFS2") + + // Create context with root credentials to mount the filesystem (the current + // user may not be privileged enough). + rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs := *procArgs + rootProcArgs.WorkingDirectory = "/" + rootProcArgs.Credentials = rootCreds + rootProcArgs.Umask = 0022 + rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals + rootCtx := procArgs.NewContext(c.k) + + mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) + if err != nil { + return nil, fmt.Errorf("creating mount namespace: %w", err) + } + rootProcArgs.MountNamespaceVFS2 = mns + + // Mount submounts. + if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil { + return nil, fmt.Errorf("mounting submounts vfs2: %w", err) + } + return mns, nil +} + +func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { + fd := c.fds.remove() + opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",") + + log.Infof("Mounting root over 9P, ioFD: %d", fd) + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts}) + if err != nil { + return nil, fmt.Errorf("setting up mount namespace: %w", err) + } + return mns, nil +} + +func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { + mounts, err := c.prepareMountsVFS2() + if err != nil { + return err + } + + for i := range mounts { + submount := &mounts[i] + log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) + if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err) + } + } else { + if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { + return fmt.Errorf("mount submount %q: %w", submount.Destination, err) + } + } + } + + if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil { + return fmt.Errorf(`mount submount "\tmp": %w`, err) + } + return nil +} + +type mountAndFD struct { + specs.Mount + fd int +} + +func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { + // Associate bind mounts with their FDs before sorting since there is an + // undocumented assumption that FDs are dispensed in the order in which + // they are required by mounts. + var mounts []mountAndFD + for _, m := range c.mounts { + fd := -1 + // Only bind mounts use host FDs; see + // containerMounter.getMountNameAndOptionsVFS2. + if m.Type == bind { + fd = c.fds.remove() + } + mounts = append(mounts, mountAndFD{ + Mount: m, + fd: fd, + }) + } + if err := c.checkDispenser(); err != nil { + return nil, err + } + + // Sort the mounts so that we don't place children before parents. + sort.Slice(mounts, func(i, j int) bool { + return len(mounts[i].Destination) < len(mounts[j].Destination) + }) + + return mounts, nil +} + +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { + root := mns.Root() + defer root.DecRef() + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(submount.Destination), + } + fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount) + if err != nil { + return fmt.Errorf("mountOptions failed: %w", err) + } + if len(fsName) == 0 { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil { + return err + } + if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil { + return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + } + log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data) + return nil +} + +// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) { + fsName := m.Type + var data []string + + // Find filesystem name and FS specific data field. + switch m.Type { + case devpts.Name, devtmpfs.Name, proc.Name, sys.Name: + // Nothing to do. + + case nonefs: + fsName = sys.Name + + case tmpfs.Name: + var err error + data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) + if err != nil { + return "", nil, err + } + + case bind: + fsName = gofer.Name + if m.fd == 0 { + // Check that an FD was provided to fails fast. Technically FD=0 is valid, + // but unlikely to be correct in this context. + return "", nil, fmt.Errorf("9P mount requires a connection FD") + } + data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) + + default: + log.Warningf("ignoring unknown filesystem type %q", m.Type) + return "", nil, nil + } + + opts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(data, ","), + }, + InternalMount: true, + } + + for _, o := range m.Options { + switch o { + case "rw": + opts.ReadOnly = false + case "ro": + opts.ReadOnly = true + case "noatime": + opts.Flags.NoATime = true + case "noexec": + opts.Flags.NoExec = true + default: + log.Warningf("ignoring unknown mount option %q", o) + } + } + + if conf.Overlay { + // All writes go to upper, be paranoid and make lower readonly. + opts.ReadOnly = true + } + return fsName, opts, nil +} + +func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(currentPath), + } + _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) + if err == nil { + log.Debugf("Mount point %q already exists", currentPath) + return nil + } + if err != syserror.ENOENT { + return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err) + } + + // Recurse to ensure parent is created and then create the mount point. + if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil { + return err + } + log.Debugf("Creating dir %q for mount point", currentPath) + mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} + if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil { + return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err) + } + return nil +} + +// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so. +// Technically we don't have to mount tmpfs at /tmp, as we could just rely on +// the host /tmp, but this is a nice optimization, and fixes some apps that call +// mknod in /tmp. It's unsafe to mount tmpfs if: +// 1. /tmp is mounted explicitly: we should not override user's wish +// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp +// +// Note that when there are submounts inside of '/tmp', directories for the +// mount points must be present, making '/tmp' not empty anymore. +func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { + for _, m := range c.mounts { + // m.Destination has been cleaned, so it's to use equality here. + if m.Destination == "/tmp" { + log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m) + return nil + } + } + + root := mns.Root() + defer root.DecRef() + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse("/tmp"), + } + // TODO(gvisor.dev/issue/2782): Use O_PATH when available. + statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{}) + switch err { + case nil: + // Found '/tmp' in filesystem, check if it's empty. + if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory { + // Not a dir?! Leave it be. + return nil + } + if statx.Nlink > 2 { + // If more than "." and ".." is found, skip internal tmpfs to prevent + // hiding existing files. + log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) + return nil + } + log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) + fallthrough + + case syserror.ENOENT: + // No '/tmp' found (or fallthrough from above). It's safe to mount internal + // tmpfs. + tmpMount := specs.Mount{ + Type: tmpfs.Name, + Destination: "/tmp", + // Sticky bit is added to prevent accidental deletion of files from + // another user. This is normally done for /tmp. + Options: []string{"mode=01777"}, + } + return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + + default: + return fmt.Errorf(`stating "/tmp" inside container: %w`, err) + } +} + +// processHintsVFS2 processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error { + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfs.Name { + continue + } + + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.vfsMount = mnt + } + return nil +} + +// mountSharedMasterVFS2 mounts the master of a volume that is shared among +// containers in a pod. +func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + mntFD := &mountAndFD{Mount: hint.mount} + fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) +} + +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error { + if err := source.checkCompatible(mount); err != nil { + return err + } + + _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + if err != nil { + return err + } + newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts) + if err != nil { + return err + } + defer newMnt.DecRef() + + root := mns.Root() + defer root.DecRef() + if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil { + return err + } + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(mount.Destination), + } + if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { + return err + } + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return nil +} diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD new file mode 100644 index 000000000..7e34a284a --- /dev/null +++ b/runsc/cgroup/BUILD @@ -0,0 +1,27 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "cgroup", + srcs = ["cgroup.go"], + visibility = ["//:sandbox"], + deps = [ + "//pkg/cleanup", + "//pkg/log", + "@com_github_cenkalti_backoff//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + ], +) + +go_test( + name = "cgroup_test", + size = "small", + srcs = ["cgroup_test.go"], + library = ":cgroup", + tags = ["local"], + deps = [ + "//pkg/test/testutil", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + ], +) diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go new file mode 100644 index 000000000..e5cc9d622 --- /dev/null +++ b/runsc/cgroup/cgroup.go @@ -0,0 +1,576 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cgroup provides an interface to read and write configuration to +// cgroup. +package cgroup + +import ( + "bufio" + "context" + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" + + "github.com/cenkalti/backoff" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/log" +) + +const ( + cgroupRoot = "/sys/fs/cgroup" +) + +var controllers = map[string]config{ + "blkio": config{ctrlr: &blockIO{}}, + "cpu": config{ctrlr: &cpu{}}, + "cpuset": config{ctrlr: &cpuSet{}}, + "hugetlb": config{ctrlr: &hugeTLB{}, optional: true}, + "memory": config{ctrlr: &memory{}}, + "net_cls": config{ctrlr: &networkClass{}}, + "net_prio": config{ctrlr: &networkPrio{}}, + "pids": config{ctrlr: &pids{}}, + + // These controllers either don't have anything in the OCI spec or is + // irrelevant for a sandbox. + "devices": config{ctrlr: &noop{}}, + "freezer": config{ctrlr: &noop{}}, + "perf_event": config{ctrlr: &noop{}}, + "rdma": config{ctrlr: &noop{}, optional: true}, + "systemd": config{ctrlr: &noop{}}, +} + +func setOptionalValueInt(path, name string, val *int64) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatInt(*val, 10) + return setValue(path, name, str) +} + +func setOptionalValueUint(path, name string, val *uint64) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatUint(*val, 10) + return setValue(path, name, str) +} + +func setOptionalValueUint32(path, name string, val *uint32) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatUint(uint64(*val), 10) + return setValue(path, name, str) +} + +func setOptionalValueUint16(path, name string, val *uint16) error { + if val == nil || *val == 0 { + return nil + } + str := strconv.FormatUint(uint64(*val), 10) + return setValue(path, name, str) +} + +func setValue(path, name, data string) error { + fullpath := filepath.Join(path, name) + return ioutil.WriteFile(fullpath, []byte(data), 0700) +} + +func getValue(path, name string) (string, error) { + fullpath := filepath.Join(path, name) + out, err := ioutil.ReadFile(fullpath) + if err != nil { + return "", err + } + return string(out), nil +} + +func getInt(path, name string) (int, error) { + s, err := getValue(path, name) + if err != nil { + return 0, err + } + return strconv.Atoi(strings.TrimSpace(s)) +} + +// fillFromAncestor sets the value of a cgroup file from the first ancestor +// that has content. It does nothing if the file in 'path' has already been set. +func fillFromAncestor(path string) (string, error) { + out, err := ioutil.ReadFile(path) + if err != nil { + return "", err + } + val := strings.TrimSpace(string(out)) + if val != "" { + // File is set, stop here. + return val, nil + } + + // File is not set, recurse to parent and then set here. + name := filepath.Base(path) + parent := filepath.Dir(filepath.Dir(path)) + val, err = fillFromAncestor(filepath.Join(parent, name)) + if err != nil { + return "", err + } + if err := ioutil.WriteFile(path, []byte(val), 0700); err != nil { + return "", err + } + return val, nil +} + +// countCpuset returns the number of CPU in a string formatted like: +// "0-2,7,12-14 # bits 0, 1, 2, 7, 12, 13, and 14 set" - man 7 cpuset +func countCpuset(cpuset string) (int, error) { + var count int + for _, p := range strings.Split(cpuset, ",") { + interval := strings.Split(p, "-") + switch len(interval) { + case 1: + if _, err := strconv.Atoi(interval[0]); err != nil { + return 0, err + } + count++ + + case 2: + start, err := strconv.Atoi(interval[0]) + if err != nil { + return 0, err + } + end, err := strconv.Atoi(interval[1]) + if err != nil { + return 0, err + } + if start < 0 || end < 0 || start > end { + return 0, fmt.Errorf("invalid cpuset: %q", p) + } + count += end - start + 1 + + default: + return 0, fmt.Errorf("invalid cpuset: %q", p) + } + } + return count, nil +} + +// LoadPaths loads cgroup paths for given 'pid', may be set to 'self'. +func LoadPaths(pid string) (map[string]string, error) { + f, err := os.Open(filepath.Join("/proc", pid, "cgroup")) + if err != nil { + return nil, err + } + defer f.Close() + + paths := make(map[string]string) + scanner := bufio.NewScanner(f) + for scanner.Scan() { + // Format: ID:controller1,controller2:path + // Example: 2:cpu,cpuacct:/user.slice + tokens := strings.Split(scanner.Text(), ":") + if len(tokens) != 3 { + return nil, fmt.Errorf("invalid cgroups file, line: %q", scanner.Text()) + } + for _, ctrlr := range strings.Split(tokens[1], ",") { + paths[ctrlr] = tokens[2] + } + } + if err := scanner.Err(); err != nil { + return nil, err + } + return paths, nil +} + +// Cgroup represents a group inside all controllers. For example: +// Name='/foo/bar' maps to /sys/fs/cgroup/<controller>/foo/bar on +// all controllers. +type Cgroup struct { + Name string `json:"name"` + Parents map[string]string `json:"parents"` + Own bool `json:"own"` +} + +// New creates a new Cgroup instance if the spec includes a cgroup path. +// Returns nil otherwise. +func New(spec *specs.Spec) (*Cgroup, error) { + if spec.Linux == nil || spec.Linux.CgroupsPath == "" { + return nil, nil + } + var parents map[string]string + if !filepath.IsAbs(spec.Linux.CgroupsPath) { + var err error + parents, err = LoadPaths("self") + if err != nil { + return nil, fmt.Errorf("finding current cgroups: %v", err) + } + } + return &Cgroup{ + Name: spec.Linux.CgroupsPath, + Parents: parents, + }, nil +} + +// Install creates and configures cgroups according to 'res'. If cgroup path +// already exists, it means that the caller has already provided a +// pre-configured cgroups, and 'res' is ignored. +func (c *Cgroup) Install(res *specs.LinuxResources) error { + if _, err := os.Stat(c.makePath("memory")); err == nil { + // If cgroup has already been created; it has been setup by caller. Don't + // make any changes to configuration, just join when sandbox/gofer starts. + log.Debugf("Using pre-created cgroup %q", c.Name) + return nil + } + + log.Debugf("Creating cgroup %q", c.Name) + + // Mark that cgroup resources are owned by me. + c.Own = true + + // The Cleanup object cleans up partially created cgroups when an error occurs. + // Errors occuring during cleanup itself are ignored. + clean := cleanup.Make(func() { _ = c.Uninstall() }) + defer clean.Clean() + + for key, cfg := range controllers { + path := c.makePath(key) + if err := os.MkdirAll(path, 0755); err != nil { + if cfg.optional && errors.Is(err, syscall.EROFS) { + log.Infof("Skipping cgroup %q", key) + continue + } + return err + } + if res != nil { + if err := cfg.ctrlr.set(res, path); err != nil { + return err + } + } + } + clean.Release() + return nil +} + +// Uninstall removes the settings done in Install(). If cgroup path already +// existed when Install() was called, Uninstall is a noop. +func (c *Cgroup) Uninstall() error { + if !c.Own { + // cgroup is managed by caller, don't touch it. + return nil + } + log.Debugf("Deleting cgroup %q", c.Name) + for key := range controllers { + path := c.makePath(key) + log.Debugf("Removing cgroup controller for key=%q path=%q", key, path) + + // If we try to remove the cgroup too soon after killing the + // sandbox we might get EBUSY, so we retry for a few seconds + // until it succeeds. + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) + if err := backoff.Retry(func() error { + err := syscall.Rmdir(path) + if os.IsNotExist(err) { + return nil + } + return err + }, b); err != nil { + return fmt.Errorf("removing cgroup path %q: %v", path, err) + } + } + return nil +} + +// Join adds the current process to the all controllers. Returns function that +// restores cgroup to the original state. +func (c *Cgroup) Join() (func(), error) { + // First save the current state so it can be restored. + undo := func() {} + paths, err := LoadPaths("self") + if err != nil { + return undo, err + } + var undoPaths []string + for ctrlr, path := range paths { + // Skip controllers we don't handle. + if _, ok := controllers[ctrlr]; ok { + fullPath := filepath.Join(cgroupRoot, ctrlr, path) + undoPaths = append(undoPaths, fullPath) + break + } + } + + // Replace empty undo with the real thing before changes are made to cgroups. + undo = func() { + for _, path := range undoPaths { + log.Debugf("Restoring cgroup %q", path) + if err := setValue(path, "cgroup.procs", "0"); err != nil { + log.Warningf("Error restoring cgroup %q: %v", path, err) + } + } + } + + // Now join the cgroups. + for key, cfg := range controllers { + path := c.makePath(key) + log.Debugf("Joining cgroup %q", path) + if err := setValue(path, "cgroup.procs", "0"); err != nil { + if cfg.optional && os.IsNotExist(err) { + continue + } + return undo, err + } + } + return undo, nil +} + +func (c *Cgroup) CPUQuota() (float64, error) { + path := c.makePath("cpu") + quota, err := getInt(path, "cpu.cfs_quota_us") + if err != nil { + return -1, err + } + period, err := getInt(path, "cpu.cfs_period_us") + if err != nil { + return -1, err + } + if quota <= 0 || period <= 0 { + return -1, err + } + return float64(quota) / float64(period), nil +} + +// NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'. +func (c *Cgroup) NumCPU() (int, error) { + path := c.makePath("cpuset") + cpuset, err := getValue(path, "cpuset.cpus") + if err != nil { + return 0, err + } + return countCpuset(strings.TrimSpace(cpuset)) +} + +// MemoryLimit returns the memory limit. +func (c *Cgroup) MemoryLimit() (uint64, error) { + path := c.makePath("memory") + limStr, err := getValue(path, "memory.limit_in_bytes") + if err != nil { + return 0, err + } + return strconv.ParseUint(strings.TrimSpace(limStr), 10, 64) +} + +func (c *Cgroup) makePath(controllerName string) string { + path := c.Name + if parent, ok := c.Parents[controllerName]; ok { + path = filepath.Join(parent, c.Name) + } + return filepath.Join(cgroupRoot, controllerName, path) +} + +type config struct { + ctrlr controller + optional bool +} + +type controller interface { + set(*specs.LinuxResources, string) error +} + +type noop struct{} + +func (*noop) set(*specs.LinuxResources, string) error { + return nil +} + +type memory struct{} + +func (*memory) set(spec *specs.LinuxResources, path string) error { + if spec.Memory == nil { + return nil + } + if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.soft_limit_in_bytes", spec.Memory.Reservation); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.memsw.limit_in_bytes", spec.Memory.Swap); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.kmem.limit_in_bytes", spec.Memory.Kernel); err != nil { + return err + } + if err := setOptionalValueInt(path, "memory.kmem.tcp.limit_in_bytes", spec.Memory.KernelTCP); err != nil { + return err + } + if err := setOptionalValueUint(path, "memory.swappiness", spec.Memory.Swappiness); err != nil { + return err + } + + if spec.Memory.DisableOOMKiller != nil && *spec.Memory.DisableOOMKiller { + if err := setValue(path, "memory.oom_control", "1"); err != nil { + return err + } + } + return nil +} + +type cpu struct{} + +func (*cpu) set(spec *specs.LinuxResources, path string) error { + if spec.CPU == nil { + return nil + } + if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil { + return err + } + if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil { + return err + } + if err := setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period); err != nil { + return err + } + if err := setOptionalValueUint(path, "cpu.rt_period_us", spec.CPU.RealtimePeriod); err != nil { + return err + } + return setOptionalValueInt(path, "cpu.rt_runtime_us", spec.CPU.RealtimeRuntime) +} + +type cpuSet struct{} + +func (*cpuSet) set(spec *specs.LinuxResources, path string) error { + // cpuset.cpus and mems are required fields, but are not set on a new cgroup. + // If not set in the spec, get it from one of the ancestors cgroup. + if spec.CPU == nil || spec.CPU.Cpus == "" { + if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil { + return err + } + } else { + if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil { + return err + } + } + + if spec.CPU == nil || spec.CPU.Mems == "" { + _, err := fillFromAncestor(filepath.Join(path, "cpuset.mems")) + return err + } + mems := spec.CPU.Mems + return setValue(path, "cpuset.mems", mems) +} + +type blockIO struct{} + +func (*blockIO) set(spec *specs.LinuxResources, path string) error { + if spec.BlockIO == nil { + return nil + } + + if err := setOptionalValueUint16(path, "blkio.weight", spec.BlockIO.Weight); err != nil { + return err + } + if err := setOptionalValueUint16(path, "blkio.leaf_weight", spec.BlockIO.LeafWeight); err != nil { + return err + } + + for _, dev := range spec.BlockIO.WeightDevice { + if dev.Weight != nil { + val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight) + if err := setValue(path, "blkio.weight_device", val); err != nil { + return err + } + } + if dev.LeafWeight != nil { + val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.LeafWeight) + if err := setValue(path, "blkio.leaf_weight_device", val); err != nil { + return err + } + } + } + if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil { + return err + } + if err := setThrottle(path, "blkio.throttle.write_bps_device", spec.BlockIO.ThrottleWriteBpsDevice); err != nil { + return err + } + if err := setThrottle(path, "blkio.throttle.read_iops_device", spec.BlockIO.ThrottleReadIOPSDevice); err != nil { + return err + } + return setThrottle(path, "blkio.throttle.write_iops_device", spec.BlockIO.ThrottleWriteIOPSDevice) +} + +func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error { + for _, dev := range devs { + val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate) + if err := setValue(path, name, val); err != nil { + return err + } + } + return nil +} + +type networkClass struct{} + +func (*networkClass) set(spec *specs.LinuxResources, path string) error { + if spec.Network == nil { + return nil + } + return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID) +} + +type networkPrio struct{} + +func (*networkPrio) set(spec *specs.LinuxResources, path string) error { + if spec.Network == nil { + return nil + } + for _, prio := range spec.Network.Priorities { + val := fmt.Sprintf("%s %d", prio.Name, prio.Priority) + if err := setValue(path, "net_prio.ifpriomap", val); err != nil { + return err + } + } + return nil +} + +type pids struct{} + +func (*pids) set(spec *specs.LinuxResources, path string) error { + if spec.Pids == nil || spec.Pids.Limit <= 0 { + return nil + } + val := strconv.FormatInt(spec.Pids.Limit, 10) + return setValue(path, "pids.max", val) +} + +type hugeTLB struct{} + +func (*hugeTLB) set(spec *specs.LinuxResources, path string) error { + for _, limit := range spec.HugepageLimits { + name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize) + val := strconv.FormatUint(limit.Limit, 10) + if err := setValue(path, name, val); err != nil { + return err + } + } + return nil +} diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go new file mode 100644 index 000000000..4db5ee5c3 --- /dev/null +++ b/runsc/cgroup/cgroup_test.go @@ -0,0 +1,649 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgroup + +import ( + "io/ioutil" + "os" + "path/filepath" + "strings" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/test/testutil" +) + +func TestUninstallEnoent(t *testing.T) { + c := Cgroup{ + // set a non-existent name + Name: "runsc-test-uninstall-656e6f656e740a", + Own: true, + } + if err := c.Uninstall(); err != nil { + t.Errorf("Uninstall() failed: %v", err) + } +} + +func TestCountCpuset(t *testing.T) { + for _, tc := range []struct { + str string + want int + error bool + }{ + {str: "0", want: 1}, + {str: "0,1,2,8,9,10", want: 6}, + {str: "0-1", want: 2}, + {str: "0-7", want: 8}, + {str: "0-7,16,32-39,64,65", want: 19}, + {str: "a", error: true}, + {str: "5-a", error: true}, + {str: "a-5", error: true}, + {str: "-10", error: true}, + {str: "15-", error: true}, + {str: "-", error: true}, + {str: "--", error: true}, + } { + t.Run(tc.str, func(t *testing.T) { + got, err := countCpuset(tc.str) + if tc.error { + if err == nil { + t.Errorf("countCpuset(%q) should have failed", tc.str) + } + } else { + if err != nil { + t.Errorf("countCpuset(%q) failed: %v", tc.str, err) + } + if tc.want != got { + t.Errorf("countCpuset(%q) want: %d, got: %d", tc.str, tc.want, got) + } + } + }) + } +} + +func uint16Ptr(v uint16) *uint16 { + return &v +} + +func uint32Ptr(v uint32) *uint32 { + return &v +} + +func int64Ptr(v int64) *int64 { + return &v +} + +func uint64Ptr(v uint64) *uint64 { + return &v +} + +func boolPtr(v bool) *bool { + return &v +} + +func checkDir(t *testing.T, dir string, contents map[string]string) { + all, err := ioutil.ReadDir(dir) + if err != nil { + t.Fatalf("ReadDir(%q): %v", dir, err) + } + fileCount := 0 + for _, file := range all { + if file.IsDir() { + // Only want to compare files. + continue + } + fileCount++ + + want, ok := contents[file.Name()] + if !ok { + t.Errorf("file not expected: %q", file.Name()) + continue + } + gotBytes, err := ioutil.ReadFile(filepath.Join(dir, file.Name())) + if err != nil { + t.Fatal(err.Error()) + } + got := strings.TrimSuffix(string(gotBytes), "\n") + if got != want { + t.Errorf("wrong file content, file: %q, want: %q, got: %q", file.Name(), want, got) + } + } + if fileCount != len(contents) { + t.Errorf("file is missing, want: %v, got: %v", contents, all) + } +} + +func makeLinuxWeightDevice(major, minor int64, weight, leafWeight *uint16) specs.LinuxWeightDevice { + rv := specs.LinuxWeightDevice{ + Weight: weight, + LeafWeight: leafWeight, + } + rv.Major = major + rv.Minor = minor + return rv +} + +func makeLinuxThrottleDevice(major, minor int64, rate uint64) specs.LinuxThrottleDevice { + rv := specs.LinuxThrottleDevice{ + Rate: rate, + } + rv.Major = major + rv.Minor = minor + return rv +} + +func TestBlockIO(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxBlockIO + wants map[string]string + }{ + { + name: "simple", + spec: &specs.LinuxBlockIO{ + Weight: uint16Ptr(1), + LeafWeight: uint16Ptr(2), + }, + wants: map[string]string{ + "blkio.weight": "1", + "blkio.leaf_weight": "2", + }, + }, + { + name: "weight_device", + spec: &specs.LinuxBlockIO{ + WeightDevice: []specs.LinuxWeightDevice{ + makeLinuxWeightDevice(1, 2, uint16Ptr(3), uint16Ptr(4)), + }, + }, + wants: map[string]string{ + "blkio.weight_device": "1:2 3", + "blkio.leaf_weight_device": "1:2 4", + }, + }, + { + name: "weight_device_nil_values", + spec: &specs.LinuxBlockIO{ + WeightDevice: []specs.LinuxWeightDevice{ + makeLinuxWeightDevice(1, 2, nil, nil), + }, + }, + }, + { + name: "throttle", + spec: &specs.LinuxBlockIO{ + ThrottleReadBpsDevice: []specs.LinuxThrottleDevice{ + makeLinuxThrottleDevice(1, 2, 3), + }, + ThrottleReadIOPSDevice: []specs.LinuxThrottleDevice{ + makeLinuxThrottleDevice(4, 5, 6), + }, + ThrottleWriteBpsDevice: []specs.LinuxThrottleDevice{ + makeLinuxThrottleDevice(7, 8, 9), + }, + ThrottleWriteIOPSDevice: []specs.LinuxThrottleDevice{ + makeLinuxThrottleDevice(10, 11, 12), + }, + }, + wants: map[string]string{ + "blkio.throttle.read_bps_device": "1:2 3", + "blkio.throttle.read_iops_device": "4:5 6", + "blkio.throttle.write_bps_device": "7:8 9", + "blkio.throttle.write_iops_device": "10:11 12", + }, + }, + { + name: "nil_values", + spec: &specs.LinuxBlockIO{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + BlockIO: tc.spec, + } + ctrlr := blockIO{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestCPU(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxCPU + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxCPU{ + Shares: uint64Ptr(1), + Quota: int64Ptr(2), + Period: uint64Ptr(3), + RealtimeRuntime: int64Ptr(4), + RealtimePeriod: uint64Ptr(5), + }, + wants: map[string]string{ + "cpu.shares": "1", + "cpu.cfs_quota_us": "2", + "cpu.cfs_period_us": "3", + "cpu.rt_runtime_us": "4", + "cpu.rt_period_us": "5", + }, + }, + { + name: "nil_values", + spec: &specs.LinuxCPU{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + CPU: tc.spec, + } + ctrlr := cpu{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestCPUSet(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxCPU + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxCPU{ + Cpus: "foo", + Mems: "bar", + }, + wants: map[string]string{ + "cpuset.cpus": "foo", + "cpuset.mems": "bar", + }, + }, + // Don't test nil values because they are copied from the parent. + // See TestCPUSetAncestor(). + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + CPU: tc.spec, + } + ctrlr := cpuSet{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +// TestCPUSetAncestor checks that, when not available, value is read from +// parent directory. +func TestCPUSetAncestor(t *testing.T) { + // Prepare master directory with cgroup files that will be propagated to + // children. + grandpa, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(grandpa) + + if err := ioutil.WriteFile(filepath.Join(grandpa, "cpuset.cpus"), []byte("parent-cpus"), 0666); err != nil { + t.Fatalf("ioutil.WriteFile(): %v", err) + } + if err := ioutil.WriteFile(filepath.Join(grandpa, "cpuset.mems"), []byte("parent-mems"), 0666); err != nil { + t.Fatalf("ioutil.WriteFile(): %v", err) + } + + for _, tc := range []struct { + name string + spec *specs.LinuxCPU + }{ + { + name: "nil_values", + spec: &specs.LinuxCPU{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + // Create empty files in intermediate directory. They should be ignored + // when reading, and then populated from parent. + parent, err := ioutil.TempDir(grandpa, "parent") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(parent) + if _, err := os.Create(filepath.Join(parent, "cpuset.cpus")); err != nil { + t.Fatalf("os.Create(): %v", err) + } + if _, err := os.Create(filepath.Join(parent, "cpuset.mems")); err != nil { + t.Fatalf("os.Create(): %v", err) + } + + // cgroup files mmust exist. + dir, err := ioutil.TempDir(parent, "child") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + if _, err := os.Create(filepath.Join(dir, "cpuset.cpus")); err != nil { + t.Fatalf("os.Create(): %v", err) + } + if _, err := os.Create(filepath.Join(dir, "cpuset.mems")); err != nil { + t.Fatalf("os.Create(): %v", err) + } + + spec := &specs.LinuxResources{ + CPU: tc.spec, + } + ctrlr := cpuSet{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + want := map[string]string{ + "cpuset.cpus": "parent-cpus", + "cpuset.mems": "parent-mems", + } + // Both path and dir must have been populated from grandpa. + checkDir(t, parent, want) + checkDir(t, dir, want) + }) + } +} + +func TestHugeTlb(t *testing.T) { + for _, tc := range []struct { + name string + spec []specs.LinuxHugepageLimit + wants map[string]string + }{ + { + name: "single", + spec: []specs.LinuxHugepageLimit{ + { + Pagesize: "1G", + Limit: 123, + }, + }, + wants: map[string]string{ + "hugetlb.1G.limit_in_bytes": "123", + }, + }, + { + name: "multiple", + spec: []specs.LinuxHugepageLimit{ + { + Pagesize: "1G", + Limit: 123, + }, + { + Pagesize: "2G", + Limit: 456, + }, + { + Pagesize: "1P", + Limit: 789, + }, + }, + wants: map[string]string{ + "hugetlb.1G.limit_in_bytes": "123", + "hugetlb.2G.limit_in_bytes": "456", + "hugetlb.1P.limit_in_bytes": "789", + }, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + HugepageLimits: tc.spec, + } + ctrlr := hugeTLB{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestMemory(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxMemory + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxMemory{ + Limit: int64Ptr(1), + Reservation: int64Ptr(2), + Swap: int64Ptr(3), + Kernel: int64Ptr(4), + KernelTCP: int64Ptr(5), + Swappiness: uint64Ptr(6), + DisableOOMKiller: boolPtr(true), + }, + wants: map[string]string{ + "memory.limit_in_bytes": "1", + "memory.soft_limit_in_bytes": "2", + "memory.memsw.limit_in_bytes": "3", + "memory.kmem.limit_in_bytes": "4", + "memory.kmem.tcp.limit_in_bytes": "5", + "memory.swappiness": "6", + "memory.oom_control": "1", + }, + }, + { + // Disable OOM killer should only write when set to true. + name: "oomkiller", + spec: &specs.LinuxMemory{ + DisableOOMKiller: boolPtr(false), + }, + }, + { + name: "nil_values", + spec: &specs.LinuxMemory{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + Memory: tc.spec, + } + ctrlr := memory{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestNetworkClass(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxNetwork + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxNetwork{ + ClassID: uint32Ptr(1), + }, + wants: map[string]string{ + "net_cls.classid": "1", + }, + }, + { + name: "nil_values", + spec: &specs.LinuxNetwork{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + Network: tc.spec, + } + ctrlr := networkClass{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestNetworkPriority(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxNetwork + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxNetwork{ + Priorities: []specs.LinuxInterfacePriority{ + { + Name: "foo", + Priority: 1, + }, + }, + }, + wants: map[string]string{ + "net_prio.ifpriomap": "foo 1", + }, + }, + { + name: "nil_values", + spec: &specs.LinuxNetwork{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + Network: tc.spec, + } + ctrlr := networkPrio{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestPids(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxPids + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxPids{Limit: 1}, + wants: map[string]string{ + "pids.max": "1", + }, + }, + { + name: "nil_values", + spec: &specs.LinuxPids{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + Pids: tc.spec, + } + ctrlr := pids{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD new file mode 100644 index 000000000..dae9b3b3e --- /dev/null +++ b/runsc/cmd/BUILD @@ -0,0 +1,95 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "cmd", + srcs = [ + "boot.go", + "capability.go", + "checkpoint.go", + "chroot.go", + "cmd.go", + "create.go", + "debug.go", + "delete.go", + "do.go", + "error.go", + "events.go", + "exec.go", + "gofer.go", + "help.go", + "install.go", + "kill.go", + "list.go", + "path.go", + "pause.go", + "ps.go", + "restore.go", + "resume.go", + "run.go", + "spec.go", + "start.go", + "state.go", + "statefile.go", + "syscalls.go", + "wait.go", + ], + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/log", + "//pkg/p9", + "//pkg/sentry/control", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/platform", + "//pkg/state/pretty", + "//pkg/state/statefile", + "//pkg/sync", + "//pkg/unet", + "//pkg/urpc", + "//runsc/boot", + "//runsc/console", + "//runsc/container", + "//runsc/flag", + "//runsc/fsgofer", + "//runsc/fsgofer/filter", + "//runsc/specutils", + "@com_github_google_subcommands//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_syndtr_gocapability//capability:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "cmd_test", + size = "small", + srcs = [ + "capability_test.go", + "delete_test.go", + "exec_test.go", + "gofer_test.go", + ], + data = [ + "//runsc", + ], + library = ":cmd", + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/control", + "//pkg/sentry/kernel/auth", + "//pkg/test/testutil", + "//pkg/urpc", + "//runsc/boot", + "//runsc/container", + "//runsc/specutils", + "@com_github_google_go-cmp//cmp:go_default_library", + "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_syndtr_gocapability//capability:go_default_library", + ], +) diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go new file mode 100644 index 000000000..01204ab4d --- /dev/null +++ b/runsc/cmd/boot.go @@ -0,0 +1,290 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "os" + "runtime/debug" + "strings" + "syscall" + + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" +) + +// Boot implements subcommands.Command for the "boot" command which starts a +// new sandbox. It should not be called directly. +type Boot struct { + // bundleDir is the directory containing the OCI spec. + bundleDir string + + // specFD is the file descriptor that the spec will be read from. + specFD int + + // controllerFD is the file descriptor of a stream socket for the + // control server that is donated to this process. + controllerFD int + + // deviceFD is the file descriptor for the platform device file. + deviceFD int + + // ioFDs is the list of FDs used to connect to FS gofers. + ioFDs intFlags + + // stdioFDs are the fds for stdin, stdout, and stderr. They must be + // provided in that order. + stdioFDs intFlags + + // console is set to true if the sandbox should allow terminal ioctl(2) + // syscalls. + console bool + + // applyCaps determines if capabilities defined in the spec should be applied + // to the process. + applyCaps bool + + // setUpChroot is set to true if the sandbox is started in an empty root. + setUpRoot bool + + // cpuNum number of CPUs to create inside the sandbox. + cpuNum int + + // totalMem sets the initial amount of total memory to report back to the + // container. + totalMem uint64 + + // userLogFD is the file descriptor to write user logs to. + userLogFD int + + // startSyncFD is the file descriptor to synchronize runsc and sandbox. + startSyncFD int + + // mountsFD is the file descriptor to read list of mounts after they have + // been resolved (direct paths, no symlinks). They are resolved outside the + // sandbox (e.g. gofer) and sent through this FD. + mountsFD int + + // pidns is set if the sandbox is in its own pid namespace. + pidns bool + + // attached is set to true to kill the sandbox process when the parent process + // terminates. This flag is set when the command execve's itself because + // parent death signal doesn't propagate through execve when uid/gid changes. + attached bool +} + +// Name implements subcommands.Command.Name. +func (*Boot) Name() string { + return "boot" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Boot) Synopsis() string { + return "launch a sandbox process (internal use only)" +} + +// Usage implements subcommands.Command.Usage. +func (*Boot) Usage() string { + return `boot [flags] <container id>` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (b *Boot) SetFlags(f *flag.FlagSet) { + f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory") + f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec") + f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") + f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file") + f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec") + f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") + f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls") + f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") + f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process") + f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace") + f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox") + f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container") + f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.") + f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup") + f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).") + f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates") +} + +// Execute implements subcommands.Command.Execute. It starts a sandbox in a +// waiting state. +func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + // Ensure that if there is a panic, all goroutine stacks are printed. + debug.SetTraceback("system") + + conf := args[0].(*boot.Config) + + if b.attached { + // Ensure this process is killed after parent process terminates when + // attached mode is enabled. In the unfortunate event that the parent + // terminates before this point, this process leaks. + if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil { + Fatalf("error setting parent death signal: %v", err) + } + } + + if b.setUpRoot { + if err := setUpChroot(b.pidns); err != nil { + Fatalf("error setting up chroot: %v", err) + } + + if !b.applyCaps && !conf.Rootless { + // Remove --apply-caps arg to call myself. It has already been done. + args := prepareArgs(b.attached, "setup-root") + + // Note that we've already read the spec from the spec FD, and + // we will read it again after the exec call. This works + // because the ReadSpecFromFile function seeks to the beginning + // of the file before reading. + if err := callSelfAsNobody(args); err != nil { + Fatalf("%v", err) + } + panic("callSelfAsNobody must never return success") + } + } + + // Get the spec from the specFD. + specFile := os.NewFile(uintptr(b.specFD), "spec file") + defer specFile.Close() + spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile) + if err != nil { + Fatalf("reading spec: %v", err) + } + specutils.LogSpec(spec) + + if b.applyCaps { + caps := spec.Process.Capabilities + if caps == nil { + caps = &specs.LinuxCapabilities{} + } + + gPlatform, err := platform.Lookup(conf.Platform) + if err != nil { + Fatalf("loading platform: %v", err) + } + if gPlatform.Requirements().RequiresCapSysPtrace { + // Ptrace platform requires extra capabilities. + const c = "CAP_SYS_PTRACE" + caps.Bounding = append(caps.Bounding, c) + caps.Effective = append(caps.Effective, c) + caps.Permitted = append(caps.Permitted, c) + } + + // Remove --apply-caps and --setup-root arg to call myself. Both have + // already been done. + args := prepareArgs(b.attached, "setup-root", "apply-caps") + + // Note that we've already read the spec from the spec FD, and + // we will read it again after the exec call. This works + // because the ReadSpecFromFile function seeks to the beginning + // of the file before reading. + if err := setCapsAndCallSelf(args, caps); err != nil { + Fatalf("%v", err) + } + panic("setCapsAndCallSelf must never return success") + } + + // Read resolved mount list and replace the original one from the spec. + mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file") + cleanMounts, err := specutils.ReadMounts(mountsFile) + if err != nil { + mountsFile.Close() + Fatalf("Error reading mounts file: %v", err) + } + mountsFile.Close() + spec.Mounts = cleanMounts + + // Create the loader. + bootArgs := boot.Args{ + ID: f.Arg(0), + Spec: spec, + Conf: conf, + ControllerFD: b.controllerFD, + Device: os.NewFile(uintptr(b.deviceFD), "platform device"), + GoferFDs: b.ioFDs.GetArray(), + StdioFDs: b.stdioFDs.GetArray(), + Console: b.console, + NumCPU: b.cpuNum, + TotalMem: b.totalMem, + UserLogFD: b.userLogFD, + } + l, err := boot.New(bootArgs) + if err != nil { + Fatalf("creating loader: %v", err) + } + + // Fatalf exits the process and doesn't run defers. + // 'l' must be destroyed explicitly after this point! + + // Notify the parent process the sandbox has booted (and that the controller + // is up). + startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file") + buf := make([]byte, 1) + if w, err := startSyncFile.Write(buf); err != nil || w != 1 { + l.Destroy() + Fatalf("unable to write into the start-sync descriptor: %v", err) + } + // Closes startSyncFile because 'l.Run()' only returns when the sandbox exits. + startSyncFile.Close() + + // Wait for the start signal from runsc. + l.WaitForStartSignal() + + // Run the application and wait for it to finish. + if err := l.Run(); err != nil { + l.Destroy() + Fatalf("running sandbox: %v", err) + } + + ws := l.WaitExit() + log.Infof("application exiting with %+v", ws) + waitStatus := args[1].(*syscall.WaitStatus) + *waitStatus = syscall.WaitStatus(ws.Status()) + l.Destroy() + return subcommands.ExitSuccess +} + +func prepareArgs(attached bool, exclude ...string) []string { + var args []string + for _, arg := range os.Args { + for _, excl := range exclude { + if strings.Contains(arg, excl) { + goto skip + } + } + args = append(args, arg) + if attached && arg == "boot" { + // Strategicaly place "--attached" after the command. This is needed + // to ensure the new process is killed when the parent process terminates. + args = append(args, "--attached") + } + skip: + } + return args +} diff --git a/runsc/cmd/capability.go b/runsc/cmd/capability.go new file mode 100644 index 000000000..abfbb7cfc --- /dev/null +++ b/runsc/cmd/capability.go @@ -0,0 +1,157 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "fmt" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/syndtr/gocapability/capability" + "gvisor.dev/gvisor/pkg/log" +) + +var allCapTypes = []capability.CapType{ + capability.BOUNDS, + capability.EFFECTIVE, + capability.PERMITTED, + capability.INHERITABLE, + capability.AMBIENT, +} + +// applyCaps applies the capabilities in the spec to the current thread. +// +// Note that it must be called with current thread locked. +func applyCaps(caps *specs.LinuxCapabilities) error { + // Load current capabilities to trim the ones not permitted. + curCaps, err := capability.NewPid2(0) + if err != nil { + return err + } + if err := curCaps.Load(); err != nil { + return err + } + + // Create an empty capability set to populate. + newCaps, err := capability.NewPid2(0) + if err != nil { + return err + } + + for _, c := range allCapTypes { + if !newCaps.Empty(c) { + panic("unloaded capabilities must be empty") + } + set, err := trimCaps(getCaps(c, caps), curCaps) + if err != nil { + return err + } + newCaps.Set(c, set...) + } + + if err := newCaps.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS); err != nil { + return err + } + log.Infof("Capabilities applied: %+v", newCaps) + return nil +} + +func getCaps(which capability.CapType, caps *specs.LinuxCapabilities) []string { + switch which { + case capability.BOUNDS: + return caps.Bounding + case capability.EFFECTIVE: + return caps.Effective + case capability.PERMITTED: + return caps.Permitted + case capability.INHERITABLE: + return caps.Inheritable + case capability.AMBIENT: + return caps.Ambient + } + panic(fmt.Sprint("invalid capability type:", which)) +} + +func trimCaps(names []string, setter capability.Capabilities) ([]capability.Cap, error) { + wantedCaps, err := capsFromNames(names) + if err != nil { + return nil, err + } + + // Trim down capabilities that aren't possible to acquire. + var caps []capability.Cap + for _, c := range wantedCaps { + // Capability rules are more complicated than this, but this catches most + // problems with tests running with non-privileged user. + if setter.Get(capability.PERMITTED, c) { + caps = append(caps, c) + } else { + log.Warningf("Capability %q is not permitted, dropping it.", c) + } + } + return caps, nil +} + +func capsFromNames(names []string) ([]capability.Cap, error) { + var caps []capability.Cap + for _, name := range names { + cap, ok := capFromName[name] + if !ok { + return nil, fmt.Errorf("invalid capability %q", name) + } + caps = append(caps, cap) + } + return caps, nil +} + +var capFromName = map[string]capability.Cap{ + "CAP_CHOWN": capability.CAP_CHOWN, + "CAP_DAC_OVERRIDE": capability.CAP_DAC_OVERRIDE, + "CAP_DAC_READ_SEARCH": capability.CAP_DAC_READ_SEARCH, + "CAP_FOWNER": capability.CAP_FOWNER, + "CAP_FSETID": capability.CAP_FSETID, + "CAP_KILL": capability.CAP_KILL, + "CAP_SETGID": capability.CAP_SETGID, + "CAP_SETUID": capability.CAP_SETUID, + "CAP_SETPCAP": capability.CAP_SETPCAP, + "CAP_LINUX_IMMUTABLE": capability.CAP_LINUX_IMMUTABLE, + "CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE, + "CAP_NET_BROADCAST": capability.CAP_NET_BROADCAST, + "CAP_NET_ADMIN": capability.CAP_NET_ADMIN, + "CAP_NET_RAW": capability.CAP_NET_RAW, + "CAP_IPC_LOCK": capability.CAP_IPC_LOCK, + "CAP_IPC_OWNER": capability.CAP_IPC_OWNER, + "CAP_SYS_MODULE": capability.CAP_SYS_MODULE, + "CAP_SYS_RAWIO": capability.CAP_SYS_RAWIO, + "CAP_SYS_CHROOT": capability.CAP_SYS_CHROOT, + "CAP_SYS_PTRACE": capability.CAP_SYS_PTRACE, + "CAP_SYS_PACCT": capability.CAP_SYS_PACCT, + "CAP_SYS_ADMIN": capability.CAP_SYS_ADMIN, + "CAP_SYS_BOOT": capability.CAP_SYS_BOOT, + "CAP_SYS_NICE": capability.CAP_SYS_NICE, + "CAP_SYS_RESOURCE": capability.CAP_SYS_RESOURCE, + "CAP_SYS_TIME": capability.CAP_SYS_TIME, + "CAP_SYS_TTY_CONFIG": capability.CAP_SYS_TTY_CONFIG, + "CAP_MKNOD": capability.CAP_MKNOD, + "CAP_LEASE": capability.CAP_LEASE, + "CAP_AUDIT_WRITE": capability.CAP_AUDIT_WRITE, + "CAP_AUDIT_CONTROL": capability.CAP_AUDIT_CONTROL, + "CAP_SETFCAP": capability.CAP_SETFCAP, + "CAP_MAC_OVERRIDE": capability.CAP_MAC_OVERRIDE, + "CAP_MAC_ADMIN": capability.CAP_MAC_ADMIN, + "CAP_SYSLOG": capability.CAP_SYSLOG, + "CAP_WAKE_ALARM": capability.CAP_WAKE_ALARM, + "CAP_BLOCK_SUSPEND": capability.CAP_BLOCK_SUSPEND, + "CAP_AUDIT_READ": capability.CAP_AUDIT_READ, +} diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go new file mode 100644 index 000000000..a84067112 --- /dev/null +++ b/runsc/cmd/capability_test.go @@ -0,0 +1,127 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "flag" + "fmt" + "os" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/syndtr/gocapability/capability" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/specutils" +) + +func init() { + log.SetLevel(log.Debug) + if err := testutil.ConfigureExePath(); err != nil { + panic(err.Error()) + } +} + +func checkProcessCaps(pid int, wantCaps *specs.LinuxCapabilities) error { + curCaps, err := capability.NewPid2(pid) + if err != nil { + return fmt.Errorf("capability.NewPid2(%d) failed: %v", pid, err) + } + if err := curCaps.Load(); err != nil { + return fmt.Errorf("unable to load capabilities: %v", err) + } + fmt.Printf("Capabilities (PID: %d): %v\n", pid, curCaps) + + for _, c := range allCapTypes { + if err := checkCaps(c, curCaps, wantCaps); err != nil { + return err + } + } + return nil +} + +func checkCaps(which capability.CapType, curCaps capability.Capabilities, wantCaps *specs.LinuxCapabilities) error { + wantNames := getCaps(which, wantCaps) + for name, c := range capFromName { + want := specutils.ContainsStr(wantNames, name) + got := curCaps.Get(which, c) + if want != got { + if want { + return fmt.Errorf("capability %v:%s should be set", which, name) + } + return fmt.Errorf("capability %v:%s should NOT be set", which, name) + } + } + return nil +} + +func TestCapabilities(t *testing.T) { + stop := testutil.StartReaper() + defer stop() + + spec := testutil.NewSpecWithArgs("/bin/sleep", "10000") + caps := []string{ + "CAP_CHOWN", + "CAP_SYS_PTRACE", // ptrace is added due to the platform choice. + } + spec.Process.Capabilities = &specs.LinuxCapabilities{ + Permitted: caps, + Bounding: caps, + Effective: caps, + Inheritable: caps, + } + + conf := testutil.TestConfig(t) + + // Use --network=host to make sandbox use spec's capabilities. + conf.Network = boot.NetworkHost + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := container.Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := container.New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Check that sandbox and gofer have the proper capabilities. + if err := checkProcessCaps(c.Sandbox.Pid, spec.Process.Capabilities); err != nil { + t.Error(err) + } + if err := checkProcessCaps(c.GoferPid, goferCaps); err != nil { + t.Error(err) + } +} + +func TestMain(m *testing.M) { + flag.Parse() + specutils.MaybeRunAsRoot() + os.Exit(m.Run()) +} diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go new file mode 100644 index 000000000..8a29e521e --- /dev/null +++ b/runsc/cmd/checkpoint.go @@ -0,0 +1,155 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "os" + "path/filepath" + "syscall" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" +) + +// File containing the container's saved image/state within the given image-path's directory. +const checkpointFileName = "checkpoint.img" + +// Checkpoint implements subcommands.Command for the "checkpoint" command. +type Checkpoint struct { + imagePath string + leaveRunning bool +} + +// Name implements subcommands.Command.Name. +func (*Checkpoint) Name() string { + return "checkpoint" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Checkpoint) Synopsis() string { + return "checkpoint current state of container (experimental)" +} + +// Usage implements subcommands.Command.Usage. +func (*Checkpoint) Usage() string { + return `checkpoint [flags] <container id> - save current state of container. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (c *Checkpoint) SetFlags(f *flag.FlagSet) { + f.StringVar(&c.imagePath, "image-path", "", "directory path to saved container image") + f.BoolVar(&c.leaveRunning, "leave-running", false, "restart the container after checkpointing") + + // Unimplemented flags necessary for compatibility with docker. + var wp string + f.StringVar(&wp, "work-path", "", "ignored") +} + +// Execute implements subcommands.Command.Execute. +func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + waitStatus := args[1].(*syscall.WaitStatus) + + cont, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading container: %v", err) + } + + if c.imagePath == "" { + Fatalf("image-path flag must be provided") + } + + if err := os.MkdirAll(c.imagePath, 0755); err != nil { + Fatalf("making directories at path provided: %v", err) + } + + fullImagePath := filepath.Join(c.imagePath, checkpointFileName) + + // Create the image file and open for writing. + file, err := os.OpenFile(fullImagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) + if err != nil { + Fatalf("os.OpenFile(%q) failed: %v", fullImagePath, err) + } + defer file.Close() + + if err := cont.Checkpoint(file); err != nil { + Fatalf("checkpoint failed: %v", err) + } + + if !c.leaveRunning { + return subcommands.ExitSuccess + } + + // TODO(b/110843694): Make it possible to restore into same container. + // For now, we can fake it by destroying the container and making a + // new container with the same ID. This hack does not work with docker + // which uses the container pid to ensure that the restore-container is + // actually the same as the checkpoint-container. By restoring into + // the same container, we will solve the docker incompatibility. + + // Restore into new container with same ID. + bundleDir := cont.BundleDir + if bundleDir == "" { + Fatalf("setting bundleDir") + } + + spec, err := specutils.ReadSpec(bundleDir) + if err != nil { + Fatalf("reading spec: %v", err) + } + + specutils.LogSpec(spec) + + if cont.ConsoleSocket != "" { + log.Warningf("ignoring console socket since it cannot be restored") + } + + if err := cont.Destroy(); err != nil { + Fatalf("destroying container: %v", err) + } + + contArgs := container.Args{ + ID: id, + Spec: spec, + BundleDir: bundleDir, + } + cont, err = container.New(conf, contArgs) + if err != nil { + Fatalf("restoring container: %v", err) + } + defer cont.Destroy() + + if err := cont.Restore(spec, conf, fullImagePath); err != nil { + Fatalf("starting container: %v", err) + } + + ws, err := cont.Wait() + *waitStatus = ws + + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/chroot.go b/runsc/cmd/chroot.go new file mode 100644 index 000000000..189244765 --- /dev/null +++ b/runsc/cmd/chroot.go @@ -0,0 +1,97 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/runsc/specutils" +) + +// mountInChroot creates the destination mount point in the given chroot and +// mounts the source. +func mountInChroot(chroot, src, dst, typ string, flags uint32) error { + chrootDst := filepath.Join(chroot, dst) + log.Infof("Mounting %q at %q", src, chrootDst) + + if err := specutils.Mount(src, chrootDst, typ, flags); err != nil { + return fmt.Errorf("error mounting %q at %q: %v", src, chrootDst, err) + } + return nil +} + +func pivotRoot(root string) error { + if err := os.Chdir(root); err != nil { + return fmt.Errorf("error changing working directory: %v", err) + } + // pivot_root(new_root, put_old) moves the root filesystem (old_root) + // of the calling process to the directory put_old and makes new_root + // the new root filesystem of the calling process. + // + // pivot_root(".", ".") makes a mount of the working directory the new + // root filesystem, so it will be moved in "/" and then the old_root + // will be moved to "/" too. The parent mount of the old_root will be + // new_root, so after umounting the old_root, we will see only + // the new_root in "/". + if err := syscall.PivotRoot(".", "."); err != nil { + return fmt.Errorf("pivot_root failed, make sure that the root mount has a parent: %v", err) + } + + if err := syscall.Unmount(".", syscall.MNT_DETACH); err != nil { + return fmt.Errorf("error umounting the old root file system: %v", err) + } + return nil +} + +// setUpChroot creates an empty directory with runsc mounted at /runsc and proc +// mounted at /proc. +func setUpChroot(pidns bool) error { + // We are a new mount namespace, so we can use /tmp as a directory to + // construct a new root. + chroot := os.TempDir() + + log.Infof("Setting up sandbox chroot in %q", chroot) + + // Convert all shared mounts into slave to be sure that nothing will be + // propagated outside of our namespace. + if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("error converting mounts: %v", err) + } + + if err := syscall.Mount("runsc-root", chroot, "tmpfs", syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_NOEXEC, ""); err != nil { + return fmt.Errorf("error mounting tmpfs in choot: %v", err) + } + + if pidns { + flags := uint32(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC | syscall.MS_RDONLY) + if err := mountInChroot(chroot, "proc", "/proc", "proc", flags); err != nil { + return fmt.Errorf("error mounting proc in chroot: %v", err) + } + } else { + if err := mountInChroot(chroot, "/proc", "/proc", "bind", syscall.MS_BIND|syscall.MS_RDONLY|syscall.MS_REC); err != nil { + return fmt.Errorf("error mounting proc in chroot: %v", err) + } + } + + if err := syscall.Mount("", chroot, "", syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("error remounting chroot in read-only: %v", err) + } + + return pivotRoot(chroot) +} diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go new file mode 100644 index 000000000..f1a4887ef --- /dev/null +++ b/runsc/cmd/cmd.go @@ -0,0 +1,98 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cmd holds implementations of the runsc commands. +package cmd + +import ( + "fmt" + "runtime" + "strconv" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/runsc/specutils" +) + +// intFlags can be used with int flags that appear multiple times. +type intFlags []int + +// String implements flag.Value. +func (i *intFlags) String() string { + return fmt.Sprintf("%v", *i) +} + +// Get implements flag.Value. +func (i *intFlags) Get() interface{} { + return i +} + +// GetArray returns array of FDs. +func (i *intFlags) GetArray() []int { + return *i +} + +// Set implements flag.Value. +func (i *intFlags) Set(s string) error { + fd, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("invalid flag value: %v", err) + } + if fd < 0 { + return fmt.Errorf("flag value must be greater than 0: %d", fd) + } + *i = append(*i, fd) + return nil +} + +// setCapsAndCallSelf sets capabilities to the current thread and then execve's +// itself again with the arguments specified in 'args' to restart the process +// with the desired capabilities. +func setCapsAndCallSelf(args []string, caps *specs.LinuxCapabilities) error { + // Keep thread locked while capabilities are changed. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if err := applyCaps(caps); err != nil { + return fmt.Errorf("applyCaps() failed: %v", err) + } + binPath := specutils.ExePath + + log.Infof("Execve %q again, bye!", binPath) + err := syscall.Exec(binPath, args, []string{}) + return fmt.Errorf("error executing %s: %v", binPath, err) +} + +// callSelfAsNobody sets UID and GID to nobody and then execve's itself again. +func callSelfAsNobody(args []string) error { + // Keep thread locked while user/group are changed. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + const nobody = 65534 + + if _, _, err := syscall.RawSyscall(syscall.SYS_SETGID, uintptr(nobody), 0, 0); err != 0 { + return fmt.Errorf("error setting uid: %v", err) + } + if _, _, err := syscall.RawSyscall(syscall.SYS_SETUID, uintptr(nobody), 0, 0); err != 0 { + return fmt.Errorf("error setting gid: %v", err) + } + + binPath := specutils.ExePath + + log.Infof("Execve %q again, bye!", binPath) + err := syscall.Exec(binPath, args, []string{}) + return fmt.Errorf("error executing %s: %v", binPath, err) +} diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go new file mode 100644 index 000000000..910e97577 --- /dev/null +++ b/runsc/cmd/create.go @@ -0,0 +1,115 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" +) + +// Create implements subcommands.Command for the "create" command. +type Create struct { + // bundleDir is the path to the bundle directory (defaults to the + // current working directory). + bundleDir string + + // pidFile is the filename that the sandbox pid will be written to. + // This file should only be created once the container process inside + // the sandbox is ready to use. + pidFile string + + // consoleSocket is the path to an AF_UNIX socket which will receive a + // file descriptor referencing the master end of the console's + // pseudoterminal. This is ignored unless spec.Process.Terminal is + // true. + consoleSocket string + + // userLog is the path to send user-visible logs to. This log is different + // from debug logs. The former is meant to be consumed by the users and should + // contain only information that is relevant to the person running the + // container, e.g. unsuported syscalls, while the later is more verbose and + // consumed by developers. + userLog string +} + +// Name implements subcommands.Command.Name. +func (*Create) Name() string { + return "create" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Create) Synopsis() string { + return "create a secure container" +} + +// Usage implements subcommands.Command.Usage. +func (*Create) Usage() string { + return `create [flags] <container id> - create a secure container +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (c *Create) SetFlags(f *flag.FlagSet) { + f.StringVar(&c.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") + f.StringVar(&c.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal") + f.StringVar(&c.pidFile, "pid-file", "", "filename that the container pid will be written to") + f.StringVar(&c.userLog, "user-log", "", "filename to send user-visible logs to. Empty means no logging.") +} + +// Execute implements subcommands.Command.Execute. +func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + if conf.Rootless { + return Errorf("Rootless mode not supported with %q", c.Name()) + } + + bundleDir := c.bundleDir + if bundleDir == "" { + bundleDir = getwdOrDie() + } + spec, err := specutils.ReadSpec(bundleDir) + if err != nil { + return Errorf("reading spec: %v", err) + } + specutils.LogSpec(spec) + + // Create the container. A new sandbox will be created for the + // container unless the metadata specifies that it should be run in an + // existing container. + contArgs := container.Args{ + ID: id, + Spec: spec, + BundleDir: bundleDir, + ConsoleSocket: c.consoleSocket, + PIDFile: c.pidFile, + UserLog: c.userLog, + } + if _, err := container.New(conf, contArgs); err != nil { + return Errorf("creating container: %v", err) + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go new file mode 100644 index 000000000..b5de2588b --- /dev/null +++ b/runsc/cmd/debug.go @@ -0,0 +1,304 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "os" + "strconv" + "strings" + "syscall" + "time" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// Debug implements subcommands.Command for the "debug" command. +type Debug struct { + pid int + stacks bool + signal int + profileHeap string + profileCPU string + profileGoroutine string + profileBlock string + profileMutex string + trace string + strace string + logLevel string + logPackets string + duration time.Duration + ps bool +} + +// Name implements subcommands.Command. +func (*Debug) Name() string { + return "debug" +} + +// Synopsis implements subcommands.Command. +func (*Debug) Synopsis() string { + return "shows a variety of debug information" +} + +// Usage implements subcommands.Command. +func (*Debug) Usage() string { + return `debug [flags] <container id>` +} + +// SetFlags implements subcommands.Command. +func (d *Debug) SetFlags(f *flag.FlagSet) { + f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set") + f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log") + f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") + f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") + f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.") + f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.") + f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.") + f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles") + f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.") + f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox") + f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`) + f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).") + f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.") + f.BoolVar(&d.ps, "ps", false, "lists processes") +} + +// Execute implements subcommands.Command.Execute. +func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + var c *container.Container + conf := args[0].(*boot.Config) + + if d.pid == 0 { + // No pid, container ID must have been provided. + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + var err error + c, err = container.Load(conf.RootDir, f.Arg(0)) + if err != nil { + return Errorf("loading container %q: %v", f.Arg(0), err) + } + } else { + if f.NArg() != 0 { + f.Usage() + return subcommands.ExitUsageError + } + // Go over all sandboxes and find the one that matches PID. + ids, err := container.List(conf.RootDir) + if err != nil { + return Errorf("listing containers: %v", err) + } + for _, id := range ids { + candidate, err := container.Load(conf.RootDir, id) + if err != nil { + return Errorf("loading container %q: %v", id, err) + } + if candidate.SandboxPid() == d.pid { + c = candidate + break + } + } + if c == nil { + return Errorf("container with PID %d not found", d.pid) + } + } + + if c.Sandbox == nil || !c.Sandbox.IsRunning() { + return Errorf("container sandbox is not running") + } + log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid) + + if d.signal > 0 { + log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid) + if err := syscall.Kill(c.Sandbox.Pid, syscall.Signal(d.signal)); err != nil { + return Errorf("failed to send signal %d to processs %d", d.signal, c.Sandbox.Pid) + } + } + if d.stacks { + log.Infof("Retrieving sandbox stacks") + stacks, err := c.Sandbox.Stacks() + if err != nil { + return Errorf("retrieving stacks: %v", err) + } + log.Infof(" *** Stack dump ***\n%s", stacks) + } + if d.profileHeap != "" { + f, err := os.Create(d.profileHeap) + if err != nil { + return Errorf(err.Error()) + } + defer f.Close() + + if err := c.Sandbox.HeapProfile(f); err != nil { + return Errorf(err.Error()) + } + log.Infof("Heap profile written to %q", d.profileHeap) + } + if d.profileGoroutine != "" { + f, err := os.Create(d.profileGoroutine) + if err != nil { + return Errorf(err.Error()) + } + defer f.Close() + + if err := c.Sandbox.GoroutineProfile(f); err != nil { + return Errorf(err.Error()) + } + log.Infof("Goroutine profile written to %q", d.profileGoroutine) + } + if d.profileBlock != "" { + f, err := os.Create(d.profileBlock) + if err != nil { + return Errorf(err.Error()) + } + defer f.Close() + + if err := c.Sandbox.BlockProfile(f); err != nil { + return Errorf(err.Error()) + } + log.Infof("Block profile written to %q", d.profileBlock) + } + if d.profileMutex != "" { + f, err := os.Create(d.profileMutex) + if err != nil { + return Errorf(err.Error()) + } + defer f.Close() + + if err := c.Sandbox.MutexProfile(f); err != nil { + return Errorf(err.Error()) + } + log.Infof("Mutex profile written to %q", d.profileMutex) + } + + delay := false + if d.profileCPU != "" { + delay = true + f, err := os.Create(d.profileCPU) + if err != nil { + return Errorf(err.Error()) + } + defer func() { + f.Close() + if err := c.Sandbox.StopCPUProfile(); err != nil { + Fatalf(err.Error()) + } + log.Infof("CPU profile written to %q", d.profileCPU) + }() + if err := c.Sandbox.StartCPUProfile(f); err != nil { + return Errorf(err.Error()) + } + log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU) + } + if d.trace != "" { + delay = true + f, err := os.Create(d.trace) + if err != nil { + return Errorf(err.Error()) + } + defer func() { + f.Close() + if err := c.Sandbox.StopTrace(); err != nil { + Fatalf(err.Error()) + } + log.Infof("Trace written to %q", d.trace) + }() + if err := c.Sandbox.StartTrace(f); err != nil { + return Errorf(err.Error()) + } + log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace) + } + + if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 { + args := control.LoggingArgs{} + switch strings.ToLower(d.strace) { + case "": + // strace not set, nothing to do here. + + case "off": + log.Infof("Disabling strace") + args.SetStrace = true + + case "all": + log.Infof("Enabling all straces") + args.SetStrace = true + args.EnableStrace = true + + default: + log.Infof("Enabling strace for syscalls: %s", d.strace) + args.SetStrace = true + args.EnableStrace = true + args.StraceWhitelist = strings.Split(d.strace, ",") + } + + if len(d.logLevel) != 0 { + args.SetLevel = true + switch strings.ToLower(d.logLevel) { + case "warning", "0": + args.Level = log.Warning + case "info", "1": + args.Level = log.Info + case "debug", "2": + args.Level = log.Debug + default: + return Errorf("invalid log level %q", d.logLevel) + } + log.Infof("Setting log level %v", args.Level) + } + + if len(d.logPackets) != 0 { + args.SetLogPackets = true + lp, err := strconv.ParseBool(d.logPackets) + if err != nil { + return Errorf("invalid value for log_packets %q", d.logPackets) + } + args.LogPackets = lp + if args.LogPackets { + log.Infof("Enabling packet logging") + } else { + log.Infof("Disabling packet logging") + } + } + + if err := c.Sandbox.ChangeLogging(args); err != nil { + return Errorf(err.Error()) + } + log.Infof("Logging options changed") + } + if d.ps { + pList, err := c.Processes() + if err != nil { + Fatalf("getting processes for container: %v", err) + } + o, err := control.ProcessListToJSON(pList) + if err != nil { + Fatalf("generating JSON: %v", err) + } + log.Infof(o) + } + + if delay { + time.Sleep(d.duration) + } + + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go new file mode 100644 index 000000000..0e4863f50 --- /dev/null +++ b/runsc/cmd/delete.go @@ -0,0 +1,87 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "fmt" + "os" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// Delete implements subcommands.Command for the "delete" command. +type Delete struct { + // force indicates that the container should be terminated if running. + force bool +} + +// Name implements subcommands.Command.Name. +func (*Delete) Name() string { + return "delete" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Delete) Synopsis() string { + return "delete resources held by a container" +} + +// Usage implements subcommands.Command.Usage. +func (*Delete) Usage() string { + return `delete [flags] <container ids>` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (d *Delete) SetFlags(f *flag.FlagSet) { + f.BoolVar(&d.force, "force", false, "terminate container if running") +} + +// Execute implements subcommands.Command.Execute. +func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() == 0 { + f.Usage() + return subcommands.ExitUsageError + } + + conf := args[0].(*boot.Config) + if err := d.execute(f.Args(), conf); err != nil { + Fatalf("%v", err) + } + return subcommands.ExitSuccess +} + +func (d *Delete) execute(ids []string, conf *boot.Config) error { + for _, id := range ids { + c, err := container.Load(conf.RootDir, id) + if err != nil { + if os.IsNotExist(err) && d.force { + log.Warningf("couldn't find container %q: %v", id, err) + return nil + } + return fmt.Errorf("loading container %q: %v", id, err) + } + if !d.force && c.Status != container.Created && c.Status != container.Stopped { + return fmt.Errorf("cannot delete container that is not stopped without --force flag") + } + if err := c.Destroy(); err != nil { + return fmt.Errorf("destroying container: %v", err) + } + } + return nil +} diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go new file mode 100644 index 000000000..cb59516a3 --- /dev/null +++ b/runsc/cmd/delete_test.go @@ -0,0 +1,41 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "io/ioutil" + "testing" + + "gvisor.dev/gvisor/runsc/boot" +) + +func TestNotFound(t *testing.T) { + ids := []string{"123"} + dir, err := ioutil.TempDir("", "metadata") + if err != nil { + t.Fatalf("error creating dir: %v", err) + } + conf := &boot.Config{RootDir: dir} + + d := Delete{} + if err := d.execute(ids, conf); err == nil { + t.Error("Deleting non-existent container should have failed") + } + + d = Delete{force: true} + if err := d.execute(ids, conf); err != nil { + t.Errorf("Deleting non-existent container with --force should NOT have failed: %v", err) + } +} diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go new file mode 100644 index 000000000..7d1310c96 --- /dev/null +++ b/runsc/cmd/do.go @@ -0,0 +1,385 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "io/ioutil" + "math/rand" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "syscall" + + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" +) + +// Do implements subcommands.Command for the "do" command. It sets up a simple +// sandbox and executes the command inside it. See Usage() for more details. +type Do struct { + root string + cwd string + ip string + quiet bool +} + +// Name implements subcommands.Command.Name. +func (*Do) Name() string { + return "do" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Do) Synopsis() string { + return "Simplistic way to execute a command inside the sandbox. It's to be used for testing only." +} + +// Usage implements subcommands.Command.Usage. +func (*Do) Usage() string { + return `do [flags] <cmd> - runs a command. + +This command starts a sandbox with host filesystem mounted inside as readonly, +with a writable tmpfs overlay on top of it. The given command is executed inside +the sandbox. It's to be used to quickly test applications without having to +install or run docker. It doesn't give nearly as many options and it's to be +used for testing only. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (c *Do) SetFlags(f *flag.FlagSet) { + f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`) + f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory") + f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox") + f.BoolVar(&c.quiet, "quiet", false, "suppress runsc messages to stdout. Application output is still sent to stdout and stderr") +} + +// Execute implements subcommands.Command.Execute. +func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if len(f.Args()) == 0 { + c.Usage() + return subcommands.ExitUsageError + } + + conf := args[0].(*boot.Config) + waitStatus := args[1].(*syscall.WaitStatus) + + if conf.Rootless { + if err := specutils.MaybeRunAsRoot(); err != nil { + return Errorf("Error executing inside namespace: %v", err) + } + // Execution will continue here if no more capabilities are needed... + } + + hostname, err := os.Hostname() + if err != nil { + return Errorf("Error to retrieve hostname: %v", err) + } + + // Map the entire host file system, but make it readonly with a writable + // overlay on top (ignore --overlay option). + conf.Overlay = true + absRoot, err := resolvePath(c.root) + if err != nil { + return Errorf("Error resolving root: %v", err) + } + absCwd, err := resolvePath(c.cwd) + if err != nil { + return Errorf("Error resolving current directory: %v", err) + } + + spec := &specs.Spec{ + Root: &specs.Root{ + Path: absRoot, + }, + Process: &specs.Process{ + Cwd: absCwd, + Args: f.Args(), + Env: os.Environ(), + Capabilities: specutils.AllCapabilities(), + }, + Hostname: hostname, + } + + specutils.LogSpec(spec) + + cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000)) + if conf.Network == boot.NetworkNone { + netns := specs.LinuxNamespace{ + Type: specs.NetworkNamespace, + } + if spec.Linux != nil { + panic("spec.Linux is not nil") + } + spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}} + + } else if conf.Rootless { + if conf.Network == boot.NetworkSandbox { + c.notifyUser("*** Warning: using host network due to --rootless ***") + conf.Network = boot.NetworkHost + } + + } else { + clean, err := c.setupNet(cid, spec) + if err != nil { + return Errorf("Error setting up network: %v", err) + } + defer clean() + } + + out, err := json.Marshal(spec) + if err != nil { + return Errorf("Error to marshal spec: %v", err) + } + tmpDir, err := ioutil.TempDir("", "runsc-do") + if err != nil { + return Errorf("Error to create tmp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + log.Infof("Changing configuration RootDir to %q", tmpDir) + conf.RootDir = tmpDir + + cfgPath := filepath.Join(tmpDir, "config.json") + if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil { + return Errorf("Error write spec: %v", err) + } + + containerArgs := container.Args{ + ID: cid, + Spec: spec, + BundleDir: tmpDir, + Attached: true, + } + ct, err := container.New(conf, containerArgs) + if err != nil { + return Errorf("creating container: %v", err) + } + defer ct.Destroy() + + if err := ct.Start(conf); err != nil { + return Errorf("starting container: %v", err) + } + + // Forward signals to init in the container. Thus if we get SIGINT from + // ^C, the container gracefully exit, and we can clean up. + // + // N.B. There is a still a window before this where a signal may kill + // this process, skipping cleanup. + stopForwarding := ct.ForwardSignals(0 /* pid */, false /* fgProcess */) + defer stopForwarding() + + ws, err := ct.Wait() + if err != nil { + return Errorf("waiting for container: %v", err) + } + + *waitStatus = ws + return subcommands.ExitSuccess +} + +func (c *Do) notifyUser(format string, v ...interface{}) { + if !c.quiet { + fmt.Printf(format+"\n", v...) + } + log.Warningf(format, v...) +} + +func resolvePath(path string) (string, error) { + var err error + path, err = filepath.Abs(path) + if err != nil { + return "", fmt.Errorf("resolving %q: %v", path, err) + } + path = filepath.Clean(path) + if err := syscall.Access(path, 0); err != nil { + return "", fmt.Errorf("unable to access %q: %v", path, err) + } + return path, nil +} + +func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) { + dev, err := defaultDevice() + if err != nil { + return nil, err + } + peerIP, err := calculatePeerIP(c.ip) + if err != nil { + return nil, err + } + veth, peer := deviceNames(cid) + + cmds := []string{ + fmt.Sprintf("ip link add %s type veth peer name %s", veth, peer), + + // Setup device outside the namespace. + fmt.Sprintf("ip addr add %s/24 dev %s", peerIP, peer), + fmt.Sprintf("ip link set %s up", peer), + + // Setup device inside the namespace. + fmt.Sprintf("ip netns add %s", cid), + fmt.Sprintf("ip link set %s netns %s", veth, cid), + fmt.Sprintf("ip netns exec %s ip addr add %s/24 dev %s", cid, c.ip, veth), + fmt.Sprintf("ip netns exec %s ip link set %s up", cid, veth), + fmt.Sprintf("ip netns exec %s ip link set lo up", cid), + fmt.Sprintf("ip netns exec %s ip route add default via %s", cid, peerIP), + + // Enable network access. + "sysctl -w net.ipv4.ip_forward=1", + fmt.Sprintf("iptables -t nat -A POSTROUTING -s %s -o %s -j MASQUERADE", c.ip, dev), + fmt.Sprintf("iptables -A FORWARD -i %s -o %s -j ACCEPT", dev, peer), + fmt.Sprintf("iptables -A FORWARD -o %s -i %s -j ACCEPT", dev, peer), + } + + for _, cmd := range cmds { + log.Debugf("Run %q", cmd) + args := strings.Split(cmd, " ") + cmd := exec.Command(args[0], args[1:]...) + if err := cmd.Run(); err != nil { + c.cleanupNet(cid, dev, "", "", "") + return nil, fmt.Errorf("failed to run %q: %v", cmd, err) + } + } + + resolvPath, err := makeFile("/etc/resolv.conf", "nameserver 8.8.8.8\n", spec) + if err != nil { + c.cleanupNet(cid, dev, "", "", "") + return nil, err + } + hostnamePath, err := makeFile("/etc/hostname", cid+"\n", spec) + if err != nil { + c.cleanupNet(cid, dev, resolvPath, "", "") + return nil, err + } + hosts := fmt.Sprintf("127.0.0.1\tlocalhost\n%s\t%s\n", c.ip, cid) + hostsPath, err := makeFile("/etc/hosts", hosts, spec) + if err != nil { + c.cleanupNet(cid, dev, resolvPath, hostnamePath, "") + return nil, err + } + + if spec.Linux == nil { + spec.Linux = &specs.Linux{} + } + netns := specs.LinuxNamespace{ + Type: specs.NetworkNamespace, + Path: filepath.Join("/var/run/netns", cid), + } + spec.Linux.Namespaces = append(spec.Linux.Namespaces, netns) + + return func() { c.cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath) }, nil +} + +// cleanupNet tries to cleanup the network setup in setupNet. +// +// It may be called when setupNet is only partially complete, in which case it +// will cleanup as much as possible, logging warnings for the rest. +// +// Unfortunately none of this can be automatically cleaned up on process exit, +// we must do so explicitly. +func (c *Do) cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath string) { + _, peer := deviceNames(cid) + + cmds := []string{ + fmt.Sprintf("ip link delete %s", peer), + fmt.Sprintf("ip netns delete %s", cid), + } + + for _, cmd := range cmds { + log.Debugf("Run %q", cmd) + args := strings.Split(cmd, " ") + c := exec.Command(args[0], args[1:]...) + if err := c.Run(); err != nil { + log.Warningf("Failed to run %q: %v", cmd, err) + } + } + + tryRemove(resolvPath) + tryRemove(hostnamePath) + tryRemove(hostsPath) +} + +func deviceNames(cid string) (string, string) { + // Device name is limited to 15 letters. + return "ve-" + cid, "vp-" + cid + +} + +func defaultDevice() (string, error) { + out, err := exec.Command("ip", "route", "list", "default").CombinedOutput() + if err != nil { + return "", err + } + parts := strings.Split(string(out), " ") + if len(parts) < 5 { + return "", fmt.Errorf("malformed %q output: %q", "ip route list default", string(out)) + } + return parts[4], nil +} + +func makeFile(dest, content string, spec *specs.Spec) (string, error) { + tmpFile, err := ioutil.TempFile("", filepath.Base(dest)) + if err != nil { + return "", err + } + if _, err := tmpFile.WriteString(content); err != nil { + if err := os.Remove(tmpFile.Name()); err != nil { + log.Warningf("Failed to remove %q: %v", tmpFile, err) + } + return "", err + } + spec.Mounts = append(spec.Mounts, specs.Mount{ + Source: tmpFile.Name(), + Destination: dest, + Type: "bind", + Options: []string{"ro"}, + }) + return tmpFile.Name(), nil +} + +func tryRemove(path string) { + if path == "" { + return + } + + if err := os.Remove(path); err != nil { + log.Warningf("Failed to remove %q: %v", path, err) + } +} + +func calculatePeerIP(ip string) (string, error) { + parts := strings.Split(ip, ".") + if len(parts) != 4 { + return "", fmt.Errorf("invalid IP format %q", ip) + } + n, err := strconv.Atoi(parts[3]) + if err != nil { + return "", fmt.Errorf("invalid IP format %q: %v", ip, err) + } + n++ + if n > 255 { + n = 1 + } + return fmt.Sprintf("%s.%s.%s.%d", parts[0], parts[1], parts[2], n), nil +} diff --git a/runsc/cmd/error.go b/runsc/cmd/error.go new file mode 100644 index 000000000..3585b5448 --- /dev/null +++ b/runsc/cmd/error.go @@ -0,0 +1,72 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/json" + "fmt" + "io" + "os" + "time" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/log" +) + +// ErrorLogger is where error messages should be written to. These messages are +// consumed by containerd and show up to users of command line tools, +// like docker/kubectl. +var ErrorLogger io.Writer + +type jsonError struct { + Msg string `json:"msg"` + Level string `json:"level"` + Time time.Time `json:"time"` +} + +// Errorf logs error to containerd log (--log), to stderr, and debug logs. It +// returns subcommands.ExitFailure for convenience with subcommand.Execute() +// methods: +// return Errorf("Danger! Danger!") +// +func Errorf(format string, args ...interface{}) subcommands.ExitStatus { + // If runsc is being invoked by docker or cri-o, then we might not have + // access to stderr, so we log a serious-looking warning in addition to + // writing to stderr. + log.Warningf("FATAL ERROR: "+format, args...) + fmt.Fprintf(os.Stderr, format+"\n", args...) + + j := jsonError{ + Msg: fmt.Sprintf(format, args...), + Level: "error", + Time: time.Now(), + } + b, err := json.Marshal(j) + if err != nil { + panic(err) + } + if ErrorLogger != nil { + ErrorLogger.Write(b) + } + + return subcommands.ExitFailure +} + +// Fatalf logs the same way as Errorf() does, plus *exits* the process. +func Fatalf(format string, args ...interface{}) { + Errorf(format, args...) + // Return an error that is unlikely to be used by the application. + os.Exit(128) +} diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go new file mode 100644 index 000000000..51f6a98ed --- /dev/null +++ b/runsc/cmd/events.go @@ -0,0 +1,111 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "os" + "time" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// Events implements subcommands.Command for the "events" command. +type Events struct { + // The interval between stats reporting. + intervalSec int + // If true, events will print a single group of stats and exit. + stats bool +} + +// Name implements subcommands.Command.Name. +func (*Events) Name() string { + return "events" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Events) Synopsis() string { + return "display container events such as OOM notifications, cpu, memory, and IO usage statistics" +} + +// Usage implements subcommands.Command.Usage. +func (*Events) Usage() string { + return `<container-id> + +Where "<container-id>" is the name for the instance of the container. + +The events command displays information about the container. By default the +information is displayed once every 5 seconds. + +OPTIONS: +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (evs *Events) SetFlags(f *flag.FlagSet) { + f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds") + f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit") +} + +// Execute implements subcommands.Command.Execute. +func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + c, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading sandbox: %v", err) + } + + // Repeatedly get stats from the container. + for { + // Get the event and print it as JSON. + ev, err := c.Event() + if err != nil { + log.Warningf("Error getting events for container: %v", err) + } + // err must be preserved because it is used below when breaking + // out of the loop. + b, err := json.Marshal(ev) + if err != nil { + log.Warningf("Error while marshalling event %v: %v", ev, err) + } else { + os.Stdout.Write(b) + } + + // If we're only running once, break. If we're only running + // once and there was an error, the command failed. + if evs.stats { + if err != nil { + return subcommands.ExitFailure + } + break + } + + time.Sleep(time.Duration(evs.intervalSec) * time.Second) + } + + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go new file mode 100644 index 000000000..d9a94903e --- /dev/null +++ b/runsc/cmd/exec.go @@ -0,0 +1,481 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" + + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/console" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" +) + +// Exec implements subcommands.Command for the "exec" command. +type Exec struct { + cwd string + env stringSlice + // user contains the UID and GID with which to run the new process. + user user + extraKGIDs stringSlice + caps stringSlice + detach bool + processPath string + pidFile string + internalPidFile string + + // consoleSocket is the path to an AF_UNIX socket which will receive a + // file descriptor referencing the master end of the console's + // pseudoterminal. + consoleSocket string +} + +// Name implements subcommands.Command.Name. +func (*Exec) Name() string { + return "exec" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Exec) Synopsis() string { + return "execute new process inside the container" +} + +// Usage implements subcommands.Command.Usage. +func (*Exec) Usage() string { + return `exec [command options] <container-id> <command> [command options] || --process process.json <container-id> + + +Where "<container-id>" is the name for the instance of the container and +"<command>" is the command to be executed in the container. +"<command>" can't be empty unless a "-process" flag provided. + +EXAMPLE: +If the container is configured to run /bin/ps the following will +output a list of processes running in the container: + + # runc exec <container-id> ps + +OPTIONS: +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (ex *Exec) SetFlags(f *flag.FlagSet) { + f.StringVar(&ex.cwd, "cwd", "", "current working directory") + f.Var(&ex.env, "env", "set environment variables (e.g. '-env PATH=/bin -env TERM=xterm')") + f.Var(&ex.user, "user", "UID (format: <uid>[:<gid>])") + f.Var(&ex.extraKGIDs, "additional-gids", "additional gids") + f.Var(&ex.caps, "cap", "add a capability to the bounding set for the process") + f.BoolVar(&ex.detach, "detach", false, "detach from the container's process") + f.StringVar(&ex.processPath, "process", "", "path to the process.json") + f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to") + f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to") + f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal") +} + +// Execute implements subcommands.Command.Execute. It starts a process in an +// already created container. +func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + conf := args[0].(*boot.Config) + e, id, err := ex.parseArgs(f, conf.EnableRaw) + if err != nil { + Fatalf("parsing process spec: %v", err) + } + waitStatus := args[1].(*syscall.WaitStatus) + + c, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading sandbox: %v", err) + } + + log.Debugf("Exec arguments: %+v", e) + log.Debugf("Exec capablities: %+v", e.Capabilities) + + // Replace empty settings with defaults from container. + if e.WorkingDirectory == "" { + e.WorkingDirectory = c.Spec.Process.Cwd + } + if e.Envv == nil { + e.Envv, err = resolveEnvs(c.Spec.Process.Env, ex.env) + if err != nil { + Fatalf("getting environment variables: %v", err) + } + } + + if e.Capabilities == nil { + e.Capabilities, err = specutils.Capabilities(conf.EnableRaw, c.Spec.Process.Capabilities) + if err != nil { + Fatalf("creating capabilities: %v", err) + } + log.Infof("Using exec capabilities from container: %+v", e.Capabilities) + } + + // containerd expects an actual process to represent the container being + // executed. If detach was specified, starts a child in non-detach mode, + // write the child's PID to the pid file. So when the container returns, the + // child process will also return and signal containerd. + if ex.detach { + return ex.execChildAndWait(waitStatus) + } + return ex.exec(c, e, waitStatus) +} + +func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus { + // Start the new process and get it pid. + pid, err := c.Execute(e) + if err != nil { + return Errorf("executing processes for container: %v", err) + } + + if e.StdioIsPty { + // Forward signals sent to this process to the foreground + // process in the sandbox. + stopForwarding := c.ForwardSignals(pid, true /* fgProcess */) + defer stopForwarding() + } + + // Write the sandbox-internal pid if required. + if ex.internalPidFile != "" { + pidStr := []byte(strconv.Itoa(int(pid))) + if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil { + return Errorf("writing internal pid file %q: %v", ex.internalPidFile, err) + } + } + + // Generate the pid file after the internal pid file is generated, so that + // users can safely assume that the internal pid file is ready after + // `runsc exec -d` returns. + if ex.pidFile != "" { + if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil { + return Errorf("writing pid file: %v", err) + } + } + + // Wait for the process to exit. + ws, err := c.WaitPID(pid) + if err != nil { + return Errorf("waiting on pid %d: %v", pid, err) + } + *waitStatus = ws + return subcommands.ExitSuccess +} + +func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus { + var args []string + for _, a := range os.Args[1:] { + if !strings.Contains(a, "detach") { + args = append(args, a) + } + } + + // The command needs to write a pid file so that execChildAndWait can tell + // when it has started. If no pid-file was provided, we should use a + // filename in a temp directory. + pidFile := ex.pidFile + if pidFile == "" { + tmpDir, err := ioutil.TempDir("", "exec-pid-") + if err != nil { + Fatalf("creating TempDir: %v", err) + } + defer os.RemoveAll(tmpDir) + pidFile = filepath.Join(tmpDir, "pid") + args = append(args, "--pid-file="+pidFile) + } + + cmd := exec.Command(specutils.ExePath, args...) + cmd.Args[0] = "runsc-exec" + + // Exec stdio defaults to current process stdio. + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + // If the console control socket file is provided, then create a new + // pty master/slave pair and set the TTY on the sandbox process. + if ex.consoleSocket != "" { + // Create a new TTY pair and send the master on the provided socket. + tty, err := console.NewWithSocket(ex.consoleSocket) + if err != nil { + Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err) + } + defer tty.Close() + + // Set stdio to the new TTY slave. + cmd.Stdin = tty + cmd.Stdout = tty + cmd.Stderr = tty + cmd.SysProcAttr = &syscall.SysProcAttr{ + Setsid: true, + Setctty: true, + // The Ctty FD must be the FD in the child process's FD + // table. Since we set cmd.Stdin/Stdout/Stderr to the + // tty FD, we can use any of 0, 1, or 2 here. + // See https://github.com/golang/go/issues/29458. + Ctty: 0, + } + } + + if err := cmd.Start(); err != nil { + Fatalf("failure to start child exec process, err: %v", err) + } + + log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, specutils.ExePath, args) + + // Wait for PID file to ensure that child process has started. Otherwise, + // '--process' file is deleted as soon as this process returns and the child + // may fail to read it. + ready := func() (bool, error) { + pidb, err := ioutil.ReadFile(pidFile) + if err == nil { + // File appeared, check whether pid is fully written. + pid, err := strconv.Atoi(string(pidb)) + if err != nil { + return false, nil + } + return pid == cmd.Process.Pid, nil + } + if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT { + return false, err + } + // No file yet, continue to wait... + return false, nil + } + if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil { + // Don't log fatal error here, otherwise it will override the error logged + // by the child process that has failed to start. + log.Warningf("Unexpected error waiting for PID file, err: %v", err) + return subcommands.ExitFailure + } + + *waitStatus = 0 + return subcommands.ExitSuccess +} + +// parseArgs parses exec information from the command line or a JSON file +// depending on whether the --process flag was used. Returns an ExecArgs and +// the ID of the container to be used. +func (ex *Exec) parseArgs(f *flag.FlagSet, enableRaw bool) (*control.ExecArgs, string, error) { + if ex.processPath == "" { + // Requires at least a container ID and command. + if f.NArg() < 2 { + f.Usage() + return nil, "", fmt.Errorf("both a container-id and command are required") + } + e, err := ex.argsFromCLI(f.Args()[1:], enableRaw) + return e, f.Arg(0), err + } + // Requires only the container ID. + if f.NArg() != 1 { + f.Usage() + return nil, "", fmt.Errorf("a container-id is required") + } + e, err := ex.argsFromProcessFile(enableRaw) + return e, f.Arg(0), err +} + +func (ex *Exec) argsFromCLI(argv []string, enableRaw bool) (*control.ExecArgs, error) { + extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs)) + for _, s := range ex.extraKGIDs { + kgid, err := strconv.Atoi(s) + if err != nil { + Fatalf("parsing GID: %s, %v", s, err) + } + extraKGIDs = append(extraKGIDs, auth.KGID(kgid)) + } + + var caps *auth.TaskCapabilities + if len(ex.caps) > 0 { + var err error + caps, err = capabilities(ex.caps, enableRaw) + if err != nil { + return nil, fmt.Errorf("capabilities error: %v", err) + } + } + + return &control.ExecArgs{ + Argv: argv, + WorkingDirectory: ex.cwd, + KUID: ex.user.kuid, + KGID: ex.user.kgid, + ExtraKGIDs: extraKGIDs, + Capabilities: caps, + StdioIsPty: ex.consoleSocket != "", + FilePayload: urpc.FilePayload{[]*os.File{os.Stdin, os.Stdout, os.Stderr}}, + }, nil +} + +func (ex *Exec) argsFromProcessFile(enableRaw bool) (*control.ExecArgs, error) { + f, err := os.Open(ex.processPath) + if err != nil { + return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err) + } + defer f.Close() + var p specs.Process + if err := json.NewDecoder(f).Decode(&p); err != nil { + return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err) + } + return argsFromProcess(&p, enableRaw) +} + +// argsFromProcess performs all the non-IO conversion from the Process struct +// to ExecArgs. +func argsFromProcess(p *specs.Process, enableRaw bool) (*control.ExecArgs, error) { + // Create capabilities. + var caps *auth.TaskCapabilities + if p.Capabilities != nil { + var err error + // Starting from Docker 19, capabilities are explicitly set for exec (instead + // of nil like before). So we can't distinguish 'exec' from + // 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter + // CAP_NET_RAW in the same way as container start. + caps, err = specutils.Capabilities(enableRaw, p.Capabilities) + if err != nil { + return nil, fmt.Errorf("error creating capabilities: %v", err) + } + } + + // Convert the spec's additional GIDs to KGIDs. + extraKGIDs := make([]auth.KGID, 0, len(p.User.AdditionalGids)) + for _, GID := range p.User.AdditionalGids { + extraKGIDs = append(extraKGIDs, auth.KGID(GID)) + } + + return &control.ExecArgs{ + Argv: p.Args, + Envv: p.Env, + WorkingDirectory: p.Cwd, + KUID: auth.KUID(p.User.UID), + KGID: auth.KGID(p.User.GID), + ExtraKGIDs: extraKGIDs, + Capabilities: caps, + StdioIsPty: p.Terminal, + FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}}, + }, nil +} + +// resolveEnvs transforms lists of environment variables into a single list of +// environment variables. If a variable is defined multiple times, the last +// value is used. +func resolveEnvs(envs ...[]string) ([]string, error) { + // First create a map of variable names to values. This removes any + // duplicates. + envMap := make(map[string]string) + for _, env := range envs { + for _, str := range env { + parts := strings.SplitN(str, "=", 2) + if len(parts) != 2 { + return nil, fmt.Errorf("invalid variable: %s", str) + } + envMap[parts[0]] = parts[1] + } + } + // Reassemble envMap into a list of environment variables of the form + // NAME=VALUE. + env := make([]string, 0, len(envMap)) + for k, v := range envMap { + env = append(env, fmt.Sprintf("%s=%s", k, v)) + } + return env, nil +} + +// capabilities takes a list of capabilities as strings and returns an +// auth.TaskCapabilities struct with those capabilities in every capability set. +// This mimics runc's behavior. +func capabilities(cs []string, enableRaw bool) (*auth.TaskCapabilities, error) { + var specCaps specs.LinuxCapabilities + for _, cap := range cs { + specCaps.Ambient = append(specCaps.Ambient, cap) + specCaps.Bounding = append(specCaps.Bounding, cap) + specCaps.Effective = append(specCaps.Effective, cap) + specCaps.Inheritable = append(specCaps.Inheritable, cap) + specCaps.Permitted = append(specCaps.Permitted, cap) + } + // Starting from Docker 19, capabilities are explicitly set for exec (instead + // of nil like before). So we can't distinguish 'exec' from + // 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter + // CAP_NET_RAW in the same way as container start. + return specutils.Capabilities(enableRaw, &specCaps) +} + +// stringSlice allows a flag to be used multiple times, where each occurrence +// adds a value to the flag. For example, a flag called "x" could be invoked +// via "runsc exec -x foo -x bar", and the corresponding stringSlice would be +// {"x", "y"}. +type stringSlice []string + +// String implements flag.Value.String. +func (ss *stringSlice) String() string { + return fmt.Sprintf("%v", *ss) +} + +// Get implements flag.Value.Get. +func (ss *stringSlice) Get() interface{} { + return ss +} + +// Set implements flag.Value.Set. +func (ss *stringSlice) Set(s string) error { + *ss = append(*ss, s) + return nil +} + +// user allows -user to convey a UID and, optionally, a GID separated by a +// colon. +type user struct { + kuid auth.KUID + kgid auth.KGID +} + +func (u *user) String() string { + return fmt.Sprintf("%+v", *u) +} + +func (u *user) Get() interface{} { + return u +} + +func (u *user) Set(s string) error { + parts := strings.SplitN(s, ":", 2) + kuid, err := strconv.Atoi(parts[0]) + if err != nil { + return fmt.Errorf("couldn't parse UID: %s", parts[0]) + } + u.kuid = auth.KUID(kuid) + if len(parts) > 1 { + kgid, err := strconv.Atoi(parts[1]) + if err != nil { + return fmt.Errorf("couldn't parse GID: %s", parts[1]) + } + u.kgid = auth.KGID(kgid) + } + return nil +} diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go new file mode 100644 index 000000000..a1e980d08 --- /dev/null +++ b/runsc/cmd/exec_test.go @@ -0,0 +1,154 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "os" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/urpc" +) + +func TestUser(t *testing.T) { + testCases := []struct { + input string + want user + wantErr bool + }{ + {input: "0", want: user{kuid: 0, kgid: 0}}, + {input: "7", want: user{kuid: 7, kgid: 0}}, + {input: "49:343", want: user{kuid: 49, kgid: 343}}, + {input: "0:2401", want: user{kuid: 0, kgid: 2401}}, + {input: "", wantErr: true}, + {input: "foo", wantErr: true}, + {input: ":123", wantErr: true}, + {input: "1:2:3", wantErr: true}, + } + + for _, tc := range testCases { + var u user + if err := u.Set(tc.input); err != nil && tc.wantErr { + // We got an error and wanted one. + continue + } else if err == nil && tc.wantErr { + t.Errorf("user.Set(%s): got no error, but wanted one", tc.input) + } else if err != nil && !tc.wantErr { + t.Errorf("user.Set(%s): got error %v, but wanted none", tc.input, err) + } else if u != tc.want { + t.Errorf("user.Set(%s): got %+v, but wanted %+v", tc.input, u, tc.want) + } + } +} + +func TestCLIArgs(t *testing.T) { + testCases := []struct { + ex Exec + argv []string + expected control.ExecArgs + }{ + { + ex: Exec{ + cwd: "/foo/bar", + user: user{kuid: 0, kgid: 0}, + extraKGIDs: []string{"1", "2", "3"}, + caps: []string{"CAP_DAC_OVERRIDE"}, + processPath: "", + }, + argv: []string{"ls", "/"}, + expected: control.ExecArgs{ + Argv: []string{"ls", "/"}, + WorkingDirectory: "/foo/bar", + FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}}, + KUID: 0, + KGID: 0, + ExtraKGIDs: []auth.KGID{1, 2, 3}, + Capabilities: &auth.TaskCapabilities{ + BoundingCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + PermittedCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + }, + }, + }, + } + + for _, tc := range testCases { + e, err := tc.ex.argsFromCLI(tc.argv, true) + if err != nil { + t.Errorf("argsFromCLI(%+v): got error: %+v", tc.ex, err) + } else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) { + t.Errorf("argsFromCLI(%+v): got %+v, but expected %+v", tc.ex, *e, tc.expected) + } + } +} + +func TestJSONArgs(t *testing.T) { + testCases := []struct { + // ex is provided to make sure it is overridden by p. + ex Exec + p specs.Process + expected control.ExecArgs + }{ + { + ex: Exec{ + cwd: "/baz/quux", + user: user{kuid: 1, kgid: 1}, + extraKGIDs: []string{"4", "5", "6"}, + caps: []string{"CAP_SETGID"}, + processPath: "/bin/foo", + }, + p: specs.Process{ + User: specs.User{UID: 0, GID: 0, AdditionalGids: []uint32{1, 2, 3}}, + Args: []string{"ls", "/"}, + Cwd: "/foo/bar", + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{"CAP_DAC_OVERRIDE"}, + Effective: []string{"CAP_DAC_OVERRIDE"}, + Inheritable: []string{"CAP_DAC_OVERRIDE"}, + Permitted: []string{"CAP_DAC_OVERRIDE"}, + }, + }, + expected: control.ExecArgs{ + Argv: []string{"ls", "/"}, + WorkingDirectory: "/foo/bar", + FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}}, + KUID: 0, + KGID: 0, + ExtraKGIDs: []auth.KGID{1, 2, 3}, + Capabilities: &auth.TaskCapabilities{ + BoundingCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + PermittedCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + }, + }, + }, + } + + for _, tc := range testCases { + e, err := argsFromProcess(&tc.p, true) + if err != nil { + t.Errorf("argsFromProcess(%+v): got error: %+v", tc.p, err) + } else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) { + t.Errorf("argsFromProcess(%+v): got %+v, but expected %+v", tc.p, *e, tc.expected) + } + } +} diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go new file mode 100644 index 000000000..3966e2d21 --- /dev/null +++ b/runsc/cmd/gofer.go @@ -0,0 +1,484 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "syscall" + + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/fsgofer" + "gvisor.dev/gvisor/runsc/fsgofer/filter" + "gvisor.dev/gvisor/runsc/specutils" +) + +var caps = []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_DAC_READ_SEARCH", + "CAP_FOWNER", + "CAP_FSETID", + "CAP_SYS_CHROOT", +} + +// goferCaps is the minimal set of capabilities needed by the Gofer to operate +// on files. +var goferCaps = &specs.LinuxCapabilities{ + Bounding: caps, + Effective: caps, + Permitted: caps, +} + +// Gofer implements subcommands.Command for the "gofer" command, which starts a +// filesystem gofer. This command should not be called directly. +type Gofer struct { + bundleDir string + ioFDs intFlags + applyCaps bool + setUpRoot bool + + panicOnWrite bool + specFD int + mountsFD int +} + +// Name implements subcommands.Command. +func (*Gofer) Name() string { + return "gofer" +} + +// Synopsis implements subcommands.Command. +func (*Gofer) Synopsis() string { + return "launch a gofer process that serves files over 9P protocol (internal use only)" +} + +// Usage implements subcommands.Command. +func (*Gofer) Usage() string { + return `gofer [flags]` +} + +// SetFlags implements subcommands.Command. +func (g *Gofer) SetFlags(f *flag.FlagSet) { + f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") + f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") + f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") + f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected") + f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") + f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") + f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).") +} + +// Execute implements subcommands.Command. +func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 { + f.Usage() + return subcommands.ExitUsageError + } + + specFile := os.NewFile(uintptr(g.specFD), "spec file") + defer specFile.Close() + spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile) + if err != nil { + Fatalf("reading spec: %v", err) + } + + conf := args[0].(*boot.Config) + + if g.setUpRoot { + if err := setupRootFS(spec, conf); err != nil { + Fatalf("Error setting up root FS: %v", err) + } + } + if g.applyCaps { + // Disable caps when calling myself again. + // Note: minimal argument handling for the default case to keep it simple. + args := os.Args + args = append(args, "--apply-caps=false", "--setup-root=false") + if err := setCapsAndCallSelf(args, goferCaps); err != nil { + Fatalf("Unable to apply caps: %v", err) + } + panic("unreachable") + } + + // Find what path is going to be served by this gofer. + root := spec.Root.Path + if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + root = "/root" + } + + // Resolve mount points paths, then replace mounts from our spec and send the + // mount list over to the sandbox, so they are both in sync. + // + // Note that all mount points have been mounted in the proper location in + // setupRootFS(). + cleanMounts, err := resolveMounts(conf, spec.Mounts, root) + if err != nil { + Fatalf("Failure to resolve mounts: %v", err) + } + spec.Mounts = cleanMounts + go func() { + if err := g.writeMounts(cleanMounts); err != nil { + panic(fmt.Sprintf("Failed to write mounts: %v", err)) + } + }() + + specutils.LogSpec(spec) + + // fsgofer should run with a umask of 0, because we want to preserve file + // modes exactly as sent by the sandbox, which will have applied its own umask. + syscall.Umask(0) + + if err := fsgofer.OpenProcSelfFD(); err != nil { + Fatalf("failed to open /proc/self/fd: %v", err) + } + + if err := syscall.Chroot(root); err != nil { + Fatalf("failed to chroot to %q: %v", root, err) + } + if err := syscall.Chdir("/"); err != nil { + Fatalf("changing working dir: %v", err) + } + log.Infof("Process chroot'd to %q", root) + + // Start with root mount, then add any other additional mount as needed. + ats := make([]p9.Attacher, 0, len(spec.Mounts)+1) + ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{ + ROMount: spec.Root.Readonly || conf.Overlay, + PanicOnWrite: g.panicOnWrite, + }) + if err != nil { + Fatalf("creating attach point: %v", err) + } + ats = append(ats, ap) + log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], spec.Root.Readonly) + + mountIdx := 1 // first one is the root + for _, m := range spec.Mounts { + if specutils.Is9PMount(m) { + cfg := fsgofer.Config{ + ROMount: isReadonlyMount(m.Options) || conf.Overlay, + PanicOnWrite: g.panicOnWrite, + HostUDS: conf.FSGoferHostUDS, + } + ap, err := fsgofer.NewAttachPoint(m.Destination, cfg) + if err != nil { + Fatalf("creating attach point: %v", err) + } + ats = append(ats, ap) + + if mountIdx >= len(g.ioFDs) { + Fatalf("no FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m) + } + log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfg.ROMount) + mountIdx++ + } + } + if mountIdx != len(g.ioFDs) { + Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) + } + + if conf.FSGoferHostUDS { + filter.InstallUDSFilters() + } + + if err := filter.Install(); err != nil { + Fatalf("installing seccomp filters: %v", err) + } + + runServers(ats, g.ioFDs) + return subcommands.ExitSuccess +} + +func runServers(ats []p9.Attacher, ioFDs []int) { + // Run the loops and wait for all to exit. + var wg sync.WaitGroup + for i, ioFD := range ioFDs { + wg.Add(1) + go func(ioFD int, at p9.Attacher) { + socket, err := unet.NewSocket(ioFD) + if err != nil { + Fatalf("creating server on FD %d: %v", ioFD, err) + } + s := p9.NewServer(at) + if err := s.Handle(socket); err != nil { + Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err) + } + wg.Done() + }(ioFD, ats[i]) + } + wg.Wait() + log.Infof("All 9P servers exited.") +} + +func (g *Gofer) writeMounts(mounts []specs.Mount) error { + bytes, err := json.Marshal(mounts) + if err != nil { + return err + } + + f := os.NewFile(uintptr(g.mountsFD), "mounts file") + defer f.Close() + + for written := 0; written < len(bytes); { + w, err := f.Write(bytes[written:]) + if err != nil { + return err + } + written += w + } + return nil +} + +func isReadonlyMount(opts []string) bool { + for _, o := range opts { + if o == "ro" { + return true + } + } + return false +} + +func setupRootFS(spec *specs.Spec, conf *boot.Config) error { + // Convert all shared mounts into slaves to be sure that nothing will be + // propagated outside of our namespace. + if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + Fatalf("error converting mounts: %v", err) + } + + root := spec.Root.Path + if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + // runsc can't be re-executed without /proc, so we create a tmpfs mount, + // mount ./proc and ./root there, then move this mount to the root and after + // setCapsAndCallSelf, runsc will chroot into /root. + // + // We need a directory to construct a new root and we know that + // runsc can't start without /proc, so we can use it for this. + flags := uintptr(syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_NOEXEC) + if err := syscall.Mount("runsc-root", "/proc", "tmpfs", flags, ""); err != nil { + Fatalf("error mounting tmpfs: %v", err) + } + + // Prepare tree structure for pivot_root(2). + os.Mkdir("/proc/proc", 0755) + os.Mkdir("/proc/root", 0755) + if err := syscall.Mount("runsc-proc", "/proc/proc", "proc", flags|syscall.MS_RDONLY, ""); err != nil { + Fatalf("error mounting proc: %v", err) + } + root = "/proc/root" + } + + // Mount root path followed by submounts. + if err := syscall.Mount(spec.Root.Path, root, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting root on root (%q) err: %v", root, err) + } + + flags := uint32(syscall.MS_SLAVE | syscall.MS_REC) + if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { + flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation}) + } + if err := syscall.Mount("", root, "", uintptr(flags), ""); err != nil { + return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err) + } + + // Replace the current spec, with the clean spec with symlinks resolved. + if err := setupMounts(conf, spec.Mounts, root); err != nil { + Fatalf("error setting up FS: %v", err) + } + + // Create working directory if needed. + if spec.Process.Cwd != "" { + dst, err := resolveSymlinks(root, spec.Process.Cwd) + if err != nil { + return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err) + } + if err := os.MkdirAll(dst, 0755); err != nil { + return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err) + } + } + + // Check if root needs to be remounted as readonly. + if spec.Root.Readonly || conf.Overlay { + // If root is a mount point but not read-only, we can change mount options + // to make it read-only for extra safety. + log.Infof("Remounting root as readonly: %q", root) + flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC) + if err := syscall.Mount(root, root, "bind", flags, ""); err != nil { + return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err) + } + } + + if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + if err := pivotRoot("/proc"); err != nil { + Fatalf("failed to change the root file system: %v", err) + } + if err := os.Chdir("/"); err != nil { + Fatalf("failed to change working directory") + } + } + return nil +} + +// setupMounts binds mount all mounts specified in the spec in their correct +// location inside root. It will resolve relative paths and symlinks. It also +// creates directories as needed. +func setupMounts(conf *boot.Config, mounts []specs.Mount, root string) error { + for _, m := range mounts { + if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { + continue + } + + dst, err := resolveSymlinks(root, m.Destination) + if err != nil { + return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) + } + + flags := specutils.OptionsToFlags(m.Options) | syscall.MS_BIND + if conf.Overlay { + // Force mount read-only if writes are not going to be sent to it. + flags |= syscall.MS_RDONLY + } + + log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags) + if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil { + return fmt.Errorf("mounting %v: %v", m, err) + } + + // Set propagation options that cannot be set together with other options. + flags = specutils.PropOptionsToFlags(m.Options) + if flags != 0 { + if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil { + return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err) + } + } + } + return nil +} + +// resolveMounts resolved relative paths and symlinks to mount points. +// +// Note: mount points must already be in place for resolution to work. +// Otherwise, it may follow symlinks to locations that would be overwritten +// with another mount point and return the wrong location. In short, make sure +// setupMounts() has been called before. +func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) { + cleanMounts := make([]specs.Mount, 0, len(mounts)) + for _, m := range mounts { + if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { + cleanMounts = append(cleanMounts, m) + continue + } + dst, err := resolveSymlinks(root, m.Destination) + if err != nil { + return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) + } + relDst, err := filepath.Rel(root, dst) + if err != nil { + panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err)) + } + + opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options) + if err != nil { + return nil, err + } + + cpy := m + cpy.Destination = filepath.Join("/", relDst) + cpy.Options = opts + cleanMounts = append(cleanMounts, cpy) + } + return cleanMounts, nil +} + +// ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are +// symlinks, they are evaluated relative to 'root' to ensure the end result is +// the same as if the process was running inside the container. +func resolveSymlinks(root, rel string) (string, error) { + return resolveSymlinksImpl(root, root, rel, 255) +} + +func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) { + if followCount == 0 { + return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel)) + } + + rel = filepath.Clean(rel) + for _, name := range strings.Split(rel, string(filepath.Separator)) { + if name == "" { + continue + } + // Note that Join() resolves things like ".." and returns a clean path. + path := filepath.Join(base, name) + if !strings.HasPrefix(path, root) { + // One cannot '..' their way out of root. + base = root + continue + } + fi, err := os.Lstat(path) + if err != nil { + if !os.IsNotExist(err) { + return "", err + } + // Not found means there is no symlink to check. Just keep walking dirs. + base = path + continue + } + if fi.Mode()&os.ModeSymlink != 0 { + link, err := os.Readlink(path) + if err != nil { + return "", err + } + if filepath.IsAbs(link) { + base = root + } + base, err = resolveSymlinksImpl(root, base, link, followCount-1) + if err != nil { + return "", err + } + continue + } + base = path + } + return base, nil +} + +// adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs. +func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) { + rv := make([]string, len(opts)) + copy(rv, opts) + + if conf.OverlayfsStaleRead { + statfs := syscall.Statfs_t{} + if err := syscall.Statfs(path, &statfs); err != nil { + return nil, err + } + if statfs.Type == unix.OVERLAYFS_SUPER_MAGIC { + rv = append(rv, "overlayfs_stale_read") + } + } + return rv, nil +} diff --git a/runsc/cmd/gofer_test.go b/runsc/cmd/gofer_test.go new file mode 100644 index 000000000..cbea7f127 --- /dev/null +++ b/runsc/cmd/gofer_test.go @@ -0,0 +1,164 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "fmt" + "io/ioutil" + "os" + "path" + "path/filepath" + "testing" +) + +func tmpDir() string { + dir := os.Getenv("TEST_TMPDIR") + if dir == "" { + dir = "/tmp" + } + return dir +} + +type dir struct { + rel string + link string +} + +func construct(root string, dirs []dir) error { + for _, d := range dirs { + p := path.Join(root, d.rel) + if d.link == "" { + if err := os.MkdirAll(p, 0755); err != nil { + return fmt.Errorf("error creating dir: %v", err) + } + } else { + if err := os.MkdirAll(path.Dir(p), 0755); err != nil { + return fmt.Errorf("error creating dir: %v", err) + } + if err := os.Symlink(d.link, p); err != nil { + return fmt.Errorf("error creating symlink: %v", err) + } + } + } + return nil +} + +func TestResolveSymlinks(t *testing.T) { + root, err := ioutil.TempDir(tmpDir(), "root") + if err != nil { + t.Fatal("ioutil.TempDir() failed:", err) + } + dirs := []dir{ + {"dir1/dir11/dir111/dir1111", ""}, // Just a boring dir + {"dir1/lnk12", "dir11"}, // Link to sibling + {"dir1/lnk13", "./dir11"}, // Link to sibling through self + {"dir1/lnk14", "../dir1/dir11"}, // Link to sibling through parent + {"dir1/dir15/lnk151", ".."}, // Link to parent + {"dir1/lnk16", "dir11/dir111"}, // Link to child + {"dir1/lnk17", "."}, // Link to self + {"dir1/lnk18", "lnk13"}, // Link to link + {"lnk2", "dir1/lnk13"}, // Link to link to link + {"dir3/dir21/lnk211", "../.."}, // Link to root relative + {"dir3/lnk22", "/"}, // Link to root absolute + {"dir3/lnk23", "/dir1"}, // Link to dir absolute + {"dir3/lnk24", "/dir1/lnk12"}, // Link to link absolute + {"lnk5", "../../.."}, // Link outside root + } + if err := construct(root, dirs); err != nil { + t.Fatal("construct failed:", err) + } + + tests := []struct { + name string + rel string + want string + compareHost bool + }{ + {name: "root", rel: "/", want: "/", compareHost: true}, + {name: "basic dir", rel: "/dir1/dir11/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "dot 1", rel: "/dir1/dir11/./dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "dot 2", rel: "/dir1/././dir11/./././././dir111/.", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "dotdot 1", rel: "/dir1/dir11/../dir15", want: "/dir1/dir15", compareHost: true}, + {name: "dotdot 2", rel: "/dir1/dir11/dir1111/../..", want: "/dir1", compareHost: true}, + + {name: "link sibling", rel: "/dir1/lnk12", want: "/dir1/dir11", compareHost: true}, + {name: "link sibling + dir", rel: "/dir1/lnk12/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "link sibling through self", rel: "/dir1/lnk13", want: "/dir1/dir11", compareHost: true}, + {name: "link sibling through parent", rel: "/dir1/lnk14", want: "/dir1/dir11", compareHost: true}, + + {name: "link parent", rel: "/dir1/dir15/lnk151", want: "/dir1", compareHost: true}, + {name: "link parent + dir", rel: "/dir1/dir15/lnk151/dir11", want: "/dir1/dir11", compareHost: true}, + {name: "link child", rel: "/dir1/lnk16", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "link child + dir", rel: "/dir1/lnk16/dir1111", want: "/dir1/dir11/dir111/dir1111", compareHost: true}, + {name: "link self", rel: "/dir1/lnk17", want: "/dir1", compareHost: true}, + {name: "link self + dir", rel: "/dir1/lnk17/dir11", want: "/dir1/dir11", compareHost: true}, + + {name: "link^2", rel: "/dir1/lnk18", want: "/dir1/dir11", compareHost: true}, + {name: "link^2 + dir", rel: "/dir1/lnk18/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + {name: "link^3", rel: "/lnk2", want: "/dir1/dir11", compareHost: true}, + {name: "link^3 + dir", rel: "/lnk2/dir111", want: "/dir1/dir11/dir111", compareHost: true}, + + {name: "link abs", rel: "/dir3/lnk23", want: "/dir1"}, + {name: "link abs + dir", rel: "/dir3/lnk23/dir11", want: "/dir1/dir11"}, + {name: "link^2 abs", rel: "/dir3/lnk24", want: "/dir1/dir11"}, + {name: "link^2 abs + dir", rel: "/dir3/lnk24/dir111", want: "/dir1/dir11/dir111"}, + + {name: "root link rel", rel: "/dir3/dir21/lnk211", want: "/", compareHost: true}, + {name: "root link abs", rel: "/dir3/lnk22", want: "/"}, + {name: "root contain link", rel: "/lnk5/dir1", want: "/dir1"}, + {name: "root contain dotdot", rel: "/dir1/dir11/../../../../../../../..", want: "/"}, + + {name: "crazy", rel: "/dir3/dir21/lnk211/dir3/lnk22/dir1/dir11/../../lnk5/dir3/../dir3/lnk24/dir111/dir1111/..", want: "/dir1/dir11/dir111"}, + } + for _, tst := range tests { + t.Run(tst.name, func(t *testing.T) { + got, err := resolveSymlinks(root, tst.rel) + if err != nil { + t.Errorf("resolveSymlinks(root, %q) failed: %v", tst.rel, err) + } + want := path.Join(root, tst.want) + if got != want { + t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, got, want) + } + if tst.compareHost { + // Check that host got to the same end result. + host, err := filepath.EvalSymlinks(path.Join(root, tst.rel)) + if err != nil { + t.Errorf("path.EvalSymlinks(root, %q) failed: %v", tst.rel, err) + } + if host != got { + t.Errorf("resolveSymlinks(root, %q) got: %q, want: %q", tst.rel, host, got) + } + } + }) + } +} + +func TestResolveSymlinksLoop(t *testing.T) { + root, err := ioutil.TempDir(tmpDir(), "root") + if err != nil { + t.Fatal("ioutil.TempDir() failed:", err) + } + dirs := []dir{ + {"loop1", "loop2"}, + {"loop2", "loop1"}, + } + if err := construct(root, dirs); err != nil { + t.Fatal("construct failed:", err) + } + if _, err := resolveSymlinks(root, "loop1"); err == nil { + t.Errorf("resolveSymlinks() should have failed") + } +} diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go new file mode 100644 index 000000000..cd85dabbb --- /dev/null +++ b/runsc/cmd/help.go @@ -0,0 +1,120 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "fmt" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/flag" +) + +// NewHelp returns a help command for the given commander. +func NewHelp(cdr *subcommands.Commander) *Help { + return &Help{ + cdr: cdr, + } +} + +// Help implements subcommands.Command for the "help" command. The 'help' +// command prints help for commands registered to a Commander but also allows for +// registering additional help commands that print other documentation. +type Help struct { + cdr *subcommands.Commander + commands []subcommands.Command + help bool +} + +// Name implements subcommands.Command.Name. +func (*Help) Name() string { + return "help" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Help) Synopsis() string { + return "Print help documentation." +} + +// Usage implements subcommands.Command.Usage. +func (*Help) Usage() string { + return `help [<subcommand>]: + With an argument, prints detailed information on the use of + the specified topic or subcommand. With no argument, print a list of + all commands and a brief description of each. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (h *Help) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command.Execute. +func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + switch f.NArg() { + case 0: + fmt.Fprintf(h.cdr.Output, "Usage: %s <flags> <subcommand> <subcommand args>\n\n", h.cdr.Name()) + fmt.Fprintf(h.cdr.Output, `runsc is the gVisor container runtime. + +Functionality is provided by subcommands. For help with a specific subcommand, +use "%s %s <subcommand>". + +`, h.cdr.Name(), h.Name()) + h.cdr.VisitGroups(func(g *subcommands.CommandGroup) { + h.cdr.ExplainGroup(h.cdr.Output, g) + }) + + fmt.Fprintf(h.cdr.Output, "Additional help topics (Use \"%s %s <topic>\" to see help on the topic):\n", h.cdr.Name(), h.Name()) + for _, cmd := range h.commands { + fmt.Fprintf(h.cdr.Output, "\t%-15s %s\n", cmd.Name(), cmd.Synopsis()) + } + fmt.Fprintf(h.cdr.Output, "\nUse \"%s flags\" for a list of top-level flags\n", h.cdr.Name()) + return subcommands.ExitSuccess + default: + // Look for commands registered to the commander and print help explanation if found. + found := false + h.cdr.VisitCommands(func(g *subcommands.CommandGroup, cmd subcommands.Command) { + if f.Arg(0) == cmd.Name() { + h.cdr.ExplainCommand(h.cdr.Output, cmd) + found = true + } + }) + if found { + return subcommands.ExitSuccess + } + + // Next check commands registered to the help command. + for _, cmd := range h.commands { + if f.Arg(0) == cmd.Name() { + fs := flag.NewFlagSet(f.Arg(0), flag.ContinueOnError) + fs.Usage = func() { h.cdr.ExplainCommand(h.cdr.Error, cmd) } + cmd.SetFlags(fs) + if fs.Parse(f.Args()[1:]) != nil { + return subcommands.ExitUsageError + } + return cmd.Execute(ctx, f, args...) + } + } + + fmt.Fprintf(h.cdr.Error, "Subcommand %s not understood\n", f.Arg(0)) + } + + f.Usage() + return subcommands.ExitUsageError +} + +// Register registers a new help command. +func (h *Help) Register(cmd subcommands.Command) { + h.commands = append(h.commands, cmd) +} diff --git a/runsc/cmd/install.go b/runsc/cmd/install.go new file mode 100644 index 000000000..2e223e3be --- /dev/null +++ b/runsc/cmd/install.go @@ -0,0 +1,210 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "io/ioutil" + "log" + "os" + "path" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/flag" +) + +// Install implements subcommands.Command. +type Install struct { + ConfigFile string + Runtime string + Experimental bool +} + +// Name implements subcommands.Command.Name. +func (*Install) Name() string { + return "install" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Install) Synopsis() string { + return "adds a runtime to docker daemon configuration" +} + +// Usage implements subcommands.Command.Usage. +func (*Install) Usage() string { + return `install [flags] <name> [-- [args...]] -- if provided, args are passed to the runtime +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (i *Install) SetFlags(fs *flag.FlagSet) { + fs.StringVar(&i.ConfigFile, "config_file", "/etc/docker/daemon.json", "path to Docker daemon config file") + fs.StringVar(&i.Runtime, "runtime", "runsc", "runtime name") + fs.BoolVar(&i.Experimental, "experimental", false, "enable experimental features") +} + +// Execute implements subcommands.Command.Execute. +func (i *Install) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + // Grab the name and arguments. + runtimeArgs := f.Args() + + // Extract the executable. + path, err := os.Executable() + if err != nil { + log.Fatalf("Error reading current exectuable: %v", err) + } + + // Load the configuration file. + c, err := readConfig(i.ConfigFile) + if err != nil { + log.Fatalf("Error reading config file %q: %v", i.ConfigFile, err) + } + + // Add the given runtime. + var rts map[string]interface{} + if i, ok := c["runtimes"]; ok { + rts = i.(map[string]interface{}) + } else { + rts = make(map[string]interface{}) + c["runtimes"] = rts + } + rts[i.Runtime] = struct { + Path string `json:"path,omitempty"` + RuntimeArgs []string `json:"runtimeArgs,omitempty"` + }{ + Path: path, + RuntimeArgs: runtimeArgs, + } + + // Set experimental if required. + if i.Experimental { + c["experimental"] = true + } + + // Write out the runtime. + if err := writeConfig(c, i.ConfigFile); err != nil { + log.Fatalf("Error writing config file %q: %v", i.ConfigFile, err) + } + + // Success. + log.Printf("Added runtime %q with arguments %v to %q.", i.Runtime, runtimeArgs, i.ConfigFile) + return subcommands.ExitSuccess +} + +// Uninstall implements subcommands.Command. +type Uninstall struct { + ConfigFile string + Runtime string +} + +// Name implements subcommands.Command.Name. +func (*Uninstall) Name() string { + return "uninstall" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Uninstall) Synopsis() string { + return "removes a runtime from docker daemon configuration" +} + +// Usage implements subcommands.Command.Usage. +func (*Uninstall) Usage() string { + return `uninstall [flags] <name> +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (u *Uninstall) SetFlags(fs *flag.FlagSet) { + fs.StringVar(&u.ConfigFile, "config_file", "/etc/docker/daemon.json", "path to Docker daemon config file") + fs.StringVar(&u.Runtime, "runtime", "runsc", "runtime name") +} + +// Execute implements subcommands.Command.Execute. +func (u *Uninstall) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + log.Printf("Removing runtime %q from %q.", u.Runtime, u.ConfigFile) + + c, err := readConfig(u.ConfigFile) + if err != nil { + log.Fatalf("Error reading config file %q: %v", u.ConfigFile, err) + } + + var rts map[string]interface{} + if i, ok := c["runtimes"]; ok { + rts = i.(map[string]interface{}) + } else { + log.Fatalf("runtime %q not found", u.Runtime) + } + if _, ok := rts[u.Runtime]; !ok { + log.Fatalf("runtime %q not found", u.Runtime) + } + delete(rts, u.Runtime) + + if err := writeConfig(c, u.ConfigFile); err != nil { + log.Fatalf("Error writing config file %q: %v", u.ConfigFile, err) + } + return subcommands.ExitSuccess +} + +func readConfig(path string) (map[string]interface{}, error) { + // Read the configuration data. + configBytes, err := ioutil.ReadFile(path) + if err != nil && !os.IsNotExist(err) { + return nil, err + } + + // Unmarshal the configuration. + c := make(map[string]interface{}) + if len(configBytes) > 0 { + if err := json.Unmarshal(configBytes, &c); err != nil { + return nil, err + } + } + + return c, nil +} + +func writeConfig(c map[string]interface{}, filename string) error { + // Marshal the configuration. + b, err := json.MarshalIndent(c, "", " ") + if err != nil { + return err + } + + // Copy the old configuration. + old, err := ioutil.ReadFile(filename) + if err != nil { + if !os.IsNotExist(err) { + return fmt.Errorf("error reading config file %q: %v", filename, err) + } + } else { + if err := ioutil.WriteFile(filename+"~", old, 0644); err != nil { + return fmt.Errorf("error backing up config file %q: %v", filename, err) + } + } + + // Make the necessary directories. + if err := os.MkdirAll(path.Dir(filename), 0755); err != nil { + return fmt.Errorf("error creating config directory for %q: %v", filename, err) + } + + // Write the new configuration. + if err := ioutil.WriteFile(filename, b, 0644); err != nil { + return fmt.Errorf("error writing config file %q: %v", filename, err) + } + + return nil +} diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go new file mode 100644 index 000000000..8282ea0e0 --- /dev/null +++ b/runsc/cmd/kill.go @@ -0,0 +1,154 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "fmt" + "strconv" + "strings" + "syscall" + + "github.com/google/subcommands" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// Kill implements subcommands.Command for the "kill" command. +type Kill struct { + all bool + pid int +} + +// Name implements subcommands.Command.Name. +func (*Kill) Name() string { + return "kill" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Kill) Synopsis() string { + return "sends a signal to the container" +} + +// Usage implements subcommands.Command.Usage. +func (*Kill) Usage() string { + return `kill <container id> [signal]` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (k *Kill) SetFlags(f *flag.FlagSet) { + f.BoolVar(&k.all, "all", false, "send the specified signal to all processes inside the container") + f.IntVar(&k.pid, "pid", 0, "send the specified signal to a specific process") +} + +// Execute implements subcommands.Command.Execute. +func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() == 0 || f.NArg() > 2 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + if k.pid != 0 && k.all { + Fatalf("it is invalid to specify both --all and --pid") + } + + c, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading container: %v", err) + } + + // The OCI command-line spec says that the signal should be specified + // via a flag, but runc (and things that call runc) pass it as an + // argument. + signal := f.Arg(1) + if signal == "" { + signal = "TERM" + } + + sig, err := parseSignal(signal) + if err != nil { + Fatalf("%v", err) + } + + if k.pid != 0 { + if err := c.SignalProcess(sig, int32(k.pid)); err != nil { + Fatalf("failed to signal pid %d: %v", k.pid, err) + } + } else { + if err := c.SignalContainer(sig, k.all); err != nil { + Fatalf("%v", err) + } + } + return subcommands.ExitSuccess +} + +func parseSignal(s string) (syscall.Signal, error) { + n, err := strconv.Atoi(s) + if err == nil { + sig := syscall.Signal(n) + for _, msig := range signalMap { + if sig == msig { + return sig, nil + } + } + return -1, fmt.Errorf("unknown signal %q", s) + } + if sig, ok := signalMap[strings.TrimPrefix(strings.ToUpper(s), "SIG")]; ok { + return sig, nil + } + return -1, fmt.Errorf("unknown signal %q", s) +} + +var signalMap = map[string]syscall.Signal{ + "ABRT": unix.SIGABRT, + "ALRM": unix.SIGALRM, + "BUS": unix.SIGBUS, + "CHLD": unix.SIGCHLD, + "CLD": unix.SIGCLD, + "CONT": unix.SIGCONT, + "FPE": unix.SIGFPE, + "HUP": unix.SIGHUP, + "ILL": unix.SIGILL, + "INT": unix.SIGINT, + "IO": unix.SIGIO, + "IOT": unix.SIGIOT, + "KILL": unix.SIGKILL, + "PIPE": unix.SIGPIPE, + "POLL": unix.SIGPOLL, + "PROF": unix.SIGPROF, + "PWR": unix.SIGPWR, + "QUIT": unix.SIGQUIT, + "SEGV": unix.SIGSEGV, + "STKFLT": unix.SIGSTKFLT, + "STOP": unix.SIGSTOP, + "SYS": unix.SIGSYS, + "TERM": unix.SIGTERM, + "TRAP": unix.SIGTRAP, + "TSTP": unix.SIGTSTP, + "TTIN": unix.SIGTTIN, + "TTOU": unix.SIGTTOU, + "URG": unix.SIGURG, + "USR1": unix.SIGUSR1, + "USR2": unix.SIGUSR2, + "VTALRM": unix.SIGVTALRM, + "WINCH": unix.SIGWINCH, + "XCPU": unix.SIGXCPU, + "XFSZ": unix.SIGXFSZ, +} diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go new file mode 100644 index 000000000..d8d906fe3 --- /dev/null +++ b/runsc/cmd/list.go @@ -0,0 +1,117 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "fmt" + "os" + "text/tabwriter" + "time" + + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// List implements subcommands.Command for the "list" command for the "list" command. +type List struct { + quiet bool + format string +} + +// Name implements subcommands.command.name. +func (*List) Name() string { + return "list" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*List) Synopsis() string { + return "list containers started by runsc with the given root" +} + +// Usage implements subcommands.Command.Usage. +func (*List) Usage() string { + return `list [flags]` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (l *List) SetFlags(f *flag.FlagSet) { + f.BoolVar(&l.quiet, "quiet", false, "only list container ids") + f.StringVar(&l.format, "format", "text", "output format: 'text' (default) or 'json'") +} + +// Execute implements subcommands.Command.Execute. +func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 0 { + f.Usage() + return subcommands.ExitUsageError + } + + conf := args[0].(*boot.Config) + ids, err := container.List(conf.RootDir) + if err != nil { + Fatalf("%v", err) + } + + if l.quiet { + for _, id := range ids { + fmt.Println(id) + } + return subcommands.ExitSuccess + } + + // Collect the containers. + var containers []*container.Container + for _, id := range ids { + c, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading container %q: %v", id, err) + } + containers = append(containers, c) + } + + switch l.format { + case "text": + // Print a nice table. + w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0) + fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n") + for _, c := range containers { + fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n", + c.ID, + c.SandboxPid(), + c.Status, + c.BundleDir, + c.CreatedAt.Format(time.RFC3339Nano), + c.Owner) + } + w.Flush() + case "json": + // Print just the states. + var states []specs.State + for _, c := range containers { + states = append(states, c.State()) + } + if err := json.NewEncoder(os.Stdout).Encode(states); err != nil { + Fatalf("marshaling container state: %v", err) + } + default: + Fatalf("unknown list format %q", l.format) + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go new file mode 100644 index 000000000..0e9ef7fa5 --- /dev/null +++ b/runsc/cmd/path.go @@ -0,0 +1,28 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "os" +) + +// getwdOrDie returns the current working directory and dies if it cannot. +func getwdOrDie() string { + wd, err := os.Getwd() + if err != nil { + Fatalf("getting current working directory: %v", err) + } + return wd +} diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go new file mode 100644 index 000000000..6f95a9837 --- /dev/null +++ b/runsc/cmd/pause.go @@ -0,0 +1,68 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// Pause implements subcommands.Command for the "pause" command. +type Pause struct{} + +// Name implements subcommands.Command.Name. +func (*Pause) Name() string { + return "pause" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Pause) Synopsis() string { + return "pause suspends all processes in a container" +} + +// Usage implements subcommands.Command.Usage. +func (*Pause) Usage() string { + return `pause <container id> - pause process in instance of container.` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (*Pause) SetFlags(f *flag.FlagSet) { +} + +// Execute implements subcommands.Command.Execute. +func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + cont, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading container: %v", err) + } + + if err := cont.Pause(); err != nil { + Fatalf("pause failed: %v", err) + } + + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go new file mode 100644 index 000000000..7fb8041af --- /dev/null +++ b/runsc/cmd/ps.go @@ -0,0 +1,86 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "fmt" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// PS implements subcommands.Command for the "ps" command. +type PS struct { + format string +} + +// Name implements subcommands.Command.Name. +func (*PS) Name() string { + return "ps" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*PS) Synopsis() string { + return "ps displays the processes running inside a container" +} + +// Usage implements subcommands.Command.Usage. +func (*PS) Usage() string { + return "<container-id> [ps options]" +} + +// SetFlags implements subcommands.Command.SetFlags. +func (ps *PS) SetFlags(f *flag.FlagSet) { + f.StringVar(&ps.format, "format", "table", "output format. Select one of: table or json (default: table)") +} + +// Execute implements subcommands.Command.Execute. +func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + c, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading sandbox: %v", err) + } + pList, err := c.Processes() + if err != nil { + Fatalf("getting processes for container: %v", err) + } + + switch ps.format { + case "table": + fmt.Println(control.ProcessListToTable(pList)) + case "json": + o, err := control.PrintPIDsJSON(pList) + if err != nil { + Fatalf("generating JSON: %v", err) + } + fmt.Println(o) + default: + Fatalf("unsupported format: %s", ps.format) + } + + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go new file mode 100644 index 000000000..72584b326 --- /dev/null +++ b/runsc/cmd/restore.go @@ -0,0 +1,119 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "path/filepath" + "syscall" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" +) + +// Restore implements subcommands.Command for the "restore" command. +type Restore struct { + // Restore flags are a super-set of those for Create. + Create + + // imagePath is the path to the saved container image + imagePath string + + // detach indicates that runsc has to start a process and exit without waiting it. + detach bool +} + +// Name implements subcommands.Command.Name. +func (*Restore) Name() string { + return "restore" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Restore) Synopsis() string { + return "restore a saved state of container (experimental)" +} + +// Usage implements subcommands.Command.Usage. +func (*Restore) Usage() string { + return `restore [flags] <container id> - restore saved state of container. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (r *Restore) SetFlags(f *flag.FlagSet) { + r.Create.SetFlags(f) + f.StringVar(&r.imagePath, "image-path", "", "directory path to saved container image") + f.BoolVar(&r.detach, "detach", false, "detach from the container's process") + + // Unimplemented flags necessary for compatibility with docker. + + var nsr bool + f.BoolVar(&nsr, "no-subreaper", false, "ignored") + + var wp string + f.StringVar(&wp, "work-path", "", "ignored") +} + +// Execute implements subcommands.Command.Execute. +func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + waitStatus := args[1].(*syscall.WaitStatus) + + if conf.Rootless { + return Errorf("Rootless mode not supported with %q", r.Name()) + } + + bundleDir := r.bundleDir + if bundleDir == "" { + bundleDir = getwdOrDie() + } + spec, err := specutils.ReadSpec(bundleDir) + if err != nil { + return Errorf("reading spec: %v", err) + } + specutils.LogSpec(spec) + + if r.imagePath == "" { + return Errorf("image-path flag must be provided") + } + + conf.RestoreFile = filepath.Join(r.imagePath, checkpointFileName) + + runArgs := container.Args{ + ID: id, + Spec: spec, + BundleDir: bundleDir, + ConsoleSocket: r.consoleSocket, + PIDFile: r.pidFile, + UserLog: r.userLog, + Attached: !r.detach, + } + ws, err := container.Run(conf, runArgs) + if err != nil { + return Errorf("running container: %v", err) + } + *waitStatus = ws + + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go new file mode 100644 index 000000000..61a55a554 --- /dev/null +++ b/runsc/cmd/resume.go @@ -0,0 +1,69 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// Resume implements subcommands.Command for the "resume" command. +type Resume struct{} + +// Name implements subcommands.Command.Name. +func (*Resume) Name() string { + return "resume" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Resume) Synopsis() string { + return "Resume unpauses a paused container" +} + +// Usage implements subcommands.Command.Usage. +func (*Resume) Usage() string { + return `resume <container id> - resume a paused container. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (r *Resume) SetFlags(f *flag.FlagSet) { +} + +// Execute implements subcommands.Command.Execute. +func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + cont, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading container: %v", err) + } + + if err := cont.Resume(); err != nil { + Fatalf("resume failed: %v", err) + } + + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go new file mode 100644 index 000000000..cf41581ad --- /dev/null +++ b/runsc/cmd/run.go @@ -0,0 +1,100 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "syscall" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" +) + +// Run implements subcommands.Command for the "run" command. +type Run struct { + // Run flags are a super-set of those for Create. + Create + + // detach indicates that runsc has to start a process and exit without waiting it. + detach bool +} + +// Name implements subcommands.Command.Name. +func (*Run) Name() string { + return "run" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Run) Synopsis() string { + return "create and run a secure container" +} + +// Usage implements subcommands.Command.Usage. +func (*Run) Usage() string { + return `run [flags] <container id> - create and run a secure container. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (r *Run) SetFlags(f *flag.FlagSet) { + f.BoolVar(&r.detach, "detach", false, "detach from the container's process") + r.Create.SetFlags(f) +} + +// Execute implements subcommands.Command.Execute. +func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + waitStatus := args[1].(*syscall.WaitStatus) + + if conf.Rootless { + return Errorf("Rootless mode not supported with %q", r.Name()) + } + + bundleDir := r.bundleDir + if bundleDir == "" { + bundleDir = getwdOrDie() + } + spec, err := specutils.ReadSpec(bundleDir) + if err != nil { + return Errorf("reading spec: %v", err) + } + specutils.LogSpec(spec) + + runArgs := container.Args{ + ID: id, + Spec: spec, + BundleDir: bundleDir, + ConsoleSocket: r.consoleSocket, + PIDFile: r.pidFile, + UserLog: r.userLog, + Attached: !r.detach, + } + ws, err := container.Run(conf, runArgs) + if err != nil { + return Errorf("running container: %v", err) + } + + *waitStatus = ws + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go new file mode 100644 index 000000000..55194e641 --- /dev/null +++ b/runsc/cmd/spec.go @@ -0,0 +1,206 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "io" + "os" + "path/filepath" + + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/runsc/flag" +) + +func writeSpec(w io.Writer, cwd string, netns string, args []string) error { + spec := &specs.Spec{ + Version: "1.0.0", + Process: &specs.Process{ + Terminal: true, + User: specs.User{ + UID: 0, + GID: 0, + }, + Args: args, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: cwd, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + // TODO(gvisor.dev/issue/3166): support ambient capabilities + }, + Rlimits: []specs.POSIXRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: 1024, + Soft: 1024, + }, + }, + }, + Root: &specs.Root{ + Path: "rootfs", + Readonly: true, + }, + Hostname: "runsc", + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "proc", + Source: "proc", + }, + { + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", + }, + { + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{ + "nosuid", + "noexec", + "nodev", + "ro", + }, + }, + }, + Linux: &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + Path: netns, + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, + }, + }, + } + + e := json.NewEncoder(w) + e.SetIndent("", " ") + return e.Encode(spec) +} + +// Spec implements subcommands.Command for the "spec" command. +type Spec struct { + bundle string + cwd string + netns string +} + +// Name implements subcommands.Command.Name. +func (*Spec) Name() string { + return "spec" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Spec) Synopsis() string { + return "create a new OCI bundle specification file" +} + +// Usage implements subcommands.Command.Usage. +func (*Spec) Usage() string { + return `spec [options] [-- args...] - create a new OCI bundle specification file. + +The spec command creates a new specification file (config.json) for a new OCI +bundle. + +The specification file is a starter file that runs the command specified by +'args' in the container. If 'args' is not specified the default is to run the +'sh' program. + +While a number of flags are provided to change values in the specification, you +can examine the file and edit it to suit your needs after this command runs. +You can find out more about the format of the specification file by visiting +the OCI runtime spec repository: +https://github.com/opencontainers/runtime-spec/ + +EXAMPLE: + $ mkdir -p bundle/rootfs + $ cd bundle + $ runsc spec -- /hello + $ docker export $(docker create hello-world) | tar -xf - -C rootfs + $ sudo runsc run hello + +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (s *Spec) SetFlags(f *flag.FlagSet) { + f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle") + f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+ + "this value MUST be an absolute path") + f.StringVar(&s.netns, "netns", "", "network namespace path") +} + +// Execute implements subcommands.Command.Execute. +func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + // Grab the arguments. + containerArgs := f.Args() + if len(containerArgs) == 0 { + containerArgs = []string{"sh"} + } + + confPath := filepath.Join(s.bundle, "config.json") + if _, err := os.Stat(confPath); !os.IsNotExist(err) { + Fatalf("file %q already exists", confPath) + } + + configFile, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0664) + if err != nil { + Fatalf("opening file %q: %v", confPath, err) + } + + err = writeSpec(configFile, s.cwd, s.netns, containerArgs) + if err != nil { + Fatalf("writing to %q: %v", confPath, err) + } + + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go new file mode 100644 index 000000000..0205fd9f7 --- /dev/null +++ b/runsc/cmd/start.go @@ -0,0 +1,65 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// Start implements subcommands.Command for the "start" command. +type Start struct{} + +// Name implements subcommands.Command.Name. +func (*Start) Name() string { + return "start" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Start) Synopsis() string { + return "start a secure container" +} + +// Usage implements subcommands.Command.Usage. +func (*Start) Usage() string { + return `start <container id> - start a secure container.` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (*Start) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command.Execute. +func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + c, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading container: %v", err) + } + if err := c.Start(conf); err != nil { + Fatalf("starting container: %v", err) + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go new file mode 100644 index 000000000..cf2413deb --- /dev/null +++ b/runsc/cmd/state.go @@ -0,0 +1,76 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "os" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +// State implements subcommands.Command for the "state" command. +type State struct{} + +// Name implements subcommands.Command.Name. +func (*State) Name() string { + return "state" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*State) Synopsis() string { + return "get the state of a container" +} + +// Usage implements subcommands.Command.Usage. +func (*State) Usage() string { + return `state [flags] <container id> - get the state of a container` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (*State) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command.Execute. +func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + c, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading container: %v", err) + } + log.Debugf("Returning state for container %+v", c) + + state := c.State() + log.Debugf("State: %+v", state) + + // Write json-encoded state directly to stdout. + b, err := json.MarshalIndent(state, "", " ") + if err != nil { + Fatalf("marshaling container state: %v", err) + } + os.Stdout.Write(b) + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/statefile.go b/runsc/cmd/statefile.go new file mode 100644 index 000000000..daed9e728 --- /dev/null +++ b/runsc/cmd/statefile.go @@ -0,0 +1,149 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "fmt" + "os" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/state/pretty" + "gvisor.dev/gvisor/pkg/state/statefile" + "gvisor.dev/gvisor/runsc/flag" +) + +// Statefile implements subcommands.Command for the "statefile" command. +type Statefile struct { + list bool + get string + key string + output string + html bool +} + +// Name implements subcommands.Command. +func (*Statefile) Name() string { + return "state" +} + +// Synopsis implements subcommands.Command. +func (*Statefile) Synopsis() string { + return "shows information about a statefile" +} + +// Usage implements subcommands.Command. +func (*Statefile) Usage() string { + return `statefile [flags] <statefile>` +} + +// SetFlags implements subcommands.Command. +func (s *Statefile) SetFlags(f *flag.FlagSet) { + f.BoolVar(&s.list, "list", false, "lists the metdata in the statefile.") + f.StringVar(&s.get, "get", "", "extracts the given metadata key.") + f.StringVar(&s.key, "key", "", "the integrity key for the file.") + f.StringVar(&s.output, "output", "", "target to write the result.") + f.BoolVar(&s.html, "html", false, "outputs in HTML format.") +} + +// Execute implements subcommands.Command.Execute. +func (s *Statefile) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + // Check arguments. + if s.list && s.get != "" { + Fatalf("error: can't specify -list and -get simultaneously.") + } + + // Setup output. + var output = os.Stdout // Default. + if s.output != "" { + f, err := os.OpenFile(s.output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644) + if err != nil { + Fatalf("error opening output: %v", err) + } + defer func() { + if err := f.Close(); err != nil { + Fatalf("error flushing output: %v", err) + } + }() + output = f + } + + // Open the file. + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + input, err := os.Open(f.Arg(0)) + if err != nil { + Fatalf("error opening input: %v\n", err) + } + + if s.html { + fmt.Fprintf(output, "<html><body>\n") + defer fmt.Fprintf(output, "</body></html>\n") + } + + // Dump the full file? + if !s.list && s.get == "" { + var key []byte + if s.key != "" { + key = []byte(s.key) + } + rc, _, err := statefile.NewReader(input, key) + if err != nil { + Fatalf("error parsing statefile: %v", err) + } + if s.html { + if err := pretty.PrintHTML(output, rc); err != nil { + Fatalf("error printing state: %v", err) + } + } else { + if err := pretty.PrintText(output, rc); err != nil { + Fatalf("error printing state: %v", err) + } + } + return subcommands.ExitSuccess + } + + // Load just the metadata. + metadata, err := statefile.MetadataUnsafe(input) + if err != nil { + Fatalf("error reading metadata: %v", err) + } + + // Is it a single key? + if s.get != "" { + val, ok := metadata[s.get] + if !ok { + Fatalf("metadata key %s: not found", s.get) + } + fmt.Fprintf(output, "%s\n", val) + return subcommands.ExitSuccess + } + + // List all keys. + if s.html { + fmt.Fprintf(output, " <ul>\n") + defer fmt.Fprintf(output, " </ul>\n") + } + for key := range metadata { + if s.html { + fmt.Fprintf(output, " <li>%s</li>\n", key) + } else { + fmt.Fprintf(output, "%s\n", key) + } + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go new file mode 100644 index 000000000..a37d66139 --- /dev/null +++ b/runsc/cmd/syscalls.go @@ -0,0 +1,356 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/csv" + "encoding/json" + "fmt" + "io" + "os" + "sort" + "strconv" + "text/tabwriter" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/runsc/flag" +) + +// Syscalls implements subcommands.Command for the "syscalls" command. +type Syscalls struct { + format string + os string + arch string + filename string +} + +// CompatibilityInfo is a map of system and architecture to compatibility doc. +// Maps operating system to architecture to ArchInfo. +type CompatibilityInfo map[string]map[string]ArchInfo + +// ArchInfo is compatibility doc for an architecture. +type ArchInfo struct { + // Syscalls maps syscall number for the architecture to the doc. + Syscalls map[uintptr]SyscallDoc `json:"syscalls"` +} + +// SyscallDoc represents a single item of syscall documentation. +type SyscallDoc struct { + Name string `json:"name"` + num uintptr + + Support string `json:"support"` + Note string `json:"note,omitempty"` + URLs []string `json:"urls,omitempty"` +} + +type outputFunc func(io.Writer, CompatibilityInfo) error + +var ( + // The string name to use for printing compatibility for all OSes. + osAll = "all" + + // The string name to use for printing compatibility for all architectures. + archAll = "all" + + // A map of OS name to map of architecture name to syscall table. + syscallTableMap = make(map[string]map[string]*kernel.SyscallTable) + + // A map of output type names to output functions. + outputMap = map[string]outputFunc{ + "table": outputTable, + "json": outputJSON, + "csv": outputCSV, + } +) + +// Name implements subcommands.Command.Name. +func (*Syscalls) Name() string { + return "syscalls" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Syscalls) Synopsis() string { + return "Print compatibility information for syscalls." +} + +// Usage implements subcommands.Command.Usage. +func (*Syscalls) Usage() string { + return `syscalls [options] - Print compatibility information for syscalls. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (s *Syscalls) SetFlags(f *flag.FlagSet) { + f.StringVar(&s.format, "format", "table", "Output format (table, csv, json).") + f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)") + f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).") + f.StringVar(&s.filename, "filename", "", "Output filename (otherwise stdout).") +} + +// Execute implements subcommands.Command.Execute. +func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + out, ok := outputMap[s.format] + if !ok { + Fatalf("Unsupported output format %q", s.format) + } + + // Build map of all supported architectures. + tables := kernel.SyscallTables() + for _, t := range tables { + osMap, ok := syscallTableMap[t.OS.String()] + if !ok { + osMap = make(map[string]*kernel.SyscallTable) + syscallTableMap[t.OS.String()] = osMap + } + osMap[t.Arch.String()] = t + } + + // Build a map of the architectures we want to output. + info, err := getCompatibilityInfo(s.os, s.arch) + if err != nil { + Fatalf("%v", err) + } + + w := os.Stdout // Default. + if s.filename != "" { + w, err = os.OpenFile(s.filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + if err != nil { + Fatalf("Error opening %q: %v", s.filename, err) + } + } + if err := out(w, info); err != nil { + Fatalf("Error writing output: %v", err) + } + + return subcommands.ExitSuccess +} + +// getCompatibilityInfo returns compatibility info for the given OS name and +// architecture name. Supports the special name 'all' for OS and architecture that +// specifies that all supported OSes or architectures should be included. +func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, error) { + info := CompatibilityInfo(make(map[string]map[string]ArchInfo)) + if osName == osAll { + // Special processing for the 'all' OS name. + for osName, _ := range syscallTableMap { + info[osName] = make(map[string]ArchInfo) + // osName is a specific OS name. + if err := addToCompatibilityInfo(info, osName, archName); err != nil { + return info, err + } + } + } else { + // osName is a specific OS name. + info[osName] = make(map[string]ArchInfo) + if err := addToCompatibilityInfo(info, osName, archName); err != nil { + return info, err + } + } + + return info, nil +} + +// addToCompatibilityInfo adds ArchInfo for the given specific OS name and +// architecture name. Supports the special architecture name 'all' to specify +// that all supported architectures for the OS should be included. +func addToCompatibilityInfo(info CompatibilityInfo, osName string, archName string) error { + if archName == archAll { + // Special processing for the 'all' architecture name. + for archName, _ := range syscallTableMap[osName] { + archInfo, err := getArchInfo(osName, archName) + if err != nil { + return err + } + info[osName][archName] = archInfo + } + } else { + // archName is a specific architecture name. + archInfo, err := getArchInfo(osName, archName) + if err != nil { + return err + } + info[osName][archName] = archInfo + } + + return nil +} + +// getArchInfo returns compatibility info for a specific OS and architecture. +func getArchInfo(osName string, archName string) (ArchInfo, error) { + info := ArchInfo{} + info.Syscalls = make(map[uintptr]SyscallDoc) + + t, ok := syscallTableMap[osName][archName] + if !ok { + return info, fmt.Errorf("syscall table for %s/%s not found", osName, archName) + } + + for num, sc := range t.Table { + info.Syscalls[num] = SyscallDoc{ + Name: sc.Name, + num: num, + Support: sc.SupportLevel.String(), + Note: sc.Note, + URLs: sc.URLs, + } + } + + return info, nil +} + +// outputTable outputs the syscall info in tabular format. +func outputTable(w io.Writer, info CompatibilityInfo) error { + tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0) + + // Linux + for osName, osInfo := range info { + for archName, archInfo := range osInfo { + // Print the OS/arch + fmt.Fprintf(w, "%s/%s:\n\n", osName, archName) + + // Sort the syscalls for output in the table. + sortedCalls := []SyscallDoc{} + for _, sc := range archInfo.Syscalls { + sortedCalls = append(sortedCalls, sc) + } + sort.Slice(sortedCalls, func(i, j int) bool { + return sortedCalls[i].num < sortedCalls[j].num + }) + + // Write the header + _, err := fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", + "NUM", + "NAME", + "SUPPORT", + "NOTE", + ) + if err != nil { + return err + } + + // Write each syscall entry + for _, sc := range sortedCalls { + _, err = fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", + strconv.FormatInt(int64(sc.num), 10), + sc.Name, + sc.Support, + sc.Note, + ) + if err != nil { + return err + } + // Add issue urls to note. + for _, url := range sc.URLs { + _, err = fmt.Fprintf(tw, "%s\t%s\t%s\tSee: %s\t\n", + "", + "", + "", + url, + ) + if err != nil { + return err + } + } + } + + err = tw.Flush() + if err != nil { + return err + } + } + } + + return nil +} + +// outputJSON outputs the syscall info in JSON format. +func outputJSON(w io.Writer, info CompatibilityInfo) error { + e := json.NewEncoder(w) + e.SetIndent("", " ") + return e.Encode(info) +} + +// numberedRow is aCSV row annotated by syscall number (used for sorting) +type numberedRow struct { + num uintptr + row []string +} + +// outputCSV outputs the syscall info in tabular format. +func outputCSV(w io.Writer, info CompatibilityInfo) error { + csvWriter := csv.NewWriter(w) + + // Linux + for osName, osInfo := range info { + for archName, archInfo := range osInfo { + // Sort the syscalls for output in the table. + sortedCalls := []numberedRow{} + for _, sc := range archInfo.Syscalls { + // Add issue urls to note. + note := sc.Note + for _, url := range sc.URLs { + note = fmt.Sprintf("%s\nSee: %s", note, url) + } + + sortedCalls = append(sortedCalls, numberedRow{ + num: sc.num, + row: []string{ + osName, + archName, + strconv.FormatInt(int64(sc.num), 10), + sc.Name, + sc.Support, + note, + }, + }) + } + sort.Slice(sortedCalls, func(i, j int) bool { + return sortedCalls[i].num < sortedCalls[j].num + }) + + // Write the header + err := csvWriter.Write([]string{ + "OS", + "Arch", + "Num", + "Name", + "Support", + "Note", + }) + if err != nil { + return err + } + + // Write each syscall entry + for _, sc := range sortedCalls { + err = csvWriter.Write(sc.row) + if err != nil { + return err + } + } + + csvWriter.Flush() + err = csvWriter.Error() + if err != nil { + return err + } + } + } + + return nil +} diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go new file mode 100644 index 000000000..29c0a15f0 --- /dev/null +++ b/runsc/cmd/wait.go @@ -0,0 +1,127 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "encoding/json" + "os" + "syscall" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/container" + "gvisor.dev/gvisor/runsc/flag" +) + +const ( + unsetPID = -1 +) + +// Wait implements subcommands.Command for the "wait" command. +type Wait struct { + rootPID int + pid int +} + +// Name implements subcommands.Command.Name. +func (*Wait) Name() string { + return "wait" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Wait) Synopsis() string { + return "wait on a process inside a container" +} + +// Usage implements subcommands.Command.Usage. +func (*Wait) Usage() string { + return `wait [flags] <container id>` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (wt *Wait) SetFlags(f *flag.FlagSet) { + f.IntVar(&wt.rootPID, "rootpid", unsetPID, "select a PID in the sandbox root PID namespace to wait on instead of the container's root process") + f.IntVar(&wt.pid, "pid", unsetPID, "select a PID in the container's PID namespace to wait on instead of the container's root process") +} + +// Execute implements subcommands.Command.Execute. It waits for a process in a +// container to exit before returning. +func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + // You can't specify both -pid and -rootpid. + if wt.rootPID != unsetPID && wt.pid != unsetPID { + Fatalf("only one of -pid and -rootPid can be set") + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + c, err := container.Load(conf.RootDir, id) + if err != nil { + Fatalf("loading container: %v", err) + } + + var waitStatus syscall.WaitStatus + switch { + // Wait on the whole container. + case wt.rootPID == unsetPID && wt.pid == unsetPID: + ws, err := c.Wait() + if err != nil { + Fatalf("waiting on container %q: %v", c.ID, err) + } + waitStatus = ws + // Wait on a PID in the root PID namespace. + case wt.rootPID != unsetPID: + ws, err := c.WaitRootPID(int32(wt.rootPID)) + if err != nil { + Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err) + } + waitStatus = ws + // Wait on a PID in the container's PID namespace. + case wt.pid != unsetPID: + ws, err := c.WaitPID(int32(wt.pid)) + if err != nil { + Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err) + } + waitStatus = ws + } + result := waitResult{ + ID: id, + ExitStatus: exitStatus(waitStatus), + } + // Write json-encoded wait result directly to stdout. + if err := json.NewEncoder(os.Stdout).Encode(result); err != nil { + Fatalf("marshaling wait result: %v", err) + } + return subcommands.ExitSuccess +} + +type waitResult struct { + ID string `json:"id"` + ExitStatus int `json:"exitStatus"` +} + +// exitStatus returns the correct exit status for a process based on if it +// was signaled or exited cleanly. +func exitStatus(status syscall.WaitStatus) int { + if status.Signaled() { + return 128 + int(status.Signal()) + } + return status.ExitStatus() +} diff --git a/runsc/console/BUILD b/runsc/console/BUILD new file mode 100644 index 000000000..06924bccd --- /dev/null +++ b/runsc/console/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "console", + srcs = [ + "console.go", + ], + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "@com_github_kr_pty//:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/console/console.go b/runsc/console/console.go new file mode 100644 index 000000000..64b23639a --- /dev/null +++ b/runsc/console/console.go @@ -0,0 +1,63 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package console contains utilities for working with pty consols in runsc. +package console + +import ( + "fmt" + "net" + "os" + + "github.com/kr/pty" + "golang.org/x/sys/unix" +) + +// NewWithSocket creates pty master/slave pair, sends the master FD over the given +// socket, and returns the slave. +func NewWithSocket(socketPath string) (*os.File, error) { + // Create a new pty master and slave. + ptyMaster, ptySlave, err := pty.Open() + if err != nil { + return nil, fmt.Errorf("opening pty: %v", err) + } + defer ptyMaster.Close() + + // Get a connection to the socket path. + conn, err := net.Dial("unix", socketPath) + if err != nil { + ptySlave.Close() + return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err) + } + defer conn.Close() + uc, ok := conn.(*net.UnixConn) + if !ok { + ptySlave.Close() + return nil, fmt.Errorf("connection is not a UnixConn: %T", conn) + } + socket, err := uc.File() + if err != nil { + ptySlave.Close() + return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err) + } + defer socket.Close() + + // Send the master FD over the connection. + msg := unix.UnixRights(int(ptyMaster.Fd())) + if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil { + ptySlave.Close() + return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err) + } + return ptySlave, nil +} diff --git a/runsc/container/BUILD b/runsc/container/BUILD new file mode 100644 index 000000000..49cfb0837 --- /dev/null +++ b/runsc/container/BUILD @@ -0,0 +1,74 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "container", + srcs = [ + "container.go", + "hook.go", + "state_file.go", + "status.go", + ], + visibility = [ + "//runsc:__subpackages__", + "//test:__subpackages__", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/cleanup", + "//pkg/log", + "//pkg/sentry/control", + "//pkg/sentry/sighandling", + "//pkg/sync", + "//runsc/boot", + "//runsc/cgroup", + "//runsc/sandbox", + "//runsc/specutils", + "@com_github_cenkalti_backoff//:go_default_library", + "@com_github_gofrs_flock//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + ], +) + +go_test( + name = "container_test", + size = "large", + srcs = [ + "console_test.go", + "container_norace_test.go", + "container_race_test.go", + "container_test.go", + "multi_container_test.go", + "shared_volume_test.go", + ], + data = [ + "//runsc", + "//test/cmd/test_app", + ], + library = ":container", + shard_count = 10, + tags = [ + "requires-kvm", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/bits", + "//pkg/cleanup", + "//pkg/log", + "//pkg/sentry/control", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sync", + "//pkg/test/testutil", + "//pkg/unet", + "//pkg/urpc", + "//runsc/boot", + "//runsc/boot/platforms", + "//runsc/specutils", + "@com_github_cenkalti_backoff//:go_default_library", + "@com_github_kr_pty//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go new file mode 100644 index 000000000..3813c6b93 --- /dev/null +++ b/runsc/container/console_test.go @@ -0,0 +1,480 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "bytes" + "fmt" + "io" + "os" + "path/filepath" + "syscall" + "testing" + "time" + + "github.com/kr/pty" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/pkg/urpc" +) + +// socketPath creates a path inside bundleDir and ensures that the returned +// path is under 108 charactors (the unix socket path length limit), +// relativizing the path if necessary. +func socketPath(bundleDir string) (string, error) { + path := filepath.Join(bundleDir, "socket") + cwd, err := os.Getwd() + if err != nil { + return "", fmt.Errorf("error getting cwd: %v", err) + } + relPath, err := filepath.Rel(cwd, path) + if err != nil { + return "", fmt.Errorf("error getting relative path for %q from cwd %q: %v", path, cwd, err) + } + if len(path) > len(relPath) { + path = relPath + } + const maxPathLen = 108 + if len(path) > maxPathLen { + return "", fmt.Errorf("could not get socket path under length limit %d: %s", maxPathLen, path) + } + return path, nil +} + +// createConsoleSocket creates a socket at the given path that will receive a +// console fd from the sandbox. If an error occurs, t.Fatalf will be called. +// The function returning should be deferred as cleanup. +func createConsoleSocket(t *testing.T, path string) (*unet.ServerSocket, func()) { + t.Helper() + srv, err := unet.BindAndListen(path, false) + if err != nil { + t.Fatalf("error binding and listening to socket %q: %v", path, err) + } + + cleanup := func() { + // Log errors; nothing can be done. + if err := srv.Close(); err != nil { + t.Logf("error closing socket %q: %v", path, err) + } + if err := os.Remove(path); err != nil { + t.Logf("error removing socket %q: %v", path, err) + } + } + + return srv, cleanup +} + +// receiveConsolePTY accepts a connection on the server socket and reads fds. +// It fails if more than one FD is received, or if the FD is not a PTY. It +// returns the PTY master file. +func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) { + sock, err := srv.Accept() + if err != nil { + return nil, fmt.Errorf("error accepting socket connection: %v", err) + } + + // Allow 3 fds to be received. We only expect 1. + r := sock.Reader(true /* blocking */) + r.EnableFDs(1) + + // The socket is closed right after sending the FD, so EOF is + // an allowed error. + b := [][]byte{{}} + if _, err := r.ReadVec(b); err != nil && err != io.EOF { + return nil, fmt.Errorf("error reading from socket connection: %v", err) + } + + // We should have gotten a control message. + fds, err := r.ExtractFDs() + if err != nil { + return nil, fmt.Errorf("error extracting fds from socket connection: %v", err) + } + if len(fds) != 1 { + return nil, fmt.Errorf("got %d fds from socket, wanted 1", len(fds)) + } + + // Verify that the fd is a terminal. + if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil { + return nil, fmt.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err) + } + + return os.NewFile(uintptr(fds[0]), "pty_master"), nil +} + +// Test that an pty FD is sent over the console socket if one is provided. +func TestConsoleSocket(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + spec := testutil.NewSpecWithArgs("true") + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + ConsoleSocket: sock, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + + // Make sure we get a console PTY. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + ptyMaster.Close() + }) + } +} + +// Test that job control signals work on a console created with "exec -ti". +func TestJobControlSignalExec(t *testing.T) { + spec := testutil.NewSpecWithArgs("/bin/sleep", "10000") + conf := testutil.TestConfig(t) + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Create a pty master/slave. The slave will be passed to the exec + // process. + ptyMaster, ptySlave, err := pty.Open() + if err != nil { + t.Fatalf("error opening pty: %v", err) + } + defer ptyMaster.Close() + defer ptySlave.Close() + + // Exec bash and attach a terminal. Note that occasionally /bin/sh + // may be a different shell or have a different configuration (such + // as disabling interactive mode and job control). Since we want to + // explicitly test interactive mode, use /bin/bash. See b/116981926. + execArgs := &control.ExecArgs{ + Filename: "/bin/bash", + // Don't let bash execute from profile or rc files, otherwise + // our PID counts get messed up. + Argv: []string{"/bin/bash", "--noprofile", "--norc"}, + // Pass the pty slave as FD 0, 1, and 2. + FilePayload: urpc.FilePayload{ + Files: []*os.File{ptySlave, ptySlave, ptySlave}, + }, + StdioIsPty: true, + } + + pid, err := c.Execute(execArgs) + if err != nil { + t.Fatalf("error executing: %v", err) + } + if pid != 2 { + t.Fatalf("exec got pid %d, wanted %d", pid, 2) + } + + // Make sure all the processes are running. + expectedPL := []*control.Process{ + // Root container process. + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + // Bash from exec process. + {PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}}, + } + if err := waitForProcessList(c, expectedPL); err != nil { + t.Error(err) + } + + // Execute sleep. + ptyMaster.Write([]byte("sleep 100\n")) + + // Wait for it to start. Sleep's PPID is bash's PID. + expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}}) + if err := waitForProcessList(c, expectedPL); err != nil { + t.Error(err) + } + + // Send a SIGTERM to the foreground process for the exec PID. Note that + // although we pass in the PID of "bash", it should actually terminate + // "sleep", since that is the foreground process. + if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGTERM, true /* fgProcess */); err != nil { + t.Fatalf("error signaling container: %v", err) + } + + // Sleep process should be gone. + expectedPL = expectedPL[:len(expectedPL)-1] + if err := waitForProcessList(c, expectedPL); err != nil { + t.Error(err) + } + + // Sleep is dead, but it may take more time for bash to notice and + // change the foreground process back to itself. We know it is done + // when bash writes "Terminated" to the pty. + if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil { + t.Fatalf("bash did not take over pty: %v", err) + } + + // Send a SIGKILL to the foreground process again. This time "bash" + // should be killed. We use SIGKILL instead of SIGTERM or SIGINT + // because bash ignores those. + if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.SIGKILL, true /* fgProcess */); err != nil { + t.Fatalf("error signaling container: %v", err) + } + expectedPL = expectedPL[:1] + if err := waitForProcessList(c, expectedPL); err != nil { + t.Error(err) + } + + // Make sure the process indicates it was killed by a SIGKILL. + ws, err := c.WaitPID(pid) + if err != nil { + t.Errorf("waiting on container failed: %v", err) + } + if !ws.Signaled() { + t.Error("ws.Signaled() got false, want true") + } + if got, want := ws.Signal(), syscall.SIGKILL; got != want { + t.Errorf("ws.Signal() got %v, want %v", got, want) + } +} + +// Test that job control signals work on a console created with "run -ti". +func TestJobControlSignalRootContainer(t *testing.T) { + conf := testutil.TestConfig(t) + // Don't let bash execute from profile or rc files, otherwise our PID + // counts get messed up. + spec := testutil.NewSpecWithArgs("/bin/bash", "--noprofile", "--norc") + spec.Process.Terminal = true + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + ConsoleSocket: sock, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + + // Get the PTY master. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + defer ptyMaster.Close() + + // Bash output as well as sandbox output will be written to the PTY + // file. Writes after a certain point will block unless we drain the + // PTY, so we must continually copy from it. + // + // We log the output to stderr for debugabilitly, and also to a buffer, + // since we wait on particular output from bash below. We use a custom + // blockingBuffer which is thread-safe and also blocks on Read calls, + // which makes this a suitable Reader for WaitUntilRead. + ptyBuf := newBlockingBuffer() + tee := io.TeeReader(ptyMaster, ptyBuf) + go io.Copy(os.Stderr, tee) + + // Start the container. + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Start waiting for the container to exit in a goroutine. We do this + // very early, otherwise it might exit before we have a chance to call + // Wait. + var ( + ws syscall.WaitStatus + wg sync.WaitGroup + ) + wg.Add(1) + go func() { + var err error + ws, err = c.Wait() + if err != nil { + t.Errorf("error waiting on container: %v", err) + } + wg.Done() + }() + + // Wait for bash to start. + expectedPL := []*control.Process{ + {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}}, + } + if err := waitForProcessList(c, expectedPL); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + + // Execute sleep via the terminal. + ptyMaster.Write([]byte("sleep 100\n")) + + // Wait for sleep to start. + expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}}) + if err := waitForProcessList(c, expectedPL); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + + // Reset the pty buffer, so there is less output for us to scan later. + ptyBuf.Reset() + + // Send a SIGTERM to the foreground process. We pass PID=0, indicating + // that the root process should be killed. However, by setting + // fgProcess=true, the signal should actually be sent to sleep. + if err := c.Sandbox.SignalProcess(c.ID, 0 /* PID */, syscall.SIGTERM, true /* fgProcess */); err != nil { + t.Fatalf("error signaling container: %v", err) + } + + // Sleep process should be gone. + expectedPL = expectedPL[:len(expectedPL)-1] + if err := waitForProcessList(c, expectedPL); err != nil { + t.Error(err) + } + + // Sleep is dead, but it may take more time for bash to notice and + // change the foreground process back to itself. We know it is done + // when bash writes "Terminated" to the pty. + if err := testutil.WaitUntilRead(ptyBuf, "Terminated", nil, 5*time.Second); err != nil { + t.Fatalf("bash did not take over pty: %v", err) + } + + // Send a SIGKILL to the foreground process again. This time "bash" + // should be killed. We use SIGKILL instead of SIGTERM or SIGINT + // because bash ignores those. + if err := c.Sandbox.SignalProcess(c.ID, 0 /* PID */, syscall.SIGKILL, true /* fgProcess */); err != nil { + t.Fatalf("error signaling container: %v", err) + } + + // Wait for the sandbox to exit. It should exit with a SIGKILL status. + wg.Wait() + if !ws.Signaled() { + t.Error("ws.Signaled() got false, want true") + } + if got, want := ws.Signal(), syscall.SIGKILL; got != want { + t.Errorf("ws.Signal() got %v, want %v", got, want) + } +} + +// blockingBuffer is a thread-safe buffer that blocks when reading if the +// buffer is empty. It implements io.ReadWriter. +type blockingBuffer struct { + // A send to readCh indicates that a previously empty buffer now has + // data for reading. + readCh chan struct{} + + // mu protects buf. + mu sync.Mutex + buf bytes.Buffer +} + +func newBlockingBuffer() *blockingBuffer { + return &blockingBuffer{ + readCh: make(chan struct{}, 1), + } +} + +// Write implements Writer.Write. +func (bb *blockingBuffer) Write(p []byte) (int, error) { + bb.mu.Lock() + defer bb.mu.Unlock() + l := bb.buf.Len() + n, err := bb.buf.Write(p) + if l == 0 && n > 0 { + // New data! + bb.readCh <- struct{}{} + } + return n, err +} + +// Read implements Reader.Read. It will block until data is available. +func (bb *blockingBuffer) Read(p []byte) (int, error) { + for { + bb.mu.Lock() + n, err := bb.buf.Read(p) + if n > 0 || err != io.EOF { + if bb.buf.Len() == 0 { + // Reset the readCh. + select { + case <-bb.readCh: + default: + } + } + bb.mu.Unlock() + return n, err + } + bb.mu.Unlock() + + // Wait for new data. + <-bb.readCh + } +} + +// Reset resets the buffer. +func (bb *blockingBuffer) Reset() { + bb.mu.Lock() + defer bb.mu.Unlock() + bb.buf.Reset() + // Reset the readCh. + select { + case <-bb.readCh: + default: + } +} diff --git a/runsc/container/container.go b/runsc/container/container.go new file mode 100644 index 000000000..6d297d0df --- /dev/null +++ b/runsc/container/container.go @@ -0,0 +1,1171 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package container creates and manipulates containers. +package container + +import ( + "context" + "errors" + "fmt" + "io/ioutil" + "os" + "os/exec" + "regexp" + "strconv" + "strings" + "syscall" + "time" + + "github.com/cenkalti/backoff" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/sighandling" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/cgroup" + "gvisor.dev/gvisor/runsc/sandbox" + "gvisor.dev/gvisor/runsc/specutils" +) + +// validateID validates the container id. +func validateID(id string) error { + // See libcontainer/factory_linux.go. + idRegex := regexp.MustCompile(`^[\w+-\.]+$`) + if !idRegex.MatchString(id) { + return fmt.Errorf("invalid container id: %v", id) + } + return nil +} + +// Container represents a containerized application. When running, the +// container is associated with a single Sandbox. +// +// Container metadata can be saved and loaded to disk. Within a root directory, +// we maintain subdirectories for each container named with the container id. +// The container metadata is stored as a json within the container directory +// in a file named "meta.json". This metadata format is defined by us and is +// not part of the OCI spec. +// +// Containers must write their metadata files after any change to their internal +// states. The entire container directory is deleted when the container is +// destroyed. +// +// When the container is stopped, all processes that belong to the container +// must be stopped before Destroy() returns. containerd makes roughly the +// following calls to stop a container: +// - First it attempts to kill the container process with +// 'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a +// separate thread, it's waiting on the container. As soon as the wait +// returns, it moves on to the next step: +// - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to +// the container. 'kill --all SIGKILL' waits for all processes before +// returning. +// - Containerd waits for stdin, stdout and stderr to drain and be closed. +// - It calls 'runsc delete'. runc implementation kills --all SIGKILL once +// again just to be sure, waits, and then proceeds with remaining teardown. +// +type Container struct { + // ID is the container ID. + ID string `json:"id"` + + // Spec is the OCI runtime spec that configures this container. + Spec *specs.Spec `json:"spec"` + + // BundleDir is the directory containing the container bundle. + BundleDir string `json:"bundleDir"` + + // CreatedAt is the time the container was created. + CreatedAt time.Time `json:"createdAt"` + + // Owner is the container owner. + Owner string `json:"owner"` + + // ConsoleSocket is the path to a unix domain socket that will receive + // the console FD. + ConsoleSocket string `json:"consoleSocket"` + + // Status is the current container Status. + Status Status `json:"status"` + + // GoferPid is the PID of the gofer running along side the sandbox. May + // be 0 if the gofer has been killed. + GoferPid int `json:"goferPid"` + + // Sandbox is the sandbox this container is running in. It's set when the + // container is created and reset when the sandbox is destroyed. + Sandbox *sandbox.Sandbox `json:"sandbox"` + + // Saver handles load from/save to the state file safely from multiple + // processes. + Saver StateFile `json:"saver"` + + // + // Fields below this line are not saved in the state file and will not + // be preserved across commands. + // + + // goferIsChild is set if a gofer process is a child of the current process. + // + // This field isn't saved to json, because only a creator of a gofer + // process will have it as a child process. + goferIsChild bool +} + +// loadSandbox loads all containers that belong to the sandbox with the given +// ID. +func loadSandbox(rootDir, id string) ([]*Container, error) { + cids, err := List(rootDir) + if err != nil { + return nil, err + } + + // Load the container metadata. + var containers []*Container + for _, cid := range cids { + container, err := Load(rootDir, cid) + if err != nil { + // Container file may not exist if it raced with creation/deletion or + // directory was left behind. Load provides a snapshot in time, so it's + // fine to skip it. + if os.IsNotExist(err) { + continue + } + return nil, fmt.Errorf("loading container %q: %v", id, err) + } + if container.Sandbox.ID == id { + containers = append(containers, container) + } + } + return containers, nil +} + +// Load loads a container with the given id from a metadata file. partialID may +// be an abbreviation of the full container id, in which case Load loads the +// container to which id unambiguously refers to. Returns ErrNotExist if +// container doesn't exist. +func Load(rootDir, partialID string) (*Container, error) { + log.Debugf("Load container %q %q", rootDir, partialID) + if err := validateID(partialID); err != nil { + return nil, fmt.Errorf("validating id: %v", err) + } + + id, err := findContainerID(rootDir, partialID) + if err != nil { + // Preserve error so that callers can distinguish 'not found' errors. + return nil, err + } + + state := StateFile{ + RootDir: rootDir, + ID: id, + } + defer state.close() + + c := &Container{} + if err := state.load(c); err != nil { + if os.IsNotExist(err) { + // Preserve error so that callers can distinguish 'not found' errors. + return nil, err + } + return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err) + } + + // If the status is "Running" or "Created", check that the sandbox + // process still exists, and set it to Stopped if it does not. + // + // This is inherently racy. + if c.Status == Running || c.Status == Created { + // Check if the sandbox process is still running. + if !c.isSandboxRunning() { + // Sandbox no longer exists, so this container definitely does not exist. + c.changeStatus(Stopped) + } else if c.Status == Running { + // Container state should reflect the actual state of the application, so + // we don't consider gofer process here. + if err := c.SignalContainer(syscall.Signal(0), false); err != nil { + c.changeStatus(Stopped) + } + } + } + + return c, nil +} + +func findContainerID(rootDir, partialID string) (string, error) { + // Check whether the id fully specifies an existing container. + stateFile := buildStatePath(rootDir, partialID) + if _, err := os.Stat(stateFile); err == nil { + return partialID, nil + } + + // Now see whether id could be an abbreviation of exactly 1 of the + // container ids. If id is ambiguous (it could match more than 1 + // container), it is an error. + ids, err := List(rootDir) + if err != nil { + return "", err + } + rv := "" + for _, id := range ids { + if strings.HasPrefix(id, partialID) { + if rv != "" { + return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id) + } + rv = id + } + } + if rv == "" { + return "", os.ErrNotExist + } + log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv) + return rv, nil +} + +// Args is used to configure a new container. +type Args struct { + // ID is the container unique identifier. + ID string + + // Spec is the OCI spec that describes the container. + Spec *specs.Spec + + // BundleDir is the directory containing the container bundle. + BundleDir string + + // ConsoleSocket is the path to a unix domain socket that will receive + // the console FD. It may be empty. + ConsoleSocket string + + // PIDFile is the filename where the container's root process PID will be + // written to. It may be empty. + PIDFile string + + // UserLog is the filename to send user-visible logs to. It may be empty. + // + // It only applies for the init container. + UserLog string + + // Attached indicates that the sandbox lifecycle is attached with the caller. + // If the caller exits, the sandbox should exit too. + // + // It only applies for the init container. + Attached bool +} + +// New creates the container in a new Sandbox process, unless the metadata +// indicates that an existing Sandbox should be used. The caller must call +// Destroy() on the container. +func New(conf *boot.Config, args Args) (*Container, error) { + log.Debugf("Create container %q in root dir: %s", args.ID, conf.RootDir) + if err := validateID(args.ID); err != nil { + return nil, err + } + + if err := os.MkdirAll(conf.RootDir, 0711); err != nil { + return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err) + } + + c := &Container{ + ID: args.ID, + Spec: args.Spec, + ConsoleSocket: args.ConsoleSocket, + BundleDir: args.BundleDir, + Status: Creating, + CreatedAt: time.Now(), + Owner: os.Getenv("USER"), + Saver: StateFile{ + RootDir: conf.RootDir, + ID: args.ID, + }, + } + // The Cleanup object cleans up partially created containers when an error + // occurs. Any errors occurring during cleanup itself are ignored. + cu := cleanup.Make(func() { _ = c.Destroy() }) + defer cu.Clean() + + // Lock the container metadata file to prevent concurrent creations of + // containers with the same id. + if err := c.Saver.lockForNew(); err != nil { + return nil, err + } + defer c.Saver.unlock() + + // If the metadata annotations indicate that this container should be + // started in an existing sandbox, we must do so. The metadata will + // indicate the ID of the sandbox, which is the same as the ID of the + // init container in the sandbox. + if isRoot(args.Spec) { + log.Debugf("Creating new sandbox for container %q", args.ID) + + // Create and join cgroup before processes are created to ensure they are + // part of the cgroup from the start (and all their children processes). + cg, err := cgroup.New(args.Spec) + if err != nil { + return nil, err + } + if cg != nil { + // If there is cgroup config, install it before creating sandbox process. + if err := cg.Install(args.Spec.Linux.Resources); err != nil { + return nil, fmt.Errorf("configuring cgroup: %v", err) + } + } + if err := runInCgroup(cg, func() error { + ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir) + if err != nil { + return err + } + + // Start a new sandbox for this container. Any errors after this point + // must destroy the container. + sandArgs := &sandbox.Args{ + ID: args.ID, + Spec: args.Spec, + BundleDir: args.BundleDir, + ConsoleSocket: args.ConsoleSocket, + UserLog: args.UserLog, + IOFiles: ioFiles, + MountsFile: specFile, + Cgroup: cg, + Attached: args.Attached, + } + sand, err := sandbox.New(conf, sandArgs) + if err != nil { + return err + } + c.Sandbox = sand + return nil + + }); err != nil { + return nil, err + } + } else { + // This is sort of confusing. For a sandbox with a root + // container and a child container in it, runsc sees: + // * A container struct whose sandbox ID is equal to the + // container ID. This is the root container that is tied to + // the creation of the sandbox. + // * A container struct whose sandbox ID is equal to the above + // container/sandbox ID, but that has a different container + // ID. This is the child container. + sbid, ok := specutils.SandboxID(args.Spec) + if !ok { + return nil, fmt.Errorf("no sandbox ID found when creating container") + } + log.Debugf("Creating new container %q in sandbox %q", c.ID, sbid) + + // Find the sandbox associated with this ID. + sb, err := Load(conf.RootDir, sbid) + if err != nil { + return nil, err + } + c.Sandbox = sb.Sandbox + if err := c.Sandbox.CreateContainer(c.ID); err != nil { + return nil, err + } + } + c.changeStatus(Created) + + // Save the metadata file. + if err := c.saveLocked(); err != nil { + return nil, err + } + + // Write the PID file. Containerd considers the create complete after + // this file is created, so it must be the last thing we do. + if args.PIDFile != "" { + if err := ioutil.WriteFile(args.PIDFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil { + return nil, fmt.Errorf("error writing PID file: %v", err) + } + } + + cu.Release() + return c, nil +} + +// Start starts running the containerized process inside the sandbox. +func (c *Container) Start(conf *boot.Config) error { + log.Debugf("Start container %q", c.ID) + + if err := c.Saver.lock(); err != nil { + return err + } + unlock := cleanup.Make(func() { c.Saver.unlock() }) + defer unlock.Clean() + + if err := c.requireStatus("start", Created); err != nil { + return err + } + + // "If any prestart hook fails, the runtime MUST generate an error, + // stop and destroy the container" -OCI spec. + if c.Spec.Hooks != nil { + if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil { + return err + } + } + + if isRoot(c.Spec) { + if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil { + return err + } + } else { + // Join cgroup to start gofer process to ensure it's part of the cgroup from + // the start (and all their children processes). + if err := runInCgroup(c.Sandbox.Cgroup, func() error { + // Create the gofer process. + ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir) + if err != nil { + return err + } + defer mountsFile.Close() + + cleanMounts, err := specutils.ReadMounts(mountsFile) + if err != nil { + return fmt.Errorf("reading mounts file: %v", err) + } + c.Spec.Mounts = cleanMounts + + return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles) + }); err != nil { + return err + } + } + + // "If any poststart hook fails, the runtime MUST log a warning, but + // the remaining hooks and lifecycle continue as if the hook had + // succeeded" -OCI spec. + if c.Spec.Hooks != nil { + executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State()) + } + + c.changeStatus(Running) + if err := c.saveLocked(); err != nil { + return err + } + + // Release lock before adjusting OOM score because the lock is acquired there. + unlock.Clean() + + // Adjust the oom_score_adj for sandbox. This must be done after saveLocked(). + if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Saver.RootDir, false); err != nil { + return err + } + + // Set container's oom_score_adj to the gofer since it is dedicated to + // the container, in case the gofer uses up too much memory. + return c.adjustGoferOOMScoreAdj() +} + +// Restore takes a container and replaces its kernel and file system +// to restore a container from its state file. +func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error { + log.Debugf("Restore container %q", c.ID) + if err := c.Saver.lock(); err != nil { + return err + } + defer c.Saver.unlock() + + if err := c.requireStatus("restore", Created); err != nil { + return err + } + + // "If any prestart hook fails, the runtime MUST generate an error, + // stop and destroy the container" -OCI spec. + if c.Spec.Hooks != nil { + if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil { + return err + } + } + + if err := c.Sandbox.Restore(c.ID, spec, conf, restoreFile); err != nil { + return err + } + c.changeStatus(Running) + return c.saveLocked() +} + +// Run is a helper that calls Create + Start + Wait. +func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) { + log.Debugf("Run container %q in root dir: %s", args.ID, conf.RootDir) + c, err := New(conf, args) + if err != nil { + return 0, fmt.Errorf("creating container: %v", err) + } + // Clean up partially created container if an error occurs. + // Any errors returned by Destroy() itself are ignored. + cu := cleanup.Make(func() { + c.Destroy() + }) + defer cu.Clean() + + if conf.RestoreFile != "" { + log.Debugf("Restore: %v", conf.RestoreFile) + if err := c.Restore(args.Spec, conf, conf.RestoreFile); err != nil { + return 0, fmt.Errorf("starting container: %v", err) + } + } else { + if err := c.Start(conf); err != nil { + return 0, fmt.Errorf("starting container: %v", err) + } + } + if args.Attached { + return c.Wait() + } + cu.Release() + return 0, nil +} + +// Execute runs the specified command in the container. It returns the PID of +// the newly created process. +func (c *Container) Execute(args *control.ExecArgs) (int32, error) { + log.Debugf("Execute in container %q, args: %+v", c.ID, args) + if err := c.requireStatus("execute in", Created, Running); err != nil { + return 0, err + } + args.ContainerID = c.ID + return c.Sandbox.Execute(args) +} + +// Event returns events for the container. +func (c *Container) Event() (*boot.Event, error) { + log.Debugf("Getting events for container %q", c.ID) + if err := c.requireStatus("get events for", Created, Running, Paused); err != nil { + return nil, err + } + return c.Sandbox.Event(c.ID) +} + +// SandboxPid returns the Pid of the sandbox the container is running in, or -1 if the +// container is not running. +func (c *Container) SandboxPid() int { + if err := c.requireStatus("get PID", Created, Running, Paused); err != nil { + return -1 + } + return c.Sandbox.Pid +} + +// Wait waits for the container to exit, and returns its WaitStatus. +// Call to wait on a stopped container is needed to retrieve the exit status +// and wait returns immediately. +func (c *Container) Wait() (syscall.WaitStatus, error) { + log.Debugf("Wait on container %q", c.ID) + return c.Sandbox.Wait(c.ID) +} + +// WaitRootPID waits for process 'pid' in the sandbox's PID namespace and +// returns its WaitStatus. +func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { + log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID) + if !c.isSandboxRunning() { + return 0, fmt.Errorf("sandbox is not running") + } + return c.Sandbox.WaitPID(c.Sandbox.ID, pid) +} + +// WaitPID waits for process 'pid' in the container's PID namespace and returns +// its WaitStatus. +func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) { + log.Debugf("Wait on PID %d in container %q", pid, c.ID) + if !c.isSandboxRunning() { + return 0, fmt.Errorf("sandbox is not running") + } + return c.Sandbox.WaitPID(c.ID, pid) +} + +// SignalContainer sends the signal to the container. If all is true and signal +// is SIGKILL, then waits for all processes to exit before returning. +// SignalContainer returns an error if the container is already stopped. +// TODO(b/113680494): Distinguish different error types. +func (c *Container) SignalContainer(sig syscall.Signal, all bool) error { + log.Debugf("Signal container %q: %v", c.ID, sig) + // Signaling container in Stopped state is allowed. When all=false, + // an error will be returned anyway; when all=true, this allows + // sending signal to other processes inside the container even + // after the init process exits. This is especially useful for + // container cleanup. + if err := c.requireStatus("signal", Running, Stopped); err != nil { + return err + } + if !c.isSandboxRunning() { + return fmt.Errorf("sandbox is not running") + } + return c.Sandbox.SignalContainer(c.ID, sig, all) +} + +// SignalProcess sends sig to a specific process in the container. +func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error { + log.Debugf("Signal process %d in container %q: %v", pid, c.ID, sig) + if err := c.requireStatus("signal a process inside", Running); err != nil { + return err + } + if !c.isSandboxRunning() { + return fmt.Errorf("sandbox is not running") + } + return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false) +} + +// ForwardSignals forwards all signals received by the current process to the +// container process inside the sandbox. It returns a function that will stop +// forwarding signals. +func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() { + log.Debugf("Forwarding all signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess) + stop := sighandling.StartSignalForwarding(func(sig linux.Signal) { + log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", sig, c.ID, pid, fgProcess) + if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.Signal(sig), fgProcess); err != nil { + log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err) + } + }) + return func() { + log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess) + stop() + } +} + +// Checkpoint sends the checkpoint call to the container. +// The statefile will be written to f, the file at the specified image-path. +func (c *Container) Checkpoint(f *os.File) error { + log.Debugf("Checkpoint container %q", c.ID) + if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil { + return err + } + return c.Sandbox.Checkpoint(c.ID, f) +} + +// Pause suspends the container and its kernel. +// The call only succeeds if the container's status is created or running. +func (c *Container) Pause() error { + log.Debugf("Pausing container %q", c.ID) + if err := c.Saver.lock(); err != nil { + return err + } + defer c.Saver.unlock() + + if c.Status != Created && c.Status != Running { + return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status) + } + + if err := c.Sandbox.Pause(c.ID); err != nil { + return fmt.Errorf("pausing container: %v", err) + } + c.changeStatus(Paused) + return c.saveLocked() +} + +// Resume unpauses the container and its kernel. +// The call only succeeds if the container's status is paused. +func (c *Container) Resume() error { + log.Debugf("Resuming container %q", c.ID) + if err := c.Saver.lock(); err != nil { + return err + } + defer c.Saver.unlock() + + if c.Status != Paused { + return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status) + } + if err := c.Sandbox.Resume(c.ID); err != nil { + return fmt.Errorf("resuming container: %v", err) + } + c.changeStatus(Running) + return c.saveLocked() +} + +// State returns the metadata of the container. +func (c *Container) State() specs.State { + return specs.State{ + Version: specs.Version, + ID: c.ID, + Status: c.Status.String(), + Pid: c.SandboxPid(), + Bundle: c.BundleDir, + } +} + +// Processes retrieves the list of processes and associated metadata inside a +// container. +func (c *Container) Processes() ([]*control.Process, error) { + if err := c.requireStatus("get processes of", Running, Paused); err != nil { + return nil, err + } + return c.Sandbox.Processes(c.ID) +} + +// Destroy stops all processes and frees all resources associated with the +// container. +func (c *Container) Destroy() error { + log.Debugf("Destroy container %q", c.ID) + + if err := c.Saver.lock(); err != nil { + return err + } + defer func() { + c.Saver.unlock() + c.Saver.close() + }() + + // Stored for later use as stop() sets c.Sandbox to nil. + sb := c.Sandbox + + // We must perform the following cleanup steps: + // * stop the container and gofer processes, + // * remove the container filesystem on the host, and + // * delete the container metadata directory. + // + // It's possible for one or more of these steps to fail, but we should + // do our best to perform all of the cleanups. Hence, we keep a slice + // of errors return their concatenation. + var errs []string + if err := c.stop(); err != nil { + err = fmt.Errorf("stopping container: %v", err) + log.Warningf("%v", err) + errs = append(errs, err.Error()) + } + + if err := c.Saver.destroy(); err != nil { + err = fmt.Errorf("deleting container state files: %v", err) + log.Warningf("%v", err) + errs = append(errs, err.Error()) + } + + c.changeStatus(Stopped) + + // Adjust oom_score_adj for the sandbox. This must be done after the container + // is stopped and the directory at c.Root is removed. Adjustment can be + // skipped if the root container is exiting, because it brings down the entire + // sandbox. + // + // Use 'sb' to tell whether it has been executed before because Destroy must + // be idempotent. + if sb != nil && !isRoot(c.Spec) { + if err := adjustSandboxOOMScoreAdj(sb, c.Saver.RootDir, true); err != nil { + errs = append(errs, err.Error()) + } + } + + // "If any poststop hook fails, the runtime MUST log a warning, but the + // remaining hooks and lifecycle continue as if the hook had + // succeeded" - OCI spec. + // + // Based on the OCI, "The post-stop hooks MUST be called after the container + // is deleted but before the delete operation returns" + // Run it here to: + // 1) Conform to the OCI. + // 2) Make sure it only runs once, because the root has been deleted, the + // container can't be loaded again. + if c.Spec.Hooks != nil { + executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) + } + + if len(errs) == 0 { + return nil + } + return fmt.Errorf(strings.Join(errs, "\n")) +} + +// saveLocked saves the container metadata to a file. +// +// Precondition: container must be locked with container.lock(). +func (c *Container) saveLocked() error { + log.Debugf("Save container %q", c.ID) + if err := c.Saver.saveLocked(c); err != nil { + return fmt.Errorf("saving container metadata: %v", err) + } + return nil +} + +// stop stops the container (for regular containers) or the sandbox (for +// root containers), and waits for the container or sandbox and the gofer +// to stop. If any of them doesn't stop before timeout, an error is returned. +func (c *Container) stop() error { + var cgroup *cgroup.Cgroup + + if c.Sandbox != nil { + log.Debugf("Destroying container %q", c.ID) + if err := c.Sandbox.DestroyContainer(c.ID); err != nil { + return fmt.Errorf("destroying container %q: %v", c.ID, err) + } + // Only uninstall cgroup for sandbox stop. + if c.Sandbox.IsRootContainer(c.ID) { + cgroup = c.Sandbox.Cgroup + } + // Only set sandbox to nil after it has been told to destroy the container. + c.Sandbox = nil + } + + // Try killing gofer if it does not exit with container. + if c.GoferPid != 0 { + log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid) + if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { + // The gofer may already be stopped, log the error. + log.Warningf("Error sending signal %d to gofer %d: %v", syscall.SIGKILL, c.GoferPid, err) + } + } + + if err := c.waitForStopped(); err != nil { + return err + } + + // Gofer is running in cgroups, so Cgroup.Uninstall has to be called after it. + if cgroup != nil { + if err := cgroup.Uninstall(); err != nil { + return err + } + } + return nil +} + +func (c *Container) waitForStopped() error { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) + op := func() error { + if c.isSandboxRunning() { + if err := c.SignalContainer(syscall.Signal(0), false); err == nil { + return fmt.Errorf("container is still running") + } + } + if c.GoferPid == 0 { + return nil + } + if c.goferIsChild { + // The gofer process is a child of the current process, + // so we can wait it and collect its zombie. + wpid, err := syscall.Wait4(int(c.GoferPid), nil, syscall.WNOHANG, nil) + if err != nil { + return fmt.Errorf("error waiting the gofer process: %v", err) + } + if wpid == 0 { + return fmt.Errorf("gofer is still running") + } + + } else if err := syscall.Kill(c.GoferPid, 0); err == nil { + return fmt.Errorf("gofer is still running") + } + c.GoferPid = 0 + return nil + } + return backoff.Retry(op, b) +} + +func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) { + // Start with the general config flags. + args := conf.ToFlags() + + var goferEnds []*os.File + + // nextFD is the next available file descriptor for the gofer process. + // It starts at 3 because 0-2 are used by stdin/stdout/stderr. + nextFD := 3 + + if conf.LogFilename != "" { + logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return nil, nil, fmt.Errorf("opening log file %q: %v", conf.LogFilename, err) + } + defer logFile.Close() + goferEnds = append(goferEnds, logFile) + args = append(args, "--log-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.DebugLog != "" { + test := "" + if len(conf.TestOnlyTestNameEnv) != 0 { + // Fetch test name if one is provided and the test only flag was set. + if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok { + test = t + } + } + debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer", test) + if err != nil { + return nil, nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err) + } + defer debugLogFile.Close() + goferEnds = append(goferEnds, debugLogFile) + args = append(args, "--debug-log-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + args = append(args, "gofer", "--bundle", bundleDir) + if conf.Overlay { + args = append(args, "--panic-on-write=true") + } + + // Open the spec file to donate to the sandbox. + specFile, err := specutils.OpenSpec(bundleDir) + if err != nil { + return nil, nil, fmt.Errorf("opening spec file: %v", err) + } + defer specFile.Close() + goferEnds = append(goferEnds, specFile) + args = append(args, "--spec-fd="+strconv.Itoa(nextFD)) + nextFD++ + + // Create pipe that allows gofer to send mount list to sandbox after all paths + // have been resolved. + mountsSand, mountsGofer, err := os.Pipe() + if err != nil { + return nil, nil, err + } + defer mountsGofer.Close() + goferEnds = append(goferEnds, mountsGofer) + args = append(args, fmt.Sprintf("--mounts-fd=%d", nextFD)) + nextFD++ + + // Add root mount and then add any other additional mounts. + mountCount := 1 + for _, m := range spec.Mounts { + if specutils.Is9PMount(m) { + mountCount++ + } + } + + sandEnds := make([]*os.File, 0, mountCount) + for i := 0; i < mountCount; i++ { + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD")) + + goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD") + defer goferEnd.Close() + goferEnds = append(goferEnds, goferEnd) + + args = append(args, fmt.Sprintf("--io-fds=%d", nextFD)) + nextFD++ + } + + binPath := specutils.ExePath + cmd := exec.Command(binPath, args...) + cmd.ExtraFiles = goferEnds + cmd.Args[0] = "runsc-gofer" + + // Enter new namespaces to isolate from the rest of the system. Don't unshare + // cgroup because gofer is added to a cgroup in the caller's namespace. + nss := []specs.LinuxNamespace{ + {Type: specs.IPCNamespace}, + {Type: specs.MountNamespace}, + {Type: specs.NetworkNamespace}, + {Type: specs.PIDNamespace}, + {Type: specs.UTSNamespace}, + } + + // Setup any uid/gid mappings, and create or join the configured user + // namespace so the gofer's view of the filesystem aligns with the + // users in the sandbox. + userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec) + nss = append(nss, userNS...) + specutils.SetUIDGIDMappings(cmd, spec) + if len(userNS) != 0 { + // We need to set UID and GID to have capabilities in a new user namespace. + cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} + } + + // Start the gofer in the given namespace. + log.Debugf("Starting gofer: %s %v", binPath, args) + if err := specutils.StartInNS(cmd, nss); err != nil { + return nil, nil, fmt.Errorf("Gofer: %v", err) + } + log.Infof("Gofer started, PID: %d", cmd.Process.Pid) + c.GoferPid = cmd.Process.Pid + c.goferIsChild = true + return sandEnds, mountsSand, nil +} + +// changeStatus transitions from one status to another ensuring that the +// transition is valid. +func (c *Container) changeStatus(s Status) { + switch s { + case Creating: + // Initial state, never transitions to it. + panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) + + case Created: + if c.Status != Creating { + panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) + } + if c.Sandbox == nil { + panic("sandbox cannot be nil") + } + + case Paused: + if c.Status != Running { + panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) + } + if c.Sandbox == nil { + panic("sandbox cannot be nil") + } + + case Running: + if c.Status != Created && c.Status != Paused { + panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) + } + if c.Sandbox == nil { + panic("sandbox cannot be nil") + } + + case Stopped: + if c.Status != Creating && c.Status != Created && c.Status != Running && c.Status != Stopped { + panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) + } + + default: + panic(fmt.Sprintf("invalid new state: %v", s)) + } + c.Status = s +} + +func (c *Container) isSandboxRunning() bool { + return c.Sandbox != nil && c.Sandbox.IsRunning() +} + +func (c *Container) requireStatus(action string, statuses ...Status) error { + for _, s := range statuses { + if c.Status == s { + return nil + } + } + return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status) +} + +func isRoot(spec *specs.Spec) bool { + return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer +} + +// runInCgroup executes fn inside the specified cgroup. If cg is nil, execute +// it in the current context. +func runInCgroup(cg *cgroup.Cgroup, fn func() error) error { + if cg == nil { + return fn() + } + restore, err := cg.Join() + defer restore() + if err != nil { + return err + } + return fn() +} + +// adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer. +func (c *Container) adjustGoferOOMScoreAdj() error { + if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil { + return nil + } + return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj) +} + +// adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox. +// oom_score_adj is set to the lowest oom_score_adj among the containers +// running in the sandbox. +// +// TODO(gvisor.dev/issue/238): This call could race with other containers being +// created at the same time and end up setting the wrong oom_score_adj to the +// sandbox. Use rpc client to synchronize. +func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) error { + containers, err := loadSandbox(rootDir, s.ID) + if err != nil { + return fmt.Errorf("loading sandbox containers: %v", err) + } + + // Do nothing if the sandbox has been terminated. + if len(containers) == 0 { + return nil + } + + // Get the lowest score for all containers. + var lowScore int + scoreFound := false + if len(containers) == 1 && specutils.SpecContainerType(containers[0].Spec) == specutils.ContainerTypeUnspecified { + // This is a single-container sandbox. Set the oom_score_adj to + // the value specified in the OCI bundle. + if containers[0].Spec.Process.OOMScoreAdj != nil { + scoreFound = true + lowScore = *containers[0].Spec.Process.OOMScoreAdj + } + } else { + for _, container := range containers { + // Special multi-container support for CRI. Ignore the root + // container when calculating oom_score_adj for the sandbox because + // it is the infrastructure (pause) container and always has a very + // low oom_score_adj. + // + // We will use OOMScoreAdj in the single-container case where the + // containerd container-type annotation is not present. + if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox { + continue + } + + if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) { + scoreFound = true + lowScore = *container.Spec.Process.OOMScoreAdj + } + } + } + + // If the container is destroyed and remaining containers have no + // oomScoreAdj specified then we must revert to the oom_score_adj of the + // parent process. + if !scoreFound && destroy { + ppid, err := specutils.GetParentPid(s.Pid) + if err != nil { + return fmt.Errorf("getting parent pid of sandbox pid %d: %v", s.Pid, err) + } + pScore, err := specutils.GetOOMScoreAdj(ppid) + if err != nil { + return fmt.Errorf("getting oom_score_adj of parent %d: %v", ppid, err) + } + + scoreFound = true + lowScore = pScore + } + + // Only set oom_score_adj if one of the containers has oom_score_adj set + // in the OCI bundle. If not, we need to inherit the parent process's + // oom_score_adj. + // See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process + if !scoreFound { + return nil + } + + // Set the lowest of all containers oom_score_adj to the sandbox. + return setOOMScoreAdj(s.Pid, lowScore) +} + +// setOOMScoreAdj sets oom_score_adj to the given value for the given PID. +// /proc must be available and mounted read-write. scoreAdj should be between +// -1000 and 1000. It's a noop if the process has already exited. +func setOOMScoreAdj(pid int, scoreAdj int) error { + f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644) + if err != nil { + // Ignore NotExist errors because it can race with process exit. + if os.IsNotExist(err) { + log.Warningf("Process (%d) not found setting oom_score_adj", pid) + return nil + } + return err + } + defer f.Close() + if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil { + if errors.Is(err, syscall.ESRCH) { + log.Warningf("Process (%d) exited while setting oom_score_adj", pid) + return nil + } + return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err) + } + return nil +} diff --git a/runsc/container/container_norace_test.go b/runsc/container/container_norace_test.go new file mode 100644 index 000000000..838c1e20a --- /dev/null +++ b/runsc/container/container_norace_test.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !race + +package container + +// Allow both kvm and ptrace for non-race builds. +var platformOptions = []configOption{ptrace, kvm} diff --git a/runsc/container/container_race_test.go b/runsc/container/container_race_test.go new file mode 100644 index 000000000..9fb4c4fc0 --- /dev/null +++ b/runsc/container/container_race_test.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build race + +package container + +// Only enabled ptrace with race builds. +var platformOptions = []configOption{ptrace} diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go new file mode 100644 index 000000000..cd76645bd --- /dev/null +++ b/runsc/container/container_test.go @@ -0,0 +1,2348 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "bytes" + "flag" + "fmt" + "io" + "io/ioutil" + "math" + "os" + "path" + "path/filepath" + "reflect" + "strconv" + "strings" + "syscall" + "testing" + "time" + + "github.com/cenkalti/backoff" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/boot/platforms" + "gvisor.dev/gvisor/runsc/specutils" +) + +// waitForProcessList waits for the given process list to show up in the container. +func waitForProcessList(cont *Container, want []*control.Process) error { + cb := func() error { + got, err := cont.Processes() + if err != nil { + err = fmt.Errorf("error getting process data from container: %v", err) + return &backoff.PermanentError{Err: err} + } + if !procListsEqual(got, want) { + return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want)) + } + return nil + } + // Gives plenty of time as tests can run slow under --race. + return testutil.Poll(cb, 30*time.Second) +} + +func waitForProcessCount(cont *Container, want int) error { + cb := func() error { + pss, err := cont.Processes() + if err != nil { + err = fmt.Errorf("error getting process data from container: %v", err) + return &backoff.PermanentError{Err: err} + } + if got := len(pss); got != want { + log.Infof("Waiting for process count to reach %d. Current: %d", want, got) + return fmt.Errorf("wrong process count, got: %d, want: %d", got, want) + } + return nil + } + // Gives plenty of time as tests can run slow under --race. + return testutil.Poll(cb, 30*time.Second) +} + +func blockUntilWaitable(pid int) error { + _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { + var err error + _, _, err1 := syscall.Syscall6(syscall.SYS_WAITID, 1, uintptr(pid), 0, syscall.WEXITED|syscall.WNOWAIT, 0, 0) + if err1 != 0 { + err = err1 + } + return 0, 0, err + }) + return err +} + +// procListsEqual is used to check whether 2 Process lists are equal. Fields +// set to -1 in wants are ignored. Timestamp and threads fields are always +// ignored. +func procListsEqual(gots, wants []*control.Process) bool { + if len(gots) != len(wants) { + return false + } + for i := range gots { + got := gots[i] + want := wants[i] + + if want.UID != math.MaxUint32 && want.UID != got.UID { + return false + } + if want.PID != -1 && want.PID != got.PID { + return false + } + if want.PPID != -1 && want.PPID != got.PPID { + return false + } + if len(want.TTY) != 0 && want.TTY != got.TTY { + return false + } + if len(want.Cmd) != 0 && want.Cmd != got.Cmd { + return false + } + } + return true +} + +type processBuilder struct { + process control.Process +} + +func newProcessBuilder() *processBuilder { + return &processBuilder{ + process: control.Process{ + UID: math.MaxUint32, + PID: -1, + PPID: -1, + }, + } +} + +func (p *processBuilder) Cmd(cmd string) *processBuilder { + p.process.Cmd = cmd + return p +} + +func (p *processBuilder) PID(pid kernel.ThreadID) *processBuilder { + p.process.PID = pid + return p +} + +func (p *processBuilder) PPID(ppid kernel.ThreadID) *processBuilder { + p.process.PPID = ppid + return p +} + +func (p *processBuilder) UID(uid auth.KUID) *processBuilder { + p.process.UID = uid + return p +} + +func (p *processBuilder) Process() *control.Process { + return &p.process +} + +func procListToString(pl []*control.Process) string { + strs := make([]string, 0, len(pl)) + for _, p := range pl { + strs = append(strs, fmt.Sprintf("%+v", p)) + } + return fmt.Sprintf("[%s]", strings.Join(strs, ",")) +} + +// createWriteableOutputFile creates an output file that can be read and +// written to in the sandbox. +func createWriteableOutputFile(path string) (*os.File, error) { + outputFile, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666) + if err != nil { + return nil, fmt.Errorf("error creating file: %q, %v", path, err) + } + + // Chmod to allow writing after umask. + if err := outputFile.Chmod(0666); err != nil { + return nil, fmt.Errorf("error chmoding file: %q, %v", path, err) + } + return outputFile, nil +} + +func waitForFileNotEmpty(f *os.File) error { + op := func() error { + fi, err := f.Stat() + if err != nil { + return err + } + if fi.Size() == 0 { + return fmt.Errorf("file %q is empty", f.Name()) + } + return nil + } + + return testutil.Poll(op, 30*time.Second) +} + +func waitForFileExist(path string) error { + op := func() error { + if _, err := os.Stat(path); os.IsNotExist(err) { + return err + } + return nil + } + + return testutil.Poll(op, 30*time.Second) +} + +// readOutputNum reads a file at given filepath and returns the int at the +// requested position. +func readOutputNum(file string, position int) (int, error) { + f, err := os.Open(file) + if err != nil { + return 0, fmt.Errorf("error opening file: %q, %v", file, err) + } + + // Ensure that there is content in output file. + if err := waitForFileNotEmpty(f); err != nil { + return 0, fmt.Errorf("error waiting for output file: %v", err) + } + + b, err := ioutil.ReadAll(f) + if err != nil { + return 0, fmt.Errorf("error reading file: %v", err) + } + if len(b) == 0 { + return 0, fmt.Errorf("error no content was read") + } + + // Strip leading null bytes caused by file offset not being 0 upon restore. + b = bytes.Trim(b, "\x00") + nums := strings.Split(string(b), "\n") + + if position >= len(nums) { + return 0, fmt.Errorf("position %v is not within the length of content %v", position, nums) + } + if position == -1 { + // Expectation of newline at the end of last position. + position = len(nums) - 2 + } + num, err := strconv.Atoi(nums[position]) + if err != nil { + return 0, fmt.Errorf("error getting number from file: %v", err) + } + return num, nil +} + +// run starts the sandbox and waits for it to exit, checking that the +// application succeeded. +func run(spec *specs.Spec, conf *boot.Config) error { + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + return fmt.Errorf("error setting up container: %v", err) + } + defer cleanup() + + // Create, start and wait for the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + Attached: true, + } + ws, err := Run(conf, args) + if err != nil { + return fmt.Errorf("running container: %v", err) + } + if !ws.Exited() || ws.ExitStatus() != 0 { + return fmt.Errorf("container failed, waitStatus: %v", ws) + } + return nil +} + +type configOption int + +const ( + overlay configOption = iota + ptrace + kvm + nonExclusiveFS +) + +var ( + noOverlay = append(platformOptions, nonExclusiveFS) + all = append(noOverlay, overlay) +) + +// configs generates different configurations to run tests. +func configs(t *testing.T, opts ...configOption) map[string]*boot.Config { + // Always load the default config. + cs := make(map[string]*boot.Config) + for _, o := range opts { + switch o { + case overlay: + c := testutil.TestConfig(t) + c.Overlay = true + cs["overlay"] = c + case ptrace: + c := testutil.TestConfig(t) + c.Platform = platforms.Ptrace + cs["ptrace"] = c + case kvm: + c := testutil.TestConfig(t) + c.Platform = platforms.KVM + cs["kvm"] = c + case nonExclusiveFS: + c := testutil.TestConfig(t) + c.FileAccess = boot.FileAccessShared + cs["non-exclusive"] = c + default: + panic(fmt.Sprintf("unknown config option %v", o)) + } + } + return cs +} + +func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config { + vfs1 := configs(t, opts...) + + var optsVFS2 []configOption + for _, opt := range opts { + // TODO(gvisor.dev/issue/1487): Enable overlay tests. + if opt != overlay { + optsVFS2 = append(optsVFS2, opt) + } + } + + for key, value := range configs(t, optsVFS2...) { + value.VFS2 = true + vfs1[key+"VFS2"] = value + } + + return vfs1 +} + +// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle. +// It verifies after each step that the container can be loaded from disk, and +// has the correct status. +func TestLifecycle(t *testing.T) { + // Start the child reaper. + childReaper := &testutil.Reaper{} + childReaper.Start() + defer childReaper.Stop() + + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + // The container will just sleep for a long time. We will kill it before + // it finishes sleeping. + spec := testutil.NewSpecWithArgs("sleep", "100") + + rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // expectedPL lists the expected process state of the container. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sleep").Process(), + } + // Create the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + + // Load the container from disk and check the status. + c, err = Load(rootDir, args.ID) + if err != nil { + t.Fatalf("error loading container: %v", err) + } + if got, want := c.Status, Created; got != want { + t.Errorf("container status got %v, want %v", got, want) + } + + // List should return the container id. + ids, err := List(rootDir) + if err != nil { + t.Fatalf("error listing containers: %v", err) + } + if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) { + t.Errorf("container list got %v, want %v", got, want) + } + + // Start the container. + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Load the container from disk and check the status. + c, err = Load(rootDir, args.ID) + if err != nil { + t.Fatalf("error loading container: %v", err) + } + if got, want := c.Status, Running; got != want { + t.Errorf("container status got %v, want %v", got, want) + } + + // Verify that "sleep 100" is running. + if err := waitForProcessList(c, expectedPL); err != nil { + t.Error(err) + } + + // Wait on the container. + ch := make(chan error) + go func() { + ws, err := c.Wait() + if err != nil { + ch <- err + } + if got, want := ws.Signal(), syscall.SIGTERM; got != want { + ch <- fmt.Errorf("got signal %v, want %v", got, want) + } + ch <- nil + }() + + // Wait a bit to ensure that we've started waiting on + // the container before we signal. + time.Sleep(time.Second) + + // Send the container a SIGTERM which will cause it to stop. + if err := c.SignalContainer(syscall.SIGTERM, false); err != nil { + t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err) + } + + // Wait for it to die. + if err := <-ch; err != nil { + t.Fatalf("error waiting for container: %v", err) + } + + // Load the container from disk and check the status. + c, err = Load(rootDir, args.ID) + if err != nil { + t.Fatalf("error loading container: %v", err) + } + if got, want := c.Status, Stopped; got != want { + t.Errorf("container status got %v, want %v", got, want) + } + + // Destroy the container. + if err := c.Destroy(); err != nil { + t.Fatalf("error destroying container: %v", err) + } + + // List should not return the container id. + ids, err = List(rootDir) + if err != nil { + t.Fatalf("error listing containers: %v", err) + } + if len(ids) != 0 { + t.Errorf("expected container list to be empty, but got %v", ids) + } + + // Loading the container by id should fail. + if _, err = Load(rootDir, args.ID); err == nil { + t.Errorf("expected loading destroyed container to fail, but it did not") + } + }) + } +} + +// Test the we can execute the application with different path formats. +func TestExePath(t *testing.T) { + // Create two directories that will be prepended to PATH. + firstPath, err := ioutil.TempDir(testutil.TmpDir(), "first") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(firstPath) + secondPath, err := ioutil.TempDir(testutil.TmpDir(), "second") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(secondPath) + + // Create two minimal executables in the second path, two of which + // will be masked by files in first path. + for _, p := range []string{"unmasked", "masked1", "masked2"} { + path := filepath.Join(secondPath, p) + f, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0777) + if err != nil { + t.Fatalf("error opening path: %v", err) + } + defer f.Close() + if _, err := io.WriteString(f, "#!/bin/true\n"); err != nil { + t.Fatalf("error writing contents: %v", err) + } + } + + // Create a non-executable file in the first path which masks a healthy + // executable in the second. + nonExecutable := filepath.Join(firstPath, "masked1") + f2, err := os.OpenFile(nonExecutable, os.O_CREATE|os.O_EXCL, 0666) + if err != nil { + t.Fatalf("error opening file: %v", err) + } + f2.Close() + + // Create a non-regular file in the first path which masks a healthy + // executable in the second. + nonRegular := filepath.Join(firstPath, "masked2") + if err := os.Mkdir(nonRegular, 0777); err != nil { + t.Fatalf("error making directory: %v", err) + } + + for name, conf := range configsWithVFS2(t, overlay) { + t.Run(name, func(t *testing.T) { + for _, test := range []struct { + path string + success bool + }{ + {path: "true", success: true}, + {path: "bin/true", success: true}, + {path: "/bin/true", success: true}, + {path: "thisfiledoesntexit", success: false}, + {path: "bin/thisfiledoesntexit", success: false}, + {path: "/bin/thisfiledoesntexit", success: false}, + + {path: "unmasked", success: true}, + {path: filepath.Join(firstPath, "unmasked"), success: false}, + {path: filepath.Join(secondPath, "unmasked"), success: true}, + + {path: "masked1", success: true}, + {path: filepath.Join(firstPath, "masked1"), success: false}, + {path: filepath.Join(secondPath, "masked1"), success: true}, + + {path: "masked2", success: true}, + {path: filepath.Join(firstPath, "masked2"), success: false}, + {path: filepath.Join(secondPath, "masked2"), success: true}, + } { + t.Run(fmt.Sprintf("path=%s,success=%t", test.path, test.success), func(t *testing.T) { + spec := testutil.NewSpecWithArgs(test.path) + spec.Process.Env = []string{ + fmt.Sprintf("PATH=%s:%s:%s", firstPath, secondPath, os.Getenv("PATH")), + } + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("exec: error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + Attached: true, + } + ws, err := Run(conf, args) + + if test.success { + if err != nil { + t.Errorf("exec: error running container: %v", err) + } + if ws.ExitStatus() != 0 { + t.Errorf("exec: got exit status %v want %v", ws.ExitStatus(), 0) + } + } else { + if err == nil { + t.Errorf("exec: got: no error, want: error") + } + } + }) + } + }) + } +} + +// Test the we can retrieve the application exit status from the container. +func TestAppExitStatus(t *testing.T) { + doAppExitStatus(t, false) +} + +// This is TestAppExitStatus for VFSv2. +func TestAppExitStatusVFS2(t *testing.T) { + doAppExitStatus(t, true) +} + +func doAppExitStatus(t *testing.T, vfs2 bool) { + // First container will succeed. + succSpec := testutil.NewSpecWithArgs("true") + conf := testutil.TestConfig(t) + conf.VFS2 = vfs2 + _, bundleDir, cleanup, err := testutil.SetupContainer(succSpec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: succSpec, + BundleDir: bundleDir, + Attached: true, + } + ws, err := Run(conf, args) + if err != nil { + t.Fatalf("error running container: %v", err) + } + if ws.ExitStatus() != 0 { + t.Errorf("got exit status %v want %v", ws.ExitStatus(), 0) + } + + // Second container exits with non-zero status. + wantStatus := 123 + errSpec := testutil.NewSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus)) + + _, bundleDir2, cleanup2, err := testutil.SetupContainer(errSpec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup2() + + args2 := Args{ + ID: testutil.RandomContainerID(), + Spec: errSpec, + BundleDir: bundleDir2, + Attached: true, + } + ws, err = Run(conf, args2) + if err != nil { + t.Fatalf("error running container: %v", err) + } + if ws.ExitStatus() != wantStatus { + t.Errorf("got exit status %v want %v", ws.ExitStatus(), wantStatus) + } +} + +// TestExec verifies that a container can exec a new program. +func TestExec(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "exec-test") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100", dir) + spec := testutil.NewSpecWithArgs("sh", "-c", cmd) + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Wait until sleep is running to ensure the symlink was created. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sh").Process(), + newProcessBuilder().Cmd("sleep").Process(), + } + if err := waitForProcessList(cont, expectedPL); err != nil { + t.Fatalf("waitForProcessList: %v", err) + } + + for _, tc := range []struct { + name string + args control.ExecArgs + }{ + { + name: "complete", + args: control.ExecArgs{ + Filename: "/bin/true", + Argv: []string{"/bin/true"}, + }, + }, + { + name: "filename", + args: control.ExecArgs{ + Filename: "/bin/true", + }, + }, + { + name: "argv", + args: control.ExecArgs{ + Argv: []string{"/bin/true"}, + }, + }, + { + name: "filename resolution", + args: control.ExecArgs{ + Filename: "true", + Envv: []string{"PATH=/bin"}, + }, + }, + { + name: "argv resolution", + args: control.ExecArgs{ + Argv: []string{"true"}, + Envv: []string{"PATH=/bin"}, + }, + }, + { + name: "argv symlink", + args: control.ExecArgs{ + Argv: []string{filepath.Join(dir, "symlink")}, + }, + }, + { + name: "working dir", + args: control.ExecArgs{ + Argv: []string{"/bin/sh", "-c", `if [[ "${PWD}" != "/tmp" ]]; then exit 1; fi`}, + WorkingDirectory: "/tmp", + }, + }, + { + name: "user", + args: control.ExecArgs{ + Argv: []string{"/bin/sh", "-c", `if [[ "$(id -u)" != "343" ]]; then exit 1; fi`}, + KUID: 343, + }, + }, + { + name: "group", + args: control.ExecArgs{ + Argv: []string{"/bin/sh", "-c", `if [[ "$(id -g)" != "343" ]]; then exit 1; fi`}, + KGID: 343, + }, + }, + { + name: "env", + args: control.ExecArgs{ + Argv: []string{"/bin/sh", "-c", `if [[ "${FOO}" != "123" ]]; then exit 1; fi`}, + Envv: []string{"FOO=123"}, + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + // t.Parallel() + if ws, err := cont.executeSync(&tc.args); err != nil { + t.Fatalf("executeAsync(%+v): %v", tc.args, err) + } else if ws != 0 { + t.Fatalf("executeAsync(%+v) failed with exit: %v", tc.args, ws) + } + }) + } + }) + } +} + +// TestExecProcList verifies that a container can exec a new program and it +// shows correcly in the process list. +func TestExecProcList(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + const uid = 343 + spec := testutil.NewSpecWithArgs("sleep", "100") + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + execArgs := &control.ExecArgs{ + Filename: "/bin/sleep", + Argv: []string{"/bin/sleep", "5"}, + WorkingDirectory: "/", + KUID: uid, + } + + // Verify that "sleep 100" and "sleep 5" are running after exec. First, + // start running exec (which blocks). + ch := make(chan error) + go func() { + exitStatus, err := cont.executeSync(execArgs) + if err != nil { + ch <- err + } else if exitStatus != 0 { + ch <- fmt.Errorf("failed with exit status: %v", exitStatus) + } else { + ch <- nil + } + }() + + // expectedPL lists the expected process state of the container. + expectedPL := []*control.Process{ + newProcessBuilder().PID(1).PPID(0).Cmd("sleep").UID(0).Process(), + newProcessBuilder().PID(2).PPID(0).Cmd("sleep").UID(uid).Process(), + } + if err := waitForProcessList(cont, expectedPL); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + + // Ensure that exec finished without error. + select { + case <-time.After(10 * time.Second): + t.Fatalf("container timed out waiting for exec to finish.") + case err := <-ch: + if err != nil { + t.Errorf("container failed to exec %v: %v", args, err) + } + } + }) + } +} + +// TestKillPid verifies that we can signal individual exec'd processes. +func TestKillPid(t *testing.T) { + for name, conf := range configsWithVFS2(t, overlay) { + t.Run(name, func(t *testing.T) { + app, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + const nProcs = 4 + spec := testutil.NewSpecWithArgs(app, "task-tree", "--depth", strconv.Itoa(nProcs-1), "--width=1", "--pause=true") + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Verify that all processes are running. + if err := waitForProcessCount(cont, nProcs); err != nil { + t.Fatalf("timed out waiting for processes to start: %v", err) + } + + // Kill the child process with the largest PID. + procs, err := cont.Processes() + if err != nil { + t.Fatalf("failed to get process list: %v", err) + } + var pid int32 + for _, p := range procs { + if pid < int32(p.PID) { + pid = int32(p.PID) + } + } + if err := cont.SignalProcess(syscall.SIGKILL, pid); err != nil { + t.Fatalf("failed to signal process %d: %v", pid, err) + } + + // Verify that one process is gone. + if err := waitForProcessCount(cont, nProcs-1); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + + procs, err = cont.Processes() + if err != nil { + t.Fatalf("failed to get process list: %v", err) + } + for _, p := range procs { + if pid == int32(p.PID) { + t.Fatalf("pid %d is still alive, which should be killed", pid) + } + } + }) + } +} + +// TestCheckpointRestore creates a container that continuously writes successive integers +// to a file. To test checkpoint and restore functionality, the container is +// checkpointed and the last number printed to the file is recorded. Then, it is restored in two +// new containers and the first number printed from these containers is checked. Both should +// be the next consecutive number after the last number from the checkpointed container. +func TestCheckpointRestore(t *testing.T) { + // Skip overlay because test requires writing to host file. + for name, conf := range configs(t, noOverlay...) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test") + if err != nil { + t.Fatalf("ioutil.TempDir failed: %v", err) + } + defer os.RemoveAll(dir) + if err := os.Chmod(dir, 0777); err != nil { + t.Fatalf("error chmoding file: %q, %v", dir, err) + } + + outputPath := filepath.Join(dir, "output") + outputFile, err := createWriteableOutputFile(outputPath) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile.Close() + + script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %q; sleep 1; done", outputPath) + spec := testutil.NewSpecWithArgs("bash", "-c", script) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Set the image path, which is where the checkpoint image will be saved. + imagePath := filepath.Join(dir, "test-image-file") + + // Create the image file and open for writing. + file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) + if err != nil { + t.Fatalf("error opening new file at imagePath: %v", err) + } + defer file.Close() + + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } + + // Checkpoint running container; save state into new file. + if err := cont.Checkpoint(file); err != nil { + t.Fatalf("error checkpointing container to empty file: %v", err) + } + defer os.RemoveAll(imagePath) + + lastNum, err := readOutputNum(outputPath, -1) + if err != nil { + t.Fatalf("error with outputFile: %v", err) + } + + // Delete and recreate file before restoring. + if err := os.Remove(outputPath); err != nil { + t.Fatalf("error removing file") + } + outputFile2, err := createWriteableOutputFile(outputPath) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile2.Close() + + // Restore into a new container. + args2 := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont2, err := New(conf, args2) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont2.Destroy() + + if err := cont2.Restore(spec, conf, imagePath); err != nil { + t.Fatalf("error restoring container: %v", err) + } + + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile2); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } + + firstNum, err := readOutputNum(outputPath, 0) + if err != nil { + t.Fatalf("error with outputFile: %v", err) + } + + // Check that lastNum is one less than firstNum and that the container picks + // up from where it left off. + if lastNum+1 != firstNum { + t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum) + } + cont2.Destroy() + + // Restore into another container! + // Delete and recreate file before restoring. + if err := os.Remove(outputPath); err != nil { + t.Fatalf("error removing file") + } + outputFile3, err := createWriteableOutputFile(outputPath) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile3.Close() + + // Restore into a new container. + args3 := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont3, err := New(conf, args3) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont3.Destroy() + + if err := cont3.Restore(spec, conf, imagePath); err != nil { + t.Fatalf("error restoring container: %v", err) + } + + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile3); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } + + firstNum2, err := readOutputNum(outputPath, 0) + if err != nil { + t.Fatalf("error with outputFile: %v", err) + } + + // Check that lastNum is one less than firstNum and that the container picks + // up from where it left off. + if lastNum+1 != firstNum2 { + t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2) + } + cont3.Destroy() + }) + } +} + +// TestUnixDomainSockets checks that Checkpoint/Restore works in cases +// with filesystem Unix Domain Socket use. +func TestUnixDomainSockets(t *testing.T) { + // Skip overlay because test requires writing to host file. + for name, conf := range configs(t, noOverlay...) { + t.Run(name, func(t *testing.T) { + // UDS path is limited to 108 chars for compatibility with older systems. + // Use '/tmp' (instead of testutil.TmpDir) to ensure the size limit is + // not exceeded. Assumes '/tmp' exists in the system. + dir, err := ioutil.TempDir("/tmp", "uds-test") + if err != nil { + t.Fatalf("ioutil.TempDir failed: %v", err) + } + defer os.RemoveAll(dir) + + outputPath := filepath.Join(dir, "uds_output") + outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile.Close() + + app, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + socketPath := filepath.Join(dir, "uds_socket") + defer os.Remove(socketPath) + + spec := testutil.NewSpecWithArgs(app, "uds", "--file", outputPath, "--socket", socketPath) + spec.Process.User = specs.User{ + UID: uint32(os.Getuid()), + GID: uint32(os.Getgid()), + } + spec.Mounts = []specs.Mount{{ + Type: "bind", + Destination: dir, + Source: dir, + }} + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Set the image path, the location where the checkpoint image will be saved. + imagePath := filepath.Join(dir, "test-image-file") + + // Create the image file and open for writing. + file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) + if err != nil { + t.Fatalf("error opening new file at imagePath: %v", err) + } + defer file.Close() + defer os.RemoveAll(imagePath) + + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } + + // Checkpoint running container; save state into new file. + if err := cont.Checkpoint(file); err != nil { + t.Fatalf("error checkpointing container to empty file: %v", err) + } + + // Read last number outputted before checkpoint. + lastNum, err := readOutputNum(outputPath, -1) + if err != nil { + t.Fatalf("error with outputFile: %v", err) + } + + // Delete and recreate file before restoring. + if err := os.Remove(outputPath); err != nil { + t.Fatalf("error removing file") + } + outputFile2, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile2.Close() + + // Restore into a new container. + argsRestore := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + contRestore, err := New(conf, argsRestore) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer contRestore.Destroy() + + if err := contRestore.Restore(spec, conf, imagePath); err != nil { + t.Fatalf("error restoring container: %v", err) + } + + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile2); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } + + // Read first number outputted after restore. + firstNum, err := readOutputNum(outputPath, 0) + if err != nil { + t.Fatalf("error with outputFile: %v", err) + } + + // Check that lastNum is one less than firstNum. + if lastNum+1 != firstNum { + t.Errorf("error numbers not consecutive, previous: %d, next: %d", lastNum, firstNum) + } + contRestore.Destroy() + }) + } +} + +// TestPauseResume tests that we can successfully pause and resume a container. +// The container will keep touching a file to indicate it's running. The test +// pauses the container, removes the file, and checks that it doesn't get +// recreated. Then it resumes the container, verify that the file gets created +// again. +func TestPauseResume(t *testing.T) { + for name, conf := range configs(t, noOverlay...) { + t.Run(name, func(t *testing.T) { + tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock") + if err != nil { + t.Fatalf("error creating temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + running := path.Join(tmpDir, "running") + script := fmt.Sprintf("while [[ true ]]; do touch %q; sleep 0.1; done", running) + spec := testutil.NewSpecWithArgs("/bin/bash", "-c", script) + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Wait until container starts running, observed by the existence of running + // file. + if err := waitForFileExist(running); err != nil { + t.Errorf("error waiting for container to start: %v", err) + } + + // Pause the running container. + if err := cont.Pause(); err != nil { + t.Errorf("error pausing container: %v", err) + } + if got, want := cont.Status, Paused; got != want { + t.Errorf("container status got %v, want %v", got, want) + } + + if err := os.Remove(running); err != nil { + t.Fatalf("os.Remove(%q) failed: %v", running, err) + } + // Script touches the file every 100ms. Give a bit a time for it to run to + // catch the case that pause didn't work. + time.Sleep(200 * time.Millisecond) + if _, err := os.Stat(running); !os.IsNotExist(err) { + t.Fatalf("container did not pause: file exist check: %v", err) + } + + // Resume the running container. + if err := cont.Resume(); err != nil { + t.Errorf("error pausing container: %v", err) + } + if got, want := cont.Status, Running; got != want { + t.Errorf("container status got %v, want %v", got, want) + } + + // Verify that the file is once again created by container. + if err := waitForFileExist(running); err != nil { + t.Fatalf("error resuming container: file exist check: %v", err) + } + }) + } +} + +// TestPauseResumeStatus makes sure that the statuses are set correctly +// with calls to pause and resume and that pausing and resuming only +// occurs given the correct state. +func TestPauseResumeStatus(t *testing.T) { + spec := testutil.NewSpecWithArgs("sleep", "20") + conf := testutil.TestConfig(t) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Pause the running container. + if err := cont.Pause(); err != nil { + t.Errorf("error pausing container: %v", err) + } + if got, want := cont.Status, Paused; got != want { + t.Errorf("container status got %v, want %v", got, want) + } + + // Try to Pause again. Should cause error. + if err := cont.Pause(); err == nil { + t.Errorf("error pausing container that was already paused: %v", err) + } + if got, want := cont.Status, Paused; got != want { + t.Errorf("container status got %v, want %v", got, want) + } + + // Resume the running container. + if err := cont.Resume(); err != nil { + t.Errorf("error resuming container: %v", err) + } + if got, want := cont.Status, Running; got != want { + t.Errorf("container status got %v, want %v", got, want) + } + + // Try to resume again. Should cause error. + if err := cont.Resume(); err == nil { + t.Errorf("error resuming container already running: %v", err) + } + if got, want := cont.Status, Running; got != want { + t.Errorf("container status got %v, want %v", got, want) + } +} + +// TestCapabilities verifies that: +// - Running exec as non-root UID and GID will result in an error (because the +// executable file can't be read). +// - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips +// this check. +func TestCapabilities(t *testing.T) { + // Pick uid/gid different than ours. + uid := auth.KUID(os.Getuid() + 1) + gid := auth.KGID(os.Getgid() + 1) + + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + spec := testutil.NewSpecWithArgs("sleep", "100") + rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // expectedPL lists the expected process state of the container. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sleep").Process(), + } + if err := waitForProcessList(cont, expectedPL); err != nil { + t.Fatalf("Failed to wait for sleep to start, err: %v", err) + } + + // Create an executable that can't be run with the specified UID:GID. + // This shouldn't be callable within the container until we add the + // CAP_DAC_OVERRIDE capability to skip the access check. + exePath := filepath.Join(rootDir, "exe") + if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil { + t.Fatalf("couldn't create executable: %v", err) + } + defer os.Remove(exePath) + + // Need to traverse the intermediate directory. + os.Chmod(rootDir, 0755) + + execArgs := &control.ExecArgs{ + Filename: exePath, + Argv: []string{exePath}, + WorkingDirectory: "/", + KUID: uid, + KGID: gid, + Capabilities: &auth.TaskCapabilities{}, + } + + // "exe" should fail because we don't have the necessary permissions. + if _, err := cont.executeSync(execArgs); err == nil { + t.Fatalf("container executed without error, but an error was expected") + } + + // Now we run with the capability enabled and should succeed. + execArgs.Capabilities = &auth.TaskCapabilities{ + EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + } + // "exe" should not fail this time. + if _, err := cont.executeSync(execArgs); err != nil { + t.Fatalf("container failed to exec %v: %v", args, err) + } + }) + } +} + +// TestRunNonRoot checks that sandbox can be configured when running as +// non-privileged user. +func TestRunNonRoot(t *testing.T) { + for name, conf := range configsWithVFS2(t, noOverlay...) { + t.Run(name, func(t *testing.T) { + spec := testutil.NewSpecWithArgs("/bin/true") + + // Set a random user/group with no access to "blocked" dir. + spec.Process.User.UID = 343 + spec.Process.User.GID = 2401 + spec.Process.Capabilities = nil + + // User running inside container can't list '$TMP/blocked' and would fail to + // mount it. + dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + if err := os.Chmod(dir, 0700); err != nil { + t.Fatalf("os.MkDir(%q) failed: %v", dir, err) + } + dir = path.Join(dir, "test") + if err := os.Mkdir(dir, 0755); err != nil { + t.Fatalf("os.MkDir(%q) failed: %v", dir, err) + } + + src, err := ioutil.TempDir(testutil.TmpDir(), "src") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: src, + Type: "bind", + }) + + if err := run(spec, conf); err != nil { + t.Fatalf("error running sandbox: %v", err) + } + }) + } +} + +// TestMountNewDir checks that runsc will create destination directory if it +// doesn't exit. +func TestMountNewDir(t *testing.T) { + for name, conf := range configsWithVFS2(t, overlay) { + t.Run(name, func(t *testing.T) { + root, err := ioutil.TempDir(testutil.TmpDir(), "root") + if err != nil { + t.Fatal("ioutil.TempDir() failed:", err) + } + + srcDir := path.Join(root, "src", "dir", "anotherdir") + if err := os.MkdirAll(srcDir, 0755); err != nil { + t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err) + } + + mountDir := path.Join(root, "dir", "anotherdir") + + spec := testutil.NewSpecWithArgs("/bin/ls", mountDir) + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: mountDir, + Source: srcDir, + Type: "bind", + }) + + if err := run(spec, conf); err != nil { + t.Fatalf("error running sandbox: %v", err) + } + }) + } +} + +func TestReadonlyRoot(t *testing.T) { + for name, conf := range configsWithVFS2(t, overlay) { + t.Run(name, func(t *testing.T) { + spec := testutil.NewSpecWithArgs("/bin/touch", "/foo") + spec.Root.Readonly = true + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create, start and wait for the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + ws, err := c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { + t.Fatalf("container failed, waitStatus: %v", ws) + } + }) + } +} + +func TestUIDMap(t *testing.T) { + for name, conf := range configsWithVFS2(t, noOverlay...) { + t.Run(name, func(t *testing.T) { + testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + defer os.RemoveAll(testDir) + testFile := path.Join(testDir, "testfile") + + spec := testutil.NewSpecWithArgs("touch", "/tmp/testfile") + uid := os.Getuid() + gid := os.Getgid() + spec.Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + {Type: specs.UserNamespace}, + {Type: specs.PIDNamespace}, + {Type: specs.MountNamespace}, + }, + UIDMappings: []specs.LinuxIDMapping{ + { + ContainerID: 0, + HostID: uint32(uid), + Size: 1, + }, + }, + GIDMappings: []specs.LinuxIDMapping{ + { + ContainerID: 0, + HostID: uint32(gid), + Size: 1, + }, + }, + } + + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: "/tmp", + Source: testDir, + Type: "bind", + }) + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create, start and wait for the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + ws, err := c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + if !ws.Exited() || ws.ExitStatus() != 0 { + t.Fatalf("container failed, waitStatus: %v", ws) + } + st := syscall.Stat_t{} + if err := syscall.Stat(testFile, &st); err != nil { + t.Fatalf("error stat /testfile: %v", err) + } + + if st.Uid != uint32(uid) || st.Gid != uint32(gid) { + t.Fatalf("UID: %d (%d) GID: %d (%d)", st.Uid, uid, st.Gid, gid) + } + }) + } +} + +func TestReadonlyMount(t *testing.T) { + for name, conf := range configsWithVFS2(t, overlay) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount") + spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: dir, + Type: "bind", + Options: []string{"ro"}, + }) + spec.Root.Readonly = false + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create, start and wait for the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + ws, err := c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { + t.Fatalf("container failed, waitStatus: %v", ws) + } + }) + } +} + +// TestAbbreviatedIDs checks that runsc supports using abbreviated container +// IDs in place of full IDs. +func TestAbbreviatedIDs(t *testing.T) { + doAbbreviatedIDsTest(t, false) +} + +func TestAbbreviatedIDsVFS2(t *testing.T) { + doAbbreviatedIDsTest(t, true) +} + +func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + conf.VFS2 = vfs2 + + cids := []string{ + "foo-" + testutil.RandomContainerID(), + "bar-" + testutil.RandomContainerID(), + "baz-" + testutil.RandomContainerID(), + } + for _, cid := range cids { + spec := testutil.NewSpecWithArgs("sleep", "100") + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: cid, + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + } + + // These should all be unambigious. + unambiguous := map[string]string{ + "f": cids[0], + cids[0]: cids[0], + "bar": cids[1], + cids[1]: cids[1], + "baz": cids[2], + cids[2]: cids[2], + } + for shortid, longid := range unambiguous { + if _, err := Load(rootDir, shortid); err != nil { + t.Errorf("%q should resolve to %q: %v", shortid, longid, err) + } + } + + // These should be ambiguous. + ambiguous := []string{ + "b", + "ba", + } + for _, shortid := range ambiguous { + if s, err := Load(rootDir, shortid); err == nil { + t.Errorf("%q should be ambiguous, but resolved to %q", shortid, s.ID) + } + } +} + +func TestGoferExits(t *testing.T) { + doGoferExitTest(t, false) +} + +func TestGoferExitsVFS2(t *testing.T) { + doGoferExitTest(t, true) +} + +func doGoferExitTest(t *testing.T, vfs2 bool) { + spec := testutil.NewSpecWithArgs("/bin/sleep", "10000") + conf := testutil.TestConfig(t) + conf.VFS2 = vfs2 + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Kill sandbox and expect gofer to exit on its own. + sandboxProc, err := os.FindProcess(c.Sandbox.Pid) + if err != nil { + t.Fatalf("error finding sandbox process: %v", err) + } + if err := sandboxProc.Kill(); err != nil { + t.Fatalf("error killing sandbox process: %v", err) + } + + err = blockUntilWaitable(c.GoferPid) + if err != nil && err != syscall.ECHILD { + t.Errorf("error waiting for gofer to exit: %v", err) + } +} + +func TestRootNotMount(t *testing.T) { + appSym, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + app, err := filepath.EvalSymlinks(appSym) + if err != nil { + t.Fatalf("error resolving %q symlink: %v", appSym, err) + } + log.Infof("App path %q is a symlink to %q", appSym, app) + + static, err := testutil.IsStatic(app) + if err != nil { + t.Fatalf("error reading application binary: %v", err) + } + if !static { + // This happens during race builds; we cannot map in shared + // libraries also, so we need to skip the test. + t.Skip() + } + + root := filepath.Dir(app) + exe := "/" + filepath.Base(app) + log.Infof("Executing %q in %q", exe, root) + + spec := testutil.NewSpecWithArgs(exe, "help") + spec.Root.Path = root + spec.Root.Readonly = true + spec.Mounts = nil + + conf := testutil.TestConfig(t) + if err := run(spec, conf); err != nil { + t.Fatalf("error running sandbox: %v", err) + } +} + +func TestUserLog(t *testing.T) { + app, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + // sched_rr_get_interval = 148 - not implemented in gvisor. + spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148") + conf := testutil.TestConfig(t) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + dir, err := ioutil.TempDir(testutil.TmpDir(), "user_log_test") + if err != nil { + t.Fatalf("error creating tmp dir: %v", err) + } + userLog := filepath.Join(dir, "user.log") + + // Create, start and wait for the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + UserLog: userLog, + Attached: true, + } + ws, err := Run(conf, args) + if err != nil { + t.Fatalf("error running container: %v", err) + } + if !ws.Exited() || ws.ExitStatus() != 0 { + t.Fatalf("container failed, waitStatus: %v", ws) + } + + out, err := ioutil.ReadFile(userLog) + if err != nil { + t.Fatalf("error opening user log file %q: %v", userLog, err) + } + if want := "Unsupported syscall sched_rr_get_interval("; !strings.Contains(string(out), want) { + t.Errorf("user log file doesn't contain %q, out: %s", want, string(out)) + } +} + +func TestWaitOnExitedSandbox(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + // Run a shell that sleeps for 1 second and then exits with a + // non-zero code. + const wantExit = 17 + cmd := fmt.Sprintf("sleep 1; exit %d", wantExit) + spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and Start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Wait on the sandbox. This will make an RPC to the sandbox + // and get the actual exit status of the application. + ws, err := c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + if got := ws.ExitStatus(); got != wantExit { + t.Errorf("got exit status %d, want %d", got, wantExit) + } + + // Now the sandbox has exited, but the zombie sandbox process + // still exists. Calling Wait() now will return the sandbox + // exit status. + ws, err = c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + if got := ws.ExitStatus(); got != wantExit { + t.Errorf("got exit status %d, want %d", got, wantExit) + } + }) + } +} + +func TestDestroyNotStarted(t *testing.T) { + doDestroyNotStartedTest(t, false) +} + +func TestDestroyNotStartedVFS2(t *testing.T) { + doDestroyNotStartedTest(t, true) +} + +func doDestroyNotStartedTest(t *testing.T, vfs2 bool) { + spec := testutil.NewSpecWithArgs("/bin/sleep", "100") + conf := testutil.TestConfig(t) + conf.VFS2 = vfs2 + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create the container and check that it can be destroyed. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + if err := c.Destroy(); err != nil { + t.Fatalf("deleting non-started container failed: %v", err) + } +} + +// TestDestroyStarting attempts to force a race between start and destroy. +func TestDestroyStarting(t *testing.T) { + doDestroyNotStartedTest(t, false) +} + +func TestDestroyStartedVFS2(t *testing.T) { + doDestroyNotStartedTest(t, true) +} + +func doDestroyStartingTest(t *testing.T, vfs2 bool) { + for i := 0; i < 10; i++ { + spec := testutil.NewSpecWithArgs("/bin/sleep", "100") + conf := testutil.TestConfig(t) + conf.VFS2 = vfs2 + rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create the container and check that it can be destroyed. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + + // Container is not thread safe, so load another instance to run in + // concurrently. + startCont, err := Load(rootDir, args.ID) + if err != nil { + t.Fatalf("error loading container: %v", err) + } + wg := sync.WaitGroup{} + wg.Add(1) + go func() { + defer wg.Done() + // Ignore failures, start can fail if destroy runs first. + startCont.Start(conf) + }() + + wg.Add(1) + go func() { + defer wg.Done() + if err := c.Destroy(); err != nil { + t.Errorf("deleting non-started container failed: %v", err) + } + }() + wg.Wait() + } +} + +func TestCreateWorkingDir(t *testing.T) { + for name, conf := range configsWithVFS2(t, overlay) { + t.Run(name, func(t *testing.T) { + tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + dir := path.Join(tmpDir, "new/working/dir") + + // touch will fail if the directory doesn't exist. + spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) + spec.Process.Cwd = dir + spec.Root.Readonly = true + + if err := run(spec, conf); err != nil { + t.Fatalf("Error running container: %v", err) + } + }) + } +} + +// TestMountPropagation verifies that mount propagates to slave but not to +// private mounts. +func TestMountPropagation(t *testing.T) { + // Setup dir structure: + // - src: is mounted as shared and is used as source for both private and + // slave mounts + // - dir: will be bind mounted inside src and should propagate to slave + tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "mount") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + src := filepath.Join(tmpDir, "src") + srcMnt := filepath.Join(src, "mnt") + dir := filepath.Join(tmpDir, "dir") + for _, path := range []string{src, srcMnt, dir} { + if err := os.MkdirAll(path, 0777); err != nil { + t.Fatalf("MkdirAll(%q): %v", path, err) + } + } + dirFile := filepath.Join(dir, "file") + f, err := os.Create(dirFile) + if err != nil { + t.Fatalf("os.Create(%q): %v", dirFile, err) + } + f.Close() + + // Setup src as a shared mount. + if err := syscall.Mount(src, src, "bind", syscall.MS_BIND, ""); err != nil { + t.Fatalf("mount(%q, %q, MS_BIND): %v", dir, srcMnt, err) + } + if err := syscall.Mount("", src, "", syscall.MS_SHARED, ""); err != nil { + t.Fatalf("mount(%q, MS_SHARED): %v", srcMnt, err) + } + + spec := testutil.NewSpecWithArgs("sleep", "1000") + + priv := filepath.Join(tmpDir, "priv") + slave := filepath.Join(tmpDir, "slave") + spec.Mounts = []specs.Mount{ + { + Source: src, + Destination: priv, + Type: "bind", + Options: []string{"private"}, + }, + { + Source: src, + Destination: slave, + Type: "bind", + Options: []string{"slave"}, + }, + } + + conf := testutil.TestConfig(t) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + // After the container is started, mount dir inside source and check what + // happens to both destinations. + if err := syscall.Mount(dir, srcMnt, "bind", syscall.MS_BIND, ""); err != nil { + t.Fatalf("mount(%q, %q, MS_BIND): %v", dir, srcMnt, err) + } + + // Check that mount didn't propagate to private mount. + privFile := filepath.Join(priv, "mnt", "file") + execArgs := &control.ExecArgs{ + Filename: "/usr/bin/test", + Argv: []string{"test", "!", "-f", privFile}, + } + if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err) + } + + // Check that mount propagated to slave mount. + slaveFile := filepath.Join(slave, "mnt", "file") + execArgs = &control.ExecArgs{ + Filename: "/usr/bin/test", + Argv: []string{"test", "-f", slaveFile}, + } + if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err) + } +} + +func TestMountSymlink(t *testing.T) { + for name, conf := range configsWithVFS2(t, overlay) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + defer os.RemoveAll(dir) + + source := path.Join(dir, "source") + target := path.Join(dir, "target") + for _, path := range []string{source, target} { + if err := os.MkdirAll(path, 0777); err != nil { + t.Fatalf("os.MkdirAll(): %v", err) + } + } + f, err := os.Create(path.Join(source, "file")) + if err != nil { + t.Fatalf("os.Create(): %v", err) + } + f.Close() + + link := path.Join(dir, "link") + if err := os.Symlink(target, link); err != nil { + t.Fatalf("os.Symlink(%q, %q): %v", target, link, err) + } + + spec := testutil.NewSpecWithArgs("/bin/sleep", "1000") + + // Mount to a symlink to ensure the mount code will follow it and mount + // at the symlink target. + spec.Mounts = append(spec.Mounts, specs.Mount{ + Type: "bind", + Destination: link, + Source: source, + }) + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } + + // Check that symlink was resolved and mount was created where the symlink + // is pointing to. + file := path.Join(target, "file") + execArgs := &control.ExecArgs{ + Filename: "/usr/bin/test", + Argv: []string{"test", "-f", file}, + } + if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err) + } + }) + } +} + +// Check that --net-raw disables the CAP_NET_RAW capability. +func TestNetRaw(t *testing.T) { + capNetRaw := strconv.FormatUint(bits.MaskOf64(int(linux.CAP_NET_RAW)), 10) + app, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + for _, enableRaw := range []bool{true, false} { + conf := testutil.TestConfig(t) + conf.EnableRaw = enableRaw + + test := "--enabled" + if !enableRaw { + test = "--disabled" + } + + spec := testutil.NewSpecWithArgs(app, "capability", test, capNetRaw) + if err := run(spec, conf); err != nil { + t.Fatalf("Error running container: %v", err) + } + } +} + +// TestTTYField checks TTY field returned by container.Processes(). +func TestTTYField(t *testing.T) { + stop := testutil.StartReaper() + defer stop() + + testApp, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + testCases := []struct { + name string + useTTY bool + wantTTYField string + }{ + { + name: "no tty", + useTTY: false, + wantTTYField: "?", + }, + { + name: "tty used", + useTTY: true, + wantTTYField: "pts/0", + }, + } + + for _, test := range testCases { + for _, vfs2 := range []bool{false, true} { + name := test.name + if vfs2 { + name += "-vfs2" + } + t.Run(name, func(t *testing.T) { + conf := testutil.TestConfig(t) + conf.VFS2 = vfs2 + + // We will run /bin/sleep, possibly with an open TTY. + cmd := []string{"/bin/sleep", "10000"} + if test.useTTY { + // Run inside the "pty-runner". + cmd = append([]string{testApp, "pty-runner"}, cmd...) + } + + spec := testutil.NewSpecWithArgs(cmd...) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Wait for sleep to be running, and check the TTY + // field. + var gotTTYField string + cb := func() error { + ps, err := c.Processes() + if err != nil { + err = fmt.Errorf("error getting process data from container: %v", err) + return &backoff.PermanentError{Err: err} + } + for _, p := range ps { + if strings.Contains(p.Cmd, "sleep") { + gotTTYField = p.TTY + return nil + } + } + return fmt.Errorf("sleep not running") + } + if err := testutil.Poll(cb, 30*time.Second); err != nil { + t.Fatalf("error waiting for sleep process: %v", err) + } + + if gotTTYField != test.wantTTYField { + t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField) + } + }) + } + } +} + +// executeSync synchronously executes a new process. +func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { + pid, err := cont.Execute(args) + if err != nil { + return 0, fmt.Errorf("error executing: %v", err) + } + ws, err := cont.WaitPID(pid) + if err != nil { + return 0, fmt.Errorf("error waiting: %v", err) + } + return ws, nil +} + +func TestMain(m *testing.M) { + log.SetLevel(log.Debug) + flag.Parse() + if err := testutil.ConfigureExePath(); err != nil { + panic(err.Error()) + } + specutils.MaybeRunAsRoot() + os.Exit(m.Run()) +} diff --git a/runsc/container/hook.go b/runsc/container/hook.go new file mode 100644 index 000000000..901607aee --- /dev/null +++ b/runsc/container/hook.go @@ -0,0 +1,111 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "path/filepath" + "strings" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/log" +) + +// This file implements hooks as defined in OCI spec: +// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22 +// +// "hooks":{ +// "prestart":[{ +// "path":"/usr/bin/dockerd", +// "args":[ +// "libnetwork-setkey", "arg2", +// ] +// }] +// }, + +// executeHooksBestEffort executes hooks and logs warning in case they fail. +// Runs all hooks, always. +func executeHooksBestEffort(hooks []specs.Hook, s specs.State) { + for _, h := range hooks { + if err := executeHook(h, s); err != nil { + log.Warningf("Failure to execute hook %+v, err: %v", h, err) + } + } +} + +// executeHooks executes hooks until the first one fails or they all execute. +func executeHooks(hooks []specs.Hook, s specs.State) error { + for _, h := range hooks { + if err := executeHook(h, s); err != nil { + return err + } + } + return nil +} + +func executeHook(h specs.Hook, s specs.State) error { + log.Debugf("Executing hook %+v, state: %+v", h, s) + + if strings.TrimSpace(h.Path) == "" { + return fmt.Errorf("empty path for hook") + } + if !filepath.IsAbs(h.Path) { + return fmt.Errorf("path for hook is not absolute: %q", h.Path) + } + + b, err := json.Marshal(s) + if err != nil { + return err + } + var stdout, stderr bytes.Buffer + cmd := exec.Cmd{ + Path: h.Path, + Args: h.Args, + Env: h.Env, + Stdin: bytes.NewReader(b), + Stdout: &stdout, + Stderr: &stderr, + } + if err := cmd.Start(); err != nil { + return err + } + + c := make(chan error, 1) + go func() { + c <- cmd.Wait() + }() + + var timer <-chan time.Time + if h.Timeout != nil { + timer = time.After(time.Duration(*h.Timeout) * time.Second) + } + select { + case err := <-c: + if err != nil { + return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String()) + } + case <-timer: + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String()) + } + + log.Debugf("Execute hook %q success!", h.Path) + return nil +} diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go new file mode 100644 index 000000000..e189648f4 --- /dev/null +++ b/runsc/container/multi_container_test.go @@ -0,0 +1,1774 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "fmt" + "io/ioutil" + "math" + "os" + "path" + "path/filepath" + "strings" + "syscall" + "testing" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/specutils" +) + +func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) { + var specs []*specs.Spec + var ids []string + rootID := testutil.RandomContainerID() + + for i, cmd := range cmds { + spec := testutil.NewSpecWithArgs(cmd...) + if i == 0 { + spec.Annotations = map[string]string{ + specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox, + } + ids = append(ids, rootID) + } else { + spec.Annotations = map[string]string{ + specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer, + specutils.ContainerdSandboxIDAnnotation: rootID, + } + ids = append(ids, testutil.RandomContainerID()) + } + specs = append(specs, spec) + } + return specs, ids +} + +func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) { + if len(conf.RootDir) == 0 { + panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.") + } + + cu := cleanup.Cleanup{} + defer cu.Clean() + + var containers []*Container + for i, spec := range specs { + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) + if err != nil { + return nil, nil, fmt.Errorf("error setting up container: %v", err) + } + cu.Add(cleanup) + + args := Args{ + ID: ids[i], + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + return nil, nil, fmt.Errorf("error creating container: %v", err) + } + cu.Add(func() { cont.Destroy() }) + containers = append(containers, cont) + + if err := cont.Start(conf); err != nil { + return nil, nil, fmt.Errorf("error starting container: %v", err) + } + } + + return containers, cu.Release(), nil +} + +type execDesc struct { + c *Container + cmd []string + want int + name string +} + +func execMany(t *testing.T, execs []execDesc) { + for _, exec := range execs { + t.Run(exec.name, func(t *testing.T) { + args := &control.ExecArgs{Argv: exec.cmd} + if ws, err := exec.c.executeSync(args); err != nil { + t.Errorf("error executing %+v: %v", args, err) + } else if ws.ExitStatus() != exec.want { + t.Errorf("%q: exec %q got exit status: %d, want: %d", exec.name, exec.cmd, ws.ExitStatus(), exec.want) + } + }) + } +} + +func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) { + for _, spec := range pod { + spec.Annotations[boot.MountPrefix+name+".source"] = mount.Source + spec.Annotations[boot.MountPrefix+name+".type"] = mount.Type + spec.Annotations[boot.MountPrefix+name+".share"] = "pod" + if len(mount.Options) > 0 { + spec.Annotations[boot.MountPrefix+name+".options"] = strings.Join(mount.Options, ",") + } + } +} + +// TestMultiContainerSanity checks that it is possible to run 2 dead-simple +// containers in the same sandbox. +func TestMultiContainerSanity(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + specs, ids := createSpecs(sleep, sleep) + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Check via ps that multiple processes are running. + expectedPL := []*control.Process{ + newProcessBuilder().PID(1).PPID(0).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + expectedPL = []*control.Process{ + newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[1], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + }) + } +} + +// TestMultiPIDNS checks that it is possible to run 2 dead-simple +// containers in the same sandbox with different pidns. +func TestMultiPIDNS(t *testing.T) { + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + testSpecs, ids := createSpecs(sleep, sleep) + testSpecs[1].Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + }, + } + + containers, cleanup, err := startContainers(conf, testSpecs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Check via ps that multiple processes are running. + expectedPL := []*control.Process{ + newProcessBuilder().PID(1).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + expectedPL = []*control.Process{ + newProcessBuilder().PID(1).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[1], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + }) + } +} + +// TestMultiPIDNSPath checks the pidns path. +func TestMultiPIDNSPath(t *testing.T) { + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + testSpecs, ids := createSpecs(sleep, sleep, sleep) + testSpecs[0].Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + Path: "/proc/1/ns/pid", + }, + }, + } + testSpecs[1].Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + Path: "/proc/1/ns/pid", + }, + }, + } + testSpecs[2].Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + Path: "/proc/2/ns/pid", + }, + }, + } + + containers, cleanup, err := startContainers(conf, testSpecs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Check via ps that multiple processes are running. + expectedPL := []*control.Process{ + newProcessBuilder().PID(1).PPID(0).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + if err := waitForProcessList(containers[2], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + + expectedPL = []*control.Process{ + newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[1], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + }) + } +} + +func TestMultiContainerWait(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + + // The first container should run the entire duration of the test. + cmd1 := []string{"sleep", "100"} + // We'll wait on the second container, which is much shorter lived. + cmd2 := []string{"sleep", "1"} + specs, ids := createSpecs(cmd1, cmd2) + + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Check via ps that multiple processes are running. + expectedPL := []*control.Process{ + newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[1], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + + // Wait on the short lived container from multiple goroutines. + wg := sync.WaitGroup{} + for i := 0; i < 3; i++ { + wg.Add(1) + go func(c *Container) { + defer wg.Done() + if ws, err := c.Wait(); err != nil { + t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err) + } else if es := ws.ExitStatus(); es != 0 { + t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es) + } + if _, err := c.Wait(); err != nil { + t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err) + } + }(containers[1]) + } + + // Also wait via PID. + for i := 0; i < 3; i++ { + wg.Add(1) + go func(c *Container) { + defer wg.Done() + const pid = 2 + if ws, err := c.WaitPID(pid); err != nil { + t.Errorf("failed to wait for PID %d: %v", pid, err) + } else if es := ws.ExitStatus(); es != 0 { + t.Errorf("PID %d exited with non-zero status %d", pid, es) + } + if _, err := c.WaitPID(pid); err == nil { + t.Errorf("wait for stopped PID %d should fail", pid) + } + }(containers[1]) + } + + wg.Wait() + + // After Wait returns, ensure that the root container is running and + // the child has finished. + expectedPL = []*control.Process{ + newProcessBuilder().Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err) + } +} + +// TestExecWait ensures what we can wait containers and individual processes in the +// sandbox that have already exited. +func TestExecWait(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + + // The first container should run the entire duration of the test. + cmd1 := []string{"sleep", "100"} + // We'll wait on the second container, which is much shorter lived. + cmd2 := []string{"sleep", "1"} + specs, ids := createSpecs(cmd1, cmd2) + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Check via ps that process is running. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[1], expectedPL); err != nil { + t.Fatalf("failed to wait for sleep to start: %v", err) + } + + // Wait for the second container to finish. + if err := waitForProcessCount(containers[1], 0); err != nil { + t.Fatalf("failed to wait for second container to stop: %v", err) + } + + // Get the second container exit status. + if ws, err := containers[1].Wait(); err != nil { + t.Fatalf("failed to wait for process %s: %v", containers[1].Spec.Process.Args, err) + } else if es := ws.ExitStatus(); es != 0 { + t.Fatalf("process %s exited with non-zero status %d", containers[1].Spec.Process.Args, es) + } + if _, err := containers[1].Wait(); err != nil { + t.Fatalf("wait for stopped container %s shouldn't fail: %v", containers[1].Spec.Process.Args, err) + } + + // Execute another process in the first container. + args := &control.ExecArgs{ + Filename: "/bin/sleep", + Argv: []string{"/bin/sleep", "1"}, + WorkingDirectory: "/", + KUID: 0, + } + pid, err := containers[0].Execute(args) + if err != nil { + t.Fatalf("error executing: %v", err) + } + + // Wait for the exec'd process to exit. + expectedPL = []*control.Process{ + newProcessBuilder().PID(1).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Fatalf("failed to wait for second container to stop: %v", err) + } + + // Get the exit status from the exec'd process. + if ws, err := containers[0].WaitPID(pid); err != nil { + t.Fatalf("failed to wait for process %+v with pid %d: %v", args, pid, err) + } else if es := ws.ExitStatus(); es != 0 { + t.Fatalf("process %+v exited with non-zero status %d", args, es) + } + if _, err := containers[0].WaitPID(pid); err == nil { + t.Fatalf("wait for stopped process %+v should fail", args) + } +} + +// TestMultiContainerMount tests that bind mounts can be used with multiple +// containers. +func TestMultiContainerMount(t *testing.T) { + cmd1 := []string{"sleep", "100"} + + // 'src != dst' ensures that 'dst' doesn't exist in the host and must be + // properly mapped inside the container to work. + src, err := ioutil.TempDir(testutil.TmpDir(), "container") + if err != nil { + t.Fatal("ioutil.TempDir failed:", err) + } + dst := src + ".dst" + cmd2 := []string{"touch", filepath.Join(dst, "file")} + + sps, ids := createSpecs(cmd1, cmd2) + sps[1].Mounts = append(sps[1].Mounts, specs.Mount{ + Source: src, + Destination: dst, + Type: "bind", + }) + + // Setup the containers. + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + + containers, cleanup, err := startContainers(conf, sps, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + ws, err := containers[1].Wait() + if err != nil { + t.Error("error waiting on container:", err) + } + if !ws.Exited() || ws.ExitStatus() != 0 { + t.Error("container failed, waitStatus:", ws) + } +} + +// TestMultiContainerSignal checks that it is possible to signal individual +// containers without killing the entire sandbox. +func TestMultiContainerSignal(t *testing.T) { + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + specs, ids := createSpecs(sleep, sleep) + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Check via ps that container 1 process is running. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[1], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + + // Kill process 2. + if err := containers[1].SignalContainer(syscall.SIGKILL, false); err != nil { + t.Errorf("failed to kill process 2: %v", err) + } + + // Make sure process 1 is still running. + expectedPL = []*control.Process{ + newProcessBuilder().PID(1).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + + // goferPid is reset when container is destroyed. + goferPid := containers[1].GoferPid + + // Destroy container and ensure container's gofer process has exited. + if err := containers[1].Destroy(); err != nil { + t.Errorf("failed to destroy container: %v", err) + } + _, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) { + cpid, err := syscall.Wait4(goferPid, nil, 0, nil) + return uintptr(cpid), 0, err + }) + if err != syscall.ECHILD { + t.Errorf("error waiting for gofer to exit: %v", err) + } + // Make sure process 1 is still running. + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + + // Now that process 2 is gone, ensure we get an error trying to + // signal it again. + if err := containers[1].SignalContainer(syscall.SIGKILL, false); err == nil { + t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID) + } + + // Kill process 1. + if err := containers[0].SignalContainer(syscall.SIGKILL, false); err != nil { + t.Errorf("failed to kill process 1: %v", err) + } + + // Ensure that container's gofer and sandbox process are no more. + err = blockUntilWaitable(containers[0].GoferPid) + if err != nil && err != syscall.ECHILD { + t.Errorf("error waiting for gofer to exit: %v", err) + } + + err = blockUntilWaitable(containers[0].Sandbox.Pid) + if err != nil && err != syscall.ECHILD { + t.Errorf("error waiting for sandbox to exit: %v", err) + } + + // The sentry should be gone, so signaling should yield an error. + if err := containers[0].SignalContainer(syscall.SIGKILL, false); err == nil { + t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID) + } + + if err := containers[0].Destroy(); err != nil { + t.Errorf("failed to destroy container: %v", err) + } + }) + } +} + +// TestMultiContainerDestroy checks that container are properly cleaned-up when +// they are destroyed. +func TestMultiContainerDestroy(t *testing.T) { + app, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // First container will remain intact while the second container is killed. + podSpecs, ids := createSpecs( + []string{"sleep", "100"}, + []string{app, "fork-bomb"}) + + // Run the fork bomb in a PID namespace to prevent processes to be + // re-parented to PID=1 in the root container. + podSpecs[1].Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{{Type: "pid"}}, + } + containers, cleanup, err := startContainers(conf, podSpecs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Exec more processes to ensure signal all works for exec'd processes too. + args := &control.ExecArgs{ + Filename: app, + Argv: []string{app, "fork-bomb"}, + } + if _, err := containers[1].Execute(args); err != nil { + t.Fatalf("error exec'ing: %v", err) + } + + // Let it brew... + time.Sleep(500 * time.Millisecond) + + if err := containers[1].Destroy(); err != nil { + t.Fatalf("error destroying container: %v", err) + } + + // Check that destroy killed all processes belonging to the container and + // waited for them to exit before returning. + pss, err := containers[0].Sandbox.Processes("") + if err != nil { + t.Fatalf("error getting process data from sandbox: %v", err) + } + expectedPL := []*control.Process{ + newProcessBuilder().PID(1).Cmd("sleep").Process(), + } + if !procListsEqual(pss, expectedPL) { + t.Errorf("container got process list: %s, want: %s: error: %v", + procListToString(pss), procListToString(expectedPL), err) + } + + // Check that cont.Destroy is safe to call multiple times. + if err := containers[1].Destroy(); err != nil { + t.Errorf("error destroying container: %v", err) + } + }) + } +} + +func TestMultiContainerProcesses(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + + // Note: use curly braces to keep 'sh' process around. Otherwise, shell + // will just execve into 'sleep' and both containers will look the + // same. + specs, ids := createSpecs( + []string{"sleep", "100"}, + []string{"sh", "-c", "{ sleep 100; }"}) + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Check root's container process list doesn't include other containers. + expectedPL0 := []*control.Process{ + newProcessBuilder().PID(1).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[0], expectedPL0); err != nil { + t.Errorf("failed to wait for process to start: %v", err) + } + + // Same for the other container. + expectedPL1 := []*control.Process{ + newProcessBuilder().PID(2).Cmd("sh").Process(), + newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[1], expectedPL1); err != nil { + t.Errorf("failed to wait for process to start: %v", err) + } + + // Now exec into the second container and verify it shows up in the container. + args := &control.ExecArgs{ + Filename: "/bin/sleep", + Argv: []string{"/bin/sleep", "100"}, + } + if _, err := containers[1].Execute(args); err != nil { + t.Fatalf("error exec'ing: %v", err) + } + expectedPL1 = append(expectedPL1, newProcessBuilder().PID(4).Cmd("sleep").Process()) + if err := waitForProcessList(containers[1], expectedPL1); err != nil { + t.Errorf("failed to wait for process to start: %v", err) + } + // Root container should remain unchanged. + if err := waitForProcessList(containers[0], expectedPL0); err != nil { + t.Errorf("failed to wait for process to start: %v", err) + } +} + +// TestMultiContainerKillAll checks that all process that belong to a container +// are killed when SIGKILL is sent to *all* processes in that container. +func TestMultiContainerKillAll(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + + for _, tc := range []struct { + killContainer bool + }{ + {killContainer: true}, + {killContainer: false}, + } { + app, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + // First container will remain intact while the second container is killed. + specs, ids := createSpecs( + []string{app, "task-tree", "--depth=2", "--width=2"}, + []string{app, "task-tree", "--depth=4", "--width=2"}) + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Wait until all processes are created. + rootProcCount := int(math.Pow(2, 3) - 1) + if err := waitForProcessCount(containers[0], rootProcCount); err != nil { + t.Fatalf("error waitting for processes: %v", err) + } + procCount := int(math.Pow(2, 5) - 1) + if err := waitForProcessCount(containers[1], procCount); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + + // Exec more processes to ensure signal works for exec'd processes too. + args := &control.ExecArgs{ + Filename: app, + Argv: []string{app, "task-tree", "--depth=2", "--width=2"}, + } + if _, err := containers[1].Execute(args); err != nil { + t.Fatalf("error exec'ing: %v", err) + } + // Wait for these new processes to start. + procCount += int(math.Pow(2, 3) - 1) + if err := waitForProcessCount(containers[1], procCount); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + + if tc.killContainer { + // First kill the init process to make the container be stopped with + // processes still running inside. + containers[1].SignalContainer(syscall.SIGKILL, false) + op := func() error { + c, err := Load(conf.RootDir, ids[1]) + if err != nil { + return err + } + if c.Status != Stopped { + return fmt.Errorf("container is not stopped") + } + return nil + } + if err := testutil.Poll(op, 5*time.Second); err != nil { + t.Fatalf("container did not stop %q: %v", containers[1].ID, err) + } + } + + c, err := Load(conf.RootDir, ids[1]) + if err != nil { + t.Fatalf("failed to load child container %q: %v", c.ID, err) + } + // Kill'Em All + if err := c.SignalContainer(syscall.SIGKILL, true); err != nil { + t.Fatalf("failed to send SIGKILL to container %q: %v", c.ID, err) + } + + // Check that all processes are gone. + if err := waitForProcessCount(containers[1], 0); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + // Check that root container was not affected. + if err := waitForProcessCount(containers[0], rootProcCount); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + } +} + +func TestMultiContainerDestroyNotStarted(t *testing.T) { + specs, ids := createSpecs( + []string{"/bin/sleep", "100"}, + []string{"/bin/sleep", "100"}) + + conf := testutil.TestConfig(t) + _, bundleDir, cleanup, err := testutil.SetupContainer(specs[0], conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + rootArgs := Args{ + ID: ids[0], + Spec: specs[0], + BundleDir: bundleDir, + } + root, err := New(conf, rootArgs) + if err != nil { + t.Fatalf("error creating root container: %v", err) + } + defer root.Destroy() + if err := root.Start(conf); err != nil { + t.Fatalf("error starting root container: %v", err) + } + + // Create and destroy sub-container. + bundleDir, cleanupSub, err := testutil.SetupBundleDir(specs[1]) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanupSub() + + args := Args{ + ID: ids[1], + Spec: specs[1], + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + + // Check that container can be destroyed. + if err := cont.Destroy(); err != nil { + t.Fatalf("deleting non-started container failed: %v", err) + } +} + +// TestMultiContainerDestroyStarting attempts to force a race between start +// and destroy. +func TestMultiContainerDestroyStarting(t *testing.T) { + cmds := make([][]string, 10) + for i := range cmds { + cmds[i] = []string{"/bin/sleep", "100"} + } + specs, ids := createSpecs(cmds...) + + conf := testutil.TestConfig(t) + rootDir, bundleDir, cleanup, err := testutil.SetupContainer(specs[0], conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + rootArgs := Args{ + ID: ids[0], + Spec: specs[0], + BundleDir: bundleDir, + } + root, err := New(conf, rootArgs) + if err != nil { + t.Fatalf("error creating root container: %v", err) + } + defer root.Destroy() + if err := root.Start(conf); err != nil { + t.Fatalf("error starting root container: %v", err) + } + + wg := sync.WaitGroup{} + for i := range cmds { + if i == 0 { + continue // skip root container + } + + bundleDir, cleanup, err := testutil.SetupBundleDir(specs[i]) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + rootArgs := Args{ + ID: ids[i], + Spec: specs[i], + BundleDir: bundleDir, + } + cont, err := New(conf, rootArgs) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + + // Container is not thread safe, so load another instance to run in + // concurrently. + startCont, err := Load(rootDir, ids[i]) + if err != nil { + t.Fatalf("error loading container: %v", err) + } + wg.Add(1) + go func() { + defer wg.Done() + startCont.Start(conf) // ignore failures, start can fail if destroy runs first. + }() + + wg.Add(1) + go func() { + defer wg.Done() + if err := cont.Destroy(); err != nil { + t.Errorf("deleting non-started container failed: %v", err) + } + }() + } + wg.Wait() +} + +// TestMultiContainerDifferentFilesystems tests that different containers have +// different root filesystems. +func TestMultiContainerDifferentFilesystems(t *testing.T) { + filename := "/foo" + // Root container will create file and then sleep. + cmdRoot := []string{"sh", "-c", fmt.Sprintf("touch %q && sleep 100", filename)} + + // Child containers will assert that the file does not exist, and will + // then create it. + script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename) + cmd := []string{"sh", "-c", script} + + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + + // Make sure overlay is enabled, and none of the root filesystems are + // read-only, otherwise we won't be able to create the file. + conf.Overlay = true + specs, ids := createSpecs(cmdRoot, cmd, cmd) + for _, s := range specs { + s.Root.Readonly = false + } + + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Both child containers should exit successfully. + for i, c := range containers { + if i == 0 { + // Don't wait on the root. + continue + } + if ws, err := c.Wait(); err != nil { + t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err) + } else if es := ws.ExitStatus(); es != 0 { + t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es) + } + } +} + +// TestMultiContainerContainerDestroyStress tests that IO operations continue +// to work after containers have been stopped and gofers killed. +func TestMultiContainerContainerDestroyStress(t *testing.T) { + app, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + // Setup containers. Root container just reaps children, while the others + // perform some IOs. Children are executed in 3 batches of 10. Within the + // batch there is overlap between containers starting and being destroyed. In + // between batches all containers stop before starting another batch. + cmds := [][]string{{app, "reaper"}} + const batchSize = 10 + for i := 0; i < 3*batchSize; i++ { + dir, err := ioutil.TempDir(testutil.TmpDir(), "gofer-stop-test") + if err != nil { + t.Fatal("ioutil.TempDir failed:", err) + } + defer os.RemoveAll(dir) + + cmd := "find /bin -type f | head | xargs -I SRC cp SRC " + dir + cmds = append(cmds, []string{"sh", "-c", cmd}) + } + allSpecs, allIDs := createSpecs(cmds...) + + // Split up the specs and IDs. + rootSpec := allSpecs[0] + rootID := allIDs[0] + childrenSpecs := allSpecs[1:] + childrenIDs := allIDs[1:] + + conf := testutil.TestConfig(t) + _, bundleDir, cleanup, err := testutil.SetupContainer(rootSpec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Start root container. + rootArgs := Args{ + ID: rootID, + Spec: rootSpec, + BundleDir: bundleDir, + } + root, err := New(conf, rootArgs) + if err != nil { + t.Fatalf("error creating root container: %v", err) + } + if err := root.Start(conf); err != nil { + t.Fatalf("error starting root container: %v", err) + } + defer root.Destroy() + + // Run batches. Each batch starts containers in parallel, then wait and + // destroy them before starting another batch. + for i := 0; i < len(childrenSpecs); i += batchSize { + t.Logf("Starting batch from %d to %d", i, i+batchSize) + specs := childrenSpecs[i : i+batchSize] + ids := childrenIDs[i : i+batchSize] + + var children []*Container + for j, spec := range specs { + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: ids[j], + Spec: spec, + BundleDir: bundleDir, + } + child, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + children = append(children, child) + + if err := child.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Give a small gap between containers. + time.Sleep(50 * time.Millisecond) + } + for _, child := range children { + ws, err := child.Wait() + if err != nil { + t.Fatalf("waiting for container: %v", err) + } + if !ws.Exited() || ws.ExitStatus() != 0 { + t.Fatalf("container failed, waitStatus: %x (%d)", ws, ws.ExitStatus()) + } + if err := child.Destroy(); err != nil { + t.Fatalf("error destroying container: %v", err) + } + } + } +} + +// Test that pod shared mounts are properly mounted in 2 containers and that +// changes from one container is reflected in the other. +func TestMultiContainerSharedMount(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: nil, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + + createSharedMount(mnt0, "test-mount", podSpec...) + + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + file0 := path.Join(mnt0.Destination, "abc") + file1 := path.Join(mnt1.Destination, "abc") + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + name: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + name: "directory is mounted in container1", + }, + { + c: containers[0], + cmd: []string{"/bin/touch", file0}, + name: "create file in container0", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-f", file0}, + name: "file appears in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-f", file1}, + name: "file appears in container1", + }, + { + c: containers[1], + cmd: []string{"/bin/rm", file1}, + name: "remove file from container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "!", "-f", file0}, + name: "file removed from container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "!", "-f", file1}, + name: "file removed from container1", + }, + { + c: containers[1], + cmd: []string{"/bin/mkdir", file1}, + name: "create directory in container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", file0}, + name: "dir appears in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", file1}, + name: "dir appears in container1", + }, + { + c: containers[0], + cmd: []string{"/bin/rmdir", file0}, + name: "remove directory from container0", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "!", "-d", file0}, + name: "dir removed from container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "!", "-d", file1}, + name: "dir removed from container1", + }, + } + execMany(t, execs) + }) + } +} + +// Test that pod mounts are mounted as readonly when requested. +func TestMultiContainerSharedMountReadonly(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: []string{"ro"}, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + + createSharedMount(mnt0, "test-mount", podSpec...) + + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + file0 := path.Join(mnt0.Destination, "abc") + file1 := path.Join(mnt1.Destination, "abc") + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + name: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + name: "directory is mounted in container1", + }, + { + c: containers[0], + cmd: []string{"/bin/touch", file0}, + want: 1, + name: "fails to write to container0", + }, + { + c: containers[1], + cmd: []string{"/bin/touch", file1}, + want: 1, + name: "fails to write to container1", + }, + } + execMany(t, execs) + }) + } +} + +// Test that shared pod mounts continue to work after container is restarted. +func TestMultiContainerSharedMountRestart(t *testing.T) { + //TODO(gvisor.dev/issue/1487): This is failing with VFS2. + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: nil, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + + createSharedMount(mnt0, "test-mount", podSpec...) + + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + file0 := path.Join(mnt0.Destination, "abc") + file1 := path.Join(mnt1.Destination, "abc") + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/bin/touch", file0}, + name: "create file in container0", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-f", file0}, + name: "file appears in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-f", file1}, + name: "file appears in container1", + }, + } + execMany(t, execs) + + containers[1].Destroy() + + bundleDir, cleanup, err := testutil.SetupBundleDir(podSpec[1]) + if err != nil { + t.Fatalf("error restarting container: %v", err) + } + defer cleanup() + + args := Args{ + ID: ids[1], + Spec: podSpec[1], + BundleDir: bundleDir, + } + containers[1], err = New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + if err := containers[1].Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + execs = []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-f", file0}, + name: "file is still in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-f", file1}, + name: "file is still in container1", + }, + { + c: containers[1], + cmd: []string{"/bin/rm", file1}, + name: "file removed from container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "!", "-f", file0}, + name: "file removed from container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "!", "-f", file1}, + name: "file removed from container1", + }, + } + execMany(t, execs) + }) + } +} + +// Test that unsupported pod mounts options are ignored when matching master and +// slave mounts. +func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"/bin/sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: []string{"rw", "rbind", "relatime"}, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + mnt1.Options = []string{"rw", "nosuid"} + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + + createSharedMount(mnt0, "test-mount", podSpec...) + + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + name: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + name: "directory is mounted in container1", + }, + } + execMany(t, execs) + }) + } +} + +// Test that one container can send an FD to another container, even though +// they have distinct MountNamespaces. +func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) { + app, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + + // We set up two containers with one shared mount that is used for a + // shared socket. The first container will send an FD over the socket + // to the second container. The FD corresponds to a file in the first + // container's mount namespace that is not part of the second + // container's mount namespace. However, the second container still + // should be able to read the FD. + + // Create a shared mount where we will put the socket. + sharedMnt := specs.Mount{ + Destination: "/mydir/test", + Type: "tmpfs", + // Shared mounts need a Source, even for tmpfs. It is only used + // to match up different shared mounts inside the pod. + Source: "/some/dir", + } + socketPath := filepath.Join(sharedMnt.Destination, "socket") + + // Create a writeable tmpfs mount where the FD sender app will create + // files to send. This will only be mounted in the FD sender. + writeableMnt := specs.Mount{ + Destination: "/tmp", + Type: "tmpfs", + } + + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + + // Create the specs. + specs, ids := createSpecs( + []string{"sleep", "1000"}, + []string{app, "fd_sender", "--socket", socketPath}, + []string{app, "fd_receiver", "--socket", socketPath}, + ) + createSharedMount(sharedMnt, "shared-mount", specs...) + specs[1].Mounts = append(specs[2].Mounts, sharedMnt, writeableMnt) + specs[2].Mounts = append(specs[1].Mounts, sharedMnt) + + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Both containers should exit successfully. + for _, c := range containers[1:] { + if ws, err := c.Wait(); err != nil { + t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err) + } else if es := ws.ExitStatus(); es != 0 { + t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es) + } + } +} + +// Test that container is destroyed when Gofer is killed. +func TestMultiContainerGoferKilled(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + + sleep := []string{"sleep", "100"} + specs, ids := createSpecs(sleep, sleep, sleep) + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Ensure container is running + c := containers[2] + expectedPL := []*control.Process{ + newProcessBuilder().PID(3).Cmd("sleep").Process(), + } + if err := waitForProcessList(c, expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + + // Kill container's gofer. + if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { + t.Fatalf("syscall.Kill(%d, SIGKILL)=%v", c.GoferPid, err) + } + + // Wait until container stops. + if err := waitForProcessList(c, nil); err != nil { + t.Errorf("Container %q was not stopped after gofer death: %v", c.ID, err) + } + + // Check that container isn't running anymore. + args := &control.ExecArgs{Argv: []string{"/bin/true"}} + if _, err := c.executeSync(args); err == nil { + t.Fatalf("Container %q was not stopped after gofer death", c.ID) + } + + // Check that other containers are unaffected. + for i, c := range containers { + if i == 2 { + continue // container[2] has been killed. + } + pl := []*control.Process{ + newProcessBuilder().PID(kernel.ThreadID(i + 1)).Cmd("sleep").Process(), + } + if err := waitForProcessList(c, pl); err != nil { + t.Errorf("Container %q was affected by another container: %v", c.ID, err) + } + args := &control.ExecArgs{Argv: []string{"/bin/true"}} + if _, err := c.executeSync(args); err != nil { + t.Fatalf("Container %q was affected by another container: %v", c.ID, err) + } + } + + // Kill root container's gofer to bring entire sandbox down. + c = containers[0] + if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { + t.Fatalf("syscall.Kill(%d, SIGKILL)=%v", c.GoferPid, err) + } + + // Wait until sandbox stops. waitForProcessList will loop until sandbox exits + // and RPC errors out. + impossiblePL := []*control.Process{ + newProcessBuilder().Cmd("non-existent-process").Process(), + } + if err := waitForProcessList(c, impossiblePL); err == nil { + t.Fatalf("Sandbox was not killed after gofer death") + } + + // Check that entire sandbox isn't running anymore. + for _, c := range containers { + args := &control.ExecArgs{Argv: []string{"/bin/true"}} + if _, err := c.executeSync(args); err == nil { + t.Fatalf("Container %q was not stopped after gofer death", c.ID) + } + } +} + +func TestMultiContainerLoadSandbox(t *testing.T) { + sleep := []string{"sleep", "100"} + specs, ids := createSpecs(sleep, sleep, sleep) + + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + + // Create containers for the sandbox. + wants, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Then create unrelated containers. + for i := 0; i < 3; i++ { + specs, ids = createSpecs(sleep, sleep, sleep) + _, cleanup, err = startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + } + + // Create an unrelated directory under root. + dir := filepath.Join(conf.RootDir, "not-a-container") + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatalf("os.MkdirAll(%q)=%v", dir, err) + } + + // Create a valid but empty container directory. + randomCID := testutil.RandomContainerID() + dir = filepath.Join(conf.RootDir, randomCID) + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatalf("os.MkdirAll(%q)=%v", dir, err) + } + + // Load the sandbox and check that the correct containers were returned. + id := wants[0].Sandbox.ID + gots, err := loadSandbox(conf.RootDir, id) + if err != nil { + t.Fatalf("loadSandbox()=%v", err) + } + wantIDs := make(map[string]struct{}) + for _, want := range wants { + wantIDs[want.ID] = struct{}{} + } + for _, got := range gots { + if got.Sandbox.ID != id { + t.Errorf("wrong sandbox ID, got: %v, want: %v", got.Sandbox.ID, id) + } + if _, ok := wantIDs[got.ID]; !ok { + t.Errorf("wrong container ID, got: %v, wants: %v", got.ID, wantIDs) + } + delete(wantIDs, got.ID) + } + if len(wantIDs) != 0 { + t.Errorf("containers not found: %v", wantIDs) + } +} + +// TestMultiContainerRunNonRoot checks that child container can be configured +// when running as non-privileged user. +func TestMultiContainerRunNonRoot(t *testing.T) { + cmdRoot := []string{"/bin/sleep", "100"} + cmdSub := []string{"/bin/true"} + podSpecs, ids := createSpecs(cmdRoot, cmdSub) + + // User running inside container can't list '$TMP/blocked' and would fail to + // mount it. + blocked, err := ioutil.TempDir(testutil.TmpDir(), "blocked") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + if err := os.Chmod(blocked, 0700); err != nil { + t.Fatalf("os.MkDir(%q) failed: %v", blocked, err) + } + dir := path.Join(blocked, "test") + if err := os.Mkdir(dir, 0755); err != nil { + t.Fatalf("os.MkDir(%q) failed: %v", dir, err) + } + + src, err := ioutil.TempDir(testutil.TmpDir(), "src") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + + // Set a random user/group with no access to "blocked" dir. + podSpecs[1].Process.User.UID = 343 + podSpecs[1].Process.User.GID = 2401 + podSpecs[1].Process.Capabilities = nil + + podSpecs[1].Mounts = append(podSpecs[1].Mounts, specs.Mount{ + Destination: dir, + Source: src, + Type: "bind", + }) + + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + + pod, cleanup, err := startContainers(conf, podSpecs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Once all containers are started, wait for the child container to exit. + // This means that the volume was mounted properly. + ws, err := pod[1].Wait() + if err != nil { + t.Fatalf("running child container: %v", err) + } + if !ws.Exited() || ws.ExitStatus() != 0 { + t.Fatalf("child container failed, waitStatus: %v", ws) + } +} + +// TestMultiContainerHomeEnvDir tests that the HOME environment variable is set +// for root containers, sub-containers, and execed processes. +func TestMultiContainerHomeEnvDir(t *testing.T) { + // TODO(gvisor.dev/issue/1487): VFSv2 configs failing. + // NOTE: Don't use overlay since we need changes to persist to the temp dir + // outside the sandbox. + for testName, conf := range configs(t, noOverlay...) { + t.Run(testName, func(t *testing.T) { + + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Create temp files we can write the value of $HOME to. + homeDirs := map[string]*os.File{} + for _, name := range []string{"root", "sub", "exec"} { + homeFile, err := ioutil.TempFile(testutil.TmpDir(), name) + if err != nil { + t.Fatalf("creating temp file: %v", err) + } + homeDirs[name] = homeFile + } + + // We will sleep in the root container in order to ensure that + // the root container doesn't terminate before sub containers can be + // created. + rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())} + subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())} + execCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())} + + // Setup the containers, a root container and sub container. + specConfig, ids := createSpecs(rootCmd, subCmd) + containers, cleanup, err := startContainers(conf, specConfig, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Exec into the root container synchronously. + args := &control.ExecArgs{Argv: execCmd} + if _, err := containers[0].executeSync(args); err != nil { + t.Errorf("error executing %+v: %v", args, err) + } + + // Wait for the subcontainer to finish. + _, err = containers[1].Wait() + if err != nil { + t.Errorf("wait on child container: %v", err) + } + + // Wait for the root container to run. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sh").Process(), + newProcessBuilder().Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + + // Check the written files. + for name, tmpFile := range homeDirs { + dirBytes, err := ioutil.ReadAll(tmpFile) + if err != nil { + t.Fatalf("reading %s temp file: %v", name, err) + } + got := string(dirBytes) + + want := "/" + if got != want { + t.Errorf("%s $HOME incorrect: got: %q, want: %q", name, got, want) + } + } + + }) + } +} diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go new file mode 100644 index 000000000..bac177a88 --- /dev/null +++ b/runsc/container/shared_volume_test.go @@ -0,0 +1,273 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/runsc/boot" +) + +// TestSharedVolume checks that modifications to a volume mount are propagated +// into and out of the sandbox. +func TestSharedVolume(t *testing.T) { + conf := testutil.TestConfig(t) + conf.FileAccess = boot.FileAccessShared + + // Main process just sleeps. We will use "exec" to probe the state of + // the filesystem. + spec := testutil.NewSpecWithArgs("sleep", "1000") + + dir, err := ioutil.TempDir(testutil.TmpDir(), "shared-volume-test") + if err != nil { + t.Fatalf("TempDir failed: %v", err) + } + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // File that will be used to check consistency inside/outside sandbox. + filename := filepath.Join(dir, "file") + + // File does not exist yet. Reading from the sandbox should fail. + argsTestFile := &control.ExecArgs{ + Filename: "/usr/bin/test", + Argv: []string{"test", "-f", filename}, + } + if ws, err := c.executeSync(argsTestFile); err != nil { + t.Fatalf("unexpected error testing file %q: %v", filename, err) + } else if ws.ExitStatus() == 0 { + t.Errorf("test %q exited with code %v, wanted not zero", ws.ExitStatus(), err) + } + + // Create the file from outside of the sandbox. + if err := ioutil.WriteFile(filename, []byte("foobar"), 0777); err != nil { + t.Fatalf("error writing to file %q: %v", filename, err) + } + + // Now we should be able to test the file from within the sandbox. + if ws, err := c.executeSync(argsTestFile); err != nil { + t.Fatalf("unexpected error testing file %q: %v", filename, err) + } else if ws.ExitStatus() != 0 { + t.Errorf("test %q exited with code %v, wanted zero", filename, ws.ExitStatus()) + } + + // Rename the file from outside of the sandbox. + newFilename := filepath.Join(dir, "newfile") + if err := os.Rename(filename, newFilename); err != nil { + t.Fatalf("os.Rename(%q, %q) failed: %v", filename, newFilename, err) + } + + // File should no longer exist at the old path within the sandbox. + if ws, err := c.executeSync(argsTestFile); err != nil { + t.Fatalf("unexpected error testing file %q: %v", filename, err) + } else if ws.ExitStatus() == 0 { + t.Errorf("test %q exited with code %v, wanted not zero", filename, ws.ExitStatus()) + } + + // We should be able to test the new filename from within the sandbox. + argsTestNewFile := &control.ExecArgs{ + Filename: "/usr/bin/test", + Argv: []string{"test", "-f", newFilename}, + } + if ws, err := c.executeSync(argsTestNewFile); err != nil { + t.Fatalf("unexpected error testing file %q: %v", newFilename, err) + } else if ws.ExitStatus() != 0 { + t.Errorf("test %q exited with code %v, wanted zero", newFilename, ws.ExitStatus()) + } + + // Delete the renamed file from outside of the sandbox. + if err := os.Remove(newFilename); err != nil { + t.Fatalf("error removing file %q: %v", filename, err) + } + + // Renamed file should no longer exist at the old path within the sandbox. + if ws, err := c.executeSync(argsTestNewFile); err != nil { + t.Fatalf("unexpected error testing file %q: %v", newFilename, err) + } else if ws.ExitStatus() == 0 { + t.Errorf("test %q exited with code %v, wanted not zero", newFilename, ws.ExitStatus()) + } + + // Now create the file from WITHIN the sandbox. + argsTouch := &control.ExecArgs{ + Filename: "/usr/bin/touch", + Argv: []string{"touch", filename}, + KUID: auth.KUID(os.Getuid()), + KGID: auth.KGID(os.Getgid()), + } + if ws, err := c.executeSync(argsTouch); err != nil { + t.Fatalf("unexpected error touching file %q: %v", filename, err) + } else if ws.ExitStatus() != 0 { + t.Errorf("touch %q exited with code %v, wanted zero", filename, ws.ExitStatus()) + } + + // File should exist outside the sandbox. + if _, err := os.Stat(filename); err != nil { + t.Errorf("stat %q got error %v, wanted nil", filename, err) + } + + // File should exist outside the sandbox. + if _, err := os.Stat(filename); err != nil { + t.Errorf("stat %q got error %v, wanted nil", filename, err) + } + + // Delete the file from within the sandbox. + argsRemove := &control.ExecArgs{ + Filename: "/bin/rm", + Argv: []string{"rm", filename}, + } + if ws, err := c.executeSync(argsRemove); err != nil { + t.Fatalf("unexpected error removing file %q: %v", filename, err) + } else if ws.ExitStatus() != 0 { + t.Errorf("remove %q exited with code %v, wanted zero", filename, ws.ExitStatus()) + } + + // File should not exist outside the sandbox. + if _, err := os.Stat(filename); !os.IsNotExist(err) { + t.Errorf("stat %q got error %v, wanted ErrNotExist", filename, err) + } +} + +func checkFile(c *Container, filename string, want []byte) error { + cpy := filename + ".copy" + argsCp := &control.ExecArgs{ + Filename: "/bin/cp", + Argv: []string{"cp", "-f", filename, cpy}, + } + if _, err := c.executeSync(argsCp); err != nil { + return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err) + } + got, err := ioutil.ReadFile(cpy) + if err != nil { + return fmt.Errorf("Error reading file %q: %v", filename, err) + } + if !bytes.Equal(got, want) { + return fmt.Errorf("file content inside the sandbox is wrong, got: %q, want: %q", got, want) + } + return nil +} + +// TestSharedVolumeFile tests that changes to file content outside the sandbox +// is reflected inside. +func TestSharedVolumeFile(t *testing.T) { + conf := testutil.TestConfig(t) + conf.FileAccess = boot.FileAccessShared + + // Main process just sleeps. We will use "exec" to probe the state of + // the filesystem. + spec := testutil.NewSpecWithArgs("sleep", "1000") + + dir, err := ioutil.TempDir(testutil.TmpDir(), "shared-volume-test") + if err != nil { + t.Fatalf("TempDir failed: %v", err) + } + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // File that will be used to check consistency inside/outside sandbox. + filename := filepath.Join(dir, "file") + + // Write file from outside the container and check that the same content is + // read inside. + want := []byte("host-") + if err := ioutil.WriteFile(filename, []byte(want), 0666); err != nil { + t.Fatalf("Error writing to %q: %v", filename, err) + } + if err := checkFile(c, filename, want); err != nil { + t.Fatal(err.Error()) + } + + // Append to file inside the container and check that content is not lost. + argsAppend := &control.ExecArgs{ + Filename: "/bin/bash", + Argv: []string{"bash", "-c", "echo -n sandbox- >> " + filename}, + } + if _, err := c.executeSync(argsAppend); err != nil { + t.Fatalf("unexpected error appending file %q: %v", filename, err) + } + want = []byte("host-sandbox-") + if err := checkFile(c, filename, want); err != nil { + t.Fatal(err.Error()) + } + + // Write again from outside the container and check that the same content is + // read inside. + f, err := os.OpenFile(filename, os.O_APPEND|os.O_WRONLY, 0) + if err != nil { + t.Fatalf("Error openning file %q: %v", filename, err) + } + defer f.Close() + if _, err := f.Write([]byte("host")); err != nil { + t.Fatalf("Error writing to file %q: %v", filename, err) + } + want = []byte("host-sandbox-host") + if err := checkFile(c, filename, want); err != nil { + t.Fatal(err.Error()) + } + + // Shrink file outside and check that the same content is read inside. + if err := f.Truncate(5); err != nil { + t.Fatalf("Error truncating file %q: %v", filename, err) + } + want = want[:5] + if err := checkFile(c, filename, want); err != nil { + t.Fatal(err.Error()) + } +} diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go new file mode 100644 index 000000000..17a251530 --- /dev/null +++ b/runsc/container/state_file.go @@ -0,0 +1,185 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path/filepath" + + "github.com/gofrs/flock" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" +) + +const stateFileExtension = ".state" + +// StateFile handles load from/save to container state safely from multiple +// processes. It uses a lock file to provide synchronization between operations. +// +// The lock file is located at: "${s.RootDir}/${s.ID}.lock". +// The state file is located at: "${s.RootDir}/${s.ID}.state". +type StateFile struct { + // RootDir is the directory containing the container metadata file. + RootDir string `json:"rootDir"` + + // ID is the container ID. + ID string `json:"id"` + + // + // Fields below this line are not saved in the state file and will not + // be preserved across commands. + // + + once sync.Once + flock *flock.Flock +} + +// List returns all container ids in the given root directory. +func List(rootDir string) ([]string, error) { + log.Debugf("List containers %q", rootDir) + list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension)) + if err != nil { + return nil, err + } + var out []string + for _, path := range list { + // Filter out files that do no belong to a container. + fileName := filepath.Base(path) + if len(fileName) < len(stateFileExtension) { + panic(fmt.Sprintf("invalid file match %q", path)) + } + // Remove the extension. + cid := fileName[:len(fileName)-len(stateFileExtension)] + if validateID(cid) == nil { + out = append(out, cid) + } + } + return out, nil +} + +// lock globally locks all locking operations for the container. +func (s *StateFile) lock() error { + s.once.Do(func() { + s.flock = flock.NewFlock(s.lockPath()) + }) + + if err := s.flock.Lock(); err != nil { + return fmt.Errorf("acquiring lock on %q: %v", s.flock, err) + } + return nil +} + +// lockForNew acquires the lock and checks if the state file doesn't exist. This +// is done to ensure that more than one creation didn't race to create +// containers with the same ID. +func (s *StateFile) lockForNew() error { + if err := s.lock(); err != nil { + return err + } + + // Checks if the container already exists by looking for the metadata file. + if _, err := os.Stat(s.statePath()); err == nil { + s.unlock() + return fmt.Errorf("container already exists") + } else if !os.IsNotExist(err) { + s.unlock() + return fmt.Errorf("looking for existing container: %v", err) + } + return nil +} + +// unlock globally unlocks all locking operations for the container. +func (s *StateFile) unlock() error { + if !s.flock.Locked() { + panic("unlock called without lock held") + } + + if err := s.flock.Unlock(); err != nil { + log.Warningf("Error to release lock on %q: %v", s.flock, err) + return fmt.Errorf("releasing lock on %q: %v", s.flock, err) + } + return nil +} + +// saveLocked saves 'v' to the state file. +// +// Preconditions: lock() must been called before. +func (s *StateFile) saveLocked(v interface{}) error { + if !s.flock.Locked() { + panic("saveLocked called without lock held") + } + + meta, err := json.Marshal(v) + if err != nil { + return err + } + if err := ioutil.WriteFile(s.statePath(), meta, 0640); err != nil { + return fmt.Errorf("writing json file: %v", err) + } + return nil +} + +func (s *StateFile) load(v interface{}) error { + if err := s.lock(); err != nil { + return err + } + defer s.unlock() + + metaBytes, err := ioutil.ReadFile(s.statePath()) + if err != nil { + return err + } + return json.Unmarshal(metaBytes, &v) +} + +func (s *StateFile) close() error { + if s.flock == nil { + return nil + } + if s.flock.Locked() { + panic("Closing locked file") + } + return s.flock.Close() +} + +func buildStatePath(rootDir, id string) string { + return filepath.Join(rootDir, id+stateFileExtension) +} + +// statePath is the full path to the state file. +func (s *StateFile) statePath() string { + return buildStatePath(s.RootDir, s.ID) +} + +// lockPath is the full path to the lock file. +func (s *StateFile) lockPath() string { + return filepath.Join(s.RootDir, s.ID+".lock") +} + +// destroy deletes all state created by the stateFile. It may be called with the +// lock file held. In that case, the lock file must still be unlocked and +// properly closed after destroy returns. +func (s *StateFile) destroy() error { + if err := os.Remove(s.statePath()); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Remove(s.lockPath()); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} diff --git a/runsc/container/status.go b/runsc/container/status.go new file mode 100644 index 000000000..91d9112f1 --- /dev/null +++ b/runsc/container/status.go @@ -0,0 +1,60 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package container + +// Status enumerates container statuses. The statuses and their semantics are +// part of the runtime CLI spec. +type Status int + +const ( + // Created indicates "the runtime has finished the create operation and + // the container process has neither exited nor executed the + // user-specified program". + Created Status = iota + + // Creating indicates "the container is being created". + Creating + + // Paused indicates that the process within the container has been + // suspended. + Paused + + // Running indicates "the container process has executed the + // user-specified program but has not exited". + Running + + // Stopped indicates "the container process has exited". + Stopped +) + +// String converts a Status to a string. These strings are part of the runtime +// CLI spec and should not be changed. +func (s Status) String() string { + switch s { + case Created: + return "created" + case Creating: + return "creating" + case Paused: + return "paused" + case Running: + return "running" + case Stopped: + return "stopped" + default: + return "unknown" + } + +} diff --git a/runsc/debian/description b/runsc/debian/description new file mode 100644 index 000000000..9e8e08805 --- /dev/null +++ b/runsc/debian/description @@ -0,0 +1 @@ +gVisor container sandbox runtime diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh new file mode 100755 index 000000000..dc7aeee87 --- /dev/null +++ b/runsc/debian/postinst.sh @@ -0,0 +1,24 @@ +#!/bin/sh -e + +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ "$1" != configure ]; then + exit 0 +fi + +if [ -f /etc/docker/daemon.json ]; then + runsc install + systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2 +fi diff --git a/runsc/flag/BUILD b/runsc/flag/BUILD new file mode 100644 index 000000000..5cb7604a8 --- /dev/null +++ b/runsc/flag/BUILD @@ -0,0 +1,9 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "flag", + srcs = ["flag.go"], + visibility = ["//:sandbox"], +) diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go new file mode 100644 index 000000000..0ca4829d7 --- /dev/null +++ b/runsc/flag/flag.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package flag + +import ( + "flag" +) + +type FlagSet = flag.FlagSet + +var ( + NewFlagSet = flag.NewFlagSet + String = flag.String + Bool = flag.Bool + Int = flag.Int + Uint = flag.Uint + CommandLine = flag.CommandLine + Parse = flag.Parse +) + +const ContinueOnError = flag.ContinueOnError diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD new file mode 100644 index 000000000..1036b0630 --- /dev/null +++ b/runsc/fsgofer/BUILD @@ -0,0 +1,35 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "fsgofer", + srcs = [ + "fsgofer.go", + "fsgofer_amd64_unsafe.go", + "fsgofer_arm64_unsafe.go", + "fsgofer_unsafe.go", + ], + visibility = ["//runsc:__subpackages__"], + deps = [ + "//pkg/abi/linux", + "//pkg/cleanup", + "//pkg/fd", + "//pkg/log", + "//pkg/p9", + "//pkg/sync", + "//pkg/syserr", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "fsgofer_test", + size = "small", + srcs = ["fsgofer_test.go"], + library = ":fsgofer", + deps = [ + "//pkg/log", + "//pkg/p9", + ], +) diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD new file mode 100644 index 000000000..82b48ef32 --- /dev/null +++ b/runsc/fsgofer/filter/BUILD @@ -0,0 +1,26 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "filter", + srcs = [ + "config.go", + "config_amd64.go", + "config_arm64.go", + "extra_filters.go", + "extra_filters_msan.go", + "extra_filters_race.go", + "filter.go", + ], + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/flipcall", + "//pkg/log", + "//pkg/seccomp", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go new file mode 100644 index 000000000..88814b83c --- /dev/null +++ b/runsc/fsgofer/filter/config.go @@ -0,0 +1,250 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "os" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" +) + +// allowedSyscalls is the set of syscalls executed by the gofer. +var allowedSyscalls = seccomp.SyscallRules{ + syscall.SYS_ACCEPT: {}, + syscall.SYS_CLOCK_GETTIME: {}, + syscall.SYS_CLONE: []seccomp.Rule{ + { + seccomp.AllowValue( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + }, + }, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, + syscall.SYS_EPOLL_CTL: {}, + syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(0), + }, + }, + syscall.SYS_EVENTFD2: []seccomp.Rule{ + { + seccomp.AllowValue(0), + seccomp.AllowValue(0), + }, + }, + syscall.SYS_EXIT: {}, + syscall.SYS_EXIT_GROUP: {}, + syscall.SYS_FALLOCATE: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(0), + }, + }, + syscall.SYS_FCHMOD: {}, + syscall.SYS_FCHOWNAT: {}, + syscall.SYS_FCNTL: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.F_GETFL), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.F_SETFL), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.F_GETFD), + }, + // Used by flipcall.PacketWindowAllocator.Init(). + { + seccomp.AllowAny{}, + seccomp.AllowValue(unix.F_ADD_SEALS), + }, + }, + syscall.SYS_FSTAT: {}, + syscall.SYS_FSTATFS: {}, + syscall.SYS_FSYNC: {}, + syscall.SYS_FTRUNCATE: {}, + syscall.SYS_FUTEX: { + seccomp.Rule{ + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(0), + }, + seccomp.Rule{ + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(0), + }, + // Non-private futex used for flipcall. + seccomp.Rule{ + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAIT), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + }, + seccomp.Rule{ + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAKE), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + }, + }, + syscall.SYS_GETDENTS64: {}, + syscall.SYS_GETPID: {}, + unix.SYS_GETRANDOM: {}, + syscall.SYS_GETTID: {}, + syscall.SYS_GETTIMEOFDAY: {}, + syscall.SYS_LINKAT: {}, + syscall.SYS_LSEEK: {}, + syscall.SYS_MADVISE: {}, + unix.SYS_MEMFD_CREATE: {}, /// Used by flipcall.PacketWindowAllocator.Init(). + syscall.SYS_MKDIRAT: {}, + syscall.SYS_MKNODAT: {}, + // Used by the Go runtime as a temporarily workaround for a Linux + // 5.2-5.4 bug. + // + // See src/runtime/os_linux_x86.go. + // + // TODO(b/148688965): Remove once this is gone from Go. + syscall.SYS_MLOCK: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(4096), + }, + }, + syscall.SYS_MMAP: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_SHARED), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), + }, + }, + syscall.SYS_MPROTECT: {}, + syscall.SYS_MUNMAP: {}, + syscall.SYS_NANOSLEEP: {}, + syscall.SYS_OPENAT: {}, + syscall.SYS_PPOLL: {}, + syscall.SYS_PREAD64: {}, + syscall.SYS_PWRITE64: {}, + syscall.SYS_READ: {}, + syscall.SYS_READLINKAT: {}, + syscall.SYS_RECVMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), + }, + }, + syscall.SYS_RENAMEAT: {}, + syscall.SYS_RESTART_SYSCALL: {}, + syscall.SYS_RT_SIGPROCMASK: {}, + syscall.SYS_RT_SIGRETURN: {}, + syscall.SYS_SCHED_YIELD: {}, + syscall.SYS_SENDMSG: []seccomp.Rule{ + // Used by fdchannel.Endpoint.SendFD(). + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(0), + }, + // Used by unet.SocketWriter.WriteVec(). + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), + }, + }, + syscall.SYS_SHUTDOWN: []seccomp.Rule{ + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, + }, + syscall.SYS_SIGALTSTACK: {}, + // Used by fdchannel.NewConnectedSockets(). + syscall.SYS_SOCKETPAIR: { + { + seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC), + seccomp.AllowValue(0), + }, + }, + syscall.SYS_SYMLINKAT: {}, + syscall.SYS_TGKILL: []seccomp.Rule{ + { + seccomp.AllowValue(uint64(os.Getpid())), + }, + }, + syscall.SYS_UNLINKAT: {}, + syscall.SYS_UTIMENSAT: {}, + syscall.SYS_WRITE: {}, +} + +var udsSyscalls = seccomp.SyscallRules{ + syscall.SYS_SOCKET: []seccomp.Rule{ + { + seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_STREAM), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_DGRAM), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_SEQPACKET), + seccomp.AllowValue(0), + }, + }, + syscall.SYS_CONNECT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + }, + }, +} diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go new file mode 100644 index 000000000..a4b28cb8b --- /dev/null +++ b/runsc/fsgofer/filter/config_amd64.go @@ -0,0 +1,33 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" +) + +func init() { + allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{ + {seccomp.AllowValue(linux.ARCH_GET_FS)}, + {seccomp.AllowValue(linux.ARCH_SET_FS)}, + } + + allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{} +} diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go new file mode 100644 index 000000000..d2697deb7 --- /dev/null +++ b/runsc/fsgofer/filter/config_arm64.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +func init() { + allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{} +} diff --git a/runsc/fsgofer/filter/extra_filters.go b/runsc/fsgofer/filter/extra_filters.go new file mode 100644 index 000000000..e28d4b8d6 --- /dev/null +++ b/runsc/fsgofer/filter/extra_filters.go @@ -0,0 +1,28 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !msan,!race + +package filter + +import ( + "gvisor.dev/gvisor/pkg/seccomp" +) + +// instrumentationFilters returns additional filters for syscalls used by +// Go instrumentation tools, e.g. -race, -msan. +// Returns empty when disabled. +func instrumentationFilters() seccomp.SyscallRules { + return nil +} diff --git a/runsc/fsgofer/filter/extra_filters_msan.go b/runsc/fsgofer/filter/extra_filters_msan.go new file mode 100644 index 000000000..8c6179c8f --- /dev/null +++ b/runsc/fsgofer/filter/extra_filters_msan.go @@ -0,0 +1,33 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build msan + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/seccomp" +) + +// instrumentationFilters returns additional filters for syscalls used by MSAN. +func instrumentationFilters() seccomp.SyscallRules { + log.Warningf("*** SECCOMP WARNING: MSAN is enabled: syscall filters less restrictive!") + return seccomp.SyscallRules{ + syscall.SYS_SCHED_GETAFFINITY: {}, + syscall.SYS_SET_ROBUST_LIST: {}, + } +} diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go new file mode 100644 index 000000000..885c92f7a --- /dev/null +++ b/runsc/fsgofer/filter/extra_filters_race.go @@ -0,0 +1,42 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build race + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/seccomp" +) + +// instrumentationFilters returns additional filters for syscalls used by TSAN. +func instrumentationFilters() seccomp.SyscallRules { + log.Warningf("*** SECCOMP WARNING: TSAN is enabled: syscall filters less restrictive!") + return seccomp.SyscallRules{ + syscall.SYS_BRK: {}, + syscall.SYS_CLONE: {}, + syscall.SYS_FUTEX: {}, + syscall.SYS_MADVISE: {}, + syscall.SYS_MMAP: {}, + syscall.SYS_MUNLOCK: {}, + syscall.SYS_NANOSLEEP: {}, + syscall.SYS_OPEN: {}, + syscall.SYS_SET_ROBUST_LIST: {}, + // Used within glibc's malloc. + syscall.SYS_TIME: {}, + } +} diff --git a/runsc/fsgofer/filter/filter.go b/runsc/fsgofer/filter/filter.go new file mode 100644 index 000000000..289886720 --- /dev/null +++ b/runsc/fsgofer/filter/filter.go @@ -0,0 +1,38 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package filter defines all syscalls the gofer is allowed to make, and +// installs seccomp filters to prevent prohibited syscalls in case it's +// compromised. +package filter + +import ( + "gvisor.dev/gvisor/pkg/seccomp" +) + +// Install installs seccomp filters. +func Install() error { + // Set of additional filters used by -race and -msan. Returns empty + // when not enabled. + allowedSyscalls.Merge(instrumentationFilters()) + + return seccomp.Install(allowedSyscalls) +} + +// InstallUDSFilters extends the allowed syscalls to include those necessary for +// connecting to a host UDS. +func InstallUDSFilters() { + // Add additional filters required for connecting to the host's sockets. + allowedSyscalls.Merge(udsSyscalls) +} diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go new file mode 100644 index 000000000..b7521bda7 --- /dev/null +++ b/runsc/fsgofer/fsgofer.go @@ -0,0 +1,1181 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsgofer implements p9.File giving access to local files using +// a simple mapping from a path prefix that is added to the path requested +// by the sandbox. Ex: +// +// prefix: "/docker/imgs/alpine" +// app path: /bin/ls => /docker/imgs/alpine/bin/ls +package fsgofer + +import ( + "fmt" + "io" + "math" + "os" + "path" + "path/filepath" + "runtime" + "strconv" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" +) + +const ( + // invalidMode is set to a value that doesn't match any other valid + // modes to ensure an unopened/closed file fails all mode checks. + invalidMode = p9.OpenFlags(math.MaxUint32) + + openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC +) + +type fileType int + +const ( + regular fileType = iota + directory + symlink + socket + unknown +) + +// String implements fmt.Stringer. +func (f fileType) String() string { + switch f { + case regular: + return "regular" + case directory: + return "directory" + case symlink: + return "symlink" + case socket: + return "socket" + } + return "unknown" +} + +// ControlSocketAddr generates an abstract unix socket name for the given id. +func ControlSocketAddr(id string) string { + return fmt.Sprintf("\x00runsc-gofer.%s", id) +} + +// Config sets configuration options for each attach point. +type Config struct { + // ROMount is set to true if this is a readonly mount. + ROMount bool + + // PanicOnWrite panics on attempts to write to RO mounts. + PanicOnWrite bool + + // HostUDS signals whether the gofer can mount a host's UDS. + HostUDS bool +} + +type attachPoint struct { + prefix string + conf Config + + // attachedMu protects attached. + attachedMu sync.Mutex + attached bool + + // deviceMu protects devices and nextDevice. + deviceMu sync.Mutex + + // nextDevice is the next device id that will be allocated. + nextDevice uint8 + + // devices is a map from actual host devices to "small" integers that + // can be combined with host inode to form a unique virtual inode id. + devices map[uint64]uint8 +} + +// NewAttachPoint creates a new attacher that gives local file +// access to all files under 'prefix'. 'prefix' must be an absolute path. +func NewAttachPoint(prefix string, c Config) (p9.Attacher, error) { + // Sanity check the prefix. + if !filepath.IsAbs(prefix) { + return nil, fmt.Errorf("attach point prefix must be absolute %q", prefix) + } + return &attachPoint{ + prefix: prefix, + conf: c, + devices: make(map[uint64]uint8), + }, nil +} + +// Attach implements p9.Attacher. +func (a *attachPoint) Attach() (p9.File, error) { + a.attachedMu.Lock() + defer a.attachedMu.Unlock() + + if a.attached { + return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) + } + + f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { + return fd.Open(a.prefix, openFlags|mode, 0) + }) + if err != nil { + return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err) + } + + stat, err := fstat(f.FD()) + if err != nil { + return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err) + } + + lf, err := newLocalFile(a, f, a.prefix, stat) + if err != nil { + return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err) + } + a.attached = true + return lf, nil +} + +// makeQID returns a unique QID for the given stat buffer. +func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { + a.deviceMu.Lock() + defer a.deviceMu.Unlock() + + // First map the host device id to a unique 8-bit integer. + dev, ok := a.devices[stat.Dev] + if !ok { + a.devices[stat.Dev] = a.nextDevice + dev = a.nextDevice + a.nextDevice++ + if a.nextDevice < dev { + panic(fmt.Sprintf("device id overflow! map: %+v", a.devices)) + } + } + + // Construct a "virtual" inode id with the uint8 device number in the + // first 8 bits, and the rest of the bits from the host inode id. + maskedIno := stat.Ino & 0x00ffffffffffffff + if maskedIno != stat.Ino { + log.Warningf("first 8 bytes of host inode id %x will be truncated to construct virtual inode id", stat.Ino) + } + ino := uint64(dev)<<56 | maskedIno + return p9.QID{ + Type: p9.FileMode(stat.Mode).QIDType(), + Path: ino, + } +} + +// localFile implements p9.File wrapping a local file. The underlying file +// is opened during Walk() and stored in 'file' to be used with other +// operations. The file is opened as readonly, unless it's a symlink or there is +// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is +// called to clone the file. This reduces the number of walks that need to be +// done by the host file system when files are reused. +// +// The file may be reopened if the requested mode in Open() is not a subset of +// current mode. Consequently, 'file' could have a mode wider than requested and +// must be verified before read/write operations. Before the file is opened and +// after it's closed, 'mode' is set to an invalid value to prevent an unopened +// file from being used. +// +// The reason that the file is not opened initially as read-write is for better +// performance with 'overlay2' storage driver. overlay2 eagerly copies the +// entire file up when it's opened in write mode, and would perform badly when +// multiple files are only being opened for read (esp. startup). +type localFile struct { + p9.DefaultWalkGetAttr + + // attachPoint is the attachPoint that serves this localFile. + attachPoint *attachPoint + + // hostPath will be safely updated by the Renamed hook. + hostPath string + + // file is opened when localFile is created and it's never nil. It may be + // reopened if the Open() mode is wider than the mode the file was originally + // opened with. + file *fd.FD + + // mode is the mode in which the file was opened. Set to invalidMode + // if localFile isn't opened. + mode p9.OpenFlags + + // ft is the fileType for this file. + ft fileType + + // readDirMu protects against concurrent Readdir calls. + readDirMu sync.Mutex + + // lastDirentOffset is the last offset returned by Readdir(). If another call + // to Readdir is made at the same offset, the file doesn't need to be + // repositioned. This is an important optimization because the caller must + // always make one extra call to detect EOF (empty result, no error). + lastDirentOffset uint64 +} + +var procSelfFD *fd.FD + +// OpenProcSelfFD opens the /proc/self/fd directory, which will be used to +// reopen file descriptors. +func OpenProcSelfFD() error { + d, err := syscall.Open("/proc/self/fd", syscall.O_RDONLY|syscall.O_DIRECTORY, 0) + if err != nil { + return fmt.Errorf("error opening /proc/self/fd: %v", err) + } + procSelfFD = fd.New(d) + return nil +} + +func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) { + d, err := syscall.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^syscall.O_NOFOLLOW, 0) + if err != nil { + return nil, err + } + + return fd.New(d), nil +} + +func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) { + path := path.Join(parent.hostPath, name) + f, err := openAnyFile(path, func(mode int) (*fd.FD, error) { + return fd.OpenAt(parent.file, name, openFlags|mode, 0) + }) + return f, path, err +} + +// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback +// to O_PATH. 'path' is used for logging messages only. 'fn' is what does the +// actual file open and is customizable by the caller. +func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) { + // Attempt to open file in the following mode in order: + // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. + // Use non-blocking to prevent getting stuck inside open(2) for + // FIFOs. This option has no effect on regular files. + // 2. PATH: for symlinks, sockets. + modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH} + + var err error + var file *fd.FD + for i, mode := range modes { + file, err = fn(mode) + if err == nil { + // openat succeeded, we're done. + break + } + switch e := extractErrno(err); e { + case syscall.ENOENT: + // File doesn't exist, no point in retrying. + return nil, e + } + // openat failed. Try again with next mode, preserving 'err' in case this + // was the last attempt. + log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err) + } + if err != nil { + // All attempts to open file have failed, return the last error. + log.Debugf("Failed to open file, path: %q, err: %v", path, err) + return nil, extractErrno(err) + } + + return file, nil +} + +func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) { + var ft fileType + switch stat.Mode & syscall.S_IFMT { + case syscall.S_IFREG: + ft = regular + case syscall.S_IFDIR: + ft = directory + case syscall.S_IFLNK: + ft = symlink + case syscall.S_IFSOCK: + if !permitSocket { + return unknown, syscall.EPERM + } + ft = socket + default: + return unknown, syscall.EPERM + } + return ft, nil +} + +func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) { + ft, err := getSupportedFileType(stat, a.conf.HostUDS) + if err != nil { + return nil, err + } + + return &localFile{ + attachPoint: a, + hostPath: path, + file: file, + mode: invalidMode, + ft: ft, + }, nil +} + +// newFDMaybe creates a fd.FD from a file, dup'ing the FD and setting it as +// non-blocking. If anything fails, returns nil. It's better to have a file +// without host FD, than to fail the operation. +func newFDMaybe(file *fd.FD) *fd.FD { + dupFD, err := syscall.Dup(file.FD()) + // Technically, the runtime may call the finalizer on file as soon as + // FD() returns. + runtime.KeepAlive(file) + if err != nil { + return nil + } + dup := fd.New(dupFD) + + // fd is blocking; non-blocking is required. + if err := syscall.SetNonblock(dup.FD(), true); err != nil { + dup.Close() + return nil + } + return dup +} + +func fstat(fd int) (syscall.Stat_t, error) { + var stat syscall.Stat_t + if err := syscall.Fstat(fd, &stat); err != nil { + return syscall.Stat_t{}, err + } + return stat, nil +} + +func stat(path string) (syscall.Stat_t, error) { + var stat syscall.Stat_t + if err := syscall.Stat(path, &stat); err != nil { + return syscall.Stat_t{}, err + } + return stat, nil +} + +func fchown(fd int, uid p9.UID, gid p9.GID) error { + return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) +} + +// Open implements p9.File. +func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { + if l.isOpen() { + panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath)) + } + + // Check if control file can be used or if a new open must be created. + var newFile *fd.FD + if flags == p9.ReadOnly { + log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath) + newFile = l.file + } else { + // Ideally reopen would call name_to_handle_at (with empty name) and + // open_by_handle_at to reopen the file without using 'hostPath'. However, + // name_to_handle_at and open_by_handle_at aren't supported by overlay2. + log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath) + var err error + // Constrain open flags to the open mode and O_TRUNC. + newFile, err = reopenProcFd(l.file, openFlags|(flags.OSFlags()&(syscall.O_ACCMODE|syscall.O_TRUNC))) + if err != nil { + return nil, p9.QID{}, 0, extractErrno(err) + } + } + + stat, err := fstat(newFile.FD()) + if err != nil { + if newFile != l.file { + newFile.Close() + } + return nil, p9.QID{}, 0, extractErrno(err) + } + + var fd *fd.FD + if stat.Mode&syscall.S_IFMT == syscall.S_IFREG { + // Donate FD for regular files only. + fd = newFDMaybe(newFile) + } + + // Close old file in case a new one was created. + if newFile != l.file { + if err := l.file.Close(); err != nil { + log.Warningf("Error closing file %q: %v", l.hostPath, err) + } + l.file = newFile + } + l.mode = flags & p9.OpenFlagsModeMask + return fd, l.attachPoint.makeQID(stat), 0, nil +} + +// Create implements p9.File. +func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return nil, nil, p9.QID{}, 0, syscall.EBADF + } + + // 'file' may be used for other operations (e.g. Walk), so read access is + // always added to flags. Note that resulting file might have a wider mode + // than needed for each particular case. + flags := openFlags | syscall.O_CREAT | syscall.O_EXCL + if mode == p9.WriteOnly { + flags |= syscall.O_RDWR + } else { + flags |= mode.OSFlags() + } + + child, err := fd.OpenAt(l.file, name, flags, uint32(perm.Permissions())) + if err != nil { + return nil, nil, p9.QID{}, 0, extractErrno(err) + } + cu := cleanup.Make(func() { + child.Close() + // Best effort attempt to remove the file in case of failure. + if err := syscall.Unlinkat(l.file.FD(), name); err != nil { + log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err) + } + }) + defer cu.Clean() + + if err := fchown(child.FD(), uid, gid); err != nil { + return nil, nil, p9.QID{}, 0, extractErrno(err) + } + stat, err := fstat(child.FD()) + if err != nil { + return nil, nil, p9.QID{}, 0, extractErrno(err) + } + + c := &localFile{ + attachPoint: l.attachPoint, + hostPath: path.Join(l.hostPath, name), + file: child, + mode: mode, + } + + cu.Release() + return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil +} + +// Mkdir implements p9.File. +func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return p9.QID{}, syscall.EBADF + } + + if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil { + return p9.QID{}, extractErrno(err) + } + cu := cleanup.Make(func() { + // Best effort attempt to remove the dir in case of failure. + if err := unix.Unlinkat(l.file.FD(), name, unix.AT_REMOVEDIR); err != nil { + log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err) + } + }) + defer cu.Clean() + + // Open directory to change ownership and stat it. + flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags + f, err := fd.OpenAt(l.file, name, flags, 0) + if err != nil { + return p9.QID{}, extractErrno(err) + } + defer f.Close() + + if err := fchown(f.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } + stat, err := fstat(f.FD()) + if err != nil { + return p9.QID{}, extractErrno(err) + } + + cu.Release() + return l.attachPoint.makeQID(stat), nil +} + +// Walk implements p9.File. +func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { + // Duplicate current file if 'names' is empty. + if len(names) == 0 { + newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { + return reopenProcFd(l.file, openFlags|mode) + }) + if err != nil { + return nil, nil, extractErrno(err) + } + + stat, err := fstat(newFile.FD()) + if err != nil { + newFile.Close() + return nil, nil, extractErrno(err) + } + + c := &localFile{ + attachPoint: l.attachPoint, + hostPath: l.hostPath, + file: newFile, + mode: invalidMode, + } + return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil + } + + var qids []p9.QID + last := l + for _, name := range names { + f, path, err := openAnyFileFromParent(last, name) + if last != l { + last.Close() + } + if err != nil { + return nil, nil, extractErrno(err) + } + stat, err := fstat(f.FD()) + if err != nil { + f.Close() + return nil, nil, extractErrno(err) + } + c, err := newLocalFile(last.attachPoint, f, path, stat) + if err != nil { + f.Close() + return nil, nil, extractErrno(err) + } + + qids = append(qids, l.attachPoint.makeQID(stat)) + last = c + } + return qids, last, nil +} + +// StatFS implements p9.File. +func (l *localFile) StatFS() (p9.FSStat, error) { + var s syscall.Statfs_t + if err := syscall.Fstatfs(l.file.FD(), &s); err != nil { + return p9.FSStat{}, extractErrno(err) + } + + // Populate with what's available. + return p9.FSStat{ + Type: uint32(s.Type), + BlockSize: uint32(s.Bsize), + Blocks: s.Blocks, + BlocksFree: s.Bfree, + BlocksAvailable: s.Bavail, + Files: s.Files, + FilesFree: s.Ffree, + NameLength: uint32(s.Namelen), + }, nil +} + +// FSync implements p9.File. +func (l *localFile) FSync() error { + if !l.isOpen() { + return syscall.EBADF + } + if err := syscall.Fsync(l.file.FD()); err != nil { + return extractErrno(err) + } + return nil +} + +// GetAttr implements p9.File. +func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { + stat, err := fstat(l.file.FD()) + if err != nil { + return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err) + } + + attr := p9.Attr{ + Mode: p9.FileMode(stat.Mode), + UID: p9.UID(stat.Uid), + GID: p9.GID(stat.Gid), + NLink: uint64(stat.Nlink), + RDev: stat.Rdev, + Size: uint64(stat.Size), + BlockSize: uint64(stat.Blksize), + Blocks: uint64(stat.Blocks), + ATimeSeconds: uint64(stat.Atim.Sec), + ATimeNanoSeconds: uint64(stat.Atim.Nsec), + MTimeSeconds: uint64(stat.Mtim.Sec), + MTimeNanoSeconds: uint64(stat.Mtim.Nsec), + CTimeSeconds: uint64(stat.Ctim.Sec), + CTimeNanoSeconds: uint64(stat.Ctim.Nsec), + } + valid := p9.AttrMask{ + Mode: true, + UID: true, + GID: true, + NLink: true, + RDev: true, + Size: true, + Blocks: true, + ATime: true, + MTime: true, + CTime: true, + } + + return l.attachPoint.makeQID(stat), valid, attr, nil +} + +// SetAttr implements p9.File. Due to mismatch in file API, options +// cannot be changed atomically and user may see partial changes when +// an error happens. +func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return syscall.EBADF + } + + allowed := p9.SetAttrMask{ + Permissions: true, + UID: true, + GID: true, + Size: true, + ATime: true, + MTime: true, + ATimeNotSystemTime: true, + MTimeNotSystemTime: true, + } + + if valid.Empty() { + // Nothing to do. + return nil + } + + // Handle all the sanity checks up front so that the client gets a + // consistent result that is not attribute dependent. + if !valid.IsSubsetOf(allowed) { + log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid) + return syscall.EPERM + } + + // Check if it's possible to use cached file, or if another one needs to be + // opened for write. + f := l.file + if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { + var err error + f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY) + if err != nil { + return extractErrno(err) + } + defer f.Close() + } + + // The semantics are to either return an error if no changes were made, + // or no error if *all* changes were made. Well, this can be impossible + // if the filesystem rejects at least one of the changes, especially + // since some operations are not easy to undo atomically. + // + // This could be made better if SetAttr actually returned the changes + // it did make, so the client can at least know what has changed. So + // we at least attempt to make all of the changes and return a generic + // error if any of them fails, which at least doesn't bias any change + // over another. + var err error + if valid.Permissions { + if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil { + log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr) + err = extractErrno(cerr) + } + } + + if valid.Size { + if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil { + log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr) + err = extractErrno(terr) + } + } + + if valid.ATime || valid.MTime { + utimes := [2]syscall.Timespec{ + {Sec: 0, Nsec: linux.UTIME_OMIT}, + {Sec: 0, Nsec: linux.UTIME_OMIT}, + } + if valid.ATime { + if valid.ATimeNotSystemTime { + utimes[0].Sec = int64(attr.ATimeSeconds) + utimes[0].Nsec = int64(attr.ATimeNanoSeconds) + } else { + utimes[0].Nsec = linux.UTIME_NOW + } + } + if valid.MTime { + if valid.MTimeNotSystemTime { + utimes[1].Sec = int64(attr.MTimeSeconds) + utimes[1].Nsec = int64(attr.MTimeNanoSeconds) + } else { + utimes[1].Nsec = linux.UTIME_NOW + } + } + + if l.ft == symlink { + // utimensat operates different that other syscalls. To operate on a + // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty + // name. + parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0) + if err != nil { + return extractErrno(err) + } + defer syscall.Close(parent) + + if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil { + log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr) + err = extractErrno(terr) + } + } else { + // Directories and regular files can operate directly on the fd + // using empty name. + if terr := utimensat(f.FD(), "", utimes, 0); terr != nil { + log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr) + err = extractErrno(terr) + } + } + } + + if valid.UID || valid.GID { + uid := -1 + if valid.UID { + uid = int(attr.UID) + } + gid := -1 + if valid.GID { + gid = int(attr.GID) + } + if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil { + log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr) + err = extractErrno(oerr) + } + } + + return err +} + +func (*localFile) GetXattr(string, uint64) (string, error) { + return "", syscall.EOPNOTSUPP +} + +func (*localFile) SetXattr(string, string, uint32) error { + return syscall.EOPNOTSUPP +} + +func (*localFile) ListXattr(uint64) (map[string]struct{}, error) { + return nil, syscall.EOPNOTSUPP +} + +func (*localFile) RemoveXattr(string) error { + return syscall.EOPNOTSUPP +} + +// Allocate implements p9.File. +func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error { + if !l.isOpen() { + return syscall.EBADF + } + + if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil { + return extractErrno(err) + } + return nil +} + +// Rename implements p9.File; this should never be called. +func (*localFile) Rename(p9.File, string) error { + panic("rename called directly") +} + +// RenameAt implements p9.File.RenameAt. +func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return syscall.EBADF + } + + newParent := directory.(*localFile) + if err := renameat(l.file.FD(), oldName, newParent.file.FD(), newName); err != nil { + return extractErrno(err) + } + return nil +} + +// ReadAt implements p9.File. +func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) { + if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite { + return 0, syscall.EBADF + } + if !l.isOpen() { + return 0, syscall.EBADF + } + + r, err := l.file.ReadAt(p, int64(offset)) + switch err { + case nil, io.EOF: + return r, nil + default: + return r, extractErrno(err) + } +} + +// WriteAt implements p9.File. +func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) { + if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { + return 0, syscall.EBADF + } + if !l.isOpen() { + return 0, syscall.EBADF + } + + w, err := l.file.WriteAt(p, int64(offset)) + if err != nil { + return w, extractErrno(err) + } + return w, nil +} + +// Symlink implements p9.File. +func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return p9.QID{}, syscall.EBADF + } + + if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil { + return p9.QID{}, extractErrno(err) + } + cu := cleanup.Make(func() { + // Best effort attempt to remove the symlink in case of failure. + if err := syscall.Unlinkat(l.file.FD(), newName); err != nil { + log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err) + } + }) + defer cu.Clean() + + // Open symlink to change ownership and stat it. + f, err := fd.OpenAt(l.file, newName, unix.O_PATH|openFlags, 0) + if err != nil { + return p9.QID{}, extractErrno(err) + } + defer f.Close() + + if err := fchown(f.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } + stat, err := fstat(f.FD()) + if err != nil { + return p9.QID{}, extractErrno(err) + } + + cu.Release() + return l.attachPoint.makeQID(stat), nil +} + +// Link implements p9.File. +func (l *localFile) Link(target p9.File, newName string) error { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return syscall.EBADF + } + + targetFile := target.(*localFile) + if err := unix.Linkat(targetFile.file.FD(), "", l.file.FD(), newName, linux.AT_EMPTY_PATH); err != nil { + return extractErrno(err) + } + return nil +} + +// Mknod implements p9.File. +func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return p9.QID{}, syscall.EROFS + } + + hostPath := path.Join(l.hostPath, name) + + // Return EEXIST if the file already exists. + if _, err := stat(hostPath); err == nil { + return p9.QID{}, syscall.EEXIST + } + + // From mknod(2) man page: + // "EPERM: [...] if the filesystem containing pathname does not support + // the type of node requested." + if mode.FileType() != p9.ModeRegular { + return p9.QID{}, syscall.EPERM + } + + // Allow Mknod to create regular files. + if err := syscall.Mknod(hostPath, uint32(mode), 0); err != nil { + return p9.QID{}, err + } + + stat, err := stat(hostPath) + if err != nil { + return p9.QID{}, extractErrno(err) + } + return l.attachPoint.makeQID(stat), nil +} + +// UnlinkAt implements p9.File. +func (l *localFile) UnlinkAt(name string, flags uint32) error { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return syscall.EBADF + } + + if err := unix.Unlinkat(l.file.FD(), name, int(flags)); err != nil { + return extractErrno(err) + } + return nil +} + +// Readdir implements p9.File. +func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { + if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite { + return nil, syscall.EBADF + } + if !l.isOpen() { + return nil, syscall.EBADF + } + + // Readdirnames is a cursor over directories, so seek back to 0 to ensure it's + // reading all directory contents. Take a lock because this operation is + // stateful. + l.readDirMu.Lock() + defer l.readDirMu.Unlock() + + skip := uint64(0) + + // Check if the file is at the correct position already. If not, seek to the + // beginning and read the entire directory again. + if l.lastDirentOffset != offset { + if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil { + return nil, extractErrno(err) + } + skip = offset + } + + dirents, err := l.readDirent(l.file.FD(), offset, count, skip) + if err == nil { + // On success, remember the offset that was returned at the current + // position. + l.lastDirentOffset = offset + uint64(len(dirents)) + } else { + // On failure, the state is unknown, force call to seek() next time. + l.lastDirentOffset = math.MaxUint64 + } + return dirents, err +} + +func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) { + var dirents []p9.Dirent + + // Limit 'count' to cap the slice size that is returned. + const maxCount = 100000 + if count > maxCount { + count = maxCount + } + + // Pre-allocate buffers that will be reused to get partial results. + direntsBuf := make([]byte, 8192) + names := make([]string, 0, 100) + + end := offset + uint64(count) + for offset < end { + dirSize, err := syscall.ReadDirent(f, direntsBuf) + if err != nil { + return dirents, err + } + if dirSize <= 0 { + return dirents, nil + } + + names := names[:0] + _, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names) + + // Skip over entries that the caller is not interested in. + if skip > 0 { + if skip > uint64(len(names)) { + skip -= uint64(len(names)) + names = names[:0] + } else { + names = names[skip:] + skip = 0 + } + } + for _, name := range names { + stat, err := statAt(l.file.FD(), name) + if err != nil { + log.Warningf("Readdir is skipping file with failed stat %q, err: %v", l.hostPath, err) + continue + } + qid := l.attachPoint.makeQID(stat) + offset++ + dirents = append(dirents, p9.Dirent{ + QID: qid, + Type: qid.Type, + Name: name, + Offset: offset, + }) + } + } + return dirents, nil +} + +// Readlink implements p9.File. +func (l *localFile) Readlink() (string, error) { + // Shamelessly stolen from os.Readlink (added upper bound limit to buffer). + const limit = 1024 * 1024 + for len := 128; len < limit; len *= 2 { + b := make([]byte, len) + n, err := unix.Readlinkat(l.file.FD(), "", b) + if err != nil { + return "", extractErrno(err) + } + if n < len { + return string(b[:n]), nil + } + } + return "", syscall.ENOMEM +} + +// Flush implements p9.File. +func (l *localFile) Flush() error { + return nil +} + +// Connect implements p9.File. +func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) { + if !l.attachPoint.conf.HostUDS { + return nil, syscall.ECONNREFUSED + } + + // TODO(gvisor.dev/issue/1003): Due to different app vs replacement + // mappings, the app path may have fit in the sockaddr, but we can't + // fit f.path in our sockaddr. We'd need to redirect through a shorter + // path in order to actually connect to this socket. + if len(l.hostPath) > linux.UnixPathMax { + return nil, syscall.ECONNREFUSED + } + + var stype int + switch flags { + case p9.StreamSocket: + stype = syscall.SOCK_STREAM + case p9.DgramSocket: + stype = syscall.SOCK_DGRAM + case p9.SeqpacketSocket: + stype = syscall.SOCK_SEQPACKET + default: + return nil, syscall.ENXIO + } + + f, err := syscall.Socket(syscall.AF_UNIX, stype, 0) + if err != nil { + return nil, err + } + + if err := syscall.SetNonblock(f, true); err != nil { + syscall.Close(f) + return nil, err + } + + sa := syscall.SockaddrUnix{Name: l.hostPath} + if err := syscall.Connect(f, &sa); err != nil { + syscall.Close(f) + return nil, err + } + + return fd.New(f), nil +} + +// Close implements p9.File. +func (l *localFile) Close() error { + l.mode = invalidMode + err := l.file.Close() + l.file = nil + return err +} + +func (l *localFile) isOpen() bool { + return l.mode != invalidMode +} + +// Renamed implements p9.Renamed. +func (l *localFile) Renamed(newDir p9.File, newName string) { + l.hostPath = path.Join(newDir.(*localFile).hostPath, newName) +} + +// extractErrno tries to determine the errno. +func extractErrno(err error) syscall.Errno { + if err == nil { + // This should never happen. The likely result will be that + // some user gets the frustrating "error: SUCCESS" message. + log.Warningf("extractErrno called with nil error!") + return 0 + } + + switch err { + case os.ErrNotExist: + return syscall.ENOENT + case os.ErrExist: + return syscall.EEXIST + case os.ErrPermission: + return syscall.EACCES + case os.ErrInvalid: + return syscall.EINVAL + } + + // See if it's an errno or a common wrapped error. + switch e := err.(type) { + case syscall.Errno: + return e + case *os.PathError: + return extractErrno(e.Err) + case *os.LinkError: + return extractErrno(e.Err) + case *os.SyscallError: + return extractErrno(e.Err) + } + + // Fall back to EIO. + log.Debugf("Unknown error: %v, defaulting to EIO", err) + return syscall.EIO +} diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go new file mode 100644 index 000000000..5d4aab597 --- /dev/null +++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go @@ -0,0 +1,49 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package fsgofer + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserr" +) + +func statAt(dirFd int, name string) (syscall.Stat_t, error) { + nameBytes, err := syscall.BytePtrFromString(name) + if err != nil { + return syscall.Stat_t{}, err + } + namePtr := unsafe.Pointer(nameBytes) + + var stat syscall.Stat_t + statPtr := unsafe.Pointer(&stat) + + if _, _, errno := syscall.Syscall6( + syscall.SYS_NEWFSTATAT, + uintptr(dirFd), + uintptr(namePtr), + uintptr(statPtr), + linux.AT_SYMLINK_NOFOLLOW, + 0, + 0); errno != 0 { + + return syscall.Stat_t{}, syserr.FromHost(errno).ToError() + } + return stat, nil +} diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go new file mode 100644 index 000000000..8041fd352 --- /dev/null +++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go @@ -0,0 +1,49 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package fsgofer + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserr" +) + +func statAt(dirFd int, name string) (syscall.Stat_t, error) { + nameBytes, err := syscall.BytePtrFromString(name) + if err != nil { + return syscall.Stat_t{}, err + } + namePtr := unsafe.Pointer(nameBytes) + + var stat syscall.Stat_t + statPtr := unsafe.Pointer(&stat) + + if _, _, errno := syscall.Syscall6( + syscall.SYS_FSTATAT, + uintptr(dirFd), + uintptr(namePtr), + uintptr(statPtr), + linux.AT_SYMLINK_NOFOLLOW, + 0, + 0); errno != 0 { + + return syscall.Stat_t{}, syserr.FromHost(errno).ToError() + } + return stat, nil +} diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go new file mode 100644 index 000000000..05af7e397 --- /dev/null +++ b/runsc/fsgofer/fsgofer_test.go @@ -0,0 +1,692 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsgofer + +import ( + "fmt" + "io/ioutil" + "net" + "os" + "path" + "path/filepath" + "syscall" + "testing" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" +) + +func init() { + log.SetLevel(log.Debug) + + allConfs = append(allConfs, rwConfs...) + allConfs = append(allConfs, roConfs...) + + if err := OpenProcSelfFD(); err != nil { + panic(err) + } +} + +func assertPanic(t *testing.T, f func()) { + defer func() { + if r := recover(); r == nil { + t.Errorf("function did not panic") + } + }() + f() +} + +func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error { + want := make([]byte, len(content)) + copy(want, content) + + b := []byte("test-1-2-3") + w, err := f.WriteAt(b, uint64(len(content))) + if flags == p9.WriteOnly || flags == p9.ReadWrite { + if err != nil { + return fmt.Errorf("WriteAt(): %v", err) + } + if w != len(b) { + return fmt.Errorf("WriteAt() was partial, got: %d, want: %d", w, len(b)) + } + want = append(want, b...) + } else { + if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF { + return fmt.Errorf("WriteAt() should have failed, got: %d, want: EBADFD", err) + } + } + + rBuf := make([]byte, len(want)) + r, err := f.ReadAt(rBuf, 0) + if flags == p9.ReadOnly || flags == p9.ReadWrite { + if err != nil { + return fmt.Errorf("ReadAt(): %v", err) + } + if r != len(rBuf) { + return fmt.Errorf("ReadAt() was partial, got: %d, want: %d", r, len(rBuf)) + } + if string(rBuf) != string(want) { + return fmt.Errorf("ReadAt() wrong data, got: %s, want: %s", string(rBuf), want) + } + } else { + if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF { + return fmt.Errorf("ReadAt() should have failed, got: %d, want: EBADFD", err) + } + } + return nil +} + +var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} + +var ( + allTypes = []fileType{regular, directory, symlink} + + // allConfs is set in init() above. + allConfs []Config + + rwConfs = []Config{{ROMount: false}} + roConfs = []Config{{ROMount: true}} +) + +type state struct { + root *localFile + file *localFile + conf Config + ft fileType +} + +func (s state) String() string { + return fmt.Sprintf("type(%v)", s.ft) +} + +func runAll(t *testing.T, test func(*testing.T, state)) { + runCustom(t, allTypes, allConfs, test) +} + +func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) { + for _, c := range confs { + t.Logf("Config: %+v", c) + + for _, ft := range types { + t.Logf("File type: %v", ft) + + path, name, err := setup(ft) + if err != nil { + t.Fatalf("%v", err) + } + defer os.RemoveAll(path) + + a, err := NewAttachPoint(path, c) + if err != nil { + t.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + t.Fatalf("Attach failed, err: %v", err) + } + + _, file, err := root.Walk([]string{name}) + if err != nil { + root.Close() + t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) + } + + st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft} + test(t, st) + file.Close() + root.Close() + } + } +} + +func setup(ft fileType) (string, string, error) { + path, err := ioutil.TempDir("", "root-") + if err != nil { + return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err) + } + + // First attach with writable configuration to setup tree. + a, err := NewAttachPoint(path, Config{}) + if err != nil { + return "", "", err + } + root, err := a.Attach() + if err != nil { + return "", "", fmt.Errorf("Attach failed, err: %v", err) + } + defer root.Close() + + var name string + switch ft { + case regular: + name = "file" + _, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err) + } + defer f.Close() + case directory: + name = "dir" + if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err) + } + case symlink: + name = "symlink" + if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err) + } + default: + panic(fmt.Sprintf("unknown file type %v", ft)) + } + return path, name, nil +} + +func createFile(dir *localFile, name string) (*localFile, error) { + _, f, _, _, err := dir.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + return nil, err + } + return f.(*localFile), nil +} + +func TestReadWrite(t *testing.T) { + runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + child, err := createFile(s.file, "test") + if err != nil { + t.Fatalf("%v: createFile() failed, err: %v", s, err) + } + defer child.Close() + want := []byte("foobar") + w, err := child.WriteAt(want, 0) + if err != nil { + t.Fatalf("%v: Write() failed, err: %v", s, err) + } + if w != len(want) { + t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(want)) + } + for _, flags := range allOpenFlags { + _, l, err := s.file.Walk([]string{"test"}) + if err != nil { + t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err) + } + if _, _, _, err := l.Open(flags); err != nil { + t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err) + } + if err := testReadWrite(l, flags, want); err != nil { + t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err) + } + } + }) +} + +func TestCreate(t *testing.T) { + runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + for i, flags := range allOpenFlags { + _, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, flags, err) + } + + if err := testReadWrite(l, flags, []byte{}); err != nil { + t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err) + } + } + }) +} + +// TestReadWriteDup tests that a file opened in any mode can be dup'ed and +// reopened in any other mode. +func TestReadWriteDup(t *testing.T) { + runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + child, err := createFile(s.file, "test") + if err != nil { + t.Fatalf("%v: createFile() failed, err: %v", s, err) + } + defer child.Close() + want := []byte("foobar") + w, err := child.WriteAt(want, 0) + if err != nil { + t.Fatalf("%v: Write() failed, err: %v", s, err) + } + if w != len(want) { + t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(want)) + } + for _, flags := range allOpenFlags { + _, l, err := s.file.Walk([]string{"test"}) + if err != nil { + t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err) + } + defer l.Close() + if _, _, _, err := l.Open(flags); err != nil { + t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err) + } + for _, dupFlags := range allOpenFlags { + t.Logf("Original flags: %v, dup flags: %v", flags, dupFlags) + _, dup, err := l.Walk([]string{}) + if err != nil { + t.Fatalf("%v: Walk(<empty>) failed: %v", s, err) + } + defer dup.Close() + if _, _, _, err := dup.Open(dupFlags); err != nil { + t.Fatalf("%v: Open(%v) failed: %v", s, flags, err) + } + if err := testReadWrite(dup, dupFlags, want); err != nil { + t.Fatalf("%v: testReadWrite(%v) failed: %v", s, dupFlags, err) + } + } + } + }) +} + +func TestUnopened(t *testing.T) { + runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) { + b := []byte("foobar") + if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF { + t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF { + t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if _, err := s.file.Readdir(0, 100); err != syscall.EBADF { + t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if err := s.file.FSync(); err != syscall.EBADF { + t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + }) +} + +func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, error) { + if err := l.SetAttr(valid, attr); err != nil { + return p9.Attr{}, err + } + _, _, a, err := l.GetAttr(p9.AttrMask{}) + if err != nil { + return p9.Attr{}, err + } + return a, nil +} + +func TestSetAttrPerm(t *testing.T) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + valid := p9.SetAttrMask{Permissions: true} + attr := p9.SetAttr{Permissions: 0777} + got, err := SetGetAttr(s.file, valid, attr) + if s.ft == symlink { + if err == nil { + t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions) + } + } else { + if err != nil { + t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Permissions, err) + } + if got.Mode.Permissions() != attr.Permissions { + t.Errorf("%v: wrong permission, got: %v, expected: %v", s, got.Mode.Permissions(), attr.Permissions) + } + } + }) +} + +func TestSetAttrSize(t *testing.T) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + for _, size := range []uint64{1024, 0, 1024 * 1024} { + valid := p9.SetAttrMask{Size: true} + attr := p9.SetAttr{Size: size} + got, err := SetGetAttr(s.file, valid, attr) + if s.ft == symlink || s.ft == directory { + if err == nil { + t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions) + } + // Run for one size only, they will all fail the same way. + return + } + if err != nil { + t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Size, err) + } + if got.Size != size { + t.Errorf("%v: wrong size, got: %v, expected: %v", s, got.Size, size) + } + } + }) +} + +func TestSetAttrTime(t *testing.T) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + valid := p9.SetAttrMask{ATime: true, ATimeNotSystemTime: true} + attr := p9.SetAttr{ATimeSeconds: 123, ATimeNanoSeconds: 456} + got, err := SetGetAttr(s.file, valid, attr) + if err != nil { + t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.ATimeSeconds, attr.ATimeNanoSeconds, err) + } + if got.ATimeSeconds != 123 { + t.Errorf("%v: wrong ATimeSeconds, got: %v, expected: %v", s, got.ATimeSeconds, 123) + } + if got.ATimeNanoSeconds != 456 { + t.Errorf("%v: wrong ATimeNanoSeconds, got: %v, expected: %v", s, got.ATimeNanoSeconds, 456) + } + + valid = p9.SetAttrMask{MTime: true, MTimeNotSystemTime: true} + attr = p9.SetAttr{MTimeSeconds: 789, MTimeNanoSeconds: 012} + got, err = SetGetAttr(s.file, valid, attr) + if err != nil { + t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.MTimeSeconds, attr.MTimeNanoSeconds, err) + } + if got.MTimeSeconds != 789 { + t.Errorf("%v: wrong MTimeSeconds, got: %v, expected: %v", s, got.MTimeSeconds, 789) + } + if got.MTimeNanoSeconds != 012 { + t.Errorf("%v: wrong MTimeNanoSeconds, got: %v, expected: %v", s, got.MTimeNanoSeconds, 012) + } + }) +} + +func TestSetAttrOwner(t *testing.T) { + if os.Getuid() != 0 { + t.Skipf("SetAttr(owner) test requires CAP_CHOWN, running as %d", os.Getuid()) + } + + runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + newUID := os.Getuid() + 1 + valid := p9.SetAttrMask{UID: true} + attr := p9.SetAttr{UID: p9.UID(newUID)} + got, err := SetGetAttr(s.file, valid, attr) + if err != nil { + t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.UID, err) + } + if got.UID != p9.UID(newUID) { + t.Errorf("%v: wrong uid, got: %v, expected: %v", s, got.UID, newUID) + } + }) +} + +func TestLink(t *testing.T) { + if os.Getuid() != 0 { + t.Skipf("Link test requires CAP_DAC_READ_SEARCH, running as %d", os.Getuid()) + } + runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + const dirName = "linkdir" + const linkFile = "link" + if _, err := s.root.Mkdir(dirName, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("%v: MkDir(%s) failed, err: %v", s, dirName, err) + } + _, dir, err := s.root.Walk([]string{dirName}) + if err != nil { + t.Fatalf("%v: Walk({%s}) failed, err: %v", s, dirName, err) + } + + err = dir.Link(s.file, linkFile) + if s.ft == directory { + if err != syscall.EPERM { + t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err) + } + return + } + if err != nil { + t.Errorf("%v: Link(target, %s) failed, err: %v", s, linkFile, err) + } + }) +} + +func TestROMountChecks(t *testing.T) { + runCustom(t, allTypes, roConfs, func(t *testing.T, s state) { + if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF { + t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if _, err := s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF { + t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if err := s.file.RenameAt("some_file", s.file, "other_file"); err != syscall.EBADF { + t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if _, err := s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF { + t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if err := s.file.UnlinkAt("some_file", 0); err != syscall.EBADF { + t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if err := s.file.Link(s.file, "some_link"); err != syscall.EBADF { + t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + + valid := p9.SetAttrMask{Size: true} + attr := p9.SetAttr{Size: 0} + if err := s.file.SetAttr(valid, attr); err != syscall.EBADF { + t.Errorf("%v: SetAttr() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + }) +} + +func TestROMountPanics(t *testing.T) { + conf := Config{ROMount: true, PanicOnWrite: true} + runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) { + assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) }) + assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) }) + assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") }) + assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())) }) + assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) }) + assertPanic(t, func() { s.file.Link(s.file, "some_link") }) + + valid := p9.SetAttrMask{Size: true} + attr := p9.SetAttr{Size: 0} + assertPanic(t, func() { s.file.SetAttr(valid, attr) }) + }) +} + +func TestWalkNotFound(t *testing.T) { + runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) { + if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT { + t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err) + } + }) +} + +func TestWalkDup(t *testing.T) { + runAll(t, func(t *testing.T, s state) { + _, dup, err := s.file.Walk([]string{}) + if err != nil { + t.Fatalf("%v: Walk(nil) failed, err: %v", s, err) + } + // Check that 'dup' is usable. + if _, _, _, err := dup.GetAttr(p9.AttrMask{}); err != nil { + t.Errorf("%v: GetAttr() failed, err: %v", s, err) + } + }) +} + +func TestReaddir(t *testing.T) { + runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + name := "dir" + if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err) + } + name = "symlink" + if _, err := s.file.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("%v: Symlink(%q) failed, err: %v", s, name, err) + } + name = "file" + _, f, _, _, err := s.file.Create(name, p9.ReadWrite, 0555, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + t.Fatalf("%v: createFile(root, %q) failed, err: %v", s, name, err) + } + f.Close() + + if _, _, _, err := s.file.Open(p9.ReadOnly); err != nil { + t.Fatalf("%v: Open(ReadOnly) failed, err: %v", s, err) + } + + dirents, err := s.file.Readdir(0, 10) + if err != nil { + t.Fatalf("%v: Readdir(0, 10) failed, err: %v", s, err) + } + if len(dirents) != 3 { + t.Fatalf("%v: Readdir(0, 10) wrong number of items, got: %v, expected: 3", s, len(dirents)) + } + var dir, symlink, file bool + for _, d := range dirents { + switch d.Name { + case "dir": + if d.Type != p9.TypeDir { + t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeDir) + } + dir = true + case "symlink": + if d.Type != p9.TypeSymlink { + t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeSymlink) + } + symlink = true + case "file": + if d.Type != p9.TypeRegular { + t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeRegular) + } + file = true + default: + t.Errorf("%v: dirent.Name got: %v", s, d.Name) + } + + _, f, err := s.file.Walk([]string{d.Name}) + if err != nil { + t.Fatalf("%v: Walk({%s}) failed, err: %v", s, d.Name, err) + } + _, _, a, err := f.GetAttr(p9.AttrMask{}) + if err != nil { + t.Fatalf("%v: GetAttr() failed, err: %v", s, err) + } + if d.Type != a.Mode.QIDType() { + t.Errorf("%v: dirent.Type different than GetAttr().Mode.QIDType(), got: %v, expected: %v", s, d.Type, a.Mode.QIDType()) + } + } + if !dir || !symlink || !file { + t.Errorf("%v: Readdir(0, 10) wrong files returned, dir: %v, symlink: %v, file: %v", s, dir, symlink, file) + } + }) +} + +// Test that attach point can be written to when it points to a file, e.g. +// /etc/hosts. +func TestAttachFile(t *testing.T) { + conf := Config{ROMount: false} + dir, err := ioutil.TempDir("", "root-") + if err != nil { + t.Fatalf("ioutil.TempDir() failed, err: %v", err) + } + defer os.RemoveAll(dir) + + path := path.Join(dir, "test") + if _, err := os.Create(path); err != nil { + t.Fatalf("os.Create(%q) failed, err: %v", path, err) + } + + a, err := NewAttachPoint(path, conf) + if err != nil { + t.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + t.Fatalf("Attach failed, err: %v", err) + } + + if _, _, _, err := root.Open(p9.ReadWrite); err != nil { + t.Fatalf("Open(ReadWrite) failed, err: %v", err) + } + defer root.Close() + + b := []byte("foobar") + w, err := root.WriteAt(b, 0) + if err != nil { + t.Fatalf("Write() failed, err: %v", err) + } + if w != len(b) { + t.Fatalf("Write() was partial, got: %d, expected: %d", w, len(b)) + } + rBuf := make([]byte, len(b)) + r, err := root.ReadAt(rBuf, 0) + if err != nil { + t.Fatalf("ReadAt() failed, err: %v", err) + } + if r != len(rBuf) { + t.Fatalf("ReadAt() was partial, got: %d, expected: %d", r, len(rBuf)) + } + if string(rBuf) != "foobar" { + t.Fatalf("ReadAt() wrong data, got: %s, expected: %s", string(rBuf), "foobar") + } +} + +func TestAttachInvalidType(t *testing.T) { + dir, err := ioutil.TempDir("", "attach-") + if err != nil { + t.Fatalf("ioutil.TempDir() failed, err: %v", err) + } + defer os.RemoveAll(dir) + + fifo := filepath.Join(dir, "fifo") + if err := syscall.Mkfifo(fifo, 0755); err != nil { + t.Fatalf("Mkfifo(%q): %v", fifo, err) + } + + dirFile, err := os.Open(dir) + if err != nil { + t.Fatalf("Open(%s): %v", dir, err) + } + defer dirFile.Close() + + // Bind a socket via /proc to be sure that a length of a socket path + // is less than UNIX_PATH_MAX. + socket := filepath.Join(fmt.Sprintf("/proc/self/fd/%d", dirFile.Fd()), "socket") + l, err := net.Listen("unix", socket) + if err != nil { + t.Fatalf("net.Listen(unix, %q): %v", socket, err) + } + defer l.Close() + + for _, tc := range []struct { + name string + path string + }{ + {name: "fifo", path: fifo}, + {name: "socket", path: socket}, + } { + t.Run(tc.name, func(t *testing.T) { + conf := Config{ROMount: false} + a, err := NewAttachPoint(tc.path, conf) + if err != nil { + t.Fatalf("NewAttachPoint failed: %v", err) + } + f, err := a.Attach() + if f != nil || err == nil { + t.Fatalf("Attach should have failed, got (%v, %v)", f, err) + } + }) + } +} + +func TestDoubleAttachError(t *testing.T) { + conf := Config{ROMount: false} + root, err := ioutil.TempDir("", "root-") + if err != nil { + t.Fatalf("ioutil.TempDir() failed, err: %v", err) + } + defer os.RemoveAll(root) + a, err := NewAttachPoint(root, conf) + if err != nil { + t.Fatalf("NewAttachPoint failed: %v", err) + } + + if _, err := a.Attach(); err != nil { + t.Fatalf("Attach failed: %v", err) + } + if _, err := a.Attach(); err == nil { + t.Fatalf("Attach should have failed, got %v want non-nil", err) + } +} diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go new file mode 100644 index 000000000..542b54365 --- /dev/null +++ b/runsc/fsgofer/fsgofer_unsafe.go @@ -0,0 +1,82 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsgofer + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/syserr" +) + +func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error { + // utimensat(2) doesn't accept empty name, instead name must be nil to make it + // operate directly on 'dirFd' unlike other *at syscalls. + var namePtr unsafe.Pointer + if name != "" { + nameBytes, err := syscall.BytePtrFromString(name) + if err != nil { + return err + } + namePtr = unsafe.Pointer(nameBytes) + } + + timesPtr := unsafe.Pointer(×[0]) + + if _, _, errno := syscall.Syscall6( + syscall.SYS_UTIMENSAT, + uintptr(dirFd), + uintptr(namePtr), + uintptr(timesPtr), + uintptr(flags), + 0, + 0); errno != 0 { + + return syserr.FromHost(errno).ToError() + } + return nil +} + +func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error { + var oldNamePtr unsafe.Pointer + if oldName != "" { + nameBytes, err := syscall.BytePtrFromString(oldName) + if err != nil { + return err + } + oldNamePtr = unsafe.Pointer(nameBytes) + } + var newNamePtr unsafe.Pointer + if newName != "" { + nameBytes, err := syscall.BytePtrFromString(newName) + if err != nil { + return err + } + newNamePtr = unsafe.Pointer(nameBytes) + } + + if _, _, errno := syscall.Syscall6( + syscall.SYS_RENAMEAT, + uintptr(oldDirFD), + uintptr(oldNamePtr), + uintptr(newDirFD), + uintptr(newNamePtr), + 0, + 0); errno != 0 { + + return syserr.FromHost(errno).ToError() + } + return nil +} diff --git a/runsc/main.go b/runsc/main.go new file mode 100644 index 000000000..c9f47c579 --- /dev/null +++ b/runsc/main.go @@ -0,0 +1,372 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Binary runsc is an implementation of the Open Container Initiative Runtime +// that runs applications inside a sandbox. +package main + +import ( + "context" + "fmt" + "io" + "io/ioutil" + "os" + "os/signal" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/cmd" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" +) + +var ( + // Although these flags are not part of the OCI spec, they are used by + // Docker, and thus should not be changed. + rootDir = flag.String("root", "", "root directory for storage of container state.") + logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.") + logFormat = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.") + debug = flag.Bool("debug", false, "enable debug logging.") + showVersion = flag.Bool("version", false, "show version and exit.") + // TODO(gvisor.dev/issue/193): support systemd cgroups + systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.") + + // These flags are unique to runsc, and are used to configure parts of the + // system that are not covered by the runtime spec. + + // Debugging flags. + debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.") + panicLog = flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.") + logPackets = flag.Bool("log-packets", false, "enable network packet logging.") + logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.") + debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.") + panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.") + debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.") + alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.") + + // Debugging flags: strace related + strace = flag.Bool("strace", false, "enable strace.") + straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.") + straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.") + + // Flags that control sandbox runtime behavior. + platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.") + network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") + hardwareGSO = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.") + softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.") + txChecksumOffload = flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.") + rxChecksumOffload = flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.") + qDisc = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.") + fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") + fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") + overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") + overlayfsStaleRead = flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem") + watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") + panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") + profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") + numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") + rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") + referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.") + cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") + vfs2Enabled = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.") + + // Test flags, not to be used outside tests, ever. + testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") + testOnlyTestNameEnv = flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.") +) + +func main() { + // Help and flags commands are generated automatically. + help := cmd.NewHelp(subcommands.DefaultCommander) + help.Register(new(cmd.Syscalls)) + subcommands.Register(help, "") + subcommands.Register(subcommands.FlagsCommand(), "") + + // Installation helpers. + const helperGroup = "helpers" + subcommands.Register(new(cmd.Install), helperGroup) + subcommands.Register(new(cmd.Uninstall), helperGroup) + + // Register user-facing runsc commands. + subcommands.Register(new(cmd.Checkpoint), "") + subcommands.Register(new(cmd.Create), "") + subcommands.Register(new(cmd.Delete), "") + subcommands.Register(new(cmd.Do), "") + subcommands.Register(new(cmd.Events), "") + subcommands.Register(new(cmd.Exec), "") + subcommands.Register(new(cmd.Gofer), "") + subcommands.Register(new(cmd.Kill), "") + subcommands.Register(new(cmd.List), "") + subcommands.Register(new(cmd.Pause), "") + subcommands.Register(new(cmd.PS), "") + subcommands.Register(new(cmd.Restore), "") + subcommands.Register(new(cmd.Resume), "") + subcommands.Register(new(cmd.Run), "") + subcommands.Register(new(cmd.Spec), "") + subcommands.Register(new(cmd.State), "") + subcommands.Register(new(cmd.Start), "") + subcommands.Register(new(cmd.Wait), "") + + // Register internal commands with the internal group name. This causes + // them to be sorted below the user-facing commands with empty group. + // The string below will be printed above the commands. + const internalGroup = "internal use only" + subcommands.Register(new(cmd.Boot), internalGroup) + subcommands.Register(new(cmd.Debug), internalGroup) + subcommands.Register(new(cmd.Gofer), internalGroup) + subcommands.Register(new(cmd.Statefile), internalGroup) + + // All subcommands must be registered before flag parsing. + flag.Parse() + + // Are we showing the version? + if *showVersion { + // The format here is the same as runc. + fmt.Fprintf(os.Stdout, "runsc version %s\n", version) + fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version) + os.Exit(0) + } + + // TODO(gvisor.dev/issue/193): support systemd cgroups + if *systemdCgroup { + fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193") + os.Exit(1) + } + + var errorLogger io.Writer + if *logFD > -1 { + errorLogger = os.NewFile(uintptr(*logFD), "error log file") + + } else if *logFilename != "" { + // We must set O_APPEND and not O_TRUNC because Docker passes + // the same log file for all commands (and also parses these + // log files), so we can't destroy them on each command. + var err error + errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) + if err != nil { + cmd.Fatalf("error opening log file %q: %v", *logFilename, err) + } + } + cmd.ErrorLogger = errorLogger + + platformType := *platformName + if _, err := platform.Lookup(platformType); err != nil { + cmd.Fatalf("%v", err) + } + + fsAccess, err := boot.MakeFileAccessType(*fileAccess) + if err != nil { + cmd.Fatalf("%v", err) + } + + if fsAccess == boot.FileAccessShared && *overlay { + cmd.Fatalf("overlay flag is incompatible with shared file access") + } + + netType, err := boot.MakeNetworkType(*network) + if err != nil { + cmd.Fatalf("%v", err) + } + + wa, err := boot.MakeWatchdogAction(*watchdogAction) + if err != nil { + cmd.Fatalf("%v", err) + } + + if *numNetworkChannels <= 0 { + cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels) + } + + refsLeakMode, err := boot.MakeRefsLeakMode(*referenceLeakMode) + if err != nil { + cmd.Fatalf("%v", err) + } + + queueingDiscipline, err := boot.MakeQueueingDiscipline(*qDisc) + if err != nil { + cmd.Fatalf("%s", err) + } + + // Sets the reference leak check mode. Also set it in config below to + // propagate it to child processes. + refs.SetLeakMode(refsLeakMode) + + // Create a new Config from the flags. + conf := &boot.Config{ + RootDir: *rootDir, + Debug: *debug, + LogFilename: *logFilename, + LogFormat: *logFormat, + DebugLog: *debugLog, + PanicLog: *panicLog, + DebugLogFormat: *debugLogFormat, + FileAccess: fsAccess, + FSGoferHostUDS: *fsGoferHostUDS, + Overlay: *overlay, + Network: netType, + HardwareGSO: *hardwareGSO, + SoftwareGSO: *softwareGSO, + TXChecksumOffload: *txChecksumOffload, + RXChecksumOffload: *rxChecksumOffload, + LogPackets: *logPackets, + Platform: platformType, + Strace: *strace, + StraceLogSize: *straceLogSize, + WatchdogAction: wa, + PanicSignal: *panicSignal, + ProfileEnable: *profile, + EnableRaw: *netRaw, + NumNetworkChannels: *numNetworkChannels, + Rootless: *rootless, + AlsoLogToStderr: *alsoLogToStderr, + ReferenceLeakMode: refsLeakMode, + OverlayfsStaleRead: *overlayfsStaleRead, + CPUNumFromQuota: *cpuNumFromQuota, + VFS2: *vfs2Enabled, + QDisc: queueingDiscipline, + TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, + TestOnlyTestNameEnv: *testOnlyTestNameEnv, + } + if len(*straceSyscalls) != 0 { + conf.StraceSyscalls = strings.Split(*straceSyscalls, ",") + } + + // Set up logging. + if *debug { + log.SetLevel(log.Debug) + } + + // Logging will include the local date and time via the time package. + // + // On first use, time.Local initializes the local time zone, which + // involves opening tzdata files on the host. Since this requires + // opening host files, it must be done before syscall filter + // installation. + // + // Generally there will be a log message before filter installation + // that will force initialization, but force initialization here in + // case that does not occur. + _ = time.Local.String() + + subcommand := flag.CommandLine.Arg(0) + + var e log.Emitter + if *debugLogFD > -1 { + f := os.NewFile(uintptr(*debugLogFD), "debug log file") + + e = newEmitter(*debugLogFormat, f) + + } else if *debugLog != "" { + f, err := specutils.DebugLogFile(*debugLog, subcommand, "" /* name */) + if err != nil { + cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err) + } + e = newEmitter(*debugLogFormat, f) + + } else { + // Stderr is reserved for the application, just discard the logs if no debug + // log is specified. + e = newEmitter("text", ioutil.Discard) + } + + if *panicLogFD > -1 || *debugLogFD > -1 { + fd := *panicLogFD + if fd < 0 { + fd = *debugLogFD + } + // Quick sanity check to make sure no other commands get passed + // a log fd (they should use log dir instead). + if subcommand != "boot" && subcommand != "gofer" { + cmd.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand) + } + + // If we are the boot process, then we own our stdio FDs and can do what we + // want with them. Since Docker and Containerd both eat boot's stderr, we + // dup our stderr to the provided log FD so that panics will appear in the + // logs, rather than just disappear. + if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil { + cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err) + } + } else if *alsoLogToStderr { + e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)} + } + + log.SetTarget(e) + + log.Infof("***************************") + log.Infof("Args: %s", os.Args) + log.Infof("Version %s", version) + log.Infof("PID: %d", os.Getpid()) + log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid()) + log.Infof("Configuration:") + log.Infof("\t\tRootDir: %s", conf.RootDir) + log.Infof("\t\tPlatform: %v", conf.Platform) + log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay) + log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets) + log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls) + log.Infof("\t\tVFS2 enabled: %v", conf.VFS2) + log.Infof("***************************") + + if *testOnlyAllowRunAsCurrentUserWithoutChroot { + // SIGTERM is sent to all processes if a test exceeds its + // timeout and this case is handled by syscall_test_runner. + log.Warningf("Block the TERM signal. This is only safe in tests!") + signal.Ignore(syscall.SIGTERM) + } + + // Call the subcommand and pass in the configuration. + var ws syscall.WaitStatus + subcmdCode := subcommands.Execute(context.Background(), conf, &ws) + if subcmdCode == subcommands.ExitSuccess { + log.Infof("Exiting with status: %v", ws) + if ws.Signaled() { + // No good way to return it, emulate what the shell does. Maybe raise + // signal to self? + os.Exit(128 + int(ws.Signal())) + } + os.Exit(ws.ExitStatus()) + } + // Return an error that is unlikely to be used by the application. + log.Warningf("Failure to execute command, err: %v", subcmdCode) + os.Exit(128) +} + +func newEmitter(format string, logFile io.Writer) log.Emitter { + switch format { + case "text": + return log.GoogleEmitter{&log.Writer{Next: logFile}} + case "json": + return log.JSONEmitter{&log.Writer{Next: logFile}} + case "json-k8s": + return log.K8sJSONEmitter{&log.Writer{Next: logFile}} + } + cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format) + panic("unreachable") +} + +func init() { + // Set default root dir to something (hopefully) user-writeable. + *rootDir = "/var/run/runsc" + if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" { + *rootDir = filepath.Join(runtimeDir, "runsc") + } +} diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD new file mode 100644 index 000000000..035dcd3e3 --- /dev/null +++ b/runsc/sandbox/BUILD @@ -0,0 +1,37 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "sandbox", + srcs = [ + "network.go", + "network_unsafe.go", + "sandbox.go", + ], + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/cleanup", + "//pkg/control/client", + "//pkg/control/server", + "//pkg/log", + "//pkg/sentry/control", + "//pkg/sentry/platform", + "//pkg/sync", + "//pkg/tcpip/header", + "//pkg/tcpip/stack", + "//pkg/urpc", + "//runsc/boot", + "//runsc/boot/platforms", + "//runsc/cgroup", + "//runsc/console", + "//runsc/specutils", + "@com_github_cenkalti_backoff//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_syndtr_gocapability//capability:go_default_library", + "@com_github_vishvananda_netlink//:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go new file mode 100644 index 000000000..817a923ad --- /dev/null +++ b/runsc/sandbox/network.go @@ -0,0 +1,411 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "fmt" + "net" + "os" + "path/filepath" + "runtime" + "strconv" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/vishvananda/netlink" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/specutils" +) + +// setupNetwork configures the network stack to mimic the local network +// configuration. Docker uses network namespaces with vnets to configure the +// network for the container. The untrusted app expects to see the same network +// inside the sandbox. Routing and port mapping is handled directly by docker +// with most of network information not even available to the runtime. +// +// Netstack inside the sandbox speaks directly to the device using a raw socket. +// All IP addresses assigned to the NIC, are removed and passed on to netstack's +// device. +// +// If 'conf.Network' is NoNetwork, skips local configuration and creates a +// loopback interface only. +// +// Run the following container to test it: +// docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4 +func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error { + log.Infof("Setting up network") + + switch conf.Network { + case boot.NetworkNone: + log.Infof("Network is disabled, create loopback interface only") + if err := createDefaultLoopbackInterface(conn); err != nil { + return fmt.Errorf("creating default loopback interface: %v", err) + } + case boot.NetworkSandbox: + // Build the path to the net namespace of the sandbox process. + // This is what we will copy. + nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil { + return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) + } + case boot.NetworkHost: + // Nothing to do here. + default: + return fmt.Errorf("invalid network type: %d", conf.Network) + } + return nil +} + +func createDefaultLoopbackInterface(conn *urpc.Client) error { + if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{ + LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink}, + }, nil); err != nil { + return fmt.Errorf("creating loopback link and routes: %v", err) + } + return nil +} + +func joinNetNS(nsPath string) (func(), error) { + runtime.LockOSThread() + restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{ + Type: specs.NetworkNamespace, + Path: nsPath, + }) + if err != nil { + runtime.UnlockOSThread() + return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err) + } + return func() { + restoreNS() + runtime.UnlockOSThread() + }, nil +} + +// isRootNS determines whether we are running in the root net namespace. +// /proc/sys/net/core/rmem_default only exists in root network namespace. +func isRootNS() (bool, error) { + err := syscall.Access("/proc/sys/net/core/rmem_default", syscall.F_OK) + switch err { + case nil: + return true, nil + case syscall.ENOENT: + return false, nil + default: + return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err) + } +} + +// createInterfacesAndRoutesFromNS scrapes the interface and routes from the +// net namespace with the given path, creates them in the sandbox, and removes +// them from the host. +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error { + // Join the network namespace that we will be copying. + restore, err := joinNetNS(nsPath) + if err != nil { + return err + } + defer restore() + + // Get all interfaces in the namespace. + ifaces, err := net.Interfaces() + if err != nil { + return fmt.Errorf("querying interfaces: %v", err) + } + + isRoot, err := isRootNS() + if err != nil { + return err + } + if isRoot { + return fmt.Errorf("cannot run with network enabled in root network namespace") + } + + // Collect addresses and routes from the interfaces. + var args boot.CreateLinksAndRoutesArgs + for _, iface := range ifaces { + if iface.Flags&net.FlagUp == 0 { + log.Infof("Skipping down interface: %+v", iface) + continue + } + + allAddrs, err := iface.Addrs() + if err != nil { + return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err) + } + + // We build our own loopback device. + if iface.Flags&net.FlagLoopback != 0 { + link, err := loopbackLink(iface, allAddrs) + if err != nil { + return fmt.Errorf("getting loopback link for iface %q: %v", iface.Name, err) + } + args.LoopbackLinks = append(args.LoopbackLinks, link) + continue + } + + var ipAddrs []*net.IPNet + for _, ifaddr := range allAddrs { + ipNet, ok := ifaddr.(*net.IPNet) + if !ok { + return fmt.Errorf("address is not IPNet: %+v", ifaddr) + } + ipAddrs = append(ipAddrs, ipNet) + } + if len(ipAddrs) == 0 { + log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name) + continue + } + + // Scrape the routes before removing the address, since that + // will remove the routes as well. + routes, defv4, defv6, err := routesForIface(iface) + if err != nil { + return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) + } + if defv4 != nil { + if !args.Defaultv4Gateway.Route.Empty() { + return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) + } + args.Defaultv4Gateway.Route = *defv4 + args.Defaultv4Gateway.Name = iface.Name + } + + if defv6 != nil { + if !args.Defaultv6Gateway.Route.Empty() { + return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) + } + args.Defaultv6Gateway.Route = *defv6 + args.Defaultv6Gateway.Name = iface.Name + } + + link := boot.FDBasedLink{ + Name: iface.Name, + MTU: iface.MTU, + Routes: routes, + TXChecksumOffload: txChecksumOffload, + RXChecksumOffload: rxChecksumOffload, + NumChannels: numNetworkChannels, + QDisc: qDisc, + } + + // Get the link for the interface. + ifaceLink, err := netlink.LinkByName(iface.Name) + if err != nil { + return fmt.Errorf("getting link for interface %q: %v", iface.Name, err) + } + link.LinkAddress = ifaceLink.Attrs().HardwareAddr + + log.Debugf("Setting up network channels") + // Create the socket for the device. + for i := 0; i < link.NumChannels; i++ { + log.Debugf("Creating Channel %d", i) + socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO) + if err != nil { + return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) + } + if i == 0 { + link.GSOMaxSize = socketEntry.gsoMaxSize + } else { + if link.GSOMaxSize != socketEntry.gsoMaxSize { + return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s", + link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name) + } + } + args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) + } + + if link.GSOMaxSize == 0 && softwareGSO { + // Hardware GSO is disabled. Let's enable software GSO. + link.GSOMaxSize = stack.SoftwareGSOMaxSize + link.SoftwareGSOEnabled = true + } + + // Collect the addresses for the interface, enable forwarding, + // and remove them from the host. + for _, addr := range ipAddrs { + link.Addresses = append(link.Addresses, addr.IP) + + // Steal IP address from NIC. + if err := removeAddress(ifaceLink, addr.String()); err != nil { + return fmt.Errorf("removing address %v from device %q: %v", iface.Name, addr, err) + } + } + + args.FDBasedLinks = append(args.FDBasedLinks, link) + } + + log.Debugf("Setting up network, config: %+v", args) + if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { + return fmt.Errorf("creating links and routes: %v", err) + } + return nil +} + +type socketEntry struct { + deviceFile *os.File + gsoMaxSize uint32 +} + +// createSocket creates an underlying AF_PACKET socket and configures it for use by +// the sentry and returns an *os.File that wraps the underlying socket fd. +func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) { + // Create the socket. + const protocol = 0x0300 // htons(ETH_P_ALL) + fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) + if err != nil { + return nil, fmt.Errorf("unable to create raw socket: %v", err) + } + deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") + // Bind to the appropriate device. + ll := syscall.SockaddrLinklayer{ + Protocol: protocol, + Ifindex: iface.Index, + Hatype: 0, // No ARP type. + Pkttype: syscall.PACKET_OTHERHOST, + } + if err := syscall.Bind(fd, &ll); err != nil { + return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) + } + + gsoMaxSize := uint32(0) + if enableGSO { + gso, err := isGSOEnabled(fd, iface.Name) + if err != nil { + return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) + } + if gso { + if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { + return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) + } + gsoMaxSize = ifaceLink.Attrs().GSOMaxSize + } else { + log.Infof("GSO not available in host.") + } + } + + // Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer + // for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max". + // wmem_max/rmem_max default to a unusually low value of 208KB. This is too low + // for gVisor to be able to receive packets at high throughputs without + // incurring packet drops. + const bufSize = 4 << 20 // 4MB. + + if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil { + return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err) + } + + if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil { + return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err) + } + + return &socketEntry{deviceFile, gsoMaxSize}, nil +} + +// loopbackLink returns the link with addresses and routes for a loopback +// interface. +func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) { + link := boot.LoopbackLink{ + Name: iface.Name, + } + for _, addr := range addrs { + ipNet, ok := addr.(*net.IPNet) + if !ok { + return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr) + } + dst := *ipNet + dst.IP = dst.IP.Mask(dst.Mask) + link.Addresses = append(link.Addresses, ipNet.IP) + link.Routes = append(link.Routes, boot.Route{ + Destination: dst, + }) + } + return link, nil +} + +// routesForIface iterates over all routes for the given interface and converts +// them to boot.Routes. It also returns the a default v4/v6 route if found. +func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) { + link, err := netlink.LinkByIndex(iface.Index) + if err != nil { + return nil, nil, nil, err + } + rs, err := netlink.RouteList(link, netlink.FAMILY_ALL) + if err != nil { + return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err) + } + + var defv4, defv6 *boot.Route + var routes []boot.Route + for _, r := range rs { + // Is it a default route? + if r.Dst == nil { + if r.Gw == nil { + return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) + } + // Create a catch all route to the gateway. + switch len(r.Gw) { + case header.IPv4AddressSize: + if defv4 != nil { + return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r) + } + defv4 = &boot.Route{ + Destination: net.IPNet{ + IP: net.IPv4zero, + Mask: net.IPMask(net.IPv4zero), + }, + Gateway: r.Gw, + } + case header.IPv6AddressSize: + if defv6 != nil { + return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r) + } + + defv6 = &boot.Route{ + Destination: net.IPNet{ + IP: net.IPv6zero, + Mask: net.IPMask(net.IPv6zero), + }, + Gateway: r.Gw, + } + default: + return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r) + } + continue + } + + dst := *r.Dst + dst.IP = dst.IP.Mask(dst.Mask) + routes = append(routes, boot.Route{ + Destination: dst, + Gateway: r.Gw, + }) + } + return routes, defv4, defv6, nil +} + +// removeAddress removes IP address from network device. It's equivalent to: +// ip addr del <ipAndMask> dev <name> +func removeAddress(source netlink.Link, ipAndMask string) error { + addr, err := netlink.ParseAddr(ipAndMask) + if err != nil { + return err + } + return netlink.AddrDel(source, addr) +} diff --git a/runsc/sandbox/network_unsafe.go b/runsc/sandbox/network_unsafe.go new file mode 100644 index 000000000..2a2a0fb7e --- /dev/null +++ b/runsc/sandbox/network_unsafe.go @@ -0,0 +1,56 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "syscall" + "unsafe" + + "golang.org/x/sys/unix" +) + +type ethtoolValue struct { + cmd uint32 + val uint32 +} + +type ifreq struct { + ifrName [unix.IFNAMSIZ]byte + ifrData *ethtoolValue +} + +const ( + _ETHTOOL_GGSO = 0x00000023 +) + +func isGSOEnabled(fd int, intf string) (bool, error) { + val := ethtoolValue{ + cmd: _ETHTOOL_GGSO, + } + + var name [unix.IFNAMSIZ]byte + copy(name[:], []byte(intf)) + + ifr := ifreq{ + ifrName: name, + ifrData: &val, + } + + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), unix.SIOCETHTOOL, uintptr(unsafe.Pointer(&ifr))); err != 0 { + return false, err + } + + return val.val != 0, nil +} diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go new file mode 100644 index 000000000..6e1a2af25 --- /dev/null +++ b/runsc/sandbox/sandbox.go @@ -0,0 +1,1228 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sandbox creates and manipulates sandboxes. +package sandbox + +import ( + "context" + "fmt" + "io" + "math" + "os" + "os/exec" + "strconv" + "strings" + "syscall" + "time" + + "github.com/cenkalti/backoff" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/syndtr/gocapability/capability" + "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/control/client" + "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/boot/platforms" + "gvisor.dev/gvisor/runsc/cgroup" + "gvisor.dev/gvisor/runsc/console" + "gvisor.dev/gvisor/runsc/specutils" +) + +// Sandbox wraps a sandbox process. +// +// It is used to start/stop sandbox process (and associated processes like +// gofers), as well as for running and manipulating containers inside a running +// sandbox. +// +// Note: Sandbox must be immutable because a copy of it is saved for each +// container and changes would not be synchronized to all of them. +type Sandbox struct { + // ID is the id of the sandbox (immutable). By convention, this is the same + // ID as the first container run in the sandbox. + ID string `json:"id"` + + // Pid is the pid of the running sandbox (immutable). May be 0 if the sandbox + // is not running. + Pid int `json:"pid"` + + // Cgroup has the cgroup configuration for the sandbox. + Cgroup *cgroup.Cgroup `json:"cgroup"` + + // child is set if a sandbox process is a child of the current process. + // + // This field isn't saved to json, because only a creator of sandbox + // will have it as a child process. + child bool + + // status is an exit status of a sandbox process. + status syscall.WaitStatus + + // statusMu protects status. + statusMu sync.Mutex +} + +// Args is used to configure a new sandbox. +type Args struct { + // ID is the sandbox unique identifier. + ID string + + // Spec is the OCI spec that describes the container. + Spec *specs.Spec + + // BundleDir is the directory containing the container bundle. + BundleDir string + + // ConsoleSocket is the path to a unix domain socket that will receive + // the console FD. It may be empty. + ConsoleSocket string + + // UserLog is the filename to send user-visible logs to. It may be empty. + UserLog string + + // IOFiles is the list of files that connect to a 9P endpoint for the mounts + // points using Gofers. They must be in the same order as mounts appear in + // the spec. + IOFiles []*os.File + + // MountsFile is a file container mount information from the spec. It's + // equivalent to the mounts from the spec, except that all paths have been + // resolved to their final absolute location. + MountsFile *os.File + + // Gcgroup is the cgroup that the sandbox is part of. + Cgroup *cgroup.Cgroup + + // Attached indicates that the sandbox lifecycle is attached with the caller. + // If the caller exits, the sandbox should exit too. + Attached bool +} + +// New creates the sandbox process. The caller must call Destroy() on the +// sandbox. +func New(conf *boot.Config, args *Args) (*Sandbox, error) { + s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup} + // The Cleanup object cleans up partially created sandboxes when an error + // occurs. Any errors occurring during cleanup itself are ignored. + c := cleanup.Make(func() { + err := s.destroy() + log.Warningf("error destroying sandbox: %v", err) + }) + defer c.Clean() + + // Create pipe to synchronize when sandbox process has been booted. + clientSyncFile, sandboxSyncFile, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err) + } + defer clientSyncFile.Close() + + // Create the sandbox process. + err = s.createSandboxProcess(conf, args, sandboxSyncFile) + // sandboxSyncFile has to be closed to be able to detect when the sandbox + // process exits unexpectedly. + sandboxSyncFile.Close() + if err != nil { + return nil, err + } + + // Wait until the sandbox has booted. + b := make([]byte, 1) + if l, err := clientSyncFile.Read(b); err != nil || l != 1 { + err := fmt.Errorf("waiting for sandbox to start: %v", err) + // If the sandbox failed to start, it may be because the binary + // permissions were incorrect. Check the bits and return a more helpful + // error message. + // + // NOTE: The error message is checked because error types are lost over + // rpc calls. + if strings.Contains(err.Error(), io.EOF.Error()) { + if permsErr := checkBinaryPermissions(conf); permsErr != nil { + return nil, fmt.Errorf("%v: %v", err, permsErr) + } + } + return nil, err + } + + c.Release() + return s, nil +} + +// CreateContainer creates a non-root container inside the sandbox. +func (s *Sandbox) CreateContainer(cid string) error { + log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) + sandboxConn, err := s.sandboxConnect() + if err != nil { + return fmt.Errorf("couldn't connect to sandbox: %v", err) + } + defer sandboxConn.Close() + + if err := sandboxConn.Call(boot.ContainerCreate, &cid, nil); err != nil { + return fmt.Errorf("creating non-root container %q: %v", cid, err) + } + return nil +} + +// StartRoot starts running the root container process inside the sandbox. +func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error { + log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + // Configure the network. + if err := setupNetwork(conn, s.Pid, spec, conf); err != nil { + return fmt.Errorf("setting up network: %v", err) + } + + // Send a message to the sandbox control server to start the root + // container. + if err := conn.Call(boot.RootContainerStart, &s.ID, nil); err != nil { + return fmt.Errorf("starting root container: %v", err) + } + + return nil +} + +// StartContainer starts running a non-root container inside the sandbox. +func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error { + for _, f := range goferFiles { + defer f.Close() + } + + log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) + sandboxConn, err := s.sandboxConnect() + if err != nil { + return fmt.Errorf("couldn't connect to sandbox: %v", err) + } + defer sandboxConn.Close() + + // The payload must container stdin/stdout/stderr followed by gofer + // files. + files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...) + // Start running the container. + args := boot.StartArgs{ + Spec: spec, + Conf: conf, + CID: cid, + FilePayload: urpc.FilePayload{Files: files}, + } + if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { + return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err) + } + return nil +} + +// Restore sends the restore call for a container in the sandbox. +func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, filename string) error { + log.Debugf("Restore sandbox %q", s.ID) + + rf, err := os.Open(filename) + if err != nil { + return fmt.Errorf("opening restore file %q failed: %v", filename, err) + } + defer rf.Close() + + opt := boot.RestoreOpts{ + FilePayload: urpc.FilePayload{ + Files: []*os.File{rf}, + }, + SandboxID: s.ID, + } + + // If the platform needs a device FD we must pass it in. + if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil { + return err + } else if deviceFile != nil { + defer deviceFile.Close() + opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile) + } + + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + // Configure the network. + if err := setupNetwork(conn, s.Pid, spec, conf); err != nil { + return fmt.Errorf("setting up network: %v", err) + } + + // Restore the container and start the root container. + if err := conn.Call(boot.ContainerRestore, &opt, nil); err != nil { + return fmt.Errorf("restoring container %q: %v", cid, err) + } + + return nil +} + +// Processes retrieves the list of processes and associated metadata for a +// given container in this sandbox. +func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { + log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return nil, err + } + defer conn.Close() + + var pl []*control.Process + if err := conn.Call(boot.ContainerProcesses, &cid, &pl); err != nil { + return nil, fmt.Errorf("retrieving process data from sandbox: %v", err) + } + return pl, nil +} + +// Execute runs the specified command in the container. It returns the PID of +// the newly created process. +func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) { + log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return 0, s.connError(err) + } + defer conn.Close() + + // Send a message to the sandbox control server to start the container. + var pid int32 + if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil { + return 0, fmt.Errorf("executing command %q in sandbox: %v", args, err) + } + return pid, nil +} + +// Event retrieves stats about the sandbox such as memory and CPU utilization. +func (s *Sandbox) Event(cid string) (*boot.Event, error) { + log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return nil, err + } + defer conn.Close() + + var e boot.Event + // TODO(b/129292330): Pass in the container id (cid) here. The sandbox + // should return events only for that container. + if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil { + return nil, fmt.Errorf("retrieving event data from sandbox: %v", err) + } + e.ID = cid + return &e, nil +} + +func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { + log.Debugf("Connecting to sandbox %q", s.ID) + conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)) + if err != nil { + return nil, s.connError(err) + } + return conn, nil +} + +func (s *Sandbox) connError(err error) error { + return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid, err) +} + +// createSandboxProcess starts the sandbox as a subprocess by running the "boot" +// command, passing in the bundle dir. +func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncFile *os.File) error { + // nextFD is used to get unused FDs that we can pass to the sandbox. It + // starts at 3 because 0, 1, and 2 are taken by stdin/out/err. + nextFD := 3 + + binPath := specutils.ExePath + cmd := exec.Command(binPath, conf.ToFlags()...) + cmd.SysProcAttr = &syscall.SysProcAttr{} + + // Open the log files to pass to the sandbox as FDs. + // + // These flags must come BEFORE the "boot" command in cmd.Args. + if conf.LogFilename != "" { + logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("opening log file %q: %v", conf.LogFilename, err) + } + defer logFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, logFile) + cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + if conf.DebugLog != "" { + test := "" + if len(conf.TestOnlyTestNameEnv) != 0 { + // Fetch test name if one is provided and the test only flag was set. + if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { + test = t + } + } + + debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot", test) + if err != nil { + return fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err) + } + defer debugLogFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, debugLogFile) + cmd.Args = append(cmd.Args, "--debug-log-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + if conf.PanicLog != "" { + test := "" + if len(conf.TestOnlyTestNameEnv) != 0 { + // Fetch test name if one is provided and the test only flag was set. + if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { + test = t + } + } + + panicLogFile, err := specutils.DebugLogFile(conf.PanicLog, "panic", test) + if err != nil { + return fmt.Errorf("opening debug log file in %q: %v", conf.PanicLog, err) + } + defer panicLogFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, panicLogFile) + cmd.Args = append(cmd.Args, "--panic-log-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + // Add the "boot" command to the args. + // + // All flags after this must be for the boot command + cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir) + + // Create a socket for the control server and donate it to the sandbox. + addr := boot.ControlSocketAddr(s.ID) + sockFD, err := server.CreateSocket(addr) + log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00". + if err != nil { + return fmt.Errorf("creating control server socket for sandbox %q: %v", s.ID, err) + } + controllerFile := os.NewFile(uintptr(sockFD), "control_server_socket") + defer controllerFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile) + cmd.Args = append(cmd.Args, "--controller-fd="+strconv.Itoa(nextFD)) + nextFD++ + + defer args.MountsFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, args.MountsFile) + cmd.Args = append(cmd.Args, "--mounts-fd="+strconv.Itoa(nextFD)) + nextFD++ + + specFile, err := specutils.OpenSpec(args.BundleDir) + if err != nil { + return err + } + defer specFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, specFile) + cmd.Args = append(cmd.Args, "--spec-fd="+strconv.Itoa(nextFD)) + nextFD++ + + cmd.ExtraFiles = append(cmd.ExtraFiles, startSyncFile) + cmd.Args = append(cmd.Args, "--start-sync-fd="+strconv.Itoa(nextFD)) + nextFD++ + + // If there is a gofer, sends all socket ends to the sandbox. + for _, f := range args.IOFiles { + defer f.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, f) + cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD)) + nextFD++ + } + + gPlatform, err := platform.Lookup(conf.Platform) + if err != nil { + return err + } + + if deviceFile, err := gPlatform.OpenDevice(); err != nil { + return fmt.Errorf("opening device file for platform %q: %v", gPlatform, err) + } else if deviceFile != nil { + defer deviceFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile) + cmd.Args = append(cmd.Args, "--device-fd="+strconv.Itoa(nextFD)) + nextFD++ + } + + // TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff + // isn't set. + if conf.Platform == "kvm" { + cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1") + } + + // The current process' stdio must be passed to the application via the + // --stdio-fds flag. The stdio of the sandbox process itself must not + // be connected to the same FDs, otherwise we risk leaking sandbox + // errors to the application, so we set the sandbox stdio to nil, + // causing them to read/write from the null device. + cmd.Stdin = nil + cmd.Stdout = nil + cmd.Stderr = nil + + // If the console control socket file is provided, then create a new + // pty master/slave pair and set the TTY on the sandbox process. + if args.ConsoleSocket != "" { + cmd.Args = append(cmd.Args, "--console=true") + + // console.NewWithSocket will send the master on the given + // socket, and return the slave. + tty, err := console.NewWithSocket(args.ConsoleSocket) + if err != nil { + return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err) + } + defer tty.Close() + + // Set the TTY as a controlling TTY on the sandbox process. + cmd.SysProcAttr.Setctty = true + // The Ctty FD must be the FD in the child process's FD table, + // which will be nextFD in this case. + // See https://github.com/golang/go/issues/29458. + cmd.SysProcAttr.Ctty = nextFD + + // Pass the tty as all stdio fds to sandbox. + for i := 0; i < 3; i++ { + cmd.ExtraFiles = append(cmd.ExtraFiles, tty) + cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.Debug { + // If debugging, send the boot process stdio to the + // TTY, so that it is easier to find. + cmd.Stdin = tty + cmd.Stdout = tty + cmd.Stderr = tty + } + } else { + // If not using a console, pass our current stdio as the + // container stdio via flags. + for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} { + cmd.ExtraFiles = append(cmd.ExtraFiles, f) + cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD)) + nextFD++ + } + + if conf.Debug { + // If debugging, send the boot process stdio to the + // this process' stdio, so that is is easier to find. + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + } + + // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT + // when re-parented. + cmd.SysProcAttr.Setsid = true + + // nss is the set of namespaces to join or create before starting the sandbox + // process. Mount, IPC and UTS namespaces from the host are not used as they + // are virtualized inside the sandbox. Be paranoid and run inside an empty + // namespace for these. Don't unshare cgroup because sandbox is added to a + // cgroup in the caller's namespace. + log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces") + nss := []specs.LinuxNamespace{ + {Type: specs.IPCNamespace}, + {Type: specs.MountNamespace}, + {Type: specs.UTSNamespace}, + } + + if gPlatform.Requirements().RequiresCurrentPIDNS { + // TODO(b/75837838): Also set a new PID namespace so that we limit + // access to other host processes. + log.Infof("Sandbox will be started in the current PID namespace") + } else { + log.Infof("Sandbox will be started in a new PID namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace}) + cmd.Args = append(cmd.Args, "--pidns=true") + } + + // Joins the network namespace if network is enabled. the sandbox talks + // directly to the host network, which may have been configured in the + // namespace. + if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != boot.NetworkNone { + log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) + nss = append(nss, ns) + } else if conf.Network == boot.NetworkHost { + log.Infof("Sandbox will be started in the host network namespace") + } else { + log.Infof("Sandbox will be started in new network namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) + } + + // User namespace depends on the network type. Host network requires to run + // inside the user namespace specified in the spec or the current namespace + // if none is configured. + if conf.Network == boot.NetworkHost { + if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok { + log.Infof("Sandbox will be started in container's user namespace: %+v", userns) + nss = append(nss, userns) + specutils.SetUIDGIDMappings(cmd, args.Spec) + } else { + log.Infof("Sandbox will be started in the current user namespace") + } + // When running in the caller's defined user namespace, apply the same + // capabilities to the sandbox process to ensure it abides to the same + // rules. + cmd.Args = append(cmd.Args, "--apply-caps=true") + + // If we have CAP_SYS_ADMIN, we can create an empty chroot and + // bind-mount the executable inside it. + if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") + + } else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) { + log.Infof("Sandbox will be started in minimal chroot") + cmd.Args = append(cmd.Args, "--setup-root") + } else { + return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN") + } + } else { + // If we have CAP_SETUID and CAP_SETGID, then we can also run + // as user nobody. + if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid()) + log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") + } else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { + log.Infof("Sandbox will be started in new user namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) + cmd.Args = append(cmd.Args, "--setup-root") + + const nobody = 65534 + if conf.Rootless { + log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) + cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ + { + ContainerID: nobody, + HostID: os.Getuid(), + Size: 1, + }, + } + cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ + { + ContainerID: nobody, + HostID: os.Getgid(), + Size: 1, + }, + } + + } else { + // Map nobody in the new namespace to nobody in the parent namespace. + // + // A sandbox process will construct an empty + // root for itself, so it has to have + // CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities. + cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ + { + ContainerID: nobody, + HostID: nobody, + Size: 1, + }, + } + cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ + { + ContainerID: nobody, + HostID: nobody, + Size: 1, + }, + } + } + + // Set credentials to run as user and group nobody. + cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody} + cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, uintptr(capability.CAP_SYS_ADMIN), uintptr(capability.CAP_SYS_CHROOT)) + } else { + return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID") + } + } + + cmd.Args[0] = "runsc-sandbox" + + if s.Cgroup != nil { + cpuNum, err := s.Cgroup.NumCPU() + if err != nil { + return fmt.Errorf("getting cpu count from cgroups: %v", err) + } + if conf.CPUNumFromQuota { + // Dropping below 2 CPUs can trigger application to disable + // locks that can lead do hard to debug errors, so just + // leaving two cores as reasonable default. + const minCPUs = 2 + + quota, err := s.Cgroup.CPUQuota() + if err != nil { + return fmt.Errorf("getting cpu qouta from cgroups: %v", err) + } + if n := int(math.Ceil(quota)); n > 0 { + if n < minCPUs { + n = minCPUs + } + if n < cpuNum { + // Only lower the cpu number. + cpuNum = n + } + } + } + cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) + + mem, err := s.Cgroup.MemoryLimit() + if err != nil { + return fmt.Errorf("getting memory limit from cgroups: %v", err) + } + // When memory limit is unset, a "large" number is returned. In that case, + // just stick with the default. + if mem < 0x7ffffffffffff000 { + cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10)) + } + } + + if args.UserLog != "" { + f, err := os.OpenFile(args.UserLog, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664) + if err != nil { + return fmt.Errorf("opening compat log file: %v", err) + } + defer f.Close() + + cmd.ExtraFiles = append(cmd.ExtraFiles, f) + cmd.Args = append(cmd.Args, "--user-log-fd", strconv.Itoa(nextFD)) + nextFD++ + } + + if args.Attached { + // Kill sandbox if parent process exits in attached mode. + cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL + // Tells boot that any process it creates must have pdeathsig set. + cmd.Args = append(cmd.Args, "--attached") + } + + // Add container as the last argument. + cmd.Args = append(cmd.Args, s.ID) + + // Log the FDs we are donating to the sandbox process. + for i, f := range cmd.ExtraFiles { + log.Debugf("Donating FD %d: %q", i+3, f.Name()) + } + + log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args) + log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr) + if err := specutils.StartInNS(cmd, nss); err != nil { + err := fmt.Errorf("starting sandbox: %v", err) + // If the sandbox failed to start, it may be because the binary + // permissions were incorrect. Check the bits and return a more helpful + // error message. + // + // NOTE: The error message is checked because error types are lost over + // rpc calls. + if strings.Contains(err.Error(), syscall.EACCES.Error()) { + if permsErr := checkBinaryPermissions(conf); permsErr != nil { + return fmt.Errorf("%v: %v", err, permsErr) + } + } + return err + } + s.child = true + s.Pid = cmd.Process.Pid + log.Infof("Sandbox started, PID: %d", s.Pid) + + return nil +} + +// Wait waits for the containerized process to exit, and returns its WaitStatus. +func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { + log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) + var ws syscall.WaitStatus + + if conn, err := s.sandboxConnect(); err != nil { + // The sandbox may have exited while before we had a chance to + // wait on it. + log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) + } else { + defer conn.Close() + // Try the Wait RPC to the sandbox. + err = conn.Call(boot.ContainerWait, &cid, &ws) + if err == nil { + // It worked! + return ws, nil + } + // The sandbox may have exited after we connected, but before + // or during the Wait RPC. + log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) + } + + // The sandbox may have already exited, or exited while handling the + // Wait RPC. The best we can do is ask Linux what the sandbox exit + // status was, since in most cases that will be the same as the + // container exit status. + if err := s.waitForStopped(); err != nil { + return ws, err + } + if !s.child { + return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable") + } + return s.status, nil +} + +// WaitPID waits for process 'pid' in the container's sandbox and returns its +// WaitStatus. +func (s *Sandbox) WaitPID(cid string, pid int32) (syscall.WaitStatus, error) { + log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) + var ws syscall.WaitStatus + conn, err := s.sandboxConnect() + if err != nil { + return ws, err + } + defer conn.Close() + + args := &boot.WaitPIDArgs{ + PID: pid, + CID: cid, + } + if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil { + return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err) + } + return ws, nil +} + +// IsRootContainer returns true if the specified container ID belongs to the +// root container. +func (s *Sandbox) IsRootContainer(cid string) bool { + return s.ID == cid +} + +// Destroy frees all resources associated with the sandbox. It fails fast and +// is idempotent. +func (s *Sandbox) destroy() error { + log.Debugf("Destroy sandbox %q", s.ID) + if s.Pid != 0 { + log.Debugf("Killing sandbox %q", s.ID) + if err := syscall.Kill(s.Pid, syscall.SIGKILL); err != nil && err != syscall.ESRCH { + return fmt.Errorf("killing sandbox %q PID %q: %v", s.ID, s.Pid, err) + } + if err := s.waitForStopped(); err != nil { + return fmt.Errorf("waiting sandbox %q stop: %v", s.ID, err) + } + } + + return nil +} + +// SignalContainer sends the signal to a container in the sandbox. If all is +// true and signal is SIGKILL, then waits for all processes to exit before +// returning. +func (s *Sandbox) SignalContainer(cid string, sig syscall.Signal, all bool) error { + log.Debugf("Signal sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + mode := boot.DeliverToProcess + if all { + mode = boot.DeliverToAllProcesses + } + + args := boot.SignalArgs{ + CID: cid, + Signo: int32(sig), + Mode: mode, + } + if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil { + return fmt.Errorf("signaling container %q: %v", cid, err) + } + return nil +} + +// SignalProcess sends the signal to a particular process in the container. If +// fgProcess is true, then the signal is sent to the foreground process group +// in the same session that PID belongs to. This is only valid if the process +// is attached to a host TTY. +func (s *Sandbox) SignalProcess(cid string, pid int32, sig syscall.Signal, fgProcess bool) error { + log.Debugf("Signal sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + mode := boot.DeliverToProcess + if fgProcess { + mode = boot.DeliverToForegroundProcessGroup + } + + args := boot.SignalArgs{ + CID: cid, + Signo: int32(sig), + PID: pid, + Mode: mode, + } + if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil { + return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err) + } + return nil +} + +// Checkpoint sends the checkpoint call for a container in the sandbox. +// The statefile will be written to f. +func (s *Sandbox) Checkpoint(cid string, f *os.File) error { + log.Debugf("Checkpoint sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + opt := control.SaveOpts{ + FilePayload: urpc.FilePayload{ + Files: []*os.File{f}, + }, + } + + if err := conn.Call(boot.ContainerCheckpoint, &opt, nil); err != nil { + return fmt.Errorf("checkpointing container %q: %v", cid, err) + } + return nil +} + +// Pause sends the pause call for a container in the sandbox. +func (s *Sandbox) Pause(cid string) error { + log.Debugf("Pause sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + if err := conn.Call(boot.ContainerPause, nil, nil); err != nil { + return fmt.Errorf("pausing container %q: %v", cid, err) + } + return nil +} + +// Resume sends the resume call for a container in the sandbox. +func (s *Sandbox) Resume(cid string) error { + log.Debugf("Resume sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + if err := conn.Call(boot.ContainerResume, nil, nil); err != nil { + return fmt.Errorf("resuming container %q: %v", cid, err) + } + return nil +} + +// IsRunning returns true if the sandbox or gofer process is running. +func (s *Sandbox) IsRunning() bool { + if s.Pid != 0 { + // Send a signal 0 to the sandbox process. + if err := syscall.Kill(s.Pid, 0); err == nil { + // Succeeded, process is running. + return true + } + } + return false +} + +// Stacks collects and returns all stacks for the sandbox. +func (s *Sandbox) Stacks() (string, error) { + log.Debugf("Stacks sandbox %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return "", err + } + defer conn.Close() + + var stacks string + if err := conn.Call(boot.SandboxStacks, nil, &stacks); err != nil { + return "", fmt.Errorf("getting sandbox %q stacks: %v", s.ID, err) + } + return stacks, nil +} + +// HeapProfile writes a heap profile to the given file. +func (s *Sandbox) HeapProfile(f *os.File) error { + log.Debugf("Heap profile %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + opts := control.ProfileOpts{ + FilePayload: urpc.FilePayload{ + Files: []*os.File{f}, + }, + } + if err := conn.Call(boot.HeapProfile, &opts, nil); err != nil { + return fmt.Errorf("getting sandbox %q heap profile: %v", s.ID, err) + } + return nil +} + +// StartCPUProfile start CPU profile writing to the given file. +func (s *Sandbox) StartCPUProfile(f *os.File) error { + log.Debugf("CPU profile start %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + opts := control.ProfileOpts{ + FilePayload: urpc.FilePayload{ + Files: []*os.File{f}, + }, + } + if err := conn.Call(boot.StartCPUProfile, &opts, nil); err != nil { + return fmt.Errorf("starting sandbox %q CPU profile: %v", s.ID, err) + } + return nil +} + +// StopCPUProfile stops a previously started CPU profile. +func (s *Sandbox) StopCPUProfile() error { + log.Debugf("CPU profile stop %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + if err := conn.Call(boot.StopCPUProfile, nil, nil); err != nil { + return fmt.Errorf("stopping sandbox %q CPU profile: %v", s.ID, err) + } + return nil +} + +// GoroutineProfile writes a goroutine profile to the given file. +func (s *Sandbox) GoroutineProfile(f *os.File) error { + log.Debugf("Goroutine profile %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + opts := control.ProfileOpts{ + FilePayload: urpc.FilePayload{ + Files: []*os.File{f}, + }, + } + if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil { + return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err) + } + return nil +} + +// BlockProfile writes a block profile to the given file. +func (s *Sandbox) BlockProfile(f *os.File) error { + log.Debugf("Block profile %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + opts := control.ProfileOpts{ + FilePayload: urpc.FilePayload{ + Files: []*os.File{f}, + }, + } + if err := conn.Call(boot.BlockProfile, &opts, nil); err != nil { + return fmt.Errorf("getting sandbox %q block profile: %v", s.ID, err) + } + return nil +} + +// MutexProfile writes a mutex profile to the given file. +func (s *Sandbox) MutexProfile(f *os.File) error { + log.Debugf("Mutex profile %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + opts := control.ProfileOpts{ + FilePayload: urpc.FilePayload{ + Files: []*os.File{f}, + }, + } + if err := conn.Call(boot.MutexProfile, &opts, nil); err != nil { + return fmt.Errorf("getting sandbox %q mutex profile: %v", s.ID, err) + } + return nil +} + +// StartTrace start trace writing to the given file. +func (s *Sandbox) StartTrace(f *os.File) error { + log.Debugf("Trace start %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + opts := control.ProfileOpts{ + FilePayload: urpc.FilePayload{ + Files: []*os.File{f}, + }, + } + if err := conn.Call(boot.StartTrace, &opts, nil); err != nil { + return fmt.Errorf("starting sandbox %q trace: %v", s.ID, err) + } + return nil +} + +// StopTrace stops a previously started trace. +func (s *Sandbox) StopTrace() error { + log.Debugf("Trace stop %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + if err := conn.Call(boot.StopTrace, nil, nil); err != nil { + return fmt.Errorf("stopping sandbox %q trace: %v", s.ID, err) + } + return nil +} + +// ChangeLogging changes logging options. +func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { + log.Debugf("Change logging start %q", s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + + if err := conn.Call(boot.ChangeLogging, &args, nil); err != nil { + return fmt.Errorf("changing sandbox %q logging: %v", s.ID, err) + } + return nil +} + +// DestroyContainer destroys the given container. If it is the root container, +// then the entire sandbox is destroyed. +func (s *Sandbox) DestroyContainer(cid string) error { + if err := s.destroyContainer(cid); err != nil { + // If the sandbox isn't running, the container has already been destroyed, + // ignore the error in this case. + if s.IsRunning() { + return err + } + } + return nil +} + +func (s *Sandbox) destroyContainer(cid string) error { + if s.IsRootContainer(cid) { + log.Debugf("Destroying root container %q by destroying sandbox", cid) + return s.destroy() + } + + log.Debugf("Destroying container %q in sandbox %q", cid, s.ID) + conn, err := s.sandboxConnect() + if err != nil { + return err + } + defer conn.Close() + if err := conn.Call(boot.ContainerDestroy, &cid, nil); err != nil { + return fmt.Errorf("destroying container %q: %v", cid, err) + } + return nil +} + +func (s *Sandbox) waitForStopped() error { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) + op := func() error { + if s.child { + s.statusMu.Lock() + defer s.statusMu.Unlock() + if s.Pid == 0 { + return nil + } + // The sandbox process is a child of the current process, + // so we can wait it and collect its zombie. + wpid, err := syscall.Wait4(int(s.Pid), &s.status, syscall.WNOHANG, nil) + if err != nil { + return fmt.Errorf("error waiting the sandbox process: %v", err) + } + if wpid == 0 { + return fmt.Errorf("sandbox is still running") + } + s.Pid = 0 + } else if s.IsRunning() { + return fmt.Errorf("sandbox is still running") + } + return nil + } + return backoff.Retry(op, b) +} + +// deviceFileForPlatform opens the device file for the given platform. If the +// platform does not need a device file, then nil is returned. +func deviceFileForPlatform(name string) (*os.File, error) { + p, err := platform.Lookup(name) + if err != nil { + return nil, err + } + + f, err := p.OpenDevice() + if err != nil { + return nil, fmt.Errorf("opening device file for platform %q: %v", p, err) + } + return f, nil +} + +// checkBinaryPermissions verifies that the required binary bits are set on +// the runsc executable. +func checkBinaryPermissions(conf *boot.Config) error { + // All platforms need the other exe bit + neededBits := os.FileMode(0001) + if conf.Platform == platforms.Ptrace { + // Ptrace needs the other read bit + neededBits |= os.FileMode(0004) + } + + exePath, err := os.Executable() + if err != nil { + return fmt.Errorf("getting exe path: %v", err) + } + + // Check the permissions of the runsc binary and print an error if it + // doesn't match expectations. + info, err := os.Stat(exePath) + if err != nil { + return fmt.Errorf("stat file: %v", err) + } + + if info.Mode().Perm()&neededBits != neededBits { + return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath))) + } + return nil +} diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD new file mode 100644 index 000000000..62d4f5113 --- /dev/null +++ b/runsc/specutils/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "specutils", + srcs = [ + "cri.go", + "fs.go", + "namespace.go", + "specutils.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi/linux", + "//pkg/bits", + "//pkg/log", + "//pkg/sentry/kernel/auth", + "@com_github_cenkalti_backoff//:go_default_library", + "@com_github_mohae_deepcopy//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_syndtr_gocapability//capability:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "specutils_test", + size = "small", + srcs = ["specutils_test.go"], + library = ":specutils", + deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"], +) diff --git a/runsc/specutils/cri.go b/runsc/specutils/cri.go new file mode 100644 index 000000000..9c5877cd5 --- /dev/null +++ b/runsc/specutils/cri.go @@ -0,0 +1,110 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package specutils + +import ( + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + // ContainerdContainerTypeAnnotation is the OCI annotation set by + // containerd to indicate whether the container to create should have + // its own sandbox or a container within an existing sandbox. + ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type" + // ContainerdContainerTypeContainer is the container type value + // indicating the container should be created in an existing sandbox. + ContainerdContainerTypeContainer = "container" + // ContainerdContainerTypeSandbox is the container type value + // indicating the container should be created in a new sandbox. + ContainerdContainerTypeSandbox = "sandbox" + + // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate + // which sandbox the container should be created in when the container + // is not the first container in the sandbox. + ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id" + + // CRIOContainerTypeAnnotation is the OCI annotation set by + // CRI-O to indicate whether the container to create should have + // its own sandbox or a container within an existing sandbox. + CRIOContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType" + + // CRIOContainerTypeContainer is the container type value + // indicating the container should be created in an existing sandbox. + CRIOContainerTypeContainer = "container" + // CRIOContainerTypeSandbox is the container type value + // indicating the container should be created in a new sandbox. + CRIOContainerTypeSandbox = "sandbox" + + // CRIOSandboxIDAnnotation is the OCI annotation set to indicate + // which sandbox the container should be created in when the container + // is not the first container in the sandbox. + CRIOSandboxIDAnnotation = "io.kubernetes.cri-o.SandboxID" +) + +// ContainerType represents the type of container requested by the calling container manager. +type ContainerType int + +const ( + // ContainerTypeUnspecified indicates that no known container type + // annotation was found in the spec. + ContainerTypeUnspecified ContainerType = iota + // ContainerTypeUnknown indicates that a container type was specified + // but is unknown to us. + ContainerTypeUnknown + // ContainerTypeSandbox indicates that the container should be run in a + // new sandbox. + ContainerTypeSandbox + // ContainerTypeContainer indicates that the container should be run in + // an existing sandbox. + ContainerTypeContainer +) + +// SpecContainerType tries to determine the type of container specified by the +// container manager using well-known container annotations. +func SpecContainerType(spec *specs.Spec) ContainerType { + if t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]; ok { + switch t { + case ContainerdContainerTypeSandbox: + return ContainerTypeSandbox + case ContainerdContainerTypeContainer: + return ContainerTypeContainer + default: + return ContainerTypeUnknown + } + } + if t, ok := spec.Annotations[CRIOContainerTypeAnnotation]; ok { + switch t { + case CRIOContainerTypeSandbox: + return ContainerTypeSandbox + case CRIOContainerTypeContainer: + return ContainerTypeContainer + default: + return ContainerTypeUnknown + } + } + return ContainerTypeUnspecified +} + +// SandboxID returns the ID of the sandbox to join and whether an ID was found +// in the spec. +func SandboxID(spec *specs.Spec) (string, bool) { + if id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]; ok { + return id, true + } + if id, ok := spec.Annotations[CRIOSandboxIDAnnotation]; ok { + return id, true + } + return "", false +} diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go new file mode 100644 index 000000000..138aa4dd1 --- /dev/null +++ b/runsc/specutils/fs.go @@ -0,0 +1,155 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package specutils + +import ( + "fmt" + "math/bits" + "path" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +type mapping struct { + set bool + val uint32 +} + +// optionsMap maps mount propagation-related OCI filesystem options to mount(2) +// syscall flags. +var optionsMap = map[string]mapping{ + "acl": {set: true, val: syscall.MS_POSIXACL}, + "async": {set: false, val: syscall.MS_SYNCHRONOUS}, + "atime": {set: false, val: syscall.MS_NOATIME}, + "bind": {set: true, val: syscall.MS_BIND}, + "defaults": {set: true, val: 0}, + "dev": {set: false, val: syscall.MS_NODEV}, + "diratime": {set: false, val: syscall.MS_NODIRATIME}, + "dirsync": {set: true, val: syscall.MS_DIRSYNC}, + "exec": {set: false, val: syscall.MS_NOEXEC}, + "noexec": {set: true, val: syscall.MS_NOEXEC}, + "iversion": {set: true, val: syscall.MS_I_VERSION}, + "loud": {set: false, val: syscall.MS_SILENT}, + "mand": {set: true, val: syscall.MS_MANDLOCK}, + "noacl": {set: false, val: syscall.MS_POSIXACL}, + "noatime": {set: true, val: syscall.MS_NOATIME}, + "nodev": {set: true, val: syscall.MS_NODEV}, + "nodiratime": {set: true, val: syscall.MS_NODIRATIME}, + "noiversion": {set: false, val: syscall.MS_I_VERSION}, + "nomand": {set: false, val: syscall.MS_MANDLOCK}, + "norelatime": {set: false, val: syscall.MS_RELATIME}, + "nostrictatime": {set: false, val: syscall.MS_STRICTATIME}, + "nosuid": {set: true, val: syscall.MS_NOSUID}, + "rbind": {set: true, val: syscall.MS_BIND | syscall.MS_REC}, + "relatime": {set: true, val: syscall.MS_RELATIME}, + "remount": {set: true, val: syscall.MS_REMOUNT}, + "ro": {set: true, val: syscall.MS_RDONLY}, + "rw": {set: false, val: syscall.MS_RDONLY}, + "silent": {set: true, val: syscall.MS_SILENT}, + "strictatime": {set: true, val: syscall.MS_STRICTATIME}, + "suid": {set: false, val: syscall.MS_NOSUID}, + "sync": {set: true, val: syscall.MS_SYNCHRONOUS}, +} + +// propOptionsMap is similar to optionsMap, but it lists propagation options +// that cannot be used together with other flags. +var propOptionsMap = map[string]mapping{ + "private": {set: true, val: syscall.MS_PRIVATE}, + "rprivate": {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC}, + "slave": {set: true, val: syscall.MS_SLAVE}, + "rslave": {set: true, val: syscall.MS_SLAVE | syscall.MS_REC}, + "unbindable": {set: true, val: syscall.MS_UNBINDABLE}, + "runbindable": {set: true, val: syscall.MS_UNBINDABLE | syscall.MS_REC}, +} + +// invalidOptions list options not allowed. +// - shared: sandbox must be isolated from the host. Propagating mount changes +// from the sandbox to the host breaks the isolation. +var invalidOptions = []string{"shared", "rshared"} + +// OptionsToFlags converts mount options to syscall flags. +func OptionsToFlags(opts []string) uint32 { + return optionsToFlags(opts, optionsMap) +} + +// PropOptionsToFlags converts propagation mount options to syscall flags. +// Propagation options cannot be set other with other options and must be +// handled separately. +func PropOptionsToFlags(opts []string) uint32 { + return optionsToFlags(opts, propOptionsMap) +} + +func optionsToFlags(opts []string, source map[string]mapping) uint32 { + var rv uint32 + for _, opt := range opts { + if m, ok := source[opt]; ok { + if m.set { + rv |= m.val + } else { + rv ^= m.val + } + } + } + return rv +} + +// validateMount validates that spec mounts are correct. +func validateMount(mnt *specs.Mount) error { + if !path.IsAbs(mnt.Destination) { + return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt) + } + if mnt.Type == "bind" { + return ValidateMountOptions(mnt.Options) + } + return nil +} + +// ValidateMountOptions validates that mount options are correct. +func ValidateMountOptions(opts []string) error { + for _, o := range opts { + if ContainsStr(invalidOptions, o) { + return fmt.Errorf("mount option %q is not supported", o) + } + _, ok1 := optionsMap[o] + _, ok2 := propOptionsMap[o] + if !ok1 && !ok2 { + return fmt.Errorf("unknown mount option %q", o) + } + if err := validatePropagation(o); err != nil { + return err + } + } + return nil +} + +// ValidateRootfsPropagation validates that rootfs propagation options are +// correct. +func validateRootfsPropagation(opt string) error { + flags := PropOptionsToFlags([]string{opt}) + if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 { + return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt) + } + return validatePropagation(opt) +} + +func validatePropagation(opt string) error { + flags := PropOptionsToFlags([]string{opt}) + exclusive := flags & (syscall.MS_SLAVE | syscall.MS_PRIVATE | syscall.MS_SHARED | syscall.MS_UNBINDABLE) + if bits.OnesCount32(exclusive) > 1 { + return fmt.Errorf("mount propagation options are mutually exclusive: %q", opt) + } + return nil +} diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go new file mode 100644 index 000000000..23001d67c --- /dev/null +++ b/runsc/specutils/namespace.go @@ -0,0 +1,289 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package specutils + +import ( + "fmt" + "os" + "os/exec" + "os/signal" + "path/filepath" + "runtime" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/syndtr/gocapability/capability" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/log" +) + +// nsCloneFlag returns the clone flag that can be used to set a namespace of +// the given type. +func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr { + switch nst { + case specs.IPCNamespace: + return unix.CLONE_NEWIPC + case specs.MountNamespace: + return unix.CLONE_NEWNS + case specs.NetworkNamespace: + return unix.CLONE_NEWNET + case specs.PIDNamespace: + return unix.CLONE_NEWPID + case specs.UTSNamespace: + return unix.CLONE_NEWUTS + case specs.UserNamespace: + return unix.CLONE_NEWUSER + case specs.CgroupNamespace: + return unix.CLONE_NEWCGROUP + default: + panic(fmt.Sprintf("unknown namespace %v", nst)) + } +} + +// nsPath returns the path of the namespace for the current process and the +// given namespace. +func nsPath(nst specs.LinuxNamespaceType) string { + base := "/proc/self/ns" + switch nst { + case specs.CgroupNamespace: + return filepath.Join(base, "cgroup") + case specs.IPCNamespace: + return filepath.Join(base, "ipc") + case specs.MountNamespace: + return filepath.Join(base, "mnt") + case specs.NetworkNamespace: + return filepath.Join(base, "net") + case specs.PIDNamespace: + return filepath.Join(base, "pid") + case specs.UserNamespace: + return filepath.Join(base, "user") + case specs.UTSNamespace: + return filepath.Join(base, "uts") + default: + panic(fmt.Sprintf("unknown namespace %v", nst)) + } +} + +// GetNS returns true and the namespace with the given type from the slice of +// namespaces in the spec. It returns false if the slice does not contain a +// namespace with the type. +func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) { + if s.Linux == nil { + return specs.LinuxNamespace{}, false + } + for _, ns := range s.Linux.Namespaces { + if ns.Type == nst { + return ns, true + } + } + return specs.LinuxNamespace{}, false +} + +// FilterNS returns a slice of namespaces from the spec with types that match +// those in the `filter` slice. +func FilterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace { + if s.Linux == nil { + return nil + } + var out []specs.LinuxNamespace + for _, nst := range filter { + if ns, ok := GetNS(nst, s); ok { + out = append(out, ns) + } + } + return out +} + +// setNS sets the namespace of the given type. It must be called with +// OSThreadLocked. +func setNS(fd, nsType uintptr) error { + if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 { + return err + } + return nil +} + +// ApplyNS applies the namespace on the current thread and returns a function +// that will restore the namespace to the original value. +// +// Preconditions: Must be called with os thread locked. +func ApplyNS(ns specs.LinuxNamespace) (func(), error) { + log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path) + newNS, err := os.Open(ns.Path) + if err != nil { + return nil, fmt.Errorf("error opening %q: %v", ns.Path, err) + } + defer newNS.Close() + + // Store current namespace to restore back. + curPath := nsPath(ns.Type) + oldNS, err := os.Open(curPath) + if err != nil { + return nil, fmt.Errorf("error opening %q: %v", curPath, err) + } + + // Set namespace to the one requested and setup function to restore it back. + flag := nsCloneFlag(ns.Type) + if err := setNS(newNS.Fd(), flag); err != nil { + oldNS.Close() + return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err) + } + return func() { + log.Infof("Restoring namespace %v", ns.Type) + defer oldNS.Close() + if err := setNS(oldNS.Fd(), flag); err != nil { + panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err)) + } + }, nil +} + +// StartInNS joins or creates the given namespaces and calls cmd.Start before +// restoring the namespaces to the original values. +func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { + // We are about to setup namespaces, which requires the os thread being + // locked so that Go doesn't change the thread out from under us. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + + for _, ns := range nss { + if ns.Path == "" { + // No path. Just set a flag to create a new namespace. + cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type) + continue + } + // Join the given namespace, and restore the current namespace + // before exiting. + restoreNS, err := ApplyNS(ns) + if err != nil { + return err + } + defer restoreNS() + } + + return cmd.Start() +} + +// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. +func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { + if s.Linux == nil { + return + } + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + for _, idMap := range s.Linux.UIDMappings { + log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) + cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{ + ContainerID: int(idMap.ContainerID), + HostID: int(idMap.HostID), + Size: int(idMap.Size), + }) + } + for _, idMap := range s.Linux.GIDMappings { + log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) + cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{ + ContainerID: int(idMap.ContainerID), + HostID: int(idMap.HostID), + Size: int(idMap.Size), + }) + } +} + +// HasCapabilities returns true if the user has all capabilities in 'cs'. +func HasCapabilities(cs ...capability.Cap) bool { + caps, err := capability.NewPid2(os.Getpid()) + if err != nil { + return false + } + if err := caps.Load(); err != nil { + return false + } + for _, c := range cs { + if !caps.Get(capability.EFFECTIVE, c) { + return false + } + } + return true +} + +// MaybeRunAsRoot ensures the process runs with capabilities needed to create a +// sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed, +// it will create a new user namespace and re-execute the process as root +// inside the namespace with the same arguments and environment. +// +// This function returns immediately when no new capability is needed. If +// another process is executed, it returns straight from here with the same exit +// code as the child. +func MaybeRunAsRoot() error { + if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) { + return nil + } + + // Current process doesn't have required capabilities, create user namespace + // and run as root inside the namespace to acquire capabilities. + log.Infof("*** Re-running as root in new user namespace ***") + + cmd := exec.Command("/proc/self/exe", os.Args[1:]...) + + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS, + // Set current user/group as root inside the namespace. Since we may not + // have CAP_SETUID/CAP_SETGID, just map root to the current user/group. + UidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getuid(), Size: 1}, + }, + GidMappings: []syscall.SysProcIDMap{ + {ContainerID: 0, HostID: os.Getgid(), Size: 1}, + }, + Credential: &syscall.Credential{Uid: 0, Gid: 0}, + GidMappingsEnableSetgroups: false, + + // Make sure child is killed when the parent terminates. + Pdeathsig: syscall.SIGKILL, + } + + cmd.Env = os.Environ() + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Start(); err != nil { + return fmt.Errorf("re-executing self: %w", err) + } + ch := make(chan os.Signal, 1) + signal.Notify(ch) + go func() { + for { + // Forward all signals to child process. + cmd.Process.Signal(<-ch) + } + }() + if err := cmd.Wait(); err != nil { + if exit, ok := err.(*exec.ExitError); ok { + if ws, ok := exit.Sys().(syscall.WaitStatus); ok { + os.Exit(ws.ExitStatus()) + } + log.Warningf("No wait status provided, exiting with -1: %v", err) + os.Exit(-1) + } + return err + } + // Child completed with success. + os.Exit(0) + panic("unreachable") +} diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go new file mode 100644 index 000000000..5015c3a84 --- /dev/null +++ b/runsc/specutils/specutils.go @@ -0,0 +1,523 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package specutils contains utility functions for working with OCI runtime +// specs. +package specutils + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" + + "github.com/cenkalti/backoff" + "github.com/mohae/deepcopy" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// ExePath must point to runsc binary, which is normally the same binary. It's +// changed in tests that aren't linked in the same binary. +var ExePath = "/proc/self/exe" + +// Version is the supported spec version. +var Version = specs.Version + +// LogSpec logs the spec in a human-friendly way. +func LogSpec(orig *specs.Spec) { + if !log.IsLogging(log.Debug) { + return + } + + // Strip down parts of the spec that are not interesting. + spec := deepcopy.Copy(orig).(*specs.Spec) + if spec.Process != nil { + spec.Process.Capabilities = nil + } + if spec.Linux != nil { + spec.Linux.Seccomp = nil + spec.Linux.MaskedPaths = nil + spec.Linux.ReadonlyPaths = nil + if spec.Linux.Resources != nil { + spec.Linux.Resources.Devices = nil + } + } + + out, err := json.MarshalIndent(spec, "", " ") + if err != nil { + log.Debugf("Failed to marshal spec: %v", err) + return + } + log.Debugf("Spec:\n%s", out) +} + +// ValidateSpec validates that the spec is compatible with runsc. +func ValidateSpec(spec *specs.Spec) error { + // Mandatory fields. + if spec.Process == nil { + return fmt.Errorf("Spec.Process must be defined: %+v", spec) + } + if len(spec.Process.Args) == 0 { + return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process) + } + if spec.Root == nil { + return fmt.Errorf("Spec.Root must be defined: %+v", spec) + } + if len(spec.Root.Path) == 0 { + return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root) + } + + // Unsupported fields. + if spec.Solaris != nil { + return fmt.Errorf("Spec.Solaris is not supported: %+v", spec) + } + if spec.Windows != nil { + return fmt.Errorf("Spec.Windows is not supported: %+v", spec) + } + if len(spec.Process.SelinuxLabel) != 0 { + return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel) + } + + // Docker uses AppArmor by default, so just log that it's being ignored. + if spec.Process.ApparmorProfile != "" { + log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile) + } + + // PR_SET_NO_NEW_PRIVS is assumed to always be set. + // See kernel.Task.updateCredsForExecLocked. + if !spec.Process.NoNewPrivileges { + log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.") + } + + // TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox. + if spec.Linux != nil && spec.Linux.Seccomp != nil { + log.Warningf("Seccomp spec is being ignored") + } + + if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { + if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil { + return err + } + } + for _, m := range spec.Mounts { + if err := validateMount(&m); err != nil { + return err + } + } + + // CRI specifies whether a container should start a new sandbox, or run + // another container in an existing sandbox. + switch SpecContainerType(spec) { + case ContainerTypeContainer: + // When starting a container in an existing sandbox, the + // sandbox ID must be set. + if _, ok := SandboxID(spec); !ok { + return fmt.Errorf("spec has container-type of container, but no sandbox ID set") + } + case ContainerTypeUnknown: + return fmt.Errorf("unknown container-type") + default: + } + + return nil +} + +// absPath turns the given path into an absolute path (if it is not already +// absolute) by prepending the base path. +func absPath(base, rel string) string { + if filepath.IsAbs(rel) { + return rel + } + return filepath.Join(base, rel) +} + +// OpenSpec opens an OCI runtime spec from the given bundle directory. +func OpenSpec(bundleDir string) (*os.File, error) { + // The spec file must be named "config.json" inside the bundle directory. + return os.Open(filepath.Join(bundleDir, "config.json")) +} + +// ReadSpec reads an OCI runtime spec from the given bundle directory. +// ReadSpec also normalizes all potential relative paths into absolute +// path, e.g. spec.Root.Path, mount.Source. +func ReadSpec(bundleDir string) (*specs.Spec, error) { + specFile, err := OpenSpec(bundleDir) + if err != nil { + return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err) + } + defer specFile.Close() + return ReadSpecFromFile(bundleDir, specFile) +} + +// ReadSpecFromFile reads an OCI runtime spec from the given File, and +// normalizes all relative paths into absolute by prepending the bundle dir. +func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) { + if _, err := specFile.Seek(0, os.SEEK_SET); err != nil { + return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err) + } + specBytes, err := ioutil.ReadAll(specFile) + if err != nil { + return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err) + } + var spec specs.Spec + if err := json.Unmarshal(specBytes, &spec); err != nil { + return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes)) + } + if err := ValidateSpec(&spec); err != nil { + return nil, err + } + // Turn any relative paths in the spec to absolute by prepending the bundleDir. + spec.Root.Path = absPath(bundleDir, spec.Root.Path) + for i := range spec.Mounts { + m := &spec.Mounts[i] + if m.Source != "" { + m.Source = absPath(bundleDir, m.Source) + } + } + return &spec, nil +} + +// ReadMounts reads mount list from a file. +func ReadMounts(f *os.File) ([]specs.Mount, error) { + bytes, err := ioutil.ReadAll(f) + if err != nil { + return nil, fmt.Errorf("error reading mounts: %v", err) + } + var mounts []specs.Mount + if err := json.Unmarshal(bytes, &mounts); err != nil { + return nil, fmt.Errorf("error unmarshaling mounts: %v\n %s", err, string(bytes)) + } + return mounts, nil +} + +// Capabilities takes in spec and returns a TaskCapabilities corresponding to +// the spec. +func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) { + // Strip CAP_NET_RAW from all capability sets if necessary. + skipSet := map[linux.Capability]struct{}{} + if !enableRaw { + skipSet[linux.CAP_NET_RAW] = struct{}{} + } + + var caps auth.TaskCapabilities + if specCaps != nil { + var err error + if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil { + return nil, err + } + if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil { + return nil, err + } + if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil { + return nil, err + } + if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil { + return nil, err + } + // TODO(nlacasse): Support ambient capabilities. + } + return &caps, nil +} + +// AllCapabilities returns a LinuxCapabilities struct with all capabilities. +func AllCapabilities() *specs.LinuxCapabilities { + var names []string + for n := range capFromName { + names = append(names, n) + } + return &specs.LinuxCapabilities{ + Bounding: names, + Effective: names, + Inheritable: names, + Permitted: names, + Ambient: names, + } +} + +// AllCapabilitiesUint64 returns a bitmask containing all capabilities set. +func AllCapabilitiesUint64() uint64 { + var rv uint64 + for _, cap := range capFromName { + rv |= bits.MaskOf64(int(cap)) + } + return rv +} + +var capFromName = map[string]linux.Capability{ + "CAP_CHOWN": linux.CAP_CHOWN, + "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE, + "CAP_DAC_READ_SEARCH": linux.CAP_DAC_READ_SEARCH, + "CAP_FOWNER": linux.CAP_FOWNER, + "CAP_FSETID": linux.CAP_FSETID, + "CAP_KILL": linux.CAP_KILL, + "CAP_SETGID": linux.CAP_SETGID, + "CAP_SETUID": linux.CAP_SETUID, + "CAP_SETPCAP": linux.CAP_SETPCAP, + "CAP_LINUX_IMMUTABLE": linux.CAP_LINUX_IMMUTABLE, + "CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE, + "CAP_NET_BROADCAST": linux.CAP_NET_BROADCAST, + "CAP_NET_ADMIN": linux.CAP_NET_ADMIN, + "CAP_NET_RAW": linux.CAP_NET_RAW, + "CAP_IPC_LOCK": linux.CAP_IPC_LOCK, + "CAP_IPC_OWNER": linux.CAP_IPC_OWNER, + "CAP_SYS_MODULE": linux.CAP_SYS_MODULE, + "CAP_SYS_RAWIO": linux.CAP_SYS_RAWIO, + "CAP_SYS_CHROOT": linux.CAP_SYS_CHROOT, + "CAP_SYS_PTRACE": linux.CAP_SYS_PTRACE, + "CAP_SYS_PACCT": linux.CAP_SYS_PACCT, + "CAP_SYS_ADMIN": linux.CAP_SYS_ADMIN, + "CAP_SYS_BOOT": linux.CAP_SYS_BOOT, + "CAP_SYS_NICE": linux.CAP_SYS_NICE, + "CAP_SYS_RESOURCE": linux.CAP_SYS_RESOURCE, + "CAP_SYS_TIME": linux.CAP_SYS_TIME, + "CAP_SYS_TTY_CONFIG": linux.CAP_SYS_TTY_CONFIG, + "CAP_MKNOD": linux.CAP_MKNOD, + "CAP_LEASE": linux.CAP_LEASE, + "CAP_AUDIT_WRITE": linux.CAP_AUDIT_WRITE, + "CAP_AUDIT_CONTROL": linux.CAP_AUDIT_CONTROL, + "CAP_SETFCAP": linux.CAP_SETFCAP, + "CAP_MAC_OVERRIDE": linux.CAP_MAC_OVERRIDE, + "CAP_MAC_ADMIN": linux.CAP_MAC_ADMIN, + "CAP_SYSLOG": linux.CAP_SYSLOG, + "CAP_WAKE_ALARM": linux.CAP_WAKE_ALARM, + "CAP_BLOCK_SUSPEND": linux.CAP_BLOCK_SUSPEND, + "CAP_AUDIT_READ": linux.CAP_AUDIT_READ, +} + +func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) { + var caps []linux.Capability + for _, n := range names { + c, ok := capFromName[n] + if !ok { + return 0, fmt.Errorf("unknown capability %q", n) + } + // Should we skip this capabilty? + if _, ok := skipSet[c]; ok { + continue + } + caps = append(caps, c) + } + return auth.CapabilitySetOfMany(caps), nil +} + +// Is9PMount returns true if the given mount can be mounted as an external gofer. +func Is9PMount(m specs.Mount) bool { + return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m) +} + +// IsSupportedDevMount returns true if the mount is a supported /dev mount. +// Only mount that does not conflict with runsc default /dev mount is +// supported. +func IsSupportedDevMount(m specs.Mount) bool { + // These are devices exist inside sentry. See pkg/sentry/fs/dev/dev.go + var existingDevices = []string{ + "/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr", + "/dev/null", "/dev/zero", "/dev/full", "/dev/random", + "/dev/urandom", "/dev/shm", "/dev/pts", "/dev/ptmx", + } + dst := filepath.Clean(m.Destination) + if dst == "/dev" { + // OCI spec uses many different mounts for the things inside of '/dev'. We + // have a single mount at '/dev' that is always mounted, regardless of + // whether it was asked for, as the spec says we SHOULD. + return false + } + for _, dev := range existingDevices { + if dst == dev || strings.HasPrefix(dst, dev+"/") { + return false + } + } + return true +} + +// WaitForReady waits for a process to become ready. The process is ready when +// the 'ready' function returns true. It continues to wait if 'ready' returns +// false. It returns error on timeout, if the process stops or if 'ready' fails. +func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error { + b := backoff.NewExponentialBackOff() + b.InitialInterval = 1 * time.Millisecond + b.MaxInterval = 1 * time.Second + b.MaxElapsedTime = timeout + + op := func() error { + if ok, err := ready(); err != nil { + return backoff.Permanent(err) + } else if ok { + return nil + } + + // Check if the process is still running. + // If the process is alive, child is 0 because of the NOHANG option. + // If the process has terminated, child equals the process id. + var ws syscall.WaitStatus + var ru syscall.Rusage + child, err := syscall.Wait4(pid, &ws, syscall.WNOHANG, &ru) + if err != nil { + return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err)) + } else if child == pid { + return backoff.Permanent(fmt.Errorf("process %d has terminated", pid)) + } + return fmt.Errorf("process %d not running yet", pid) + } + return backoff.Retry(op, b) +} + +// DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern' +// ends with '/', it's used as a directory with default file name. +// 'logPattern' can contain variables that are substituted: +// - %TIMESTAMP%: is replaced with a timestamp using the following format: +// <yyyymmdd-hhmmss.uuuuuu> +// - %COMMAND%: is replaced with 'command' +// - %TEST%: is replaced with 'test' (omitted by default) +func DebugLogFile(logPattern, command, test string) (*os.File, error) { + if strings.HasSuffix(logPattern, "/") { + // Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command> + logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%" + } + logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1) + logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1) + logPattern = strings.Replace(logPattern, "%TEST%", test, -1) + + dir := filepath.Dir(logPattern) + if err := os.MkdirAll(dir, 0775); err != nil { + return nil, fmt.Errorf("error creating dir %q: %v", dir, err) + } + return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664) +} + +// Mount creates the mount point and calls Mount with the given flags. +func Mount(src, dst, typ string, flags uint32) error { + // Create the mount point inside. The type must be the same as the + // source (file or directory). + var isDir bool + if typ == "proc" { + // Special case, as there is no source directory for proc mounts. + isDir = true + } else if fi, err := os.Stat(src); err != nil { + return fmt.Errorf("Stat(%q) failed: %v", src, err) + } else { + isDir = fi.IsDir() + } + + if isDir { + // Create the destination directory. + if err := os.MkdirAll(dst, 0777); err != nil { + return fmt.Errorf("Mkdir(%q) failed: %v", dst, err) + } + } else { + // Create the parent destination directory. + parent := path.Dir(dst) + if err := os.MkdirAll(parent, 0777); err != nil { + return fmt.Errorf("Mkdir(%q) failed: %v", parent, err) + } + // Create the destination file if it does not exist. + f, err := os.OpenFile(dst, syscall.O_CREAT, 0777) + if err != nil { + return fmt.Errorf("Open(%q) failed: %v", dst, err) + } + f.Close() + } + + // Do the mount. + if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil { + return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err) + } + return nil +} + +// ContainsStr returns true if 'str' is inside 'strs'. +func ContainsStr(strs []string, str string) bool { + for _, s := range strs { + if s == str { + return true + } + } + return false +} + +// RetryEintr retries the function until an error different than EINTR is +// returned. +func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) { + for { + r1, r2, err := f() + if err != syscall.EINTR { + return r1, r2, err + } + } +} + +// GetOOMScoreAdj reads the given process' oom_score_adj +func GetOOMScoreAdj(pid int) (int, error) { + data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid)) + if err != nil { + return 0, err + } + return strconv.Atoi(strings.TrimSpace(string(data))) +} + +// GetParentPid gets the parent process ID of the specified PID. +func GetParentPid(pid int) (int, error) { + data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/stat", pid)) + if err != nil { + return 0, err + } + + var cpid string + var name string + var state string + var ppid int + // Parse after the binary name. + _, err = fmt.Sscanf(string(data), + "%v %v %v %d", + // cpid is ignored. + &cpid, + // name is ignored. + &name, + // state is ignored. + &state, + &ppid) + + if err != nil { + return 0, err + } + + return ppid, nil +} + +// EnvVar looks for a varible value in the env slice assuming the following +// format: "NAME=VALUE". +func EnvVar(env []string, name string) (string, bool) { + prefix := name + "=" + for _, e := range env { + if strings.HasPrefix(e, prefix) { + return strings.TrimPrefix(e, prefix), true + } + } + return "", false +} + +// FaqErrorMsg returns an error message pointing to the FAQ. +func FaqErrorMsg(anchor, msg string) string { + return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor) +} diff --git a/runsc/specutils/specutils_test.go b/runsc/specutils/specutils_test.go new file mode 100644 index 000000000..2c86fffe8 --- /dev/null +++ b/runsc/specutils/specutils_test.go @@ -0,0 +1,265 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package specutils + +import ( + "fmt" + "os/exec" + "strings" + "testing" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestWaitForReadyHappy(t *testing.T) { + cmd := exec.Command("/bin/sleep", "1000") + if err := cmd.Start(); err != nil { + t.Fatalf("cmd.Start() failed, err: %v", err) + } + defer cmd.Wait() + + var count int + err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) { + if count < 3 { + count++ + return false, nil + } + return true, nil + }) + if err != nil { + t.Errorf("ProcessWaitReady got: %v, expected: nil", err) + } + cmd.Process.Kill() +} + +func TestWaitForReadyFail(t *testing.T) { + cmd := exec.Command("/bin/sleep", "1000") + if err := cmd.Start(); err != nil { + t.Fatalf("cmd.Start() failed, err: %v", err) + } + defer cmd.Wait() + + var count int + err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) { + if count < 3 { + count++ + return false, nil + } + return false, fmt.Errorf("Fake error") + }) + if err == nil { + t.Errorf("ProcessWaitReady got: nil, expected: error") + } + cmd.Process.Kill() +} + +func TestWaitForReadyNotRunning(t *testing.T) { + cmd := exec.Command("/bin/true") + if err := cmd.Start(); err != nil { + t.Fatalf("cmd.Start() failed, err: %v", err) + } + defer cmd.Wait() + + err := WaitForReady(cmd.Process.Pid, 5*time.Second, func() (bool, error) { + return false, nil + }) + if err != nil && !strings.Contains(err.Error(), "terminated") { + t.Errorf("ProcessWaitReady got: %v, expected: process terminated", err) + } + if err == nil { + t.Errorf("ProcessWaitReady incorrectly succeeded") + } +} + +func TestWaitForReadyTimeout(t *testing.T) { + cmd := exec.Command("/bin/sleep", "1000") + if err := cmd.Start(); err != nil { + t.Fatalf("cmd.Start() failed, err: %v", err) + } + defer cmd.Wait() + + err := WaitForReady(cmd.Process.Pid, 50*time.Millisecond, func() (bool, error) { + return false, nil + }) + if !strings.Contains(err.Error(), "not running yet") { + t.Errorf("ProcessWaitReady got: %v, expected: not running yet", err) + } + cmd.Process.Kill() +} + +func TestSpecInvalid(t *testing.T) { + for _, test := range []struct { + name string + spec specs.Spec + error string + }{ + { + name: "valid", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + Mounts: []specs.Mount{ + { + Source: "src", + Destination: "/dst", + }, + }, + }, + error: "", + }, + { + name: "valid+warning", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + // This is normally set by docker and will just cause warnings to be logged. + ApparmorProfile: "someprofile", + }, + // This is normally set by docker and will just cause warnings to be logged. + Linux: &specs.Linux{Seccomp: &specs.LinuxSeccomp{}}, + }, + error: "", + }, + { + name: "no root", + spec: specs.Spec{ + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + }, + error: "must be defined", + }, + { + name: "empty root", + spec: specs.Spec{ + Root: &specs.Root{}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + }, + error: "must be defined", + }, + { + name: "no process", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + }, + error: "must be defined", + }, + { + name: "empty args", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{}, + }, + error: "must be defined", + }, + { + name: "selinux", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + SelinuxLabel: "somelabel", + }, + }, + error: "is not supported", + }, + { + name: "solaris", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + Solaris: &specs.Solaris{}, + }, + error: "is not supported", + }, + { + name: "windows", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + Windows: &specs.Windows{}, + }, + error: "is not supported", + }, + { + name: "relative mount destination", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + Mounts: []specs.Mount{ + { + Source: "src", + Destination: "dst", + }, + }, + }, + error: "must be an absolute path", + }, + { + name: "invalid mount option", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + Mounts: []specs.Mount{ + { + Source: "/src", + Destination: "/dst", + Type: "bind", + Options: []string{"shared"}, + }, + }, + }, + error: "is not supported", + }, + { + name: "invalid rootfs propagation", + spec: specs.Spec{ + Root: &specs.Root{Path: "/"}, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + Linux: &specs.Linux{ + RootfsPropagation: "foo", + }, + }, + error: "root mount propagation option must specify private or slave", + }, + } { + err := ValidateSpec(&test.spec) + if len(test.error) == 0 { + if err != nil { + t.Errorf("ValidateSpec(%q) failed, err: %v", test.name, err) + } + } else { + if err == nil || !strings.Contains(err.Error(), test.error) { + t.Errorf("ValidateSpec(%q) wrong error, got: %v, want: .*%s.*", test.name, err, test.error) + } + } + } +} diff --git a/runsc/version.go b/runsc/version.go new file mode 100644 index 000000000..ab9194b9d --- /dev/null +++ b/runsc/version.go @@ -0,0 +1,18 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +// version is set during linking. +var version = "VERSION_MISSING" diff --git a/runsc/version_test.sh b/runsc/version_test.sh new file mode 100755 index 000000000..747350654 --- /dev/null +++ b/runsc/version_test.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright 2018 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euf -x -o pipefail + +readonly runsc="$1" +readonly version=$($runsc --version) + +# Version should should not match VERSION, which is the default and which will +# also appear if something is wrong with workspace_status.sh script. +if [[ $version =~ "VERSION" ]]; then + echo "FAIL: Got bad version $version" + exit 1 +fi + +# Version should contain at least one number. +if [[ ! $version =~ [0-9] ]]; then + echo "FAIL: Got bad version $version" + exit 1 +fi + +echo "PASS: Got OK version $version" +exit 0 |