diff options
Diffstat (limited to 'runsc/boot')
30 files changed, 6781 insertions, 0 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD new file mode 100644 index 000000000..aad2a41de --- /dev/null +++ b/runsc/boot/BUILD @@ -0,0 +1,137 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "boot", + srcs = [ + "compat.go", + "compat_amd64.go", + "compat_arm64.go", + "config.go", + "controller.go", + "debug.go", + "events.go", + "fs.go", + "limits.go", + "loader.go", + "network.go", + "strace.go", + "vfs.go", + ], + visibility = [ + "//pkg/test:__subpackages__", + "//runsc:__subpackages__", + "//test:__subpackages__", + ], + deps = [ + "//pkg/abi", + "//pkg/abi/linux", + "//pkg/context", + "//pkg/control/server", + "//pkg/cpuid", + "//pkg/eventchannel", + "//pkg/fspath", + "//pkg/log", + "//pkg/memutil", + "//pkg/rand", + "//pkg/refs", + "//pkg/sentry/arch", + "//pkg/sentry/arch:registers_go_proto", + "//pkg/sentry/control", + "//pkg/sentry/devices/memdev", + "//pkg/sentry/devices/ttydev", + "//pkg/sentry/devices/tundev", + "//pkg/sentry/fdimport", + "//pkg/sentry/fs", + "//pkg/sentry/fs/dev", + "//pkg/sentry/fs/gofer", + "//pkg/sentry/fs/host", + "//pkg/sentry/fs/proc", + "//pkg/sentry/fs/ramfs", + "//pkg/sentry/fs/sys", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/fs/tty", + "//pkg/sentry/fs/user", + "//pkg/sentry/fsimpl/devpts", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/fuse", + "//pkg/sentry/fsimpl/gofer", + "//pkg/sentry/fsimpl/host", + "//pkg/sentry/fsimpl/overlay", + "//pkg/sentry/fsimpl/proc", + "//pkg/sentry/fsimpl/sys", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel:uncaught_signal_go_proto", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/sighandling", + "//pkg/sentry/socket/hostinet", + "//pkg/sentry/socket/netlink", + "//pkg/sentry/socket/netlink/route", + "//pkg/sentry/socket/netlink/uevent", + "//pkg/sentry/socket/netstack", + "//pkg/sentry/socket/unix", + "//pkg/sentry/state", + "//pkg/sentry/strace", + "//pkg/sentry/syscalls/linux/vfs2", + "//pkg/sentry/time", + "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", + "//pkg/sentry/usage", + "//pkg/sentry/vfs", + "//pkg/sentry/watchdog", + "//pkg/sync", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/link/fdbased", + "//pkg/tcpip/link/loopback", + "//pkg/tcpip/link/qdisc/fifo", + "//pkg/tcpip/link/sniffer", + "//pkg/tcpip/network/arp", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/network/ipv6", + "//pkg/tcpip/stack", + "//pkg/tcpip/transport/icmp", + "//pkg/tcpip/transport/raw", + "//pkg/tcpip/transport/tcp", + "//pkg/tcpip/transport/udp", + "//pkg/urpc", + "//runsc/boot/filter", + "//runsc/boot/platforms", + "//runsc/boot/pprof", + "//runsc/specutils", + "@com_github_golang_protobuf//proto:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "boot_test", + size = "small", + srcs = [ + "compat_test.go", + "fs_test.go", + "loader_test.go", + ], + library = ":boot", + deps = [ + "//pkg/control/server", + "//pkg/fspath", + "//pkg/log", + "//pkg/p9", + "//pkg/sentry/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/unet", + "//runsc/fsgofer", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go new file mode 100644 index 000000000..84c67cbc2 --- /dev/null +++ b/runsc/boot/compat.go @@ -0,0 +1,202 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "os" + "syscall" + + "github.com/golang/protobuf/proto" + "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/log" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" + spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" + "gvisor.dev/gvisor/pkg/sync" +) + +func initCompatLogs(fd int) error { + ce, err := newCompatEmitter(fd) + if err != nil { + return err + } + eventchannel.AddEmitter(ce) + return nil +} + +type compatEmitter struct { + sink *log.BasicLogger + nameMap strace.SyscallMap + + // mu protects the fields below. + mu sync.Mutex + + // trackers map syscall number to the respective tracker instance. + // Protected by 'mu'. + trackers map[uint64]syscallTracker +} + +func newCompatEmitter(logFD int) (*compatEmitter, error) { + nameMap, ok := getSyscallNameMap() + if !ok { + return nil, fmt.Errorf("Linux syscall table not found") + } + + c := &compatEmitter{ + // Always logs to default logger. + sink: log.Log(), + nameMap: nameMap, + trackers: make(map[uint64]syscallTracker), + } + + if logFD > 0 { + f := os.NewFile(uintptr(logFD), "user log file") + target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}} + c.sink = &log.BasicLogger{Level: log.Info, Emitter: target} + } + return c, nil +} + +// Emit implements eventchannel.Emitter. +func (c *compatEmitter) Emit(msg proto.Message) (bool, error) { + switch m := msg.(type) { + case *spb.UnimplementedSyscall: + c.emitUnimplementedSyscall(m) + case *ucspb.UncaughtSignal: + c.emitUncaughtSignal(m) + } + + return false, nil +} + +func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { + regs := us.Registers + + c.mu.Lock() + defer c.mu.Unlock() + + sysnr := syscallNum(regs) + tr := c.trackers[sysnr] + if tr == nil { + switch sysnr { + case syscall.SYS_PRCTL: + // args: cmd, ... + tr = newArgsTracker(0) + + case syscall.SYS_IOCTL, syscall.SYS_EPOLL_CTL, syscall.SYS_SHMCTL, syscall.SYS_FUTEX, syscall.SYS_FALLOCATE: + // args: fd/addr, cmd, ... + tr = newArgsTracker(1) + + case syscall.SYS_GETSOCKOPT, syscall.SYS_SETSOCKOPT: + // args: fd, level, name, ... + tr = newArgsTracker(1, 2) + + case syscall.SYS_SEMCTL: + // args: semid, semnum, cmd, ... + tr = newArgsTracker(2) + + default: + tr = newArchArgsTracker(sysnr) + if tr == nil { + tr = &onceTracker{} + } + } + c.trackers[sysnr] = tr + } + + if tr.shouldReport(regs) { + name := c.nameMap.Name(uintptr(sysnr)) + c.sink.Infof("Unsupported syscall %s(%#x,%#x,%#x,%#x,%#x,%#x). It is "+ + "likely that you can safely ignore this message and that this is not "+ + "the cause of any error. Please, refer to %s/%s for more information.", + name, argVal(0, regs), argVal(1, regs), argVal(2, regs), argVal(3, regs), + argVal(4, regs), argVal(5, regs), syscallLink, name) + + tr.onReported(regs) + } +} + +func (c *compatEmitter) emitUncaughtSignal(msg *ucspb.UncaughtSignal) { + sig := syscall.Signal(msg.SignalNumber) + c.sink.Infof( + "Uncaught signal: %q (%d), PID: %d, TID: %d, fault addr: %#x", + sig, msg.SignalNumber, msg.Pid, msg.Tid, msg.FaultAddr) +} + +// Close implements eventchannel.Emitter. +func (c *compatEmitter) Close() error { + c.sink = nil + return nil +} + +// syscallTracker interface allows filters to apply differently depending on +// the syscall and arguments. +type syscallTracker interface { + // shouldReport returns true is the syscall should be reported. + shouldReport(regs *rpb.Registers) bool + + // onReported marks the syscall as reported. + onReported(regs *rpb.Registers) +} + +// onceTracker reports only a single time, used for most syscalls. +type onceTracker struct { + reported bool +} + +func (o *onceTracker) shouldReport(_ *rpb.Registers) bool { + return !o.reported +} + +func (o *onceTracker) onReported(_ *rpb.Registers) { + o.reported = true +} + +// argsTracker reports only once for each different combination of arguments. +// It's used for generic syscalls like ioctl to report once per 'cmd'. +type argsTracker struct { + // argsIdx is the syscall arguments to use as unique ID. + argsIdx []int + reported map[string]struct{} + count int +} + +func newArgsTracker(argIdx ...int) *argsTracker { + return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})} +} + +// key returns the command based on the syscall argument index. +func (a *argsTracker) key(regs *rpb.Registers) string { + var rv string + for _, idx := range a.argsIdx { + rv += fmt.Sprintf("%d|", argVal(idx, regs)) + } + return rv +} + +func (a *argsTracker) shouldReport(regs *rpb.Registers) bool { + if a.count >= reportLimit { + return false + } + _, ok := a.reported[a.key(regs)] + return !ok +} + +func (a *argsTracker) onReported(regs *rpb.Registers) { + a.count++ + a.reported[a.key(regs)] = struct{}{} +} diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go new file mode 100644 index 000000000..8eb76b2ba --- /dev/null +++ b/runsc/boot/compat_amd64.go @@ -0,0 +1,100 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" +) + +const ( + // reportLimit is the max number of events that should be reported per + // tracker. + reportLimit = 100 + syscallLink = "https://gvisor.dev/c/linux/amd64" +) + +// newRegs create a empty Registers instance. +func newRegs() *rpb.Registers { + return &rpb.Registers{ + Arch: &rpb.Registers_Amd64{ + Amd64: &rpb.AMD64Registers{}, + }, + } +} + +func argVal(argIdx int, regs *rpb.Registers) uint64 { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + + switch argIdx { + case 0: + return amd64Regs.Rdi + case 1: + return amd64Regs.Rsi + case 2: + return amd64Regs.Rdx + case 3: + return amd64Regs.R10 + case 4: + return amd64Regs.R8 + case 5: + return amd64Regs.R9 + } + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) +} + +func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + + switch argIdx { + case 0: + amd64Regs.Rdi = argVal + case 1: + amd64Regs.Rsi = argVal + case 2: + amd64Regs.Rdx = argVal + case 3: + amd64Regs.R10 = argVal + case 4: + amd64Regs.R8 = argVal + case 5: + amd64Regs.R9 = argVal + default: + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) + } +} + +func getSyscallNameMap() (strace.SyscallMap, bool) { + return strace.Lookup(abi.Linux, arch.AMD64) +} + +func syscallNum(regs *rpb.Registers) uint64 { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + return amd64Regs.OrigRax +} + +func newArchArgsTracker(sysnr uint64) syscallTracker { + switch sysnr { + case syscall.SYS_ARCH_PRCTL: + // args: cmd, ... + return newArgsTracker(0) + } + return nil +} diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go new file mode 100644 index 000000000..bce9d95b3 --- /dev/null +++ b/runsc/boot/compat_arm64.go @@ -0,0 +1,95 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" +) + +const ( + // reportLimit is the max number of events that should be reported per + // tracker. + reportLimit = 100 + syscallLink = "https://gvisor.dev/c/linux/arm64" +) + +// newRegs create a empty Registers instance. +func newRegs() *rpb.Registers { + return &rpb.Registers{ + Arch: &rpb.Registers_Arm64{ + Arm64: &rpb.ARM64Registers{}, + }, + } +} + +func argVal(argIdx int, regs *rpb.Registers) uint64 { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + + switch argIdx { + case 0: + return arm64Regs.R0 + case 1: + return arm64Regs.R1 + case 2: + return arm64Regs.R2 + case 3: + return arm64Regs.R3 + case 4: + return arm64Regs.R4 + case 5: + return arm64Regs.R5 + } + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) +} + +func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + + switch argIdx { + case 0: + arm64Regs.R0 = argVal + case 1: + arm64Regs.R1 = argVal + case 2: + arm64Regs.R2 = argVal + case 3: + arm64Regs.R3 = argVal + case 4: + arm64Regs.R4 = argVal + case 5: + arm64Regs.R5 = argVal + default: + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) + } +} + +func getSyscallNameMap() (strace.SyscallMap, bool) { + return strace.Lookup(abi.Linux, arch.ARM64) +} + +func syscallNum(regs *rpb.Registers) uint64 { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + return arm64Regs.R8 +} + +func newArchArgsTracker(sysnr uint64) syscallTracker { + // currently, no arch specific syscalls need to be handled here. + return nil +} diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go new file mode 100644 index 000000000..839c5303b --- /dev/null +++ b/runsc/boot/compat_test.go @@ -0,0 +1,90 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "testing" +) + +func TestOnceTracker(t *testing.T) { + o := onceTracker{} + if !o.shouldReport(nil) { + t.Error("first call to checkAndMark, got: false, want: true") + } + o.onReported(nil) + for i := 0; i < 2; i++ { + if o.shouldReport(nil) { + t.Error("after first call to checkAndMark, got: true, want: false") + } + } +} + +func TestArgsTracker(t *testing.T) { + for _, tc := range []struct { + name string + idx []int + arg1_1 uint64 + arg1_2 uint64 + arg2_1 uint64 + arg2_2 uint64 + want bool + }{ + {name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false}, + {name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false}, + {name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true}, + {name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true}, + {name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false}, + {name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false}, + {name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true}, + } { + t.Run(tc.name, func(t *testing.T) { + c := newArgsTracker(tc.idx...) + regs := newRegs() + setArgVal(0, tc.arg1_1, regs) + setArgVal(1, tc.arg2_1, regs) + if !c.shouldReport(regs) { + t.Error("first call to shouldReport, got: false, want: true") + } + c.onReported(regs) + + setArgVal(0, tc.arg1_2, regs) + setArgVal(1, tc.arg2_2, regs) + if got := c.shouldReport(regs); tc.want != got { + t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want) + } + }) + } +} + +func TestArgsTrackerLimit(t *testing.T) { + c := newArgsTracker(0, 1) + for i := 0; i < reportLimit; i++ { + regs := newRegs() + setArgVal(0, 123, regs) + setArgVal(1, uint64(i), regs) + if !c.shouldReport(regs) { + t.Error("shouldReport before limit was reached, got: false, want: true") + } + c.onReported(regs) + } + + // Should hit the count limit now. + regs := newRegs() + setArgVal(0, 123, regs) + setArgVal(1, 123456, regs) + if c.shouldReport(regs) { + t.Error("shouldReport after limit was reached, got: true, want: false") + } +} diff --git a/runsc/boot/config.go b/runsc/boot/config.go new file mode 100644 index 000000000..bb01b8fb5 --- /dev/null +++ b/runsc/boot/config.go @@ -0,0 +1,329 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "strconv" + "strings" + + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/watchdog" +) + +// FileAccessType tells how the filesystem is accessed. +type FileAccessType int + +const ( + // FileAccessShared sends IO requests to a Gofer process that validates the + // requests and forwards them to the host. + FileAccessShared FileAccessType = iota + + // FileAccessExclusive is the same as FileAccessShared, but enables + // extra caching for improved performance. It should only be used if + // the sandbox has exclusive access to the filesystem. + FileAccessExclusive +) + +// MakeFileAccessType converts type from string. +func MakeFileAccessType(s string) (FileAccessType, error) { + switch s { + case "shared": + return FileAccessShared, nil + case "exclusive": + return FileAccessExclusive, nil + default: + return 0, fmt.Errorf("invalid file access type %q", s) + } +} + +func (f FileAccessType) String() string { + switch f { + case FileAccessShared: + return "shared" + case FileAccessExclusive: + return "exclusive" + default: + return fmt.Sprintf("unknown(%d)", f) + } +} + +// NetworkType tells which network stack to use. +type NetworkType int + +const ( + // NetworkSandbox uses internal network stack, isolated from the host. + NetworkSandbox NetworkType = iota + + // NetworkHost redirects network related syscalls to the host network. + NetworkHost + + // NetworkNone sets up just loopback using netstack. + NetworkNone +) + +// MakeNetworkType converts type from string. +func MakeNetworkType(s string) (NetworkType, error) { + switch s { + case "sandbox": + return NetworkSandbox, nil + case "host": + return NetworkHost, nil + case "none": + return NetworkNone, nil + default: + return 0, fmt.Errorf("invalid network type %q", s) + } +} + +func (n NetworkType) String() string { + switch n { + case NetworkSandbox: + return "sandbox" + case NetworkHost: + return "host" + case NetworkNone: + return "none" + default: + return fmt.Sprintf("unknown(%d)", n) + } +} + +// MakeWatchdogAction converts type from string. +func MakeWatchdogAction(s string) (watchdog.Action, error) { + switch strings.ToLower(s) { + case "log", "logwarning": + return watchdog.LogWarning, nil + case "panic": + return watchdog.Panic, nil + default: + return 0, fmt.Errorf("invalid watchdog action %q", s) + } +} + +// MakeRefsLeakMode converts type from string. +func MakeRefsLeakMode(s string) (refs.LeakMode, error) { + switch strings.ToLower(s) { + case "disabled": + return refs.NoLeakChecking, nil + case "log-names": + return refs.LeaksLogWarning, nil + case "log-traces": + return refs.LeaksLogTraces, nil + default: + return 0, fmt.Errorf("invalid refs leakmode %q", s) + } +} + +func refsLeakModeToString(mode refs.LeakMode) string { + switch mode { + // If not set, default it to disabled. + case refs.UninitializedLeakChecking, refs.NoLeakChecking: + return "disabled" + case refs.LeaksLogWarning: + return "log-names" + case refs.LeaksLogTraces: + return "log-traces" + default: + panic(fmt.Sprintf("Invalid leakmode: %d", mode)) + } +} + +// Config holds configuration that is not part of the runtime spec. +type Config struct { + // RootDir is the runtime root directory. + RootDir string + + // Debug indicates that debug logging should be enabled. + Debug bool + + // LogFilename is the filename to log to, if not empty. + LogFilename string + + // LogFormat is the log format. + LogFormat string + + // DebugLog is the path to log debug information to, if not empty. + DebugLog string + + // PanicLog is the path to log GO's runtime messages, if not empty. + PanicLog string + + // DebugLogFormat is the log format for debug. + DebugLogFormat string + + // FileAccess indicates how the filesystem is accessed. + FileAccess FileAccessType + + // Overlay is whether to wrap the root filesystem in an overlay. + Overlay bool + + // FSGoferHostUDS enables the gofer to mount a host UDS. + FSGoferHostUDS bool + + // Network indicates what type of network to use. + Network NetworkType + + // EnableRaw indicates whether raw sockets should be enabled. Raw + // sockets are disabled by stripping CAP_NET_RAW from the list of + // capabilities. + EnableRaw bool + + // HardwareGSO indicates that hardware segmentation offload is enabled. + HardwareGSO bool + + // SoftwareGSO indicates that software segmentation offload is enabled. + SoftwareGSO bool + + // TXChecksumOffload indicates that TX Checksum Offload is enabled. + TXChecksumOffload bool + + // RXChecksumOffload indicates that RX Checksum Offload is enabled. + RXChecksumOffload bool + + // QDisc indicates the type of queuening discipline to use by default + // for non-loopback interfaces. + QDisc QueueingDiscipline + + // LogPackets indicates that all network packets should be logged. + LogPackets bool + + // Platform is the platform to run on. + Platform string + + // Strace indicates that strace should be enabled. + Strace bool + + // StraceSyscalls is the set of syscalls to trace. If StraceEnable is + // true and this list is empty, then all syscalls will be traced. + StraceSyscalls []string + + // StraceLogSize is the max size of data blobs to display. + StraceLogSize uint + + // DisableSeccomp indicates whether seccomp syscall filters should be + // disabled. Pardon the double negation, but default to enabled is important. + DisableSeccomp bool + + // WatchdogAction sets what action the watchdog takes when triggered. + WatchdogAction watchdog.Action + + // PanicSignal registers signal handling that panics. Usually set to + // SIGUSR2(12) to troubleshoot hangs. -1 disables it. + PanicSignal int + + // ProfileEnable is set to prepare the sandbox to be profiled. + ProfileEnable bool + + // RestoreFile is the path to the saved container image + RestoreFile string + + // NumNetworkChannels controls the number of AF_PACKET sockets that map + // to the same underlying network device. This allows netstack to better + // scale for high throughput use cases. + NumNetworkChannels int + + // Rootless allows the sandbox to be started with a user that is not root. + // Defense is depth measures are weaker with rootless. Specifically, the + // sandbox and Gofer process run as root inside a user namespace with root + // mapped to the caller's user. + Rootless bool + + // AlsoLogToStderr allows to send log messages to stderr. + AlsoLogToStderr bool + + // ReferenceLeakMode sets reference leak check mode + ReferenceLeakMode refs.LeakMode + + // OverlayfsStaleRead instructs the sandbox to assume that the root mount + // is on a Linux overlayfs mount, which does not necessarily preserve + // coherence between read-only and subsequent writable file descriptors + // representing the "same" file. + OverlayfsStaleRead bool + + // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in + // tests. It allows runsc to start the sandbox process as the current + // user, and without chrooting the sandbox process. This can be + // necessary in test environments that have limited capabilities. + TestOnlyAllowRunAsCurrentUserWithoutChroot bool + + // TestOnlyTestNameEnv should only be used in tests. It looks up for the + // test name in the container environment variables and adds it to the debug + // log file name. This is done to help identify the log with the test when + // multiple tests are run in parallel, since there is no way to pass + // parameters to the runtime from docker. + TestOnlyTestNameEnv string + + // CPUNumFromQuota sets CPU number count to available CPU quota, using + // least integer value greater than or equal to quota. + // + // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. + CPUNumFromQuota bool + + // Enables VFS2 (not plumbled through yet). + VFS2 bool +} + +// ToFlags returns a slice of flags that correspond to the given Config. +func (c *Config) ToFlags() []string { + f := []string{ + "--root=" + c.RootDir, + "--debug=" + strconv.FormatBool(c.Debug), + "--log=" + c.LogFilename, + "--log-format=" + c.LogFormat, + "--debug-log=" + c.DebugLog, + "--panic-log=" + c.PanicLog, + "--debug-log-format=" + c.DebugLogFormat, + "--file-access=" + c.FileAccess.String(), + "--overlay=" + strconv.FormatBool(c.Overlay), + "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS), + "--network=" + c.Network.String(), + "--log-packets=" + strconv.FormatBool(c.LogPackets), + "--platform=" + c.Platform, + "--strace=" + strconv.FormatBool(c.Strace), + "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","), + "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)), + "--watchdog-action=" + c.WatchdogAction.String(), + "--panic-signal=" + strconv.Itoa(c.PanicSignal), + "--profile=" + strconv.FormatBool(c.ProfileEnable), + "--net-raw=" + strconv.FormatBool(c.EnableRaw), + "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), + "--rootless=" + strconv.FormatBool(c.Rootless), + "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr), + "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), + "--gso=" + strconv.FormatBool(c.HardwareGSO), + "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), + "--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload), + "--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload), + "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), + "--qdisc=" + c.QDisc.String(), + } + if c.CPUNumFromQuota { + f = append(f, "--cpu-num-from-quota") + } + // Only include these if set since it is never to be used by users. + if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { + f = append(f, "--TESTONLY-unsafe-nonroot=true") + } + if len(c.TestOnlyTestNameEnv) != 0 { + f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv) + } + + if c.VFS2 { + f = append(f, "--vfs2=true") + } + + return f +} diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go new file mode 100644 index 000000000..8125d5061 --- /dev/null +++ b/runsc/boot/controller.go @@ -0,0 +1,506 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "errors" + "fmt" + "os" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/sentry/state" + "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/specutils" +) + +const ( + // ContainerCheckpoint checkpoints a container. + ContainerCheckpoint = "containerManager.Checkpoint" + + // ContainerCreate creates a container. + ContainerCreate = "containerManager.Create" + + // ContainerDestroy is used to stop a non-root container and free all + // associated resources in the sandbox. + ContainerDestroy = "containerManager.Destroy" + + // ContainerEvent is the URPC endpoint for getting stats about the + // container used by "runsc events". + ContainerEvent = "containerManager.Event" + + // ContainerExecuteAsync is the URPC endpoint for executing a command in a + // container. + ContainerExecuteAsync = "containerManager.ExecuteAsync" + + // ContainerPause pauses the container. + ContainerPause = "containerManager.Pause" + + // ContainerProcesses is the URPC endpoint for getting the list of + // processes running in a container. + ContainerProcesses = "containerManager.Processes" + + // ContainerRestore restores a container from a statefile. + ContainerRestore = "containerManager.Restore" + + // ContainerResume unpauses the paused container. + ContainerResume = "containerManager.Resume" + + // ContainerSignal is used to send a signal to a container. + ContainerSignal = "containerManager.Signal" + + // ContainerSignalProcess is used to send a signal to a particular + // process in a container. + ContainerSignalProcess = "containerManager.SignalProcess" + + // ContainerStart is the URPC endpoint for running a non-root container + // within a sandbox. + ContainerStart = "containerManager.Start" + + // ContainerWait is used to wait on the init process of the container + // and return its ExitStatus. + ContainerWait = "containerManager.Wait" + + // ContainerWaitPID is used to wait on a process with a certain PID in + // the sandbox and return its ExitStatus. + ContainerWaitPID = "containerManager.WaitPID" + + // NetworkCreateLinksAndRoutes is the URPC endpoint for creating links + // and routes in a network stack. + NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes" + + // RootContainerStart is the URPC endpoint for starting a new sandbox + // with root container. + RootContainerStart = "containerManager.StartRoot" + + // SandboxStacks collects sandbox stacks for debugging. + SandboxStacks = "debug.Stacks" +) + +// Profiling related commands (see pprof.go for more details). +const ( + StartCPUProfile = "Profile.StartCPUProfile" + StopCPUProfile = "Profile.StopCPUProfile" + HeapProfile = "Profile.HeapProfile" + GoroutineProfile = "Profile.GoroutineProfile" + BlockProfile = "Profile.BlockProfile" + MutexProfile = "Profile.MutexProfile" + StartTrace = "Profile.StartTrace" + StopTrace = "Profile.StopTrace" +) + +// Logging related commands (see logging.go for more details). +const ( + ChangeLogging = "Logging.Change" +) + +// ControlSocketAddr generates an abstract unix socket name for the given ID. +func ControlSocketAddr(id string) string { + return fmt.Sprintf("\x00runsc-sandbox.%s", id) +} + +// controller holds the control server, and is used for communication into the +// sandbox. +type controller struct { + // srv is the control server. + srv *server.Server + + // manager holds the containerManager methods. + manager *containerManager +} + +// newController creates a new controller. The caller must call +// controller.srv.StartServing() to start the controller. +func newController(fd int, l *Loader) (*controller, error) { + srv, err := server.CreateFromFD(fd) + if err != nil { + return nil, err + } + + manager := &containerManager{ + startChan: make(chan struct{}), + startResultChan: make(chan error), + l: l, + } + srv.Register(manager) + + if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { + net := &Network{ + Stack: eps.Stack, + } + srv.Register(net) + } + + srv.Register(&debug{}) + srv.Register(&control.Logging{}) + if l.conf.ProfileEnable { + srv.Register(&control.Profile{ + Kernel: l.k, + }) + } + + return &controller{ + srv: srv, + manager: manager, + }, nil +} + +// containerManager manages sandbox containers. +type containerManager struct { + // startChan is used to signal when the root container process should + // be started. + startChan chan struct{} + + // startResultChan is used to signal when the root container has + // started. Any errors encountered during startup will be sent to the + // channel. A nil value indicates success. + startResultChan chan error + + // l is the loader that creates containers and sandboxes. + l *Loader +} + +// StartRoot will start the root container process. +func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error { + log.Debugf("containerManager.StartRoot %q", *cid) + // Tell the root container to start and wait for the result. + cm.startChan <- struct{}{} + if err := <-cm.startResultChan; err != nil { + return fmt.Errorf("starting sandbox: %v", err) + } + return nil +} + +// Processes retrieves information about processes running in the sandbox. +func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error { + log.Debugf("containerManager.Processes: %q", *cid) + return control.Processes(cm.l.k, *cid, out) +} + +// Create creates a container within a sandbox. +func (cm *containerManager) Create(cid *string, _ *struct{}) error { + log.Debugf("containerManager.Create: %q", *cid) + return cm.l.createContainer(*cid) +} + +// StartArgs contains arguments to the Start method. +type StartArgs struct { + // Spec is the spec of the container to start. + Spec *specs.Spec + + // Config is the runsc-specific configuration for the sandbox. + Conf *Config + + // CID is the ID of the container to start. + CID string + + // FilePayload contains, in order: + // * stdin, stdout, and stderr. + // * the file descriptor over which the sandbox will + // request files from its root filesystem. + urpc.FilePayload +} + +// Start runs a created container within a sandbox. +func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { + log.Debugf("containerManager.Start: %+v", args) + + // Validate arguments. + if args == nil { + return errors.New("start missing arguments") + } + if args.Spec == nil { + return errors.New("start arguments missing spec") + } + if args.Conf == nil { + return errors.New("start arguments missing config") + } + if args.CID == "" { + return errors.New("start argument missing container ID") + } + if len(args.FilePayload.Files) < 4 { + return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") + } + + // All validation passed, logs the spec for debugging. + specutils.LogSpec(args.Spec) + + err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files) + if err != nil { + log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) + return err + } + log.Debugf("Container %q started", args.CID) + + return nil +} + +// Destroy stops a container if it is still running and cleans up its +// filesystem. +func (cm *containerManager) Destroy(cid *string, _ *struct{}) error { + log.Debugf("containerManager.destroy %q", *cid) + return cm.l.destroyContainer(*cid) +} + +// ExecuteAsync starts running a command on a created or running sandbox. It +// returns the PID of the new process. +func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error { + log.Debugf("containerManager.ExecuteAsync: %+v", args) + tgid, err := cm.l.executeAsync(args) + if err != nil { + log.Debugf("containerManager.ExecuteAsync failed: %+v: %v", args, err) + return err + } + *pid = int32(tgid) + return nil +} + +// Checkpoint pauses a sandbox and saves its state. +func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error { + log.Debugf("containerManager.Checkpoint") + state := control.State{ + Kernel: cm.l.k, + Watchdog: cm.l.watchdog, + } + return state.Save(o, nil) +} + +// Pause suspends a container. +func (cm *containerManager) Pause(_, _ *struct{}) error { + log.Debugf("containerManager.Pause") + cm.l.k.Pause() + return nil +} + +// RestoreOpts contains options related to restoring a container's file system. +type RestoreOpts struct { + // FilePayload contains the state file to be restored, followed by the + // platform device file if necessary. + urpc.FilePayload + + // SandboxID contains the ID of the sandbox. + SandboxID string +} + +// Restore loads a container from a statefile. +// The container's current kernel is destroyed, a restore environment is +// created, and the kernel is recreated with the restore state file. The +// container then sends the signal to start. +func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { + log.Debugf("containerManager.Restore") + + var specFile, deviceFile *os.File + switch numFiles := len(o.FilePayload.Files); numFiles { + case 2: + // The device file is donated to the platform. + // Can't take ownership away from os.File. dup them to get a new FD. + fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd())) + if err != nil { + return fmt.Errorf("failed to dup file: %v", err) + } + deviceFile = os.NewFile(uintptr(fd), "platform device") + fallthrough + case 1: + specFile = o.FilePayload.Files[0] + case 0: + return fmt.Errorf("at least one file must be passed to Restore") + default: + return fmt.Errorf("at most two files may be passed to Restore") + } + + // Pause the kernel while we build a new one. + cm.l.k.Pause() + + p, err := createPlatform(cm.l.conf, deviceFile) + if err != nil { + return fmt.Errorf("creating platform: %v", err) + } + k := &kernel.Kernel{ + Platform: p, + } + mf, err := createMemoryFile() + if err != nil { + return fmt.Errorf("creating memory file: %v", err) + } + k.SetMemoryFile(mf) + networkStack := cm.l.k.RootNetworkNamespace().Stack() + cm.l.k = k + + // Set up the restore environment. + mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints) + renv, err := mntr.createRestoreEnvironment(cm.l.conf) + if err != nil { + return fmt.Errorf("creating RestoreEnvironment: %v", err) + } + fs.SetRestoreEnvironment(*renv) + + // Prepare to load from the state file. + if eps, ok := networkStack.(*netstack.Stack); ok { + stack.StackFromEnv = eps.Stack // FIXME(b/36201077) + } + info, err := specFile.Stat() + if err != nil { + return err + } + if info.Size() == 0 { + return fmt.Errorf("file cannot be empty") + } + + if cm.l.conf.ProfileEnable { + // pprof.Initialize opens /proc/self/maps, so has to be called before + // installing seccomp filters. + pprof.Initialize() + } + + // Seccomp filters have to be applied before parsing the state file. + if err := cm.l.installSeccompFilters(); err != nil { + return err + } + + // Load the state. + loadOpts := state.LoadOpts{Source: specFile} + if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil { + return err + } + + // Since we have a new kernel we also must make a new watchdog. + dogOpts := watchdog.DefaultOpts + dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction + dog := watchdog.New(k, dogOpts) + + // Change the loader fields to reflect the changes made when restoring. + cm.l.k = k + cm.l.watchdog = dog + cm.l.rootProcArgs = kernel.CreateProcessArgs{} + cm.l.restore = true + + // Reinitialize the sandbox ID and processes map. Note that it doesn't + // restore the state of multiple containers, nor exec processes. + cm.l.sandboxID = o.SandboxID + cm.l.mu.Lock() + eid := execID{cid: o.SandboxID} + cm.l.processes = map[execID]*execProcess{ + eid: { + tg: cm.l.k.GlobalInit(), + }, + } + cm.l.mu.Unlock() + + // Tell the root container to start and wait for the result. + cm.startChan <- struct{}{} + if err := <-cm.startResultChan; err != nil { + return fmt.Errorf("starting sandbox: %v", err) + } + + return nil +} + +// Resume unpauses a container. +func (cm *containerManager) Resume(_, _ *struct{}) error { + log.Debugf("containerManager.Resume") + cm.l.k.Unpause() + return nil +} + +// Wait waits for the init process in the given container. +func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error { + log.Debugf("containerManager.Wait") + err := cm.l.waitContainer(*cid, waitStatus) + log.Debugf("containerManager.Wait returned, waitStatus: %v: %v", waitStatus, err) + return err +} + +// WaitPIDArgs are arguments to the WaitPID method. +type WaitPIDArgs struct { + // PID is the PID in the container's PID namespace. + PID int32 + + // CID is the container ID. + CID string +} + +// WaitPID waits for the process with PID 'pid' in the sandbox. +func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error { + log.Debugf("containerManager.Wait") + return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus) +} + +// SignalDeliveryMode enumerates different signal delivery modes. +type SignalDeliveryMode int + +const ( + // DeliverToProcess delivers the signal to the container process with + // the specified PID. If PID is 0, then the container init process is + // signaled. + DeliverToProcess SignalDeliveryMode = iota + + // DeliverToAllProcesses delivers the signal to all processes in the + // container. PID must be 0. + DeliverToAllProcesses + + // DeliverToForegroundProcessGroup delivers the signal to the + // foreground process group in the same TTY session as the specified + // process. If PID is 0, then the signal is delivered to the foreground + // process group for the TTY for the init process. + DeliverToForegroundProcessGroup +) + +func (s SignalDeliveryMode) String() string { + switch s { + case DeliverToProcess: + return "Process" + case DeliverToAllProcesses: + return "All" + case DeliverToForegroundProcessGroup: + return "Foreground Process Group" + } + return fmt.Sprintf("unknown signal delivery mode: %d", s) +} + +// SignalArgs are arguments to the Signal method. +type SignalArgs struct { + // CID is the container ID. + CID string + + // Signo is the signal to send to the process. + Signo int32 + + // PID is the process ID in the given container that will be signaled. + // If 0, the root container will be signalled. + PID int32 + + // Mode is the signal delivery mode. + Mode SignalDeliveryMode +} + +// Signal sends a signal to one or more processes in a container. If args.PID +// is 0, then the container init process is used. Depending on the +// args.SignalDeliveryMode option, the signal may be sent directly to the +// indicated process, to all processes in the container, or to the foreground +// process group. +func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error { + log.Debugf("containerManager.Signal %+v", args) + return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode) +} diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go new file mode 100644 index 000000000..1fb32c527 --- /dev/null +++ b/runsc/boot/debug.go @@ -0,0 +1,29 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "gvisor.dev/gvisor/pkg/log" +) + +type debug struct { +} + +// Stacks collects all sandbox stacks and copies them to 'stacks'. +func (*debug) Stacks(_ *struct{}, stacks *string) error { + buf := log.Stacks(true) + *stacks = string(buf) + return nil +} diff --git a/runsc/boot/events.go b/runsc/boot/events.go new file mode 100644 index 000000000..422f4da00 --- /dev/null +++ b/runsc/boot/events.go @@ -0,0 +1,81 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usage" +) + +// Event struct for encoding the event data to JSON. Corresponds to runc's +// main.event struct. +type Event struct { + Type string `json:"type"` + ID string `json:"id"` + Data interface{} `json:"data,omitempty"` +} + +// Stats is the runc specific stats structure for stability when encoding and +// decoding stats. +type Stats struct { + Memory Memory `json:"memory"` + Pids Pids `json:"pids"` +} + +// Pids contains stats on processes. +type Pids struct { + Current uint64 `json:"current,omitempty"` + Limit uint64 `json:"limit,omitempty"` +} + +// MemoryEntry contains stats on a kind of memory. +type MemoryEntry struct { + Limit uint64 `json:"limit"` + Usage uint64 `json:"usage,omitempty"` + Max uint64 `json:"max,omitempty"` + Failcnt uint64 `json:"failcnt"` +} + +// Memory contains stats on memory. +type Memory struct { + Cache uint64 `json:"cache,omitempty"` + Usage MemoryEntry `json:"usage,omitempty"` + Swap MemoryEntry `json:"swap,omitempty"` + Kernel MemoryEntry `json:"kernel,omitempty"` + KernelTCP MemoryEntry `json:"kernelTCP,omitempty"` + Raw map[string]uint64 `json:"raw,omitempty"` +} + +// Event gets the events from the container. +func (cm *containerManager) Event(_ *struct{}, out *Event) error { + stats := &Stats{} + stats.populateMemory(cm.l.k) + stats.populatePIDs(cm.l.k) + *out = Event{Type: "stats", Data: stats} + return nil +} + +func (s *Stats) populateMemory(k *kernel.Kernel) { + mem := k.MemoryFile() + mem.UpdateUsage() + _, totalUsage := usage.MemoryAccounting.Copy() + s.Memory.Usage = MemoryEntry{ + Usage: totalUsage, + } +} + +func (s *Stats) populatePIDs(k *kernel.Kernel) { + s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups())) +} diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD new file mode 100644 index 000000000..ed18f0047 --- /dev/null +++ b/runsc/boot/filter/BUILD @@ -0,0 +1,28 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "filter", + srcs = [ + "config.go", + "config_amd64.go", + "config_arm64.go", + "config_profile.go", + "extra_filters.go", + "extra_filters_msan.go", + "extra_filters_race.go", + "filter.go", + ], + visibility = [ + "//runsc/boot:__subpackages__", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/seccomp", + "//pkg/sentry/platform", + "//pkg/tcpip/link/fdbased", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go new file mode 100644 index 000000000..60e33425f --- /dev/null +++ b/runsc/boot/filter/config.go @@ -0,0 +1,559 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "os" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" +) + +// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS. +var allowedSyscalls = seccomp.SyscallRules{ + syscall.SYS_CLOCK_GETTIME: {}, + syscall.SYS_CLONE: []seccomp.Rule{ + { + seccomp.AllowValue( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + }, + }, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, + syscall.SYS_DUP3: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.O_CLOEXEC), + }, + }, + syscall.SYS_EPOLL_CREATE1: {}, + syscall.SYS_EPOLL_CTL: {}, + syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(0), + }, + }, + syscall.SYS_EVENTFD2: []seccomp.Rule{ + { + seccomp.AllowValue(0), + seccomp.AllowValue(0), + }, + }, + syscall.SYS_EXIT: {}, + syscall.SYS_EXIT_GROUP: {}, + syscall.SYS_FALLOCATE: {}, + syscall.SYS_FCHMOD: {}, + syscall.SYS_FCNTL: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.F_GETFL), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.F_SETFL), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.F_GETFD), + }, + }, + syscall.SYS_FSTAT: {}, + syscall.SYS_FSYNC: {}, + syscall.SYS_FTRUNCATE: {}, + syscall.SYS_FUTEX: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), + seccomp.AllowAny{}, + }, + // Non-private variants are included for flipcall support. They are otherwise + // unncessary, as the sentry will use only private futexes internally. + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAIT), + seccomp.AllowAny{}, + seccomp.AllowAny{}, + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(linux.FUTEX_WAKE), + seccomp.AllowAny{}, + }, + }, + syscall.SYS_GETPID: {}, + unix.SYS_GETRANDOM: {}, + syscall.SYS_GETSOCKOPT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_DOMAIN), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_TYPE), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_ERROR), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_SNDBUF), + }, + }, + syscall.SYS_GETTID: {}, + syscall.SYS_GETTIMEOFDAY: {}, + // SYS_IOCTL is needed for terminal support, but we only allow + // setting/getting termios and winsize. + syscall.SYS_IOCTL: []seccomp.Rule{ + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TCGETS), + seccomp.AllowAny{}, /* termios struct */ + }, + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TCSETS), + seccomp.AllowAny{}, /* termios struct */ + }, + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TCSETSF), + seccomp.AllowAny{}, /* termios struct */ + }, + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TCSETSW), + seccomp.AllowAny{}, /* termios struct */ + }, + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TIOCSWINSZ), + seccomp.AllowAny{}, /* winsize struct */ + }, + { + seccomp.AllowAny{}, /* fd */ + seccomp.AllowValue(linux.TIOCGWINSZ), + seccomp.AllowAny{}, /* winsize struct */ + }, + }, + syscall.SYS_LSEEK: {}, + syscall.SYS_MADVISE: {}, + syscall.SYS_MINCORE: {}, + // Used by the Go runtime as a temporarily workaround for a Linux + // 5.2-5.4 bug. + // + // See src/runtime/os_linux_x86.go. + // + // TODO(b/148688965): Remove once this is gone from Go. + syscall.SYS_MLOCK: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(4096), + }, + }, + syscall.SYS_MMAP: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_SHARED), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_PRIVATE), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ), + seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), + }, + }, + syscall.SYS_MPROTECT: {}, + syscall.SYS_MUNMAP: {}, + syscall.SYS_NANOSLEEP: {}, + syscall.SYS_PPOLL: {}, + syscall.SYS_PREAD64: {}, + syscall.SYS_PREADV: {}, + unix.SYS_PREADV2: {}, + syscall.SYS_PWRITE64: {}, + syscall.SYS_PWRITEV: {}, + unix.SYS_PWRITEV2: {}, + syscall.SYS_READ: {}, + syscall.SYS_RECVMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), + }, + }, + syscall.SYS_RECVMMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(fdbased.MaxMsgsPerRecv), + seccomp.AllowValue(syscall.MSG_DONTWAIT), + seccomp.AllowValue(0), + }, + }, + unix.SYS_SENDMMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT), + seccomp.AllowValue(0), + }, + }, + syscall.SYS_RESTART_SYSCALL: {}, + syscall.SYS_RT_SIGACTION: {}, + syscall.SYS_RT_SIGPROCMASK: {}, + syscall.SYS_RT_SIGRETURN: {}, + syscall.SYS_SCHED_YIELD: {}, + syscall.SYS_SENDMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), + }, + }, + syscall.SYS_SETITIMER: {}, + syscall.SYS_SHUTDOWN: []seccomp.Rule{ + // Used by fs/host to shutdown host sockets. + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)}, + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)}, + // Used by unet to shutdown connections. + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, + }, + syscall.SYS_SIGALTSTACK: {}, + unix.SYS_STATX: {}, + syscall.SYS_SYNC_FILE_RANGE: {}, + syscall.SYS_TEE: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(1), /* len */ + seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */ + }, + }, + syscall.SYS_TGKILL: []seccomp.Rule{ + { + seccomp.AllowValue(uint64(os.Getpid())), + }, + }, + syscall.SYS_UTIMENSAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* null pathname */ + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* flags */ + }, + }, + syscall.SYS_WRITE: {}, + // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with + // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR + // option is enabled for a packet socket. + syscall.SYS_WRITEV: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(2), + }, + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(3), + }, + }, +} + +// hostInetFilters contains syscalls that are needed by sentry/socket/hostinet. +func hostInetFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_ACCEPT4: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + }, + }, + syscall.SYS_BIND: {}, + syscall.SYS_CONNECT: {}, + syscall.SYS_GETPEERNAME: {}, + syscall.SYS_GETSOCKNAME: {}, + syscall.SYS_GETSOCKOPT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_TOS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_RECVTOS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_TCLASS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_V6ONLY), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_ERROR), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_KEEPALIVE), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_SNDBUF), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_RCVBUF), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_REUSEADDR), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_TYPE), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_LINGER), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_TCP), + seccomp.AllowValue(syscall.TCP_NODELAY), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_TCP), + seccomp.AllowValue(syscall.TCP_INFO), + }, + }, + syscall.SYS_IOCTL: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.TIOCOUTQ), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.TIOCINQ), + }, + }, + syscall.SYS_LISTEN: {}, + syscall.SYS_READV: {}, + syscall.SYS_RECVFROM: {}, + syscall.SYS_RECVMSG: {}, + syscall.SYS_SENDMSG: {}, + syscall.SYS_SENDTO: {}, + syscall.SYS_SETSOCKOPT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_V6ONLY), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_SNDBUF), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_RCVBUF), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_REUSEADDR), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_TCP), + seccomp.AllowValue(syscall.TCP_NODELAY), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_TOS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_RECVTOS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_TCLASS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + }, + syscall.SYS_SHUTDOWN: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SHUT_RD), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SHUT_WR), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SHUT_RDWR), + }, + }, + syscall.SYS_SOCKET: []seccomp.Rule{ + { + seccomp.AllowValue(syscall.AF_INET), + seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_INET), + seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_INET6), + seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_INET6), + seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.AllowValue(0), + }, + }, + syscall.SYS_WRITEV: {}, + } +} + +func controlServerFilters(fd int) seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_ACCEPT: []seccomp.Rule{ + { + seccomp.AllowValue(fd), + }, + }, + syscall.SYS_LISTEN: []seccomp.Rule{ + { + seccomp.AllowValue(fd), + seccomp.AllowValue(16 /* unet.backlog */), + }, + }, + syscall.SYS_GETSOCKOPT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_SOCKET), + seccomp.AllowValue(syscall.SO_PEERCRED), + }, + }, + } +} diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go new file mode 100644 index 000000000..5335ff82c --- /dev/null +++ b/runsc/boot/filter/config_amd64.go @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" +) + +func init() { + allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL], + seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)}, + seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)}, + ) +} diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go new file mode 100644 index 000000000..7fa9bbda3 --- /dev/null +++ b/runsc/boot/filter/config_arm64.go @@ -0,0 +1,21 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package filter + +// Reserve for future customization. +func init() { +} diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go new file mode 100644 index 000000000..194952a7b --- /dev/null +++ b/runsc/boot/filter/config_profile.go @@ -0,0 +1,34 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// profileFilters returns extra syscalls made by runtime/pprof package. +func profileFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_OPENAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), + }, + }, + } +} diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go new file mode 100644 index 000000000..e28d4b8d6 --- /dev/null +++ b/runsc/boot/filter/extra_filters.go @@ -0,0 +1,28 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !msan,!race + +package filter + +import ( + "gvisor.dev/gvisor/pkg/seccomp" +) + +// instrumentationFilters returns additional filters for syscalls used by +// Go instrumentation tools, e.g. -race, -msan. +// Returns empty when disabled. +func instrumentationFilters() seccomp.SyscallRules { + return nil +} diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go new file mode 100644 index 000000000..209e646a7 --- /dev/null +++ b/runsc/boot/filter/extra_filters_msan.go @@ -0,0 +1,34 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build msan + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// instrumentationFilters returns additional filters for syscalls used by MSAN. +func instrumentationFilters() seccomp.SyscallRules { + Report("MSAN is enabled: syscall filters less restrictive!") + return seccomp.SyscallRules{ + syscall.SYS_CLONE: {}, + syscall.SYS_MMAP: {}, + syscall.SYS_SCHED_GETAFFINITY: {}, + syscall.SYS_SET_ROBUST_LIST: {}, + } +} diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go new file mode 100644 index 000000000..9ff80276a --- /dev/null +++ b/runsc/boot/filter/extra_filters_race.go @@ -0,0 +1,41 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build race + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// instrumentationFilters returns additional filters for syscalls used by TSAN. +func instrumentationFilters() seccomp.SyscallRules { + Report("TSAN is enabled: syscall filters less restrictive!") + return seccomp.SyscallRules{ + syscall.SYS_BRK: {}, + syscall.SYS_CLONE: {}, + syscall.SYS_FUTEX: {}, + syscall.SYS_MMAP: {}, + syscall.SYS_MUNLOCK: {}, + syscall.SYS_NANOSLEEP: {}, + syscall.SYS_OPEN: {}, + syscall.SYS_OPENAT: {}, + syscall.SYS_SET_ROBUST_LIST: {}, + // Used within glibc's malloc. + syscall.SYS_TIME: {}, + } +} diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go new file mode 100644 index 000000000..e80c171b3 --- /dev/null +++ b/runsc/boot/filter/filter.go @@ -0,0 +1,60 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package filter defines all syscalls the sandbox is allowed to make +// to the host, and installs seccomp filters to prevent prohibited +// syscalls in case it's compromised. +package filter + +import ( + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/sentry/platform" +) + +// Options are seccomp filter related options. +type Options struct { + Platform platform.Platform + HostNetwork bool + ProfileEnable bool + ControllerFD int +} + +// Install installs seccomp filters for based on the given platform. +func Install(opt Options) error { + s := allowedSyscalls + s.Merge(controlServerFilters(opt.ControllerFD)) + + // Set of additional filters used by -race and -msan. Returns empty + // when not enabled. + s.Merge(instrumentationFilters()) + + if opt.HostNetwork { + Report("host networking enabled: syscall filters less restrictive!") + s.Merge(hostInetFilters()) + } + if opt.ProfileEnable { + Report("profile enabled: syscall filters less restrictive!") + s.Merge(profileFilters()) + } + + s.Merge(opt.Platform.SyscallFilters()) + + return seccomp.Install(s) +} + +// Report writes a warning message to the log. +func Report(msg string) { + log.Warningf("*** SECCOMP WARNING: %s", msg) +} diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go new file mode 100644 index 000000000..59639ba19 --- /dev/null +++ b/runsc/boot/fs.go @@ -0,0 +1,1034 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "path/filepath" + "sort" + "strconv" + "strings" + "syscall" + + // Include filesystem types that OCI spec might mount. + _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" + _ "gvisor.dev/gvisor/pkg/sentry/fs/host" + _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" + _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" + "gvisor.dev/gvisor/pkg/sentry/vfs" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/gofer" + "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" + "gvisor.dev/gvisor/pkg/sentry/fs/user" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + procvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" + sysvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" + tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/runsc/specutils" +) + +const ( + // Device name for root mount. + rootDevice = "9pfs-/" + + // MountPrefix is the annotation prefix for mount hints. + MountPrefix = "dev.gvisor.spec.mount." + + // Supported filesystems that map to different internal filesystem. + bind = "bind" + nonefs = "none" +) + +// tmpfs has some extra supported options that we must pass through. +var tmpfsAllowedData = []string{"mode", "uid", "gid"} + +func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { + // Upper layer uses the same flags as lower, but it must be read-write. + upperFlags := lowerFlags + upperFlags.ReadOnly = false + + tmpFS := mustFindFilesystem("tmpfs") + if !fs.IsDir(lower.StableAttr) { + // Create overlay on top of mount file, e.g. /etc/hostname. + msrc := fs.NewCachingMountSource(ctx, tmpFS, upperFlags) + return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags) + } + + // Create overlay on top of mount dir. + upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil) + if err != nil { + return nil, fmt.Errorf("creating tmpfs overlay: %v", err) + } + + // Replicate permissions and owner from lower to upper mount point. + attr, err := lower.UnstableAttr(ctx) + if err != nil { + return nil, fmt.Errorf("reading attributes from lower mount point: %v", err) + } + if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) { + return nil, fmt.Errorf("error setting permission to upper mount point") + } + if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil { + return nil, fmt.Errorf("setting owner to upper mount point: %v", err) + } + + return fs.NewOverlayRoot(ctx, upper, lower, upperFlags) +} + +// compileMounts returns the supported mounts from the mount spec, adding any +// mandatory mounts that are required by the OCI specification. +func compileMounts(spec *specs.Spec) []specs.Mount { + // Keep track of whether proc and sys were mounted. + var procMounted, sysMounted bool + var mounts []specs.Mount + + // Always mount /dev. + mounts = append(mounts, specs.Mount{ + Type: devtmpfs.Name, + Destination: "/dev", + }) + + mounts = append(mounts, specs.Mount{ + Type: devpts.Name, + Destination: "/dev/pts", + }) + + // Mount all submounts from the spec. + for _, m := range spec.Mounts { + if !specutils.IsSupportedDevMount(m) { + log.Warningf("ignoring dev mount at %q", m.Destination) + continue + } + mounts = append(mounts, m) + switch filepath.Clean(m.Destination) { + case "/proc": + procMounted = true + case "/sys": + sysMounted = true + } + } + + // Mount proc and sys even if the user did not ask for it, as the spec + // says we SHOULD. + var mandatoryMounts []specs.Mount + if !procMounted { + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: procvfs2.Name, + Destination: "/proc", + }) + } + if !sysMounted { + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: sysvfs2.Name, + Destination: "/sys", + }) + } + + // The mandatory mounts should be ordered right after the root, in case + // there are submounts of these mandatory mounts already in the spec. + mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...) + + return mounts +} + +// p9MountData creates a slice of p9 mount data. +func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string { + opts := []string{ + "trans=fd", + "rfdno=" + strconv.Itoa(fd), + "wfdno=" + strconv.Itoa(fd), + } + if !vfs2 { + // privateunixsocket is always enabled in VFS2. VFS1 requires explicit + // enablement. + opts = append(opts, "privateunixsocket=true") + } + if fa == FileAccessShared { + opts = append(opts, "cache=remote_revalidating") + } + return opts +} + +// parseAndFilterOptions parses a MountOptions slice and filters by the allowed +// keys. +func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) { + var out []string + for _, o := range opts { + ok, err := parseMountOption(o, allowedKeys...) + if err != nil { + return nil, err + } + if ok { + out = append(out, o) + } + } + return out, nil +} + +func parseMountOption(opt string, allowedKeys ...string) (bool, error) { + kv := strings.SplitN(opt, "=", 3) + if len(kv) > 2 { + return false, fmt.Errorf("invalid option %q", opt) + } + return specutils.ContainsStr(allowedKeys, kv[0]), nil +} + +// mountDevice returns a device string based on the fs type and target +// of the mount. +func mountDevice(m specs.Mount) string { + if m.Type == bind { + // Make a device string that includes the target, which is consistent across + // S/R and uniquely identifies the connection. + return "9pfs-" + m.Destination + } + // All other fs types use device "none". + return "none" +} + +func mountFlags(opts []string) fs.MountSourceFlags { + mf := fs.MountSourceFlags{} + // Note: changes to supported options must be reflected in + // isSupportedMountFlag() as well. + for _, o := range opts { + switch o { + case "rw": + mf.ReadOnly = false + case "ro": + mf.ReadOnly = true + case "noatime": + mf.NoAtime = true + case "noexec": + mf.NoExec = true + default: + log.Warningf("ignoring unknown mount option %q", o) + } + } + return mf +} + +func isSupportedMountFlag(fstype, opt string) bool { + switch opt { + case "rw", "ro", "noatime", "noexec": + return true + } + if fstype == tmpfsvfs2.Name { + ok, err := parseMountOption(opt, tmpfsAllowedData...) + return ok && err == nil + } + return false +} + +func mustFindFilesystem(name string) fs.Filesystem { + fs, ok := fs.FindFilesystem(name) + if !ok { + panic(fmt.Sprintf("could not find filesystem %q", name)) + } + return fs +} + +// addSubmountOverlay overlays the inode over a ramfs tree containing the given +// paths. +func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) { + // Construct a ramfs tree of mount points. The contents never + // change, so this can be fully caching. There's no real + // filesystem backing this tree, so we set the filesystem to + // nil. + msrc := fs.NewCachingMountSource(ctx, nil, fs.MountSourceFlags{}) + mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts) + if err != nil { + return nil, fmt.Errorf("creating mount tree: %v", err) + } + overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{}) + if err != nil { + return nil, fmt.Errorf("adding mount overlay: %v", err) + } + return overlayInode, err +} + +// subtargets takes a set of Mounts and returns only the targets that are +// children of the given root. The returned paths are relative to the root. +func subtargets(root string, mnts []specs.Mount) []string { + var targets []string + for _, mnt := range mnts { + if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath { + targets = append(targets, relPath) + } + } + return targets +} + +func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + if conf.VFS2 { + return setupContainerVFS2(ctx, conf, mntr, procArgs) + } + mns, err := mntr.setupFS(conf, procArgs) + if err != nil { + return err + } + + // Set namespace here so that it can be found in ctx. + procArgs.MountNamespace = mns + + // Resolve the executable path from working dir and environment. + resolved, err := user.ResolveExecutablePath(ctx, procArgs) + if err != nil { + return err + } + procArgs.Filename = resolved + return nil +} + +func adjustDirentCache(k *kernel.Kernel) error { + var hl syscall.Rlimit + if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil { + return fmt.Errorf("getting RLIMIT_NOFILE: %v", err) + } + if int64(hl.Cur) != syscall.RLIM_INFINITY { + newSize := hl.Cur / 2 + if newSize < gofer.DefaultDirentCacheSize { + log.Infof("Setting gofer dirent cache size to %d", newSize) + gofer.DefaultDirentCacheSize = newSize + k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize) + } + } + return nil +} + +type fdDispenser struct { + fds []int +} + +func (f *fdDispenser) remove() int { + if f.empty() { + panic("fdDispenser out of fds") + } + rv := f.fds[0] + f.fds = f.fds[1:] + return rv +} + +func (f *fdDispenser) empty() bool { + return len(f.fds) == 0 +} + +type shareType int + +const ( + invalid shareType = iota + + // container shareType indicates that the mount is used by a single container. + container + + // pod shareType indicates that the mount is used by more than one container + // inside the pod. + pod + + // shared shareType indicates that the mount can also be shared with a process + // outside the pod, e.g. NFS. + shared +) + +func parseShare(val string) (shareType, error) { + switch val { + case "container": + return container, nil + case "pod": + return pod, nil + case "shared": + return shared, nil + default: + return 0, fmt.Errorf("invalid share value %q", val) + } +} + +func (s shareType) String() string { + switch s { + case invalid: + return "invalid" + case container: + return "container" + case pod: + return "pod" + case shared: + return "shared" + default: + return fmt.Sprintf("invalid share value %d", s) + } +} + +// mountHint represents extra information about mounts that are provided via +// annotations. They can override mount type, and provide sharing information +// so that mounts can be correctly shared inside the pod. +type mountHint struct { + name string + share shareType + mount specs.Mount + + // root is the inode where the volume is mounted. For mounts with 'pod' share + // the volume is mounted once and then bind mounted inside the containers. + root *fs.Inode + + // vfsMount is the master mount for the volume. For mounts with 'pod' share + // the master volume is bind mounted inside the containers. + vfsMount *vfs.Mount +} + +func (m *mountHint) setField(key, val string) error { + switch key { + case "source": + if len(val) == 0 { + return fmt.Errorf("source cannot be empty") + } + m.mount.Source = val + case "type": + return m.setType(val) + case "share": + share, err := parseShare(val) + if err != nil { + return err + } + m.share = share + case "options": + return m.setOptions(val) + default: + return fmt.Errorf("invalid mount annotation: %s=%s", key, val) + } + return nil +} + +func (m *mountHint) setType(val string) error { + switch val { + case "tmpfs", "bind": + m.mount.Type = val + default: + return fmt.Errorf("invalid type %q", val) + } + return nil +} + +func (m *mountHint) setOptions(val string) error { + opts := strings.Split(val, ",") + if err := specutils.ValidateMountOptions(opts); err != nil { + return err + } + // Sort options so it can be compared with container mount options later on. + sort.Strings(opts) + m.mount.Options = opts + return nil +} + +func (m *mountHint) isSupported() bool { + return m.mount.Type == tmpfsvfs2.Name && m.share == pod +} + +// checkCompatible verifies that shared mount is compatible with master. +// For now enforce that all options are the same. Once bind mount is properly +// supported, then we should ensure the master is less restrictive than the +// container, e.g. master can be 'rw' while container mounts as 'ro'. +func (m *mountHint) checkCompatible(mount specs.Mount) error { + // Remove options that don't affect to mount's behavior. + masterOpts := filterUnsupportedOptions(m.mount) + slaveOpts := filterUnsupportedOptions(mount) + + if len(masterOpts) != len(slaveOpts) { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + } + + sort.Strings(masterOpts) + sort.Strings(slaveOpts) + for i, opt := range masterOpts { + if opt != slaveOpts[i] { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + } + } + return nil +} + +func (m *mountHint) fileAccessType() FileAccessType { + if m.share == container { + return FileAccessExclusive + } + return FileAccessShared +} + +func filterUnsupportedOptions(mount specs.Mount) []string { + rv := make([]string, 0, len(mount.Options)) + for _, o := range mount.Options { + if isSupportedMountFlag(mount.Type, o) { + rv = append(rv, o) + } + } + return rv +} + +// podMountHints contains a collection of mountHints for the pod. +type podMountHints struct { + mounts map[string]*mountHint +} + +func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { + mnts := make(map[string]*mountHint) + for k, v := range spec.Annotations { + // Look for 'dev.gvisor.spec.mount' annotations and parse them. + if strings.HasPrefix(k, MountPrefix) { + // Remove the prefix and split the rest. + parts := strings.Split(k[len(MountPrefix):], ".") + if len(parts) != 2 { + return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v) + } + name := parts[0] + if len(name) == 0 { + return nil, fmt.Errorf("invalid mount name: %s", name) + } + mnt := mnts[name] + if mnt == nil { + mnt = &mountHint{name: name} + mnts[name] = mnt + } + if err := mnt.setField(parts[1], v); err != nil { + return nil, err + } + } + } + + // Validate all hints after done parsing. + for name, m := range mnts { + log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share) + if m.share == invalid { + return nil, fmt.Errorf("share field for %q has not been set", m.name) + } + if len(m.mount.Source) == 0 { + return nil, fmt.Errorf("source field for %q has not been set", m.name) + } + if len(m.mount.Type) == 0 { + return nil, fmt.Errorf("type field for %q has not been set", m.name) + } + + // Check for duplicate mount sources. + for name2, m2 := range mnts { + if name != name2 && m.mount.Source == m2.mount.Source { + return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source) + } + } + } + + return &podMountHints{mounts: mnts}, nil +} + +func (p *podMountHints) findMount(mount specs.Mount) *mountHint { + for _, m := range p.mounts { + if m.mount.Source == mount.Source { + return m + } + } + return nil +} + +type containerMounter struct { + root *specs.Root + + // mounts is the set of submounts for the container. It's a copy from the spec + // that may be freely modified without affecting the original spec. + mounts []specs.Mount + + // fds is the list of FDs to be dispensed for mounts that require it. + fds fdDispenser + + k *kernel.Kernel + + hints *podMountHints +} + +func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter { + return &containerMounter{ + root: spec.Root, + mounts: compileMounts(spec), + fds: fdDispenser{fds: goferFDs}, + k: k, + hints: hints, + } +} + +// processHints processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error { + if conf.VFS2 { + return c.processHintsVFS2(conf, creds) + } + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfsvfs2.Name { + continue + } + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + inode, err := c.mountSharedMaster(ctx, conf, hint) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.root = inode + } + return nil +} + +// setupFS is used to set up the file system for all containers. This is the +// main entry point method, with most of the other being internal only. It +// returns the mount namespace that is created for the container. +func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { + log.Infof("Configuring container's file system") + + // Create context with root credentials to mount the filesystem (the current + // user may not be privileged enough). + rootProcArgs := *procArgs + rootProcArgs.WorkingDirectory = "/" + rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs.Umask = 0022 + rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals + rootCtx := rootProcArgs.NewContext(c.k) + + mns, err := c.createMountNamespace(rootCtx, conf) + if err != nil { + return nil, err + } + + // Set namespace here so that it can be found in rootCtx. + rootProcArgs.MountNamespace = mns + + if err := c.mountSubmounts(rootCtx, conf, mns); err != nil { + return nil, err + } + return mns, nil +} + +func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) { + rootInode, err := c.createRootMount(ctx, conf) + if err != nil { + return nil, fmt.Errorf("creating filesystem for container: %v", err) + } + mns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + return nil, fmt.Errorf("creating new mount namespace for container: %v", err) + } + return mns, nil +} + +func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { + root := mns.Root() + defer root.DecRef() + + for _, m := range c.mounts { + log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) + if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err) + } + } else { + if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { + return fmt.Errorf("mount submount %q: %v", m.Destination, err) + } + } + } + + if err := c.mountTmp(ctx, conf, mns, root); err != nil { + return fmt.Errorf("mount submount %q: %v", "tmp", err) + } + + if err := c.checkDispenser(); err != nil { + return err + } + return nil +} + +func (c *containerMounter) checkDispenser() error { + if !c.fds.empty() { + return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds) + } + return nil +} + +// mountSharedMaster mounts the master of a volume that is shared among +// containers in a pod. It returns the root mount's inode. +func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + + // Mount with revalidate because it's shared among containers. + opts = append(opts, "cache=revalidate") + + // All filesystem names should have been mapped to something we know. + filesystem := mustFindFilesystem(fsName) + + mf := mountFlags(hint.mount.Options) + if useOverlay { + // All writes go to upper, be paranoid and make lower readonly. + mf.ReadOnly = true + } + + inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil) + if err != nil { + return nil, fmt.Errorf("creating mount %q: %v", hint.name, err) + } + + if useOverlay { + log.Debugf("Adding overlay on top of shared mount %q", hint.name) + inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf) + if err != nil { + return nil, err + } + } + + return inode, nil +} + +// createRootMount creates the root filesystem. +func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) { + // First construct the filesystem from the spec.Root. + mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay} + + fd := c.fds.remove() + log.Infof("Mounting root over 9P, ioFD: %d", fd) + p9FS := mustFindFilesystem("9p") + opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) + + if conf.OverlayfsStaleRead { + // We can't check for overlayfs here because sandbox is chroot'ed and gofer + // can only send mount options for specs.Mounts (specs.Root is missing + // Options field). So assume root is always on top of overlayfs. + opts = append(opts, "overlayfs_stale_read") + } + + rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) + if err != nil { + return nil, fmt.Errorf("creating root mount point: %v", err) + } + + // We need to overlay the root on top of a ramfs with stub directories + // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always + // mounted even if they are not in the spec. + submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + if err != nil { + return nil, fmt.Errorf("adding submount overlay: %v", err) + } + + if conf.Overlay && !c.root.Readonly { + log.Debugf("Adding overlay on top of root mount") + // Overlay a tmpfs filesystem on top of the root. + rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) + if err != nil { + return nil, err + } + } + + log.Infof("Mounted %q to %q type root", c.root.Path, "/") + return rootInode, nil +} + +// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) { + var ( + fsName string + opts []string + useOverlay bool + ) + + switch m.Type { + case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name: + fsName = m.Type + case nonefs: + fsName = sysvfs2.Name + case tmpfsvfs2.Name: + fsName = m.Type + + var err error + opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) + if err != nil { + return "", nil, false, err + } + + case bind: + fd := c.fds.remove() + fsName = gofervfs2.Name + opts = p9MountData(fd, c.getMountAccessType(m), conf.VFS2) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + + default: + log.Warningf("ignoring unknown filesystem type %q", m.Type) + } + return fsName, opts, useOverlay, nil +} + +func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType { + if hint := c.hints.findMount(mount); hint != nil { + return hint.fileAccessType() + } + // Non-root bind mounts are always shared if no hints were provided. + return FileAccessShared +} + +// mountSubmount mounts volumes inside the container's root. Because mounts may +// be readonly, a lower ramfs overlay is added to create the mount point dir. +// Another overlay is added with tmpfs on top if Config.Overlay is true. +// 'm.Destination' must be an absolute path with '..' and symlinks resolved. +func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) + if err != nil { + return err + } + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + // All filesystem names should have been mapped to something we know. + filesystem := mustFindFilesystem(fsName) + + mf := mountFlags(m.Options) + if useOverlay { + // All writes go to upper, be paranoid and make lower readonly. + mf.ReadOnly = true + } + + inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil) + if err != nil { + err := fmt.Errorf("creating mount with source %q: %v", m.Source, err) + // Check to see if this is a common error due to a Linux bug. + // This error is generated here in order to cause it to be + // printed to the user using Docker via 'runsc create' etc. rather + // than simply printed to the logs for the 'runsc boot' command. + // + // We check the error message string rather than type because the + // actual error types (syscall.EIO, syscall.EPIPE) are lost by file system + // implementation (e.g. p9). + // TODO(gvisor.dev/issue/1765): Remove message when bug is resolved. + if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) { + return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug")) + } + return err + } + + // If there are submounts, we need to overlay the mount on top of a ramfs + // with stub directories for submount paths. + submounts := subtargets(m.Destination, c.mounts) + if len(submounts) > 0 { + log.Infof("Adding submount overlay over %q", m.Destination) + inode, err = addSubmountOverlay(ctx, inode, submounts) + if err != nil { + return fmt.Errorf("adding submount overlay: %v", err) + } + } + + if useOverlay { + log.Debugf("Adding overlay on top of mount %q", m.Destination) + inode, err = addOverlay(ctx, conf, inode, m.Type, mf) + if err != nil { + return err + } + } + + maxTraversals := uint(0) + dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals) + if err != nil { + return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) + } + defer dirent.DecRef() + if err := mns.Mount(ctx, dirent, inode); err != nil { + return fmt.Errorf("mount %q error: %v", m.Destination, err) + } + + log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts) + return nil +} + +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error { + if err := source.checkCompatible(mount); err != nil { + return err + } + + maxTraversals := uint(0) + target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals) + if err != nil { + return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) + } + defer target.DecRef() + + // Take a ref on the inode that is about to be (re)-mounted. + source.root.IncRef() + if err := mns.Mount(ctx, target, source.root); err != nil { + source.root.DecRef() + return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) + } + + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return nil +} + +// addRestoreMount adds a mount to the MountSources map used for restoring a +// checkpointed container. +func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error { + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) + if err != nil { + return err + } + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + newMount := fs.MountArgs{ + Dev: mountDevice(m), + Flags: mountFlags(m.Options), + DataString: strings.Join(opts, ","), + } + if useOverlay { + newMount.Flags.ReadOnly = true + } + renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount) + log.Infof("Added mount at %q: %+v", fsName, newMount) + return nil +} + +// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding +// the mounts to the environment. +func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) { + renv := &fs.RestoreEnvironment{ + MountSources: make(map[string][]fs.MountArgs), + } + + // Add root mount. + fd := c.fds.remove() + opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) + + mf := fs.MountSourceFlags{} + if c.root.Readonly || conf.Overlay { + mf.ReadOnly = true + } + + rootMount := fs.MountArgs{ + Dev: rootDevice, + Flags: mf, + DataString: strings.Join(opts, ","), + } + renv.MountSources[gofervfs2.Name] = append(renv.MountSources[gofervfs2.Name], rootMount) + + // Add submounts. + var tmpMounted bool + for _, m := range c.mounts { + if err := c.addRestoreMount(conf, renv, m); err != nil { + return nil, err + } + if filepath.Clean(m.Destination) == "/tmp" { + tmpMounted = true + } + } + + // TODO(b/67958150): handle '/tmp' properly (see mountTmp()). + if !tmpMounted { + tmpMount := specs.Mount{ + Type: tmpfsvfs2.Name, + Destination: "/tmp", + } + if err := c.addRestoreMount(conf, renv, tmpMount); err != nil { + return nil, err + } + } + + return renv, nil +} + +// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so. +// Technically we don't have to mount tmpfs at /tmp, as we could just rely on +// the host /tmp, but this is a nice optimization, and fixes some apps that call +// mknod in /tmp. It's unsafe to mount tmpfs if: +// 1. /tmp is mounted explicitly: we should not override user's wish +// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp +// +// Note that when there are submounts inside of '/tmp', directories for the +// mount points must be present, making '/tmp' not empty anymore. +func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { + for _, m := range c.mounts { + if filepath.Clean(m.Destination) == "/tmp" { + log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m) + return nil + } + } + + maxTraversals := uint(0) + tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals) + switch err { + case nil: + // Found '/tmp' in filesystem, check if it's empty. + defer tmp.DecRef() + f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) + if err != nil { + return err + } + defer f.DecRef() + serializer := &fs.CollectEntriesSerializer{} + if err := f.Readdir(ctx, serializer); err != nil { + return err + } + // If more than "." and ".." is found, skip internal tmpfs to prevent hiding + // existing files. + if len(serializer.Order) > 2 { + log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp") + return nil + } + log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp") + fallthrough + + case syserror.ENOENT: + // No '/tmp' found (or fallthrough from above). Safe to mount internal + // tmpfs. + tmpMount := specs.Mount{ + Type: tmpfsvfs2.Name, + Destination: "/tmp", + // Sticky bit is added to prevent accidental deletion of files from + // another user. This is normally done for /tmp. + Options: []string{"mode=01777"}, + } + return c.mountSubmount(ctx, conf, mns, root, tmpMount) + + default: + return err + } +} diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go new file mode 100644 index 000000000..912037075 --- /dev/null +++ b/runsc/boot/fs_test.go @@ -0,0 +1,250 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "reflect" + "strings" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestPodMountHintsHappy(t *testing.T) { + spec := &specs.Spec{ + Annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + + MountPrefix + "mount2.source": "bar", + MountPrefix + "mount2.type": "bind", + MountPrefix + "mount2.share": "container", + MountPrefix + "mount2.options": "rw,private", + }, + } + podHints, err := newPodMountHints(spec) + if err != nil { + t.Fatalf("newPodMountHints failed: %v", err) + } + + // Check that fields were set correctly. + mount1 := podHints.mounts["mount1"] + if want := "mount1"; want != mount1.name { + t.Errorf("mount1 name, want: %q, got: %q", want, mount1.name) + } + if want := "foo"; want != mount1.mount.Source { + t.Errorf("mount1 source, want: %q, got: %q", want, mount1.mount.Source) + } + if want := "tmpfs"; want != mount1.mount.Type { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Type) + } + if want := pod; want != mount1.share { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.share) + } + if want := []string(nil); !reflect.DeepEqual(want, mount1.mount.Options) { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Options) + } + + mount2 := podHints.mounts["mount2"] + if want := "mount2"; want != mount2.name { + t.Errorf("mount2 name, want: %q, got: %q", want, mount2.name) + } + if want := "bar"; want != mount2.mount.Source { + t.Errorf("mount2 source, want: %q, got: %q", want, mount2.mount.Source) + } + if want := "bind"; want != mount2.mount.Type { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Type) + } + if want := container; want != mount2.share { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.share) + } + if want := []string{"private", "rw"}; !reflect.DeepEqual(want, mount2.mount.Options) { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Options) + } +} + +func TestPodMountHintsErrors(t *testing.T) { + for _, tst := range []struct { + name string + annotations map[string]string + error string + }{ + { + name: "too short", + annotations: map[string]string{ + MountPrefix + "mount1": "foo", + }, + error: "invalid mount annotation", + }, + { + name: "no name", + annotations: map[string]string{ + MountPrefix + ".source": "foo", + }, + error: "invalid mount name", + }, + { + name: "missing source", + annotations: map[string]string{ + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + }, + error: "source field", + }, + { + name: "missing type", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.share": "pod", + }, + error: "type field", + }, + { + name: "missing share", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + }, + error: "share field", + }, + { + name: "invalid field name", + annotations: map[string]string{ + MountPrefix + "mount1.invalid": "foo", + }, + error: "invalid mount annotation", + }, + { + name: "invalid source", + annotations: map[string]string{ + MountPrefix + "mount1.source": "", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + }, + error: "source cannot be empty", + }, + { + name: "invalid type", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "invalid-type", + MountPrefix + "mount1.share": "pod", + }, + error: "invalid type", + }, + { + name: "invalid share", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "invalid-share", + }, + error: "invalid share", + }, + { + name: "invalid options", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + MountPrefix + "mount1.options": "invalid-option", + }, + error: "unknown mount option", + }, + { + name: "duplicate source", + annotations: map[string]string{ + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + + MountPrefix + "mount2.source": "foo", + MountPrefix + "mount2.type": "bind", + MountPrefix + "mount2.share": "container", + }, + error: "have the same mount source", + }, + } { + t.Run(tst.name, func(t *testing.T) { + spec := &specs.Spec{Annotations: tst.annotations} + podHints, err := newPodMountHints(spec) + if err == nil || !strings.Contains(err.Error(), tst.error) { + t.Errorf("newPodMountHints invalid error, want: .*%s.*, got: %v", tst.error, err) + } + if podHints != nil { + t.Errorf("newPodMountHints must return nil on failure: %+v", podHints) + } + }) + } +} + +func TestGetMountAccessType(t *testing.T) { + const source = "foo" + for _, tst := range []struct { + name string + annotations map[string]string + want FileAccessType + }{ + { + name: "container=exclusive", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "container", + }, + want: FileAccessExclusive, + }, + { + name: "pod=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "pod", + }, + want: FileAccessShared, + }, + { + name: "shared=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "shared", + }, + want: FileAccessShared, + }, + { + name: "default=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source + "mismatch", + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "container", + }, + want: FileAccessShared, + }, + } { + t.Run(tst.name, func(t *testing.T) { + spec := &specs.Spec{Annotations: tst.annotations} + podHints, err := newPodMountHints(spec) + if err != nil { + t.Fatalf("newPodMountHints failed: %v", err) + } + mounter := containerMounter{hints: podHints} + if got := mounter.getMountAccessType(specs.Mount{Source: source}); got != tst.want { + t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got) + } + }) + } +} diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go new file mode 100644 index 000000000..ce62236e5 --- /dev/null +++ b/runsc/boot/limits.go @@ -0,0 +1,154 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sync" +) + +// Mapping from linux resource names to limits.LimitType. +var fromLinuxResource = map[string]limits.LimitType{ + "RLIMIT_AS": limits.AS, + "RLIMIT_CORE": limits.Core, + "RLIMIT_CPU": limits.CPU, + "RLIMIT_DATA": limits.Data, + "RLIMIT_FSIZE": limits.FileSize, + "RLIMIT_LOCKS": limits.Locks, + "RLIMIT_MEMLOCK": limits.MemoryLocked, + "RLIMIT_MSGQUEUE": limits.MessageQueueBytes, + "RLIMIT_NICE": limits.Nice, + "RLIMIT_NOFILE": limits.NumberOfFiles, + "RLIMIT_NPROC": limits.ProcessCount, + "RLIMIT_RSS": limits.Rss, + "RLIMIT_RTPRIO": limits.RealTimePriority, + "RLIMIT_RTTIME": limits.Rttime, + "RLIMIT_SIGPENDING": limits.SignalsPending, + "RLIMIT_STACK": limits.Stack, +} + +func findName(lt limits.LimitType) string { + for k, v := range fromLinuxResource { + if v == lt { + return k + } + } + return "unknown" +} + +var defaults defs + +type defs struct { + mu sync.Mutex + set *limits.LimitSet + err error +} + +func (d *defs) get() (*limits.LimitSet, error) { + d.mu.Lock() + defer d.mu.Unlock() + + if d.err != nil { + return nil, d.err + } + if d.set == nil { + if err := d.initDefaults(); err != nil { + d.err = err + return nil, err + } + } + return d.set, nil +} + +func (d *defs) initDefaults() error { + ls, err := limits.NewLinuxLimitSet() + if err != nil { + return err + } + + // Set default limits based on what containers get by default, ex: + // $ docker run --rm debian prlimit + ls.SetUnchecked(limits.AS, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.Core, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.CPU, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536}) + ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200}) + ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0}) + ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576}) + ls.SetUnchecked(limits.ProcessCount, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.Rss, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.RealTimePriority, limits.Limit{Cur: 0, Max: 0}) + ls.SetUnchecked(limits.Rttime, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) + ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0}) + ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity}) + + // Read host limits that directly affect the sandbox and adjust the defaults + // based on them. + for _, res := range []int{syscall.RLIMIT_FSIZE, syscall.RLIMIT_NOFILE} { + var hl syscall.Rlimit + if err := syscall.Getrlimit(res, &hl); err != nil { + return err + } + + lt, ok := limits.FromLinuxResource[res] + if !ok { + return fmt.Errorf("unknown rlimit type %v", res) + } + hostLimit := limits.Limit{ + Cur: limits.FromLinux(hl.Cur), + Max: limits.FromLinux(hl.Max), + } + + defaultLimit := ls.Get(lt) + if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur { + log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur) + } + if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max { + log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max) + ls.SetUnchecked(lt, hostLimit) + } + } + + d.set = ls + return nil +} + +func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) { + ls, err := defaults.get() + if err != nil { + return nil, err + } + + // Then apply overwrites on top of defaults. + for _, rl := range spec.Process.Rlimits { + lt, ok := fromLinuxResource[rl.Type] + if !ok { + return nil, fmt.Errorf("unknown resource %q", rl.Type) + } + ls.SetUnchecked(lt, limits.Limit{ + Cur: rl.Soft, + Max: rl.Hard, + }) + } + return ls, nil +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go new file mode 100644 index 000000000..0c0423ab2 --- /dev/null +++ b/runsc/boot/loader.go @@ -0,0 +1,1284 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package boot loads the kernel and runs a container. +package boot + +import ( + "fmt" + mrand "math/rand" + "os" + "runtime" + "sync/atomic" + "syscall" + gtime "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/fdimport" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/fs/user" + hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/sighandling" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" + "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" + "gvisor.dev/gvisor/pkg/tcpip/transport/raw" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/runsc/boot/filter" + _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. + "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/specutils" + + // Include supported socket providers. + "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" +) + +// Loader keeps state needed to start the kernel and run the container.. +type Loader struct { + // k is the kernel. + k *kernel.Kernel + + // ctrl is the control server. + ctrl *controller + + conf *Config + + // console is set to true if terminal is enabled. + console bool + + watchdog *watchdog.Watchdog + + // stdioFDs contains stdin, stdout, and stderr. + stdioFDs []int + + // goferFDs are the FDs that attach the sandbox to the gofers. + goferFDs []int + + // spec is the base configuration for the root container. + spec *specs.Spec + + // stopSignalForwarding disables forwarding of signals to the sandboxed + // container. It should be called when a sandbox is destroyed. + stopSignalForwarding func() + + // restore is set to true if we are restoring a container. + restore bool + + // rootProcArgs refers to the root sandbox init task. + rootProcArgs kernel.CreateProcessArgs + + // sandboxID is the ID for the whole sandbox. + sandboxID string + + // mu guards processes. + mu sync.Mutex + + // processes maps containers init process and invocation of exec. Root + // processes are keyed with container ID and pid=0, while exec invocations + // have the corresponding pid set. + // + // processes is guardded by mu. + processes map[execID]*execProcess + + // mountHints provides extra information about mounts for containers that + // apply to the entire pod. + mountHints *podMountHints +} + +// execID uniquely identifies a sentry process that is executed in a container. +type execID struct { + cid string + pid kernel.ThreadID +} + +// execProcess contains the thread group and host TTY of a sentry process. +type execProcess struct { + // tg will be nil for containers that haven't started yet. + tg *kernel.ThreadGroup + + // tty will be nil if the process is not attached to a terminal. + tty *host.TTYFileOperations + + // tty will be nil if the process is not attached to a terminal. + ttyVFS2 *hostvfs2.TTYFileDescription + + // pidnsPath is the pid namespace path in spec + pidnsPath string +} + +func init() { + // Initialize the random number generator. + mrand.Seed(gtime.Now().UnixNano()) +} + +// Args are the arguments for New(). +type Args struct { + // Id is the sandbox ID. + ID string + // Spec is the sandbox specification. + Spec *specs.Spec + // Conf is the system configuration. + Conf *Config + // ControllerFD is the FD to the URPC controller. The Loader takes ownership + // of this FD and may close it at any time. + ControllerFD int + // Device is an optional argument that is passed to the platform. The Loader + // takes ownership of this file and may close it at any time. + Device *os.File + // GoferFDs is an array of FDs used to connect with the Gofer. The Loader + // takes ownership of these FDs and may close them at any time. + GoferFDs []int + // StdioFDs is the stdio for the application. The Loader takes ownership of + // these FDs and may close them at any time. + StdioFDs []int + // Console is set to true if using TTY. + Console bool + // NumCPU is the number of CPUs to create inside the sandbox. + NumCPU int + // TotalMem is the initial amount of total memory to report back to the + // container. + TotalMem uint64 + // UserLogFD is the file descriptor to write user logs to. + UserLogFD int +} + +// make sure stdioFDs are always the same on initial start and on restore +const startingStdioFD = 64 + +// New initializes a new kernel loader configured by spec. +// New also handles setting up a kernel for restoring a container. +func New(args Args) (*Loader, error) { + // We initialize the rand package now to make sure /dev/urandom is pre-opened + // on kernels that do not support getrandom(2). + if err := rand.Init(); err != nil { + return nil, fmt.Errorf("setting up rand: %v", err) + } + + if err := usage.Init(); err != nil { + return nil, fmt.Errorf("setting up memory usage: %v", err) + } + + // Is this a VFSv2 kernel? + if args.Conf.VFS2 { + kernel.VFS2Enabled = true + vfs2.Override() + } + + // Create kernel and platform. + p, err := createPlatform(args.Conf, args.Device) + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + k := &kernel.Kernel{ + Platform: p, + } + + // Create memory file. + mf, err := createMemoryFile() + if err != nil { + return nil, fmt.Errorf("creating memory file: %v", err) + } + k.SetMemoryFile(mf) + + // Create VDSO. + // + // Pass k as the platform since it is savable, unlike the actual platform. + vdso, err := loader.PrepareVDSO(k) + if err != nil { + return nil, fmt.Errorf("creating vdso: %v", err) + } + + // Create timekeeper. + tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) + if err != nil { + return nil, fmt.Errorf("creating timekeeper: %v", err) + } + tk.SetClocks(time.NewCalibratedClocks()) + + if err := enableStrace(args.Conf); err != nil { + return nil, fmt.Errorf("enabling strace: %v", err) + } + + // Create root network namespace/stack. + netns, err := newRootNetworkNamespace(args.Conf, k, k) + if err != nil { + return nil, fmt.Errorf("creating network: %v", err) + } + + // Create capabilities. + caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities) + if err != nil { + return nil, fmt.Errorf("converting capabilities: %v", err) + } + + // Convert the spec's additional GIDs to KGIDs. + extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids)) + for _, GID := range args.Spec.Process.User.AdditionalGids { + extraKGIDs = append(extraKGIDs, auth.KGID(GID)) + } + + // Create credentials. + creds := auth.NewUserCredentials( + auth.KUID(args.Spec.Process.User.UID), + auth.KGID(args.Spec.Process.User.GID), + extraKGIDs, + caps, + auth.NewRootUserNamespace()) + + if args.NumCPU == 0 { + args.NumCPU = runtime.NumCPU() + } + log.Infof("CPUs: %d", args.NumCPU) + + if args.TotalMem > 0 { + // Adjust the total memory returned by the Sentry so that applications that + // use /proc/meminfo can make allocations based on this limit. + usage.MinimumTotalMemoryBytes = args.TotalMem + log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30)) + } + + // Initiate the Kernel object, which is required by the Context passed + // to createVFS in order to mount (among other things) procfs. + if err = k.Init(kernel.InitKernelArgs{ + FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, + RootUserNamespace: creds.UserNamespace, + RootNetworkNamespace: netns, + ApplicationCores: uint(args.NumCPU), + Vdso: vdso, + RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), + RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), + RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), + }); err != nil { + return nil, fmt.Errorf("initializing kernel: %v", err) + } + + if kernel.VFS2Enabled { + if err := registerFilesystems(k); err != nil { + return nil, fmt.Errorf("registering filesystems: %w", err) + } + } + + if err := adjustDirentCache(k); err != nil { + return nil, err + } + + // Turn on packet logging if enabled. + if args.Conf.LogPackets { + log.Infof("Packet logging enabled") + atomic.StoreUint32(&sniffer.LogPackets, 1) + } else { + log.Infof("Packet logging disabled") + atomic.StoreUint32(&sniffer.LogPackets, 0) + } + + // Create a watchdog. + dogOpts := watchdog.DefaultOpts + dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction + dog := watchdog.New(k, dogOpts) + + procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) + if err != nil { + return nil, fmt.Errorf("creating init process for root container: %v", err) + } + + if err := initCompatLogs(args.UserLogFD); err != nil { + return nil, fmt.Errorf("initializing compat logs: %v", err) + } + + mountHints, err := newPodMountHints(args.Spec) + if err != nil { + return nil, fmt.Errorf("creating pod mount hints: %v", err) + } + + if kernel.VFS2Enabled { + // Set up host mount that will be used for imported fds. + hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS()) + if err != nil { + return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) + } + defer hostFilesystem.DecRef() + hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create hostfs mount: %v", err) + } + k.SetHostMount(hostMount) + } + + // Make host FDs stable between invocations. Host FDs must map to the exact + // same number when the sandbox is restored. Otherwise the wrong FD will be + // used. + var stdioFDs []int + newfd := startingStdioFD + for _, fd := range args.StdioFDs { + err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) + } + stdioFDs = append(stdioFDs, newfd) + err = syscall.Close(fd) + if err != nil { + return nil, fmt.Errorf("close original stdioFDs failed: %v", err) + } + newfd++ + } + + eid := execID{cid: args.ID} + l := &Loader{ + k: k, + conf: args.Conf, + console: args.Console, + watchdog: dog, + spec: args.Spec, + goferFDs: args.GoferFDs, + stdioFDs: stdioFDs, + rootProcArgs: procArgs, + sandboxID: args.ID, + processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, + } + + // We don't care about child signals; some platforms can generate a + // tremendous number of useless ones (I'm looking at you, ptrace). + if err := sighandling.IgnoreChildStop(); err != nil { + return nil, fmt.Errorf("ignore child stop signals failed: %v", err) + } + + // Create the control server using the provided FD. + // + // This must be done *after* we have initialized the kernel since the + // controller is used to configure the kernel's network stack. + ctrl, err := newController(args.ControllerFD, l) + if err != nil { + return nil, fmt.Errorf("creating control server: %v", err) + } + l.ctrl = ctrl + + // Only start serving after Loader is set to controller and controller is set + // to Loader, because they are both used in the urpc methods. + if err := ctrl.srv.StartServing(); err != nil { + return nil, fmt.Errorf("starting control server: %v", err) + } + + return l, nil +} + +// newProcess creates a process that can be run with kernel.CreateProcess. +func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { + // Create initial limits. + ls, err := createLimitSet(spec) + if err != nil { + return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err) + } + + wd := spec.Process.Cwd + if wd == "" { + wd = "/" + } + + // Create the process arguments. + procArgs := kernel.CreateProcessArgs{ + Argv: spec.Process.Args, + Envv: spec.Process.Env, + WorkingDirectory: wd, + Credentials: creds, + Umask: 0022, + Limits: ls, + MaxSymlinkTraversals: linux.MaxSymlinkTraversals, + UTSNamespace: k.RootUTSNamespace(), + IPCNamespace: k.RootIPCNamespace(), + AbstractSocketNamespace: k.RootAbstractSocketNamespace(), + ContainerID: id, + PIDNamespace: pidns, + } + + return procArgs, nil +} + +// Destroy cleans up all resources used by the loader. +// +// Note that this will block until all open control server connections have +// been closed. For that reason, this should NOT be called in a defer, because +// a panic in a control server rpc would then hang forever. +func (l *Loader) Destroy() { + if l.ctrl != nil { + l.ctrl.srv.Stop() + } + if l.stopSignalForwarding != nil { + l.stopSignalForwarding() + } + l.watchdog.Stop() +} + +func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { + p, err := platform.Lookup(conf.Platform) + if err != nil { + panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err)) + } + log.Infof("Platform: %s", conf.Platform) + return p.New(deviceFile) +} + +func createMemoryFile() (*pgalloc.MemoryFile, error) { + const memfileName = "runsc-memory" + memfd, err := memutil.CreateMemFD(memfileName, 0) + if err != nil { + return nil, fmt.Errorf("error creating memfd: %v", err) + } + memfile := os.NewFile(uintptr(memfd), memfileName) + // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if + // there are memory cgroups specified, because at this point we're already + // in a mount namespace in which the relevant cgroupfs is not visible. + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) + if err != nil { + memfile.Close() + return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) + } + return mf, nil +} + +func (l *Loader) installSeccompFilters() error { + if l.conf.DisableSeccomp { + filter.Report("syscall filter is DISABLED. Running in less secure mode.") + } else { + opts := filter.Options{ + Platform: l.k.Platform, + HostNetwork: l.conf.Network == NetworkHost, + ProfileEnable: l.conf.ProfileEnable, + ControllerFD: l.ctrl.srv.FD(), + } + if err := filter.Install(opts); err != nil { + return fmt.Errorf("installing seccomp filters: %v", err) + } + } + return nil +} + +// Run runs the root container. +func (l *Loader) Run() error { + err := l.run() + l.ctrl.manager.startResultChan <- err + if err != nil { + // Give the controller some time to send the error to the + // runtime. If we return too quickly here the process will exit + // and the control connection will be closed before the error + // is returned. + gtime.Sleep(2 * gtime.Second) + return err + } + return nil +} + +func (l *Loader) run() error { + if l.conf.Network == NetworkHost { + // Delay host network configuration to this point because network namespace + // is configured after the loader is created and before Run() is called. + log.Debugf("Configuring host network") + stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) + if err := stack.Configure(); err != nil { + return err + } + } + + l.mu.Lock() + defer l.mu.Unlock() + + eid := execID{cid: l.sandboxID} + ep, ok := l.processes[eid] + if !ok { + return fmt.Errorf("trying to start deleted container %q", l.sandboxID) + } + + // If we are restoring, we do not want to create a process. + // l.restore is set by the container manager when a restore call is made. + var ttyFile *host.TTYFileOperations + var ttyFileVFS2 *hostvfs2.TTYFileDescription + if !l.restore { + if l.conf.ProfileEnable { + pprof.Initialize() + } + + // Finally done with all configuration. Setup filters before user code + // is loaded. + if err := l.installSeccompFilters(); err != nil { + return err + } + + // Create the FD map, which will set stdin, stdout, and stderr. If console + // is true, then ioctl calls will be passed through to the host fd. + ctx := l.rootProcArgs.NewContext(l.k) + var err error + + // CreateProcess takes a reference on FDMap if successful. We won't need + // ours either way. + l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs) + if err != nil { + return fmt.Errorf("importing fds: %v", err) + } + + // Setup the root container file system. + l.startGoferMonitor(l.sandboxID, l.goferFDs) + + mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.conf, l.rootProcArgs.Credentials); err != nil { + return err + } + if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil { + return err + } + + // Add the HOME enviroment variable if it is not already set. + var envv []string + if kernel.VFS2Enabled { + envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2, + l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + + } else { + envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, + l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + } + if err != nil { + return err + } + l.rootProcArgs.Envv = envv + + // Create the root container init task. It will begin running + // when the kernel is started. + if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil { + return fmt.Errorf("creating init process: %v", err) + } + + // CreateProcess takes a reference on FDTable if successful. + l.rootProcArgs.FDTable.DecRef() + } + + ep.tg = l.k.GlobalInit() + if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok { + ep.pidnsPath = ns.Path + } + if l.console { + // Set the foreground process group on the TTY to the global init process + // group, since that is what we are about to start running. + switch { + case ttyFileVFS2 != nil: + ep.ttyVFS2 = ttyFileVFS2 + ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup()) + case ttyFile != nil: + ep.tty = ttyFile + ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup()) + } + } + + // Handle signals by forwarding them to the root container process + // (except for panic signal, which should cause a panic). + l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { + // Panic signal should cause a panic. + if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) { + panic("Signal-induced panic") + } + + // Otherwise forward to root container. + deliveryMode := DeliverToProcess + if l.console { + // Since we are running with a console, we should forward the signal to + // the foreground process group so that job control signals like ^C can + // be handled properly. + deliveryMode = DeliverToForegroundProcessGroup + } + log.Infof("Received external signal %d, mode: %v", sig, deliveryMode) + if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { + log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err) + } + }) + + // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again + // either in createFDTable() during initial start or in descriptor.initAfterLoad() + // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the + // passed FDs, so only close for VFS1. + if !kernel.VFS2Enabled { + for _, fd := range l.stdioFDs { + err := syscall.Close(fd) + if err != nil { + return fmt.Errorf("close dup()ed stdioFDs: %v", err) + } + } + } + + log.Infof("Process should have started...") + l.watchdog.Start() + return l.k.Start() +} + +// createContainer creates a new container inside the sandbox. +func (l *Loader) createContainer(cid string) error { + l.mu.Lock() + defer l.mu.Unlock() + + eid := execID{cid: cid} + if _, ok := l.processes[eid]; ok { + return fmt.Errorf("container %q already exists", cid) + } + l.processes[eid] = &execProcess{} + return nil +} + +// startContainer starts a child container. It returns the thread group ID of +// the newly created process. Caller owns 'files' and may close them after +// this method returns. +func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error { + // Create capabilities. + caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) + if err != nil { + return fmt.Errorf("creating capabilities: %v", err) + } + + l.mu.Lock() + defer l.mu.Unlock() + + eid := execID{cid: cid} + if _, ok := l.processes[eid]; !ok { + return fmt.Errorf("trying to start a deleted container %q", cid) + } + + // Convert the spec's additional GIDs to KGIDs. + extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) + for _, GID := range spec.Process.User.AdditionalGids { + extraKGIDs = append(extraKGIDs, auth.KGID(GID)) + } + + // Create credentials. We reuse the root user namespace because the + // sentry currently supports only 1 mount namespace, which is tied to a + // single user namespace. Thus we must run in the same user namespace + // to access mounts. + creds := auth.NewUserCredentials( + auth.KUID(spec.Process.User.UID), + auth.KGID(spec.Process.User.GID), + extraKGIDs, + caps, + l.k.RootUserNamespace()) + + var pidns *kernel.PIDNamespace + if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok { + if ns.Path != "" { + for _, p := range l.processes { + if ns.Path == p.pidnsPath { + pidns = p.tg.PIDNamespace() + break + } + } + } + if pidns == nil { + pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) + } + l.processes[eid].pidnsPath = ns.Path + } else { + pidns = l.k.RootPIDNamespace() + } + procArgs, err := newProcess(cid, spec, creds, l.k, pidns) + if err != nil { + return fmt.Errorf("creating new process: %v", err) + } + + // setupContainerFS() dups stdioFDs, so we don't need to dup them here. + var stdioFDs []int + for _, f := range files[:3] { + stdioFDs = append(stdioFDs, int(f.Fd())) + } + + // Create the FD map, which will set stdin, stdout, and stderr. + ctx := procArgs.NewContext(l.k) + fdTable, _, _, err := createFDTable(ctx, false, stdioFDs) + if err != nil { + return fmt.Errorf("importing fds: %v", err) + } + // CreateProcess takes a reference on fdTable if successful. We won't + // need ours either way. + procArgs.FDTable = fdTable + + // Can't take ownership away from os.File. dup them to get a new FDs. + var goferFDs []int + for _, f := range files[3:] { + fd, err := syscall.Dup(int(f.Fd())) + if err != nil { + return fmt.Errorf("failed to dup file: %v", err) + } + goferFDs = append(goferFDs, fd) + } + + // Setup the child container file system. + l.startGoferMonitor(cid, goferFDs) + + mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints) + if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil { + return err + } + + // Add the HOME enviroment variable if it is not already set. + var envv []string + if kernel.VFS2Enabled { + envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2, + procArgs.Credentials.RealKUID, procArgs.Envv) + + } else { + envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace, + procArgs.Credentials.RealKUID, procArgs.Envv) + } + if err != nil { + return err + } + procArgs.Envv = envv + + // Create and start the new process. + tg, _, err := l.k.CreateProcess(procArgs) + if err != nil { + return fmt.Errorf("creating process: %v", err) + } + l.k.StartProcess(tg) + + // CreateProcess takes a reference on FDTable if successful. + procArgs.FDTable.DecRef() + + l.processes[eid].tg = tg + return nil +} + +// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on +// the gofer FDs looking for disconnects, and destroys the container if a +// disconnect occurs in any of the gofer FDs. +func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { + go func() { + log.Debugf("Monitoring gofer health for container %q", cid) + var events []unix.PollFd + for _, fd := range goferFDs { + events = append(events, unix.PollFd{ + Fd: int32(fd), + Events: unix.POLLHUP | unix.POLLRDHUP, + }) + } + _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { + // Use ppoll instead of poll because it's already whilelisted in seccomp. + n, err := unix.Ppoll(events, nil, nil) + return uintptr(n), 0, err + }) + if err != nil { + panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err)) + } + + // Check if the gofer has stopped as part of normal container destruction. + // This is done just to avoid sending an annoying error message to the log. + // Note that there is a small race window in between mu.Unlock() and the + // lock being reacquired in destroyContainer(), but it's harmless to call + // destroyContainer() multiple times. + l.mu.Lock() + _, ok := l.processes[execID{cid: cid}] + l.mu.Unlock() + if ok { + log.Infof("Gofer socket disconnected, destroying container %q", cid) + if err := l.destroyContainer(cid); err != nil { + log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err) + } + } + }() +} + +// destroyContainer stops a container if it is still running and cleans up its +// filesystem. +func (l *Loader) destroyContainer(cid string) error { + l.mu.Lock() + defer l.mu.Unlock() + + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) + if err != nil { + // Container doesn't exist. + return err + } + + // The container exists, but has it been started? + if tg != nil { + if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { + return fmt.Errorf("sending SIGKILL to all container processes: %v", err) + } + // Wait for all processes that belong to the container to exit (including + // exec'd processes). + for _, t := range l.k.TaskSet().Root.Tasks() { + if t.ContainerID() == cid { + t.ThreadGroup().WaitExited() + } + } + + // At this point, all processes inside of the container have exited, + // releasing all references to the container's MountNamespace and + // causing all submounts and overlays to be unmounted. + // + // Since the container's MountNamespace has been released, + // MountNamespace.destroy() will have executed, but that function may + // trigger async close operations. We must wait for those to complete + // before returning, otherwise the caller may kill the gofer before + // they complete, causing a cascade of failing RPCs. + fs.AsyncBarrier() + } + + // No more failure from this point on. Remove all container thread groups + // from the map. + for key := range l.processes { + if key.cid == cid { + delete(l.processes, key) + } + } + + log.Debugf("Container destroyed %q", cid) + return nil +} + +func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { + // Hold the lock for the entire operation to ensure that exec'd process is + // added to 'processes' in case it races with destroyContainer(). + l.mu.Lock() + defer l.mu.Unlock() + + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID}) + if err != nil { + return 0, err + } + if tg == nil { + return 0, fmt.Errorf("container %q not started", args.ContainerID) + } + + // Get the container MountNamespace from the Task. + if kernel.VFS2Enabled { + // task.MountNamespace() does not take a ref, so we must do so ourselves. + args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2() + args.MountNamespaceVFS2.IncRef() + } else { + tg.Leader().WithMuLocked(func(t *kernel.Task) { + // task.MountNamespace() does not take a ref, so we must do so ourselves. + args.MountNamespace = t.MountNamespace() + args.MountNamespace.IncRef() + }) + } + + // Add the HOME environment variable if it is not already set. + if kernel.VFS2Enabled { + defer args.MountNamespaceVFS2.DecRef() + + root := args.MountNamespaceVFS2.Root() + defer root.DecRef() + ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) + if err != nil { + return 0, err + } + args.Envv = envv + } else { + defer args.MountNamespace.DecRef() + + root := args.MountNamespace.Root() + defer root.DecRef() + ctx := fs.WithRoot(l.k.SupervisorContext(), root) + envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) + if err != nil { + return 0, err + } + args.Envv = envv + } + + // Start the process. + proc := control.Proc{Kernel: l.k} + args.PIDNamespace = tg.PIDNamespace() + newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args) + if err != nil { + return 0, err + } + + eid := execID{cid: args.ContainerID, pid: tgid} + l.processes[eid] = &execProcess{ + tg: newTG, + tty: ttyFile, + ttyVFS2: ttyFileVFS2, + } + log.Debugf("updated processes: %v", l.processes) + + return tgid, nil +} + +// waitContainer waits for the init process of a container to exit. +func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { + // Don't defer unlock, as doing so would make it impossible for + // multiple clients to wait on the same container. + tg, err := l.threadGroupFromID(execID{cid: cid}) + if err != nil { + return fmt.Errorf("can't wait for container %q: %v", cid, err) + } + + // If the thread either has already exited or exits during waiting, + // consider the container exited. + ws := l.wait(tg) + *waitStatus = ws + return nil +} + +func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { + if tgid <= 0 { + return fmt.Errorf("PID (%d) must be positive", tgid) + } + + // Try to find a process that was exec'd + eid := execID{cid: cid, pid: tgid} + execTG, err := l.threadGroupFromID(eid) + if err == nil { + ws := l.wait(execTG) + *waitStatus = ws + + l.mu.Lock() + delete(l.processes, eid) + log.Debugf("updated processes (removal): %v", l.processes) + l.mu.Unlock() + return nil + } + + // The caller may be waiting on a process not started directly via exec. + // In this case, find the process in the container's PID namespace. + initTG, err := l.threadGroupFromID(execID{cid: cid}) + if err != nil { + return fmt.Errorf("waiting for PID %d: %v", tgid, err) + } + tg := initTG.PIDNamespace().ThreadGroupWithID(tgid) + if tg == nil { + return fmt.Errorf("waiting for PID %d: no such process", tgid) + } + if tg.Leader().ContainerID() != cid { + return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) + } + ws := l.wait(tg) + *waitStatus = ws + return nil +} + +// wait waits for the process with TGID 'tgid' in a container's PID namespace +// to exit. +func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 { + tg.WaitExited() + return tg.ExitStatus().Status() +} + +// WaitForStartSignal waits for a start signal from the control server. +func (l *Loader) WaitForStartSignal() { + <-l.ctrl.manager.startChan +} + +// WaitExit waits for the root container to exit, and returns its exit status. +func (l *Loader) WaitExit() kernel.ExitStatus { + // Wait for container. + l.k.WaitExited() + + return l.k.GlobalInit().ExitStatus() +} + +func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { + // Create an empty network stack because the network namespace may be empty at + // this point. Netns is configured before Run() is called. Netstack is + // configured using a control uRPC message. Host network is configured inside + // Run(). + switch conf.Network { + case NetworkHost: + // No network namespacing support for hostinet yet, hence creator is nil. + return inet.NewRootNamespace(hostinet.NewStack(), nil), nil + + case NetworkNone, NetworkSandbox: + s, err := newEmptySandboxNetworkStack(clock, uniqueID) + if err != nil { + return nil, err + } + creator := &sandboxNetstackCreator{ + clock: clock, + uniqueID: uniqueID, + } + return inet.NewRootNamespace(s, creator), nil + + default: + panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) + } + +} + +func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { + netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} + transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + s := netstack.Stack{stack.New(stack.Options{ + NetworkProtocols: netProtos, + TransportProtocols: transProtos, + Clock: clock, + Stats: netstack.Metrics, + HandleLocal: true, + // Enable raw sockets for users with sufficient + // privileges. + RawFactory: raw.EndpointFactory{}, + UniqueID: uniqueID, + })} + + // Enable SACK Recovery. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { + return nil, fmt.Errorf("failed to enable SACK: %s", err) + } + + // Set default TTLs as required by socket/netstack. + s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + + // Enable Receive Buffer Auto-Tuning. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err) + } + + return &s, nil +} + +// sandboxNetstackCreator implements kernel.NetworkStackCreator. +// +// +stateify savable +type sandboxNetstackCreator struct { + clock tcpip.Clock + uniqueID stack.UniqueID +} + +// CreateStack implements kernel.NetworkStackCreator.CreateStack. +func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { + s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID) + if err != nil { + return nil, err + } + + // Setup loopback. + n := &Network{Stack: s.(*netstack.Stack).Stack} + nicID := tcpip.NICID(f.uniqueID.UniqueID()) + link := DefaultLoopbackLink + linkEP := loopback.New() + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return nil, err + } + + return s, nil +} + +// signal sends a signal to one or more processes in a container. If PID is 0, +// then the container init process is used. Depending on the SignalDeliveryMode +// option, the signal may be sent directly to the indicated process, to all +// processes in the container, or to the foreground process group. +func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error { + if pid < 0 { + return fmt.Errorf("PID (%d) must be positive", pid) + } + + switch mode { + case DeliverToProcess: + if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil { + return fmt.Errorf("signaling process in container %q PID %d: %v", cid, pid, err) + } + return nil + + case DeliverToForegroundProcessGroup: + if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil { + return fmt.Errorf("signaling foreground process group in container %q PID %d: %v", cid, pid, err) + } + return nil + + case DeliverToAllProcesses: + if pid != 0 { + return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid) + } + // Check that the container has actually started before signaling it. + if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil { + return err + } + if err := l.signalAllProcesses(cid, signo); err != nil { + return fmt.Errorf("signaling all processes in container %q: %v", cid, err) + } + return nil + + default: + panic(fmt.Sprintf("unknown signal delivery mode %v", mode)) + } +} + +func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error { + execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) + if err == nil { + // Send signal directly to the identified process. + return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo}) + } + + // The caller may be signaling a process not started directly via exec. + // In this case, find the process in the container's PID namespace and + // signal it. + initTG, err := l.threadGroupFromID(execID{cid: cid}) + if err != nil { + return fmt.Errorf("no thread group found: %v", err) + } + tg := initTG.PIDNamespace().ThreadGroupWithID(tgid) + if tg == nil { + return fmt.Errorf("no such process with PID %d", tgid) + } + if tg.Leader().ContainerID() != cid { + return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) + } + return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}) +} + +// signalForegrondProcessGroup looks up foreground process group from the TTY +// for the given "tgid" inside container "cid", and send the signal to it. +func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { + l.mu.Lock() + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid}) + if err != nil { + l.mu.Unlock() + return fmt.Errorf("no thread group found: %v", err) + } + if tg == nil { + l.mu.Unlock() + return fmt.Errorf("container %q not started", cid) + } + + tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid}) + l.mu.Unlock() + if err != nil { + return fmt.Errorf("no thread group found: %v", err) + } + + var pg *kernel.ProcessGroup + switch { + case ttyVFS2 != nil: + pg = ttyVFS2.ForegroundProcessGroup() + case tty != nil: + pg = tty.ForegroundProcessGroup() + default: + return fmt.Errorf("no TTY attached") + } + if pg == nil { + // No foreground process group has been set. Signal the + // original thread group. + log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid) + return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}) + } + // Send the signal to all processes in the process group. + var lastErr error + for _, tg := range l.k.TaskSet().Root.ThreadGroups() { + if tg.ProcessGroup() != pg { + continue + } + if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil { + lastErr = err + } + } + return lastErr +} + +// signalAllProcesses that belong to specified container. It's a noop if the +// container hasn't started or has exited. +func (l *Loader) signalAllProcesses(cid string, signo int32) error { + // Pause the kernel to prevent new processes from being created while + // the signal is delivered. This prevents process leaks when SIGKILL is + // sent to the entire container. + l.k.Pause() + defer l.k.Unpause() + return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}) +} + +// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it +// acquires mutex before calling it and fails in case container hasn't started +// yet. +func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) { + l.mu.Lock() + defer l.mu.Unlock() + tg, err := l.tryThreadGroupFromIDLocked(key) + if err != nil { + return nil, err + } + if tg == nil { + return nil, fmt.Errorf("container %q not started", key.cid) + } + return tg, nil +} + +// tryThreadGroupFromIDLocked returns the thread group for the given execution +// ID. It may return nil in case the container has not started yet. Returns +// error if execution ID is invalid or if the container cannot be found (maybe +// it has been deleted). Caller must hold 'mu'. +func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) { + ep := l.processes[key] + if ep == nil { + return nil, fmt.Errorf("container %q not found", key.cid) + } + return ep.tg, nil +} + +// ttyFromIDLocked returns the TTY files for the given execution ID. It may +// return nil in case the container has not started yet. Returns error if +// execution ID is invalid or if the container cannot be found (maybe it has +// been deleted). Caller must hold 'mu'. +func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { + ep := l.processes[key] + if ep == nil { + return nil, nil, fmt.Errorf("container %q not found", key.cid) + } + return ep.tty, ep.ttyVFS2, nil +} + +func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { + if len(stdioFDs) != 3 { + return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) + } + + k := kernel.KernelFromContext(ctx) + fdTable := k.NewFDTable() + ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) + if err != nil { + fdTable.DecRef() + return nil, nil, nil, err + } + return fdTable, ttyFile, ttyFileVFS2, nil +} diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go new file mode 100644 index 000000000..b723e4335 --- /dev/null +++ b/runsc/boot/loader_test.go @@ -0,0 +1,715 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "math/rand" + "os" + "reflect" + "syscall" + "testing" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/runsc/fsgofer" +) + +func init() { + log.SetLevel(log.Debug) + rand.Seed(time.Now().UnixNano()) + if err := fsgofer.OpenProcSelfFD(); err != nil { + panic(err) + } +} + +func testConfig() *Config { + return &Config{ + RootDir: "unused_root_dir", + Network: NetworkNone, + DisableSeccomp: true, + Platform: "ptrace", + } +} + +// testSpec returns a simple spec that can be used in tests. +func testSpec() *specs.Spec { + return &specs.Spec{ + // The host filesystem root is the sandbox root. + Root: &specs.Root{ + Path: "/", + Readonly: true, + }, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + } +} + +// startGofer starts a new gofer routine serving 'root' path. It returns the +// sandbox side of the connection, and a function that when called will stop the +// gofer. +func startGofer(root string) (int, func(), error) { + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return 0, nil, err + } + sandboxEnd, goferEnd := fds[0], fds[1] + + socket, err := unet.NewSocket(goferEnd) + if err != nil { + syscall.Close(sandboxEnd) + syscall.Close(goferEnd) + return 0, nil, fmt.Errorf("error creating server on FD %d: %v", goferEnd, err) + } + at, err := fsgofer.NewAttachPoint(root, fsgofer.Config{ROMount: true}) + if err != nil { + return 0, nil, err + } + go func() { + s := p9.NewServer(at) + if err := s.Handle(socket); err != nil { + log.Infof("Gofer is stopping. FD: %d, err: %v\n", goferEnd, err) + } + }() + // Closing the gofer socket will stop the gofer and exit goroutine above. + cleanup := func() { + if err := socket.Close(); err != nil { + log.Warningf("Error closing gofer socket: %v", err) + } + } + return sandboxEnd, cleanup, nil +} + +func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) { + fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10])) + if err != nil { + return nil, nil, err + } + conf := testConfig() + conf.VFS2 = vfsEnabled + + sandEnd, cleanup, err := startGofer(spec.Root.Path) + if err != nil { + return nil, nil, err + } + + // Loader takes ownership of stdio. + var stdio []int + for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} { + newFd, err := unix.Dup(int(f.Fd())) + if err != nil { + return nil, nil, err + } + stdio = append(stdio, newFd) + } + + args := Args{ + ID: "foo", + Spec: spec, + Conf: conf, + ControllerFD: fd, + GoferFDs: []int{sandEnd}, + StdioFDs: stdio, + } + l, err := New(args) + if err != nil { + cleanup() + return nil, nil, err + } + return l, cleanup, nil +} + +// TestRun runs a simple application in a sandbox and checks that it succeeds. +func TestRun(t *testing.T) { + doRun(t, false) +} + +// TestRunVFS2 runs TestRun in VFSv2. +func TestRunVFS2(t *testing.T) { + doRun(t, true) +} + +func doRun(t *testing.T, vfsEnabled bool) { + l, cleanup, err := createLoader(vfsEnabled, testSpec()) + if err != nil { + t.Fatalf("error creating loader: %v", err) + } + + defer l.Destroy() + defer cleanup() + + // Start a goroutine to read the start chan result, otherwise Run will + // block forever. + var resultChanErr error + var wg sync.WaitGroup + wg.Add(1) + go func() { + resultChanErr = <-l.ctrl.manager.startResultChan + wg.Done() + }() + + // Run the container. + if err := l.Run(); err != nil { + t.Errorf("error running container: %v", err) + } + + // We should have not gotten an error on the startResultChan. + wg.Wait() + if resultChanErr != nil { + t.Errorf("error on startResultChan: %v", resultChanErr) + } + + // Wait for the application to exit. It should succeed. + if status := l.WaitExit(); status.Code != 0 || status.Signo != 0 { + t.Errorf("application exited with status %+v, want 0", status) + } +} + +// TestStartSignal tests that the controller Start message will cause +// WaitForStartSignal to return. +func TestStartSignal(t *testing.T) { + doStartSignal(t, false) +} + +// TestStartSignalVFS2 does TestStartSignal with VFS2. +func TestStartSignalVFS2(t *testing.T) { + doStartSignal(t, true) +} + +func doStartSignal(t *testing.T, vfsEnabled bool) { + l, cleanup, err := createLoader(vfsEnabled, testSpec()) + if err != nil { + t.Fatalf("error creating loader: %v", err) + } + defer l.Destroy() + defer cleanup() + + // We aren't going to wait on this application, so the control server + // needs to be shut down manually. + defer l.ctrl.srv.Stop() + + // Start a goroutine that calls WaitForStartSignal and writes to a + // channel when it returns. + waitFinished := make(chan struct{}) + go func() { + l.WaitForStartSignal() + // Pretend that Run() executed and returned no error. + l.ctrl.manager.startResultChan <- nil + waitFinished <- struct{}{} + }() + + // Nothing has been written to the channel, so waitFinished should not + // return. Give it a little bit of time to make sure the goroutine has + // started. + select { + case <-waitFinished: + t.Errorf("WaitForStartSignal completed but it should not have") + case <-time.After(50 * time.Millisecond): + // OK. + } + + // Trigger the control server StartRoot method. + cid := "foo" + if err := l.ctrl.manager.StartRoot(&cid, nil); err != nil { + t.Errorf("error calling StartRoot: %v", err) + } + + // Now WaitForStartSignal should return (within a short amount of + // time). + select { + case <-waitFinished: + // OK. + case <-time.After(50 * time.Millisecond): + t.Errorf("WaitForStartSignal did not complete but it should have") + } + +} + +type CreateMountTestcase struct { + name string + // Spec that will be used to create the mount manager. Note + // that we can't mount procfs without a kernel, so each spec + // MUST contain something other than procfs mounted at /proc. + spec specs.Spec + // Paths that are expected to exist in the resulting fs. + expectedPaths []string +} + +func createMountTestcases(vfs2 bool) []*CreateMountTestcase { + testCases := []*CreateMountTestcase{ + &CreateMountTestcase{ + // Only proc. + name: "only proc mount", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + }, + }, + // /proc, /dev, and /sys should always be mounted. + expectedPaths: []string{"/proc", "/dev", "/sys"}, + }, + { + // Mount at a deep path, with many components that do + // not exist in the root. + name: "deep mount path", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/some/very/very/deep/path", + Type: "tmpfs", + }, + { + Destination: "/proc", + Type: "tmpfs", + }, + }, + }, + // /some/deep/path should be mounted, along with /proc, + // /dev, and /sys. + expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"}, + }, + &CreateMountTestcase{ + // Mounts are nested inside each other. + name: "nested mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + { + Destination: "/foo", + Type: "tmpfs", + }, + { + Destination: "/foo/qux", + Type: "tmpfs", + }, + { + // File mounts with the same prefix. + Destination: "/foo/qux-quz", + Type: "tmpfs", + }, + { + Destination: "/foo/bar", + Type: "tmpfs", + }, + { + Destination: "/foo/bar/baz", + Type: "tmpfs", + }, + { + // A deep path that is in foo but not the other mounts. + Destination: "/foo/some/very/very/deep/path", + Type: "tmpfs", + }, + }, + }, + expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux", + "/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"}, + }, + &CreateMountTestcase{ + name: "mount inside /dev", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + { + Destination: "/dev", + Type: "tmpfs", + }, + { + // Mounted by runsc by default. + Destination: "/dev/fd", + Type: "tmpfs", + }, + { + // Mount with the same prefix. + Destination: "/dev/fd-foo", + Type: "tmpfs", + }, + { + // Unsupported fs type. + Destination: "/dev/mqueue", + Type: "mqueue", + }, + { + Destination: "/dev/foo", + Type: "tmpfs", + }, + { + Destination: "/dev/bar", + Type: "tmpfs", + }, + }, + }, + expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"}, + }, + } + + vfsCase := &CreateMountTestcase{ + name: "mounts inside mandatory mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports + // MkDirAt in VFS2 (and remove the reduntant append). + // { + // Destination: "/sys/bar", + // Type: "tmpfs", + // }, + // + { + Destination: "/tmp/baz", + Type: "tmpfs", + }, + }, + }, + expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"}, + } + + if !vfs2 { + vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"}) + vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar") + } + return append(testCases, vfsCase) +} + +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespace(t *testing.T) { + for _, tc := range createMountTestcases(false /* vfs2 */) { + t.Run(tc.name, func(t *testing.T) { + conf := testConfig() + ctx := contexttest.Context(t) + + sandEnd, cleanup, err := startGofer(tc.spec.Root.Path) + if err != nil { + t.Fatalf("failed to create gofer: %v", err) + } + defer cleanup() + + mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{}) + mns, err := mntr.createMountNamespace(ctx, conf) + if err != nil { + t.Fatalf("failed to create mount namespace: %v", err) + } + ctx = fs.WithRoot(ctx, mns.Root()) + if err := mntr.mountSubmounts(ctx, conf, mns); err != nil { + t.Fatalf("failed to create mount namespace: %v", err) + } + + root := mns.Root() + defer root.DecRef() + for _, p := range tc.expectedPaths { + maxTraversals := uint(0) + if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil { + t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + } else { + d.DecRef() + } + } + }) + } +} + +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespaceVFS2(t *testing.T) { + for _, tc := range createMountTestcases(true /* vfs2 */) { + t.Run(tc.name, func(t *testing.T) { + spec := testSpec() + spec.Mounts = tc.spec.Mounts + spec.Root = tc.spec.Root + + t.Logf("Using root: %q", spec.Root.Path) + l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec) + if err != nil { + t.Fatalf("failed to create loader: %v", err) + } + defer l.Destroy() + defer loaderCleanup() + + mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.conf, l.rootProcArgs.Credentials); err != nil { + t.Fatalf("failed process hints: %v", err) + } + + ctx := l.k.SupervisorContext() + mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs) + if err != nil { + t.Fatalf("failed to setupVFS2: %v", err) + } + + root := mns.Root() + defer root.DecRef() + for _, p := range tc.expectedPaths { + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(p), + } + + if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { + t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + } else { + d.DecRef() + } + } + }) + } +} + +// TestRestoreEnvironment tests that the correct mounts are collected from the spec and config +// in order to build the environment for restoring. +func TestRestoreEnvironment(t *testing.T) { + testCases := []struct { + name string + spec *specs.Spec + ioFDs []int + errorExpected bool + expectedRenv fs.RestoreEnvironment + }{ + { + name: "basic spec test", + spec: &specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/some/very/very/deep/path", + Type: "tmpfs", + }, + { + Destination: "/proc", + Type: "tmpfs", + }, + }, + }, + ioFDs: []int{0}, + errorExpected: false, + expectedRenv: fs.RestoreEnvironment{ + MountSources: map[string][]fs.MountArgs{ + "9p": { + { + Dev: "9pfs-/", + Flags: fs.MountSourceFlags{ReadOnly: true}, + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + }, + }, + "tmpfs": { + { + Dev: "none", + }, + { + Dev: "none", + }, + { + Dev: "none", + }, + }, + "devtmpfs": { + { + Dev: "none", + }, + }, + "devpts": { + { + Dev: "none", + }, + }, + "sysfs": { + { + Dev: "none", + }, + }, + }, + }, + }, + { + name: "bind type test", + spec: &specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/dev/fd-foo", + Type: "bind", + }, + }, + }, + ioFDs: []int{0, 1}, + errorExpected: false, + expectedRenv: fs.RestoreEnvironment{ + MountSources: map[string][]fs.MountArgs{ + "9p": { + { + Dev: "9pfs-/", + Flags: fs.MountSourceFlags{ReadOnly: true}, + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + }, + { + Dev: "9pfs-/dev/fd-foo", + DataString: "trans=fd,rfdno=1,wfdno=1,privateunixsocket=true,cache=remote_revalidating", + }, + }, + "tmpfs": { + { + Dev: "none", + }, + }, + "devtmpfs": { + { + Dev: "none", + }, + }, + "devpts": { + { + Dev: "none", + }, + }, + "proc": { + { + Dev: "none", + }, + }, + "sysfs": { + { + Dev: "none", + }, + }, + }, + }, + }, + { + name: "options test", + spec: &specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/dev/fd-foo", + Type: "tmpfs", + Options: []string{"uid=1022", "noatime"}, + }, + }, + }, + ioFDs: []int{0}, + errorExpected: false, + expectedRenv: fs.RestoreEnvironment{ + MountSources: map[string][]fs.MountArgs{ + "9p": { + { + Dev: "9pfs-/", + Flags: fs.MountSourceFlags{ReadOnly: true}, + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + }, + }, + "tmpfs": { + { + Dev: "none", + Flags: fs.MountSourceFlags{NoAtime: true}, + DataString: "uid=1022", + }, + { + Dev: "none", + }, + }, + "devtmpfs": { + { + Dev: "none", + }, + }, + "devpts": { + { + Dev: "none", + }, + }, + "proc": { + { + Dev: "none", + }, + }, + "sysfs": { + { + Dev: "none", + }, + }, + }, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + conf := testConfig() + mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{}) + actualRenv, err := mntr.createRestoreEnvironment(conf) + if !tc.errorExpected && err != nil { + t.Fatalf("could not create restore environment for test:%s", tc.name) + } else if tc.errorExpected { + if err == nil { + t.Errorf("expected an error, but no error occurred.") + } + } else { + if !reflect.DeepEqual(*actualRenv, tc.expectedRenv) { + t.Errorf("restore environments did not match for test:%s\ngot:%+v\nwant:%+v\n", tc.name, *actualRenv, tc.expectedRenv) + } + } + }) + } +} diff --git a/runsc/boot/network.go b/runsc/boot/network.go new file mode 100644 index 000000000..14d2f56a5 --- /dev/null +++ b/runsc/boot/network.go @@ -0,0 +1,341 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "net" + "runtime" + "strings" + "syscall" + + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/urpc" +) + +var ( + // DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and + // "::1/8" on "lo" interface. + DefaultLoopbackLink = LoopbackLink{ + Name: "lo", + Addresses: []net.IP{ + net.IP("\x7f\x00\x00\x01"), + net.IPv6loopback, + }, + Routes: []Route{ + { + Destination: net.IPNet{ + IP: net.IPv4(0x7f, 0, 0, 0), + Mask: net.IPv4Mask(0xff, 0, 0, 0), + }, + }, + { + Destination: net.IPNet{ + IP: net.IPv6loopback, + Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), + }, + }, + }, + } +) + +// Network exposes methods that can be used to configure a network stack. +type Network struct { + Stack *stack.Stack +} + +// Route represents a route in the network stack. +type Route struct { + Destination net.IPNet + Gateway net.IP +} + +// DefaultRoute represents a catch all route to the default gateway. +type DefaultRoute struct { + Route Route + Name string +} + +// QueueingDiscipline is used to specify the kind of Queueing Discipline to +// apply for a give FDBasedLink. +type QueueingDiscipline int + +const ( + // QDiscNone disables any queueing for the underlying FD. + QDiscNone QueueingDiscipline = iota + + // QDiscFIFO applies a simple fifo based queue to the underlying + // FD. + QDiscFIFO +) + +// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s +// else returns an error. +func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) { + switch s { + case "none": + return QDiscNone, nil + case "fifo": + return QDiscFIFO, nil + default: + return 0, fmt.Errorf("unsupported qdisc specified: %q", s) + } +} + +// String implements fmt.Stringer. +func (q QueueingDiscipline) String() string { + switch q { + case QDiscNone: + return "none" + case QDiscFIFO: + return "fifo" + default: + panic(fmt.Sprintf("Invalid queueing discipline: %d", q)) + } +} + +// FDBasedLink configures an fd-based link. +type FDBasedLink struct { + Name string + MTU int + Addresses []net.IP + Routes []Route + GSOMaxSize uint32 + SoftwareGSOEnabled bool + TXChecksumOffload bool + RXChecksumOffload bool + LinkAddress net.HardwareAddr + QDisc QueueingDiscipline + + // NumChannels controls how many underlying FD's are to be used to + // create this endpoint. + NumChannels int +} + +// LoopbackLink configures a loopback li nk. +type LoopbackLink struct { + Name string + Addresses []net.IP + Routes []Route +} + +// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes. +type CreateLinksAndRoutesArgs struct { + // FilePayload contains the fds associated with the FDBasedLinks. The + // number of fd's should match the sum of the NumChannels field of the + // FDBasedLink entries below. + urpc.FilePayload + + LoopbackLinks []LoopbackLink + FDBasedLinks []FDBasedLink + + Defaultv4Gateway DefaultRoute + Defaultv6Gateway DefaultRoute +} + +// Empty returns true if route hasn't been set. +func (r *Route) Empty() bool { + return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil +} + +func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) { + subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask)) + if err != nil { + return tcpip.Route{}, err + } + return tcpip.Route{ + Destination: subnet, + Gateway: ipToAddress(r.Gateway), + NIC: id, + }, nil +} + +// CreateLinksAndRoutes creates links and routes in a network stack. It should +// only be called once. +func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { + wantFDs := 0 + for _, l := range args.FDBasedLinks { + wantFDs += l.NumChannels + } + if got := len(args.FilePayload.Files); got != wantFDs { + return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs) + } + + var nicID tcpip.NICID + nicids := make(map[string]tcpip.NICID) + + // Collect routes from all links. + var routes []tcpip.Route + + // Loopback normally appear before other interfaces. + for _, link := range args.LoopbackLinks { + nicID++ + nicids[link.Name] = nicID + + linkEP := loopback.New() + + log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return err + } + + // Collect the routes from this link. + for _, r := range link.Routes { + route, err := r.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + } + + fdOffset := 0 + for _, link := range args.FDBasedLinks { + nicID++ + nicids[link.Name] = nicID + + FDs := []int{} + for j := 0; j < link.NumChannels; j++ { + // Copy the underlying FD. + oldFD := args.FilePayload.Files[fdOffset].Fd() + newFD, err := syscall.Dup(int(oldFD)) + if err != nil { + return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + } + FDs = append(FDs, newFD) + fdOffset++ + } + + mac := tcpip.LinkAddress(link.LinkAddress) + log.Infof("gso max size is: %d", link.GSOMaxSize) + + linkEP, err := fdbased.New(&fdbased.Options{ + FDs: FDs, + MTU: uint32(link.MTU), + EthernetHeader: true, + Address: mac, + PacketDispatchMode: fdbased.RecvMMsg, + GSOMaxSize: link.GSOMaxSize, + SoftwareGSOEnabled: link.SoftwareGSOEnabled, + TXChecksumOffload: link.TXChecksumOffload, + RXChecksumOffload: link.RXChecksumOffload, + }) + if err != nil { + return err + } + + switch link.QDisc { + case QDiscNone: + case QDiscFIFO: + log.Infof("Enabling FIFO QDisc on %q", link.Name) + linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) + } + + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return err + } + + // Collect the routes from this link. + for _, r := range link.Routes { + route, err := r.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + } + + if !args.Defaultv4Gateway.Route.Empty() { + nicID, ok := nicids[args.Defaultv4Gateway.Name] + if !ok { + return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name) + } + route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + + if !args.Defaultv6Gateway.Route.Empty() { + nicID, ok := nicids[args.Defaultv6Gateway.Name] + if !ok { + return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name) + } + route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + + log.Infof("Setting routes %+v", routes) + n.Stack.SetRouteTable(routes) + return nil +} + +// createNICWithAddrs creates a NIC in the network stack and adds the given +// addresses. +func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error { + opts := stack.NICOptions{Name: name} + if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil { + return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err) + } + + // Always start with an arp address for the NIC. + if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err) + } + + for _, addr := range addrs { + proto, tcpipAddr := ipToAddressAndProto(addr) + if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil { + return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err) + } + } + return nil +} + +// ipToAddressAndProto converts IP to tcpip.Address and a protocol number. +// +// Note: don't use 'len(ip)' to determine IP version because length is always 16. +func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) { + if i4 := ip.To4(); i4 != nil { + return ipv4.ProtocolNumber, tcpip.Address(i4) + } + return ipv6.ProtocolNumber, tcpip.Address(ip) +} + +// ipToAddress converts IP to tcpip.Address, ignoring the protocol. +func ipToAddress(ip net.IP) tcpip.Address { + _, addr := ipToAddressAndProto(ip) + return addr +} + +// ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the +// protocol. +func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask { + return tcpip.AddressMask(ipToAddress(net.IP(ipMask))) +} diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD new file mode 100644 index 000000000..77774f43c --- /dev/null +++ b/runsc/boot/platforms/BUILD @@ -0,0 +1,15 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "platforms", + srcs = ["platforms.go"], + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + ], +) diff --git a/runsc/boot/platforms/platforms.go b/runsc/boot/platforms/platforms.go new file mode 100644 index 000000000..056b46ad5 --- /dev/null +++ b/runsc/boot/platforms/platforms.go @@ -0,0 +1,30 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package platforms imports all available platform packages. +package platforms + +import ( + // Import platforms that runsc might use. + _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" +) + +const ( + // Ptrace runs the sandbox with the ptrace platform. + Ptrace = "ptrace" + + // KVM runs the sandbox with the KVM platform. + KVM = "kvm" +) diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD new file mode 100644 index 000000000..29cb42b2f --- /dev/null +++ b/runsc/boot/pprof/BUILD @@ -0,0 +1,11 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "pprof", + srcs = ["pprof.go"], + visibility = [ + "//runsc:__subpackages__", + ], +) diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go new file mode 100644 index 000000000..1ded20dee --- /dev/null +++ b/runsc/boot/pprof/pprof.go @@ -0,0 +1,20 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pprof provides a stub to initialize custom profilers. +package pprof + +// Initialize will be called at boot for initializing custom profilers. +func Initialize() { +} diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go new file mode 100644 index 000000000..fbfd3b07c --- /dev/null +++ b/runsc/boot/strace.go @@ -0,0 +1,40 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "gvisor.dev/gvisor/pkg/sentry/strace" +) + +func enableStrace(conf *Config) error { + // We must initialize even if strace is not enabled. + strace.Initialize() + + if !conf.Strace { + return nil + } + + max := conf.StraceLogSize + if max == 0 { + max = 1024 + } + strace.LogMaximumSize = max + + if len(conf.StraceSyscalls) == 0 { + strace.EnableAll(strace.SinkTypeLog) + return nil + } + return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog) +} diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go new file mode 100644 index 000000000..6ee6fae04 --- /dev/null +++ b/runsc/boot/vfs.go @@ -0,0 +1,482 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "path" + "sort" + "strings" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/devices/memdev" + "gvisor.dev/gvisor/pkg/sentry/devices/ttydev" + "gvisor.dev/gvisor/pkg/sentry/devices/tundev" + "gvisor.dev/gvisor/pkg/sentry/fs/user" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +func registerFilesystems(k *kernel.Kernel) error { + ctx := k.SupervisorContext() + creds := auth.NewRootCredentials(k.RootUserNamespace()) + vfsObj := k.VFS() + + vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + // TODO(b/29356795): Users may mount this once the terminals are in a + // usable state. + AllowUserMount: false, + }) + vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + // Setup files in devtmpfs. + if err := memdev.Register(vfsObj); err != nil { + return fmt.Errorf("registering memdev: %w", err) + } + if err := ttydev.Register(vfsObj); err != nil { + return fmt.Errorf("registering ttydev: %w", err) + } + + if err := fuse.Register(vfsObj); err != nil { + return fmt.Errorf("registering fusedev: %w", err) + } + if err := tundev.Register(vfsObj); err != nil { + return fmt.Errorf("registering tundev: %v", err) + } + a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name) + if err != nil { + return fmt.Errorf("creating devtmpfs accessor: %w", err) + } + defer a.Release() + + if err := a.UserspaceInit(ctx); err != nil { + return fmt.Errorf("initializing userspace: %w", err) + } + if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating memdev devtmpfs files: %w", err) + } + if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating ttydev devtmpfs files: %w", err) + } + if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating tundev devtmpfs files: %v", err) + } + if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { + return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + } + return nil +} + +func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + mns, err := mntr.setupVFS2(ctx, conf, procArgs) + if err != nil { + return fmt.Errorf("failed to setupFS: %w", err) + } + procArgs.MountNamespaceVFS2 = mns + + // Resolve the executable path from working dir and environment. + resolved, err := user.ResolveExecutablePath(ctx, procArgs) + if err != nil { + return err + } + procArgs.Filename = resolved + return nil +} + +func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { + log.Infof("Configuring container's file system with VFS2") + + // Create context with root credentials to mount the filesystem (the current + // user may not be privileged enough). + rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs := *procArgs + rootProcArgs.WorkingDirectory = "/" + rootProcArgs.Credentials = rootCreds + rootProcArgs.Umask = 0022 + rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals + rootCtx := procArgs.NewContext(c.k) + + mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) + if err != nil { + return nil, fmt.Errorf("creating mount namespace: %w", err) + } + rootProcArgs.MountNamespaceVFS2 = mns + + // Mount submounts. + if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil { + return nil, fmt.Errorf("mounting submounts vfs2: %w", err) + } + return mns, nil +} + +func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { + fd := c.fds.remove() + opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",") + + log.Infof("Mounting root over 9P, ioFD: %d", fd) + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts}) + if err != nil { + return nil, fmt.Errorf("setting up mount namespace: %w", err) + } + return mns, nil +} + +func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { + mounts, err := c.prepareMountsVFS2() + if err != nil { + return err + } + + for i := range mounts { + submount := &mounts[i] + log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) + if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err) + } + } else { + if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { + return fmt.Errorf("mount submount %q: %w", submount.Destination, err) + } + } + } + + if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil { + return fmt.Errorf(`mount submount "\tmp": %w`, err) + } + return nil +} + +type mountAndFD struct { + specs.Mount + fd int +} + +func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { + // Associate bind mounts with their FDs before sorting since there is an + // undocumented assumption that FDs are dispensed in the order in which + // they are required by mounts. + var mounts []mountAndFD + for _, m := range c.mounts { + fd := -1 + // Only bind mounts use host FDs; see + // containerMounter.getMountNameAndOptionsVFS2. + if m.Type == bind { + fd = c.fds.remove() + } + mounts = append(mounts, mountAndFD{ + Mount: m, + fd: fd, + }) + } + if err := c.checkDispenser(); err != nil { + return nil, err + } + + // Sort the mounts so that we don't place children before parents. + sort.Slice(mounts, func(i, j int) bool { + return len(mounts[i].Destination) < len(mounts[j].Destination) + }) + + return mounts, nil +} + +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { + root := mns.Root() + defer root.DecRef() + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(submount.Destination), + } + fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount) + if err != nil { + return fmt.Errorf("mountOptions failed: %w", err) + } + if len(fsName) == 0 { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil { + return err + } + if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil { + return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + } + log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data) + return nil +} + +// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) { + fsName := m.Type + var data []string + + // Find filesystem name and FS specific data field. + switch m.Type { + case devpts.Name, devtmpfs.Name, proc.Name, sys.Name: + // Nothing to do. + + case nonefs: + fsName = sys.Name + + case tmpfs.Name: + var err error + data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) + if err != nil { + return "", nil, err + } + + case bind: + fsName = gofer.Name + if m.fd == 0 { + // Check that an FD was provided to fails fast. Technically FD=0 is valid, + // but unlikely to be correct in this context. + return "", nil, fmt.Errorf("9P mount requires a connection FD") + } + data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) + + default: + log.Warningf("ignoring unknown filesystem type %q", m.Type) + return "", nil, nil + } + + opts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(data, ","), + }, + InternalMount: true, + } + + for _, o := range m.Options { + switch o { + case "rw": + opts.ReadOnly = false + case "ro": + opts.ReadOnly = true + case "noatime": + opts.Flags.NoATime = true + case "noexec": + opts.Flags.NoExec = true + default: + log.Warningf("ignoring unknown mount option %q", o) + } + } + + if conf.Overlay { + // All writes go to upper, be paranoid and make lower readonly. + opts.ReadOnly = true + } + return fsName, opts, nil +} + +func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(currentPath), + } + _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) + if err == nil { + log.Debugf("Mount point %q already exists", currentPath) + return nil + } + if err != syserror.ENOENT { + return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err) + } + + // Recurse to ensure parent is created and then create the mount point. + if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil { + return err + } + log.Debugf("Creating dir %q for mount point", currentPath) + mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} + if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil { + return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err) + } + return nil +} + +// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so. +// Technically we don't have to mount tmpfs at /tmp, as we could just rely on +// the host /tmp, but this is a nice optimization, and fixes some apps that call +// mknod in /tmp. It's unsafe to mount tmpfs if: +// 1. /tmp is mounted explicitly: we should not override user's wish +// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp +// +// Note that when there are submounts inside of '/tmp', directories for the +// mount points must be present, making '/tmp' not empty anymore. +func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { + for _, m := range c.mounts { + // m.Destination has been cleaned, so it's to use equality here. + if m.Destination == "/tmp" { + log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m) + return nil + } + } + + root := mns.Root() + defer root.DecRef() + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse("/tmp"), + } + // TODO(gvisor.dev/issue/2782): Use O_PATH when available. + statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{}) + switch err { + case nil: + // Found '/tmp' in filesystem, check if it's empty. + if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory { + // Not a dir?! Leave it be. + return nil + } + if statx.Nlink > 2 { + // If more than "." and ".." is found, skip internal tmpfs to prevent + // hiding existing files. + log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) + return nil + } + log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) + fallthrough + + case syserror.ENOENT: + // No '/tmp' found (or fallthrough from above). It's safe to mount internal + // tmpfs. + tmpMount := specs.Mount{ + Type: tmpfs.Name, + Destination: "/tmp", + // Sticky bit is added to prevent accidental deletion of files from + // another user. This is normally done for /tmp. + Options: []string{"mode=01777"}, + } + return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + + default: + return fmt.Errorf(`stating "/tmp" inside container: %w`, err) + } +} + +// processHintsVFS2 processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error { + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfs.Name { + continue + } + + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.vfsMount = mnt + } + return nil +} + +// mountSharedMasterVFS2 mounts the master of a volume that is shared among +// containers in a pod. +func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + mntFD := &mountAndFD{Mount: hint.mount} + fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) +} + +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error { + if err := source.checkCompatible(mount); err != nil { + return err + } + + _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + if err != nil { + return err + } + newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts) + if err != nil { + return err + } + defer newMnt.DecRef() + + root := mns.Root() + defer root.DecRef() + if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil { + return err + } + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(mount.Destination), + } + if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { + return err + } + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return nil +} |