diff options
Diffstat (limited to 'runsc/boot')
27 files changed, 1473 insertions, 759 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 6fe2b57de..ed3c8f546 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -7,6 +7,7 @@ go_library( srcs = [ "compat.go", "compat_amd64.go", + "compat_arm64.go", "config.go", "controller.go", "debug.go", @@ -15,30 +16,33 @@ go_library( "fs.go", "limits.go", "loader.go", + "loader_amd64.go", + "loader_arm64.go", "network.go", - "pprof.go", "strace.go", - "user.go", + "vfs.go", ], - importpath = "gvisor.dev/gvisor/runsc/boot", visibility = [ + "//pkg/test:__subpackages__", "//runsc:__subpackages__", "//test:__subpackages__", ], deps = [ "//pkg/abi", "//pkg/abi/linux", + "//pkg/context", "//pkg/control/server", "//pkg/cpuid", "//pkg/eventchannel", + "//pkg/fspath", "//pkg/log", "//pkg/memutil", "//pkg/rand", "//pkg/refs", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", - "//pkg/sentry/context", "//pkg/sentry/control", + "//pkg/sentry/devices/memdev", "//pkg/sentry/fs", "//pkg/sentry/fs/dev", "//pkg/sentry/fs/gofer", @@ -48,6 +52,13 @@ go_library( "//pkg/sentry/fs/sys", "//pkg/sentry/fs/tmpfs", "//pkg/sentry/fs/tty", + "//pkg/sentry/fs/user", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/gofer", + "//pkg/sentry/fsimpl/host", + "//pkg/sentry/fsimpl/proc", + "//pkg/sentry/fsimpl/sys", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel:uncaught_signal_go_proto", @@ -60,16 +71,19 @@ go_library( "//pkg/sentry/socket/hostinet", "//pkg/sentry/socket/netlink", "//pkg/sentry/socket/netlink/route", + "//pkg/sentry/socket/netlink/uevent", "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix", "//pkg/sentry/state", "//pkg/sentry/strace", "//pkg/sentry/syscalls/linux", + "//pkg/sentry/syscalls/linux/vfs2", "//pkg/sentry/time", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/usage", - "//pkg/sentry/usermem", + "//pkg/sentry/vfs", "//pkg/sentry/watchdog", + "//pkg/sync", "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/link/fdbased", @@ -86,6 +100,7 @@ go_library( "//pkg/urpc", "//runsc/boot/filter", "//runsc/boot/platforms", + "//runsc/boot/pprof", "//runsc/specutils", "@com_github_golang_protobuf//proto:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", @@ -100,19 +115,21 @@ go_test( "compat_test.go", "fs_test.go", "loader_test.go", - "user_test.go", ], - embed = [":boot"], + library = ":boot", deps = [ "//pkg/control/server", + "//pkg/fspath", "//pkg/log", "//pkg/p9", - "//pkg/sentry/arch:registers_go_proto", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel", + "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/unet", "//runsc/fsgofer", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 07e35ab10..b7cfb35bf 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -17,18 +17,16 @@ package boot import ( "fmt" "os" - "sync" "syscall" "github.com/golang/protobuf/proto" - "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/arch" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" "gvisor.dev/gvisor/pkg/sentry/strace" spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" + "gvisor.dev/gvisor/pkg/sync" ) func initCompatLogs(fd int) error { @@ -53,9 +51,9 @@ type compatEmitter struct { } func newCompatEmitter(logFD int) (*compatEmitter, error) { - nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64) + nameMap, ok := getSyscallNameMap() if !ok { - return nil, fmt.Errorf("amd64 Linux syscall table not found") + return nil, fmt.Errorf("Linux syscall table not found") } c := &compatEmitter{ @@ -67,7 +65,7 @@ func newCompatEmitter(logFD int) (*compatEmitter, error) { if logFD > 0 { f := os.NewFile(uintptr(logFD), "user log file") - target := log.MultiEmitter{c.sink, log.K8sJSONEmitter{log.Writer{Next: f}}} + target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}} c.sink = &log.BasicLogger{Level: log.Info, Emitter: target} } return c, nil @@ -86,16 +84,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (bool, error) { } func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { - regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64 + regs := us.Registers c.mu.Lock() defer c.mu.Unlock() - sysnr := regs.OrigRax + sysnr := syscallNum(regs) tr := c.trackers[sysnr] if tr == nil { switch sysnr { - case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL: + case syscall.SYS_PRCTL: // args: cmd, ... tr = newArgsTracker(0) @@ -112,10 +110,14 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { tr = newArgsTracker(2) default: - tr = &onceTracker{} + tr = newArchArgsTracker(sysnr) + if tr == nil { + tr = &onceTracker{} + } } c.trackers[sysnr] = tr } + if tr.shouldReport(regs) { c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs) tr.onReported(regs) @@ -139,10 +141,10 @@ func (c *compatEmitter) Close() error { // the syscall and arguments. type syscallTracker interface { // shouldReport returns true is the syscall should be reported. - shouldReport(regs *rpb.AMD64Registers) bool + shouldReport(regs *rpb.Registers) bool // onReported marks the syscall as reported. - onReported(regs *rpb.AMD64Registers) + onReported(regs *rpb.Registers) } // onceTracker reports only a single time, used for most syscalls. @@ -150,10 +152,45 @@ type onceTracker struct { reported bool } -func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool { +func (o *onceTracker) shouldReport(_ *rpb.Registers) bool { return !o.reported } -func (o *onceTracker) onReported(_ *rpb.AMD64Registers) { +func (o *onceTracker) onReported(_ *rpb.Registers) { o.reported = true } + +// argsTracker reports only once for each different combination of arguments. +// It's used for generic syscalls like ioctl to report once per 'cmd'. +type argsTracker struct { + // argsIdx is the syscall arguments to use as unique ID. + argsIdx []int + reported map[string]struct{} + count int +} + +func newArgsTracker(argIdx ...int) *argsTracker { + return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})} +} + +// key returns the command based on the syscall argument index. +func (a *argsTracker) key(regs *rpb.Registers) string { + var rv string + for _, idx := range a.argsIdx { + rv += fmt.Sprintf("%d|", argVal(idx, regs)) + } + return rv +} + +func (a *argsTracker) shouldReport(regs *rpb.Registers) bool { + if a.count >= reportLimit { + return false + } + _, ok := a.reported[a.key(regs)] + return !ok +} + +func (a *argsTracker) onReported(regs *rpb.Registers) { + a.count++ + a.reported[a.key(regs)] = struct{}{} +} diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go index 43cd0db94..42b0ca8b0 100644 --- a/runsc/boot/compat_amd64.go +++ b/runsc/boot/compat_amd64.go @@ -16,62 +16,81 @@ package boot import ( "fmt" + "syscall" + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" ) // reportLimit is the max number of events that should be reported per tracker. const reportLimit = 100 -// argsTracker reports only once for each different combination of arguments. -// It's used for generic syscalls like ioctl to report once per 'cmd'. -type argsTracker struct { - // argsIdx is the syscall arguments to use as unique ID. - argsIdx []int - reported map[string]struct{} - count int +// newRegs create a empty Registers instance. +func newRegs() *rpb.Registers { + return &rpb.Registers{ + Arch: &rpb.Registers_Amd64{ + Amd64: &rpb.AMD64Registers{}, + }, + } } -func newArgsTracker(argIdx ...int) *argsTracker { - return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})} -} +func argVal(argIdx int, regs *rpb.Registers) uint32 { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 -// cmd returns the command based on the syscall argument index. -func (a *argsTracker) key(regs *rpb.AMD64Registers) string { - var rv string - for _, idx := range a.argsIdx { - rv += fmt.Sprintf("%d|", argVal(idx, regs)) + switch argIdx { + case 0: + return uint32(amd64Regs.Rdi) + case 1: + return uint32(amd64Regs.Rsi) + case 2: + return uint32(amd64Regs.Rdx) + case 3: + return uint32(amd64Regs.R10) + case 4: + return uint32(amd64Regs.R8) + case 5: + return uint32(amd64Regs.R9) } - return rv + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } -func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 { +func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + switch argIdx { case 0: - return uint32(regs.Rdi) + amd64Regs.Rdi = argVal case 1: - return uint32(regs.Rsi) + amd64Regs.Rsi = argVal case 2: - return uint32(regs.Rdx) + amd64Regs.Rdx = argVal case 3: - return uint32(regs.R10) + amd64Regs.R10 = argVal case 4: - return uint32(regs.R8) + amd64Regs.R8 = argVal case 5: - return uint32(regs.R9) + amd64Regs.R9 = argVal + default: + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } - panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } -func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool { - if a.count >= reportLimit { - return false - } - _, ok := a.reported[a.key(regs)] - return !ok +func getSyscallNameMap() (strace.SyscallMap, bool) { + return strace.Lookup(abi.Linux, arch.AMD64) +} + +func syscallNum(regs *rpb.Registers) uint64 { + amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 + return amd64Regs.OrigRax } -func (a *argsTracker) onReported(regs *rpb.AMD64Registers) { - a.count++ - a.reported[a.key(regs)] = struct{}{} +func newArchArgsTracker(sysnr uint64) syscallTracker { + switch sysnr { + case syscall.SYS_ARCH_PRCTL: + // args: cmd, ... + return newArgsTracker(0) + } + return nil } diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go new file mode 100644 index 000000000..f784cd237 --- /dev/null +++ b/runsc/boot/compat_arm64.go @@ -0,0 +1,91 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/sentry/arch" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" +) + +// reportLimit is the max number of events that should be reported per tracker. +const reportLimit = 100 + +// newRegs create a empty Registers instance. +func newRegs() *rpb.Registers { + return &rpb.Registers{ + Arch: &rpb.Registers_Arm64{ + Arm64: &rpb.ARM64Registers{}, + }, + } +} + +func argVal(argIdx int, regs *rpb.Registers) uint32 { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + + switch argIdx { + case 0: + return uint32(arm64Regs.R0) + case 1: + return uint32(arm64Regs.R1) + case 2: + return uint32(arm64Regs.R2) + case 3: + return uint32(arm64Regs.R3) + case 4: + return uint32(arm64Regs.R4) + case 5: + return uint32(arm64Regs.R5) + } + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) +} + +func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + + switch argIdx { + case 0: + arm64Regs.R0 = argVal + case 1: + arm64Regs.R1 = argVal + case 2: + arm64Regs.R2 = argVal + case 3: + arm64Regs.R3 = argVal + case 4: + arm64Regs.R4 = argVal + case 5: + arm64Regs.R5 = argVal + default: + panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) + } +} + +func getSyscallNameMap() (strace.SyscallMap, bool) { + return strace.Lookup(abi.Linux, arch.ARM64) +} + +func syscallNum(regs *rpb.Registers) uint64 { + arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 + return arm64Regs.R8 +} + +func newArchArgsTracker(sysnr uint64) syscallTracker { + // currently, no arch specific syscalls need to be handled here. + return nil +} diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go index 388298d8d..839c5303b 100644 --- a/runsc/boot/compat_test.go +++ b/runsc/boot/compat_test.go @@ -16,8 +16,6 @@ package boot import ( "testing" - - rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" ) func TestOnceTracker(t *testing.T) { @@ -35,31 +33,34 @@ func TestOnceTracker(t *testing.T) { func TestArgsTracker(t *testing.T) { for _, tc := range []struct { - name string - idx []int - rdi1 uint64 - rdi2 uint64 - rsi1 uint64 - rsi2 uint64 - want bool + name string + idx []int + arg1_1 uint64 + arg1_2 uint64 + arg2_1 uint64 + arg2_2 uint64 + want bool }{ - {name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false}, - {name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false}, - {name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true}, - {name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true}, - {name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false}, - {name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false}, - {name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true}, + {name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false}, + {name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false}, + {name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true}, + {name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true}, + {name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false}, + {name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false}, + {name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true}, } { t.Run(tc.name, func(t *testing.T) { c := newArgsTracker(tc.idx...) - regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1} + regs := newRegs() + setArgVal(0, tc.arg1_1, regs) + setArgVal(1, tc.arg2_1, regs) if !c.shouldReport(regs) { t.Error("first call to shouldReport, got: false, want: true") } c.onReported(regs) - regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2 + setArgVal(0, tc.arg1_2, regs) + setArgVal(1, tc.arg2_2, regs) if got := c.shouldReport(regs); tc.want != got { t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want) } @@ -70,7 +71,9 @@ func TestArgsTracker(t *testing.T) { func TestArgsTrackerLimit(t *testing.T) { c := newArgsTracker(0, 1) for i := 0; i < reportLimit; i++ { - regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)} + regs := newRegs() + setArgVal(0, 123, regs) + setArgVal(1, uint64(i), regs) if !c.shouldReport(regs) { t.Error("shouldReport before limit was reached, got: false, want: true") } @@ -78,7 +81,9 @@ func TestArgsTrackerLimit(t *testing.T) { } // Should hit the count limit now. - regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456} + regs := newRegs() + setArgVal(0, 123, regs) + setArgVal(1, 123456, regs) if c.shouldReport(regs) { t.Error("shouldReport after limit was reached, got: true, want: false") } diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 72a33534f..715a19112 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -158,6 +158,9 @@ type Config struct { // DebugLog is the path to log debug information to, if not empty. DebugLog string + // PanicLog is the path to log GO's runtime messages, if not empty. + PanicLog string + // DebugLogFormat is the log format for debug. DebugLogFormat string @@ -250,6 +253,15 @@ type Config struct { // multiple tests are run in parallel, since there is no way to pass // parameters to the runtime from docker. TestOnlyTestNameEnv string + + // CPUNumFromQuota sets CPU number count to available CPU quota, using + // least integer value greater than or equal to quota. + // + // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. + CPUNumFromQuota bool + + // Enables VFS2 (not plumbled through yet). + VFS2 bool } // ToFlags returns a slice of flags that correspond to the given Config. @@ -260,6 +272,7 @@ func (c *Config) ToFlags() []string { "--log=" + c.LogFilename, "--log-format=" + c.LogFormat, "--debug-log=" + c.DebugLog, + "--panic-log=" + c.PanicLog, "--debug-log-format=" + c.DebugLogFormat, "--file-access=" + c.FileAccess.String(), "--overlay=" + strconv.FormatBool(c.Overlay), @@ -282,6 +295,9 @@ func (c *Config) ToFlags() []string { "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), } + if c.CPUNumFromQuota { + f = append(f, "--cpu-num-from-quota") + } // Only include these if set since it is never to be used by users. if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { f = append(f, "--TESTONLY-unsafe-nonroot=true") @@ -289,5 +305,10 @@ func (c *Config) ToFlags() []string { if len(c.TestOnlyTestNameEnv) != 0 { f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv) } + + if c.VFS2 { + f = append(f, "--vfs2=true") + } + return f } diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 928285683..8125d5061 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -32,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/boot/pprof" "gvisor.dev/gvisor/runsc/specutils" ) @@ -100,11 +101,14 @@ const ( // Profiling related commands (see pprof.go for more details). const ( - StartCPUProfile = "Profile.StartCPUProfile" - StopCPUProfile = "Profile.StopCPUProfile" - HeapProfile = "Profile.HeapProfile" - StartTrace = "Profile.StartTrace" - StopTrace = "Profile.StopTrace" + StartCPUProfile = "Profile.StartCPUProfile" + StopCPUProfile = "Profile.StopCPUProfile" + HeapProfile = "Profile.HeapProfile" + GoroutineProfile = "Profile.GoroutineProfile" + BlockProfile = "Profile.BlockProfile" + MutexProfile = "Profile.MutexProfile" + StartTrace = "Profile.StartTrace" + StopTrace = "Profile.StopTrace" ) // Logging related commands (see logging.go for more details). @@ -142,7 +146,7 @@ func newController(fd int, l *Loader) (*controller, error) { } srv.Register(manager) - if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok { + if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { net := &Network{ Stack: eps.Stack, } @@ -152,7 +156,9 @@ func newController(fd int, l *Loader) (*controller, error) { srv.Register(&debug{}) srv.Register(&control.Logging{}) if l.conf.ProfileEnable { - srv.Register(&control.Profile{}) + srv.Register(&control.Profile{ + Kernel: l.k, + }) } return &controller{ @@ -339,7 +345,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("creating memory file: %v", err) } k.SetMemoryFile(mf) - networkStack := cm.l.k.NetworkStack() + networkStack := cm.l.k.RootNetworkNamespace().Stack() cm.l.k = k // Set up the restore environment. @@ -363,9 +369,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { } if cm.l.conf.ProfileEnable { - // initializePProf opens /proc/self/maps, so has to be - // called before installing seccomp filters. - initializePProf() + // pprof.Initialize opens /proc/self/maps, so has to be called before + // installing seccomp filters. + pprof.Initialize() } // Seccomp filters have to be applied before parsing the state file. @@ -380,7 +386,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { } // Since we have a new kernel we also must make a new watchdog. - dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction) + dogOpts := watchdog.DefaultOpts + dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction + dog := watchdog.New(k, dogOpts) // Change the loader fields to reflect the changes made when restoring. cm.l.k = k diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index e5de1f3d7..7e7a31fbd 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -17,9 +17,10 @@ package boot import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" + vfshost "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" ) @@ -31,10 +32,13 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } + if kernel.VFS2Enabled { + return createFDTableVFS2(ctx, console, stdioFDs) + } + k := kernel.KernelFromContext(ctx) fdTable := k.NewFDTable() defer fdTable.DecRef() - mounter := fs.FileOwnerFromContext(ctx) var ttyFile *fs.File for appFD, hostFD := range stdioFDs { @@ -44,7 +48,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F // Import the file as a host TTY file. if ttyFile == nil { var err error - appFile, err = host.ImportFile(ctx, hostFD, mounter, true /* isTTY */) + appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */) if err != nil { return nil, err } @@ -63,7 +67,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F } else { // Import the file as a regular host file. var err error - appFile, err = host.ImportFile(ctx, hostFD, mounter, false /* isTTY */) + appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */) if err != nil { return nil, err } @@ -79,3 +83,26 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F fdTable.IncRef() return fdTable, nil } + +func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) { + k := kernel.KernelFromContext(ctx) + fdTable := k.NewFDTable() + defer fdTable.DecRef() + + for appFD, hostFD := range stdioFDs { + // TODO(gvisor.dev/issue/1482): Add TTY support. + appFile, err := vfshost.ImportFD(ctx, k.HostMount(), hostFD, false) + if err != nil { + return nil, err + } + + if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { + appFile.DecRef() + return nil, err + } + appFile.DecRef() + } + + fdTable.IncRef() + return fdTable, nil +} diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD index f5509b6b7..ed18f0047 100644 --- a/runsc/boot/filter/BUILD +++ b/runsc/boot/filter/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,12 +6,14 @@ go_library( name = "filter", srcs = [ "config.go", + "config_amd64.go", + "config_arm64.go", + "config_profile.go", "extra_filters.go", "extra_filters_msan.go", "extra_filters_race.go", "filter.go", ], - importpath = "gvisor.dev/gvisor/runsc/boot/filter", visibility = [ "//runsc/boot:__subpackages__", ], diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 5ad108261..1828d116a 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -26,10 +26,6 @@ import ( // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS. var allowedSyscalls = seccomp.SyscallRules{ - syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ - {seccomp.AllowValue(linux.ARCH_GET_FS)}, - {seccomp.AllowValue(linux.ARCH_SET_FS)}, - }, syscall.SYS_CLOCK_GETTIME: {}, syscall.SYS_CLONE: []seccomp.Rule{ { @@ -42,9 +38,15 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.CLONE_THREAD), }, }, - syscall.SYS_CLOSE: {}, - syscall.SYS_DUP: {}, - syscall.SYS_DUP2: {}, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, + syscall.SYS_DUP3: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.O_CLOEXEC), + }, + }, syscall.SYS_EPOLL_CREATE1: {}, syscall.SYS_EPOLL_CTL: {}, syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ @@ -132,11 +134,6 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowValue(syscall.SOL_SOCKET), seccomp.AllowValue(syscall.SO_SNDBUF), }, - { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), - }, }, syscall.SYS_GETTID: {}, syscall.SYS_GETTIMEOFDAY: {}, @@ -177,6 +174,18 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_LSEEK: {}, syscall.SYS_MADVISE: {}, syscall.SYS_MINCORE: {}, + // Used by the Go runtime as a temporarily workaround for a Linux + // 5.2-5.4 bug. + // + // See src/runtime/os_linux_x86.go. + // + // TODO(b/148688965): Remove once this is gone from Go. + syscall.SYS_MLOCK: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(4096), + }, + }, syscall.SYS_MMAP: []seccomp.Rule{ { seccomp.AllowAny{}, @@ -220,7 +229,9 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_NANOSLEEP: {}, syscall.SYS_PPOLL: {}, syscall.SYS_PREAD64: {}, + syscall.SYS_PREADV: {}, syscall.SYS_PWRITE64: {}, + syscall.SYS_PWRITEV: {}, syscall.SYS_READ: {}, syscall.SYS_RECVMSG: []seccomp.Rule{ { @@ -273,12 +284,21 @@ var allowedSyscalls = seccomp.SyscallRules{ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, + unix.SYS_STATX: {}, syscall.SYS_SYNC_FILE_RANGE: {}, syscall.SYS_TGKILL: []seccomp.Rule{ { seccomp.AllowValue(uint64(os.Getpid())), }, }, + syscall.SYS_UTIMENSAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* null pathname */ + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* flags */ + }, + }, syscall.SYS_WRITE: {}, // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR @@ -315,6 +335,26 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_TOS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_RECVTOS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_TCLASS), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + }, + { + seccomp.AllowAny{}, seccomp.AllowValue(syscall.SOL_IPV6), seccomp.AllowValue(syscall.IPV6_V6ONLY), }, @@ -416,6 +456,34 @@ func hostInetFilters() seccomp.SyscallRules { seccomp.AllowAny{}, seccomp.AllowValue(4), }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_TOS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IP), + seccomp.AllowValue(syscall.IP_RECVTOS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_TCLASS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, + { + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.SOL_IPV6), + seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + seccomp.AllowAny{}, + seccomp.AllowValue(4), + }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ { @@ -479,16 +547,3 @@ func controlServerFilters(fd int) seccomp.SyscallRules { }, } } - -// profileFilters returns extra syscalls made by runtime/pprof package. -func profileFilters() seccomp.SyscallRules { - return seccomp.SyscallRules{ - syscall.SYS_OPENAT: []seccomp.Rule{ - { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), - }, - }, - } -} diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go new file mode 100644 index 000000000..5335ff82c --- /dev/null +++ b/runsc/boot/filter/config_amd64.go @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" +) + +func init() { + allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL], + seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)}, + seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)}, + ) +} diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go new file mode 100644 index 000000000..7fa9bbda3 --- /dev/null +++ b/runsc/boot/filter/config_arm64.go @@ -0,0 +1,21 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package filter + +// Reserve for future customization. +func init() { +} diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go new file mode 100644 index 000000000..194952a7b --- /dev/null +++ b/runsc/boot/filter/config_profile.go @@ -0,0 +1,34 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + +// profileFilters returns extra syscalls made by runtime/pprof package. +func profileFilters() seccomp.SyscallRules { + return seccomp.SyscallRules{ + syscall.SYS_OPENAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), + }, + }, + } +} diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 76036c147..98cce60af 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -16,7 +16,6 @@ package boot import ( "fmt" - "path" "path/filepath" "sort" "strconv" @@ -33,8 +32,8 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/gofer" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" @@ -52,7 +51,7 @@ const ( rootDevice = "9pfs-/" // MountPrefix is the annotation prefix for mount hints. - MountPrefix = "gvisor.dev/spec/mount" + MountPrefix = "dev.gvisor.spec.mount." // Filesystems that runsc supports. bind = "bind" @@ -279,6 +278,9 @@ func subtargets(root string, mnts []specs.Mount) []string { } func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + if conf.VFS2 { + return setupContainerVFS2(ctx, conf, mntr, procArgs) + } mns, err := mntr.setupFS(conf, procArgs) if err != nil { return err @@ -465,6 +467,13 @@ func (m *mountHint) checkCompatible(mount specs.Mount) error { return nil } +func (m *mountHint) fileAccessType() FileAccessType { + if m.share == container { + return FileAccessExclusive + } + return FileAccessShared +} + func filterUnsupportedOptions(mount specs.Mount) []string { rv := make([]string, 0, len(mount.Options)) for _, o := range mount.Options { @@ -483,14 +492,15 @@ type podMountHints struct { func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { mnts := make(map[string]*mountHint) for k, v := range spec.Annotations { - // Look for 'gvisor.dev/spec/mount' annotations and parse them. + // Look for 'dev.gvisor.spec.mount' annotations and parse them. if strings.HasPrefix(k, MountPrefix) { - parts := strings.Split(k, "/") - if len(parts) != 5 { + // Remove the prefix and split the rest. + parts := strings.Split(k[len(MountPrefix):], ".") + if len(parts) != 2 { return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v) } - name := parts[3] - if len(name) == 0 || path.Clean(name) != name { + name := parts[0] + if len(name) == 0 { return nil, fmt.Errorf("invalid mount name: %s", name) } mnt := mnts[name] @@ -498,7 +508,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { mnt = &mountHint{name: name} mnts[name] = mnt } - if err := mnt.setField(parts[4], v); err != nil { + if err := mnt.setField(parts[1], v); err != nil { return nil, err } } @@ -566,8 +576,16 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. func (c *containerMounter) processHints(conf *Config) error { + if conf.VFS2 { + return nil + } ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfs { + continue + } log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) inode, err := c.mountSharedMaster(ctx, conf, hint) if err != nil { @@ -764,20 +782,24 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( case bind: fd := c.fds.remove() fsName = "9p" - // Non-root bind mounts are always shared. - opts = p9MountOptions(fd, FileAccessShared) + opts = p9MountOptions(fd, c.getMountAccessType(m)) // If configured, add overlay to all writable mounts. useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly default: - // TODO(nlacasse): Support all the mount types and make this a fatal error. - // Most applications will "just work" without them, so this is a warning - // for now. log.Warningf("ignoring unknown filesystem type %q", m.Type) } return fsName, opts, useOverlay, nil } +func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType { + if hint := c.hints.findMount(mount); hint != nil { + return hint.fileAccessType() + } + // Non-root bind mounts are always shared if no hints were provided. + return FileAccessShared +} + // mountSubmount mounts volumes inside the container's root. Because mounts may // be readonly, a lower ramfs overlay is added to create the mount point dir. // Another overlay is added with tmpfs on top if Config.Overlay is true. @@ -805,7 +827,20 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil) if err != nil { - return fmt.Errorf("creating mount with source %q: %v", m.Source, err) + err := fmt.Errorf("creating mount with source %q: %v", m.Source, err) + // Check to see if this is a common error due to a Linux bug. + // This error is generated here in order to cause it to be + // printed to the user using Docker via 'runsc create' etc. rather + // than simply printed to the logs for the 'runsc boot' command. + // + // We check the error message string rather than type because the + // actual error types (syscall.EIO, syscall.EPIPE) are lost by file system + // implementation (e.g. p9). + // TODO(gvisor.dev/issue/1765): Remove message when bug is resolved. + if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) { + return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug")) + } + return err } // If there are submounts, we need to overlay the mount on top of a ramfs @@ -837,7 +872,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns return fmt.Errorf("mount %q error: %v", m.Destination, err) } - log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) + log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts) return nil } diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go index 49ab34b33..912037075 100644 --- a/runsc/boot/fs_test.go +++ b/runsc/boot/fs_test.go @@ -15,7 +15,6 @@ package boot import ( - "path" "reflect" "strings" "testing" @@ -26,19 +25,19 @@ import ( func TestPodMountHintsHappy(t *testing.T) { spec := &specs.Spec{ Annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", - path.Join(MountPrefix, "mount2", "source"): "bar", - path.Join(MountPrefix, "mount2", "type"): "bind", - path.Join(MountPrefix, "mount2", "share"): "container", - path.Join(MountPrefix, "mount2", "options"): "rw,private", + MountPrefix + "mount2.source": "bar", + MountPrefix + "mount2.type": "bind", + MountPrefix + "mount2.share": "container", + MountPrefix + "mount2.options": "rw,private", }, } podHints, err := newPodMountHints(spec) if err != nil { - t.Errorf("newPodMountHints failed: %v", err) + t.Fatalf("newPodMountHints failed: %v", err) } // Check that fields were set correctly. @@ -86,95 +85,95 @@ func TestPodMountHintsErrors(t *testing.T) { { name: "too short", annotations: map[string]string{ - path.Join(MountPrefix, "mount1"): "foo", + MountPrefix + "mount1": "foo", }, error: "invalid mount annotation", }, { name: "no name", annotations: map[string]string{ - MountPrefix + "//source": "foo", + MountPrefix + ".source": "foo", }, error: "invalid mount name", }, { name: "missing source", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", }, error: "source field", }, { name: "missing type", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.share": "pod", }, error: "type field", }, { name: "missing share", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", }, error: "share field", }, { name: "invalid field name", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "invalid"): "foo", + MountPrefix + "mount1.invalid": "foo", }, error: "invalid mount annotation", }, { name: "invalid source", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", }, error: "source cannot be empty", }, { name: "invalid type", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "invalid-type", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "invalid-type", + MountPrefix + "mount1.share": "pod", }, error: "invalid type", }, { name: "invalid share", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "invalid-share", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "invalid-share", }, error: "invalid share", }, { name: "invalid options", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", - path.Join(MountPrefix, "mount1", "options"): "invalid-option", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", + MountPrefix + "mount1.options": "invalid-option", }, error: "unknown mount option", }, { name: "duplicate source", annotations: map[string]string{ - path.Join(MountPrefix, "mount1", "source"): "foo", - path.Join(MountPrefix, "mount1", "type"): "tmpfs", - path.Join(MountPrefix, "mount1", "share"): "pod", + MountPrefix + "mount1.source": "foo", + MountPrefix + "mount1.type": "tmpfs", + MountPrefix + "mount1.share": "pod", - path.Join(MountPrefix, "mount2", "source"): "foo", - path.Join(MountPrefix, "mount2", "type"): "bind", - path.Join(MountPrefix, "mount2", "share"): "container", + MountPrefix + "mount2.source": "foo", + MountPrefix + "mount2.type": "bind", + MountPrefix + "mount2.share": "container", }, error: "have the same mount source", }, @@ -191,3 +190,61 @@ func TestPodMountHintsErrors(t *testing.T) { }) } } + +func TestGetMountAccessType(t *testing.T) { + const source = "foo" + for _, tst := range []struct { + name string + annotations map[string]string + want FileAccessType + }{ + { + name: "container=exclusive", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "container", + }, + want: FileAccessExclusive, + }, + { + name: "pod=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "pod", + }, + want: FileAccessShared, + }, + { + name: "shared=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source, + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "shared", + }, + want: FileAccessShared, + }, + { + name: "default=shared", + annotations: map[string]string{ + MountPrefix + "mount1.source": source + "mismatch", + MountPrefix + "mount1.type": "bind", + MountPrefix + "mount1.share": "container", + }, + want: FileAccessShared, + }, + } { + t.Run(tst.name, func(t *testing.T) { + spec := &specs.Spec{Annotations: tst.annotations} + podHints, err := newPodMountHints(spec) + if err != nil { + t.Fatalf("newPodMountHints failed: %v", err) + } + mounter := containerMounter{hints: podHints} + if got := mounter.getMountAccessType(specs.Mount{Source: source}); got != tst.want { + t.Errorf("getMountAccessType(), want: %v, got: %v", tst.want, got) + } + }) + } +} diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go index d1c0bb9b5..ce62236e5 100644 --- a/runsc/boot/limits.go +++ b/runsc/boot/limits.go @@ -16,12 +16,12 @@ package boot import ( "fmt" - "sync" "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sync" ) // Mapping from linux resource names to limits.LimitType. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 0c0eba99e..f6ea4c102 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -20,7 +20,6 @@ import ( mrand "math/rand" "os" "runtime" - "sync" "sync/atomic" "syscall" gtime "time" @@ -36,6 +35,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/fs/user" + vfs2host "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -43,11 +44,14 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/sighandling" - slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -59,16 +63,20 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/runsc/boot/filter" _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. + "gvisor.dev/gvisor/runsc/boot/pprof" "gvisor.dev/gvisor/runsc/specutils" // Include supported socket providers. "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink" _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) +var syscallTable *kernel.SyscallTable + // Loader keeps state needed to start the kernel and run the container.. type Loader struct { // k is the kernel. @@ -93,10 +101,6 @@ type Loader struct { // spec is the base configuration for the root container. spec *specs.Spec - // startSignalForwarding enables forwarding of signals to the sandboxed - // container. It should be called after the init process is loaded. - startSignalForwarding func() func() - // stopSignalForwarding disables forwarding of signals to the sandboxed // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() @@ -146,9 +150,6 @@ type execProcess struct { func init() { // Initialize the random number generator. mrand.Seed(gtime.Now().UnixNano()) - - // Register the global syscall table. - kernel.RegisterSyscallTable(slinux.AMD64) } // Args are the arguments for New(). @@ -159,13 +160,17 @@ type Args struct { Spec *specs.Spec // Conf is the system configuration. Conf *Config - // ControllerFD is the FD to the URPC controller. + // ControllerFD is the FD to the URPC controller. The Loader takes ownership + // of this FD and may close it at any time. ControllerFD int - // Device is an optional argument that is passed to the platform. + // Device is an optional argument that is passed to the platform. The Loader + // takes ownership of this file and may close it at any time. Device *os.File - // GoferFDs is an array of FDs used to connect with the Gofer. + // GoferFDs is an array of FDs used to connect with the Gofer. The Loader + // takes ownership of these FDs and may close them at any time. GoferFDs []int - // StdioFDs is the stdio for the application. + // StdioFDs is the stdio for the application. The Loader takes ownership of + // these FDs and may close them at any time. StdioFDs []int // Console is set to true if using TTY. Console bool @@ -178,6 +183,9 @@ type Args struct { UserLogFD int } +// make sure stdioFDs are always the same on initial start and on restore +const startingStdioFD = 64 + // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. func New(args Args) (*Loader, error) { @@ -191,6 +199,14 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("setting up memory usage: %v", err) } + // Patch the syscall table. + kernel.VFS2Enabled = args.Conf.VFS2 + if kernel.VFS2Enabled { + vfs2.Override(syscallTable.Table) + } + + kernel.RegisterSyscallTable(syscallTable) + // Create kernel and platform. p, err := createPlatform(args.Conf, args.Device) if err != nil { @@ -228,11 +244,8 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("enabling strace: %v", err) } - // Create an empty network stack because the network namespace may be empty at - // this point. Netns is configured before Run() is called. Netstack is - // configured using a control uRPC message. Host network is configured inside - // Run(). - networkStack, err := newEmptyNetworkStack(args.Conf, k) + // Create root network namespace/stack. + netns, err := newRootNetworkNamespace(args.Conf, k, k) if err != nil { return nil, fmt.Errorf("creating network: %v", err) } @@ -275,7 +288,7 @@ func New(args Args) (*Loader, error) { FeatureSet: cpuid.HostFeatureSet(), Timekeeper: tk, RootUserNamespace: creds.UserNamespace, - NetworkStack: networkStack, + RootNetworkNamespace: netns, ApplicationCores: uint(args.NumCPU), Vdso: vdso, RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), @@ -300,7 +313,9 @@ func New(args Args) (*Loader, error) { } // Create a watchdog. - dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction) + dogOpts := watchdog.DefaultOpts + dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction + dog := watchdog.New(k, dogOpts) procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) if err != nil { @@ -316,6 +331,35 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("creating pod mount hints: %v", err) } + if kernel.VFS2Enabled { + // Set up host mount that will be used for imported fds. + hostFilesystem := vfs2host.NewFilesystem(k.VFS()) + defer hostFilesystem.DecRef() + hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create hostfs mount: %v", err) + } + k.SetHostMount(hostMount) + } + + // Make host FDs stable between invocations. Host FDs must map to the exact + // same number when the sandbox is restored. Otherwise the wrong FD will be + // used. + var stdioFDs []int + newfd := startingStdioFD + for _, fd := range args.StdioFDs { + err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) + } + stdioFDs = append(stdioFDs, newfd) + err = syscall.Close(fd) + if err != nil { + return nil, fmt.Errorf("close original stdioFDs failed: %v", err) + } + newfd++ + } + eid := execID{cid: args.ID} l := &Loader{ k: k, @@ -324,7 +368,7 @@ func New(args Args) (*Loader, error) { watchdog: dog, spec: args.Spec, goferFDs: args.GoferFDs, - stdioFDs: args.StdioFDs, + stdioFDs: stdioFDs, rootProcArgs: procArgs, sandboxID: args.ID, processes: map[execID]*execProcess{eid: {}}, @@ -337,29 +381,6 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("ignore child stop signals failed: %v", err) } - // Handle signals by forwarding them to the root container process - // (except for panic signal, which should cause a panic). - l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) { - // Panic signal should cause a panic. - if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) { - panic("Signal-induced panic") - } - - // Otherwise forward to root container. - deliveryMode := DeliverToProcess - if args.Console { - // Since we are running with a console, we should - // forward the signal to the foreground process group - // so that job control signals like ^C can be handled - // properly. - deliveryMode = DeliverToForegroundProcessGroup - } - log.Infof("Received external signal %d, mode: %v", sig, deliveryMode) - if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil { - log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err) - } - }) - // Create the control server using the provided FD. // // This must be done *after* we have initialized the kernel since the @@ -387,11 +408,16 @@ func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel. return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err) } + wd := spec.Process.Cwd + if wd == "" { + wd = "/" + } + // Create the process arguments. procArgs := kernel.CreateProcessArgs{ Argv: spec.Process.Args, Envv: spec.Process.Env, - WorkingDirectory: spec.Process.Cwd, // Defaults to '/' if empty. + WorkingDirectory: wd, Credentials: creds, Umask: 0022, Limits: ls, @@ -485,7 +511,7 @@ func (l *Loader) run() error { // Delay host network configuration to this point because network namespace // is configured after the loader is created and before Run() is called. log.Debugf("Configuring host network") - stack := l.k.NetworkStack().(*hostinet.Stack) + stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) if err := stack.Configure(); err != nil { return err } @@ -504,7 +530,7 @@ func (l *Loader) run() error { // l.restore is set by the container manager when a restore call is made. if !l.restore { if l.conf.ProfileEnable { - initializePProf() + pprof.Initialize() } // Finally done with all configuration. Setup filters before user code @@ -536,7 +562,15 @@ func (l *Loader) run() error { } // Add the HOME enviroment variable if it is not already set. - envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + var envv []string + if kernel.VFS2Enabled { + envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2, + l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + + } else { + envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, + l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) + } if err != nil { return err } @@ -567,8 +601,40 @@ func (l *Loader) run() error { ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup()) } - // Start signal forwarding only after an init process is created. - l.stopSignalForwarding = l.startSignalForwarding() + // Handle signals by forwarding them to the root container process + // (except for panic signal, which should cause a panic). + l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { + // Panic signal should cause a panic. + if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) { + panic("Signal-induced panic") + } + + // Otherwise forward to root container. + deliveryMode := DeliverToProcess + if l.console { + // Since we are running with a console, we should forward the signal to + // the foreground process group so that job control signals like ^C can + // be handled properly. + deliveryMode = DeliverToForegroundProcessGroup + } + log.Infof("Received external signal %d, mode: %v", sig, deliveryMode) + if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { + log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err) + } + }) + + // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again + // either in createFDTable() during initial start or in descriptor.initAfterLoad() + // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the + // passed FDs, so only close for VFS1. + if !kernel.VFS2Enabled { + for _, fd := range l.stdioFDs { + err := syscall.Close(fd) + if err != nil { + return fmt.Errorf("close dup()ed stdioFDs: %v", err) + } + } + } log.Infof("Process should have started...") l.watchdog.Start() @@ -795,20 +861,23 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { return 0, fmt.Errorf("container %q not started", args.ContainerID) } + // TODO(gvisor.dev/issue/1623): Add VFS2 support + // Get the container MountNamespace from the Task. tg.Leader().WithMuLocked(func(t *kernel.Task) { - // task.MountNamespace() does not take a ref, so we must do so - // ourselves. + // task.MountNamespace() does not take a ref, so we must do so ourselves. args.MountNamespace = t.MountNamespace() args.MountNamespace.IncRef() }) - defer args.MountNamespace.DecRef() + if args.MountNamespace != nil { + defer args.MountNamespace.DecRef() + } - // Add the HOME enviroment varible if it is not already set. + // Add the HOME environment variable if it is not already set. root := args.MountNamespace.Root() defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) - envv, err := maybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) + envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) if err != nil { return 0, err } @@ -905,47 +974,92 @@ func (l *Loader) WaitExit() kernel.ExitStatus { return l.k.GlobalInit().ExitStatus() } -func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { +func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { + // Create an empty network stack because the network namespace may be empty at + // this point. Netns is configured before Run() is called. Netstack is + // configured using a control uRPC message. Host network is configured inside + // Run(). switch conf.Network { case NetworkHost: - return hostinet.NewStack(), nil + // No network namespacing support for hostinet yet, hence creator is nil. + return inet.NewRootNamespace(hostinet.NewStack(), nil), nil case NetworkNone, NetworkSandbox: - // NetworkNone sets up loopback using netstack. - netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} - transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} - s := netstack.Stack{stack.New(stack.Options{ - NetworkProtocols: netProtos, - TransportProtocols: transProtos, - Clock: clock, - Stats: netstack.Metrics, - HandleLocal: true, - // Enable raw sockets for users with sufficient - // privileges. - RawFactory: raw.EndpointFactory{}, - })} - - // Enable SACK Recovery. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { - return nil, fmt.Errorf("failed to enable SACK: %v", err) + s, err := newEmptySandboxNetworkStack(clock, uniqueID) + if err != nil { + return nil, err + } + creator := &sandboxNetstackCreator{ + clock: clock, + uniqueID: uniqueID, } + return inet.NewRootNamespace(s, creator), nil - // Set default TTLs as required by socket/netstack. - s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) - s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + default: + panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) + } - // Enable Receive Buffer Auto-Tuning. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) - } +} - s.FillDefaultIPTables() +func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { + netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} + transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + s := netstack.Stack{stack.New(stack.Options{ + NetworkProtocols: netProtos, + TransportProtocols: transProtos, + Clock: clock, + Stats: netstack.Metrics, + HandleLocal: true, + // Enable raw sockets for users with sufficient + // privileges. + RawFactory: raw.EndpointFactory{}, + UniqueID: uniqueID, + })} - return &s, nil + // Enable SACK Recovery. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { + return nil, fmt.Errorf("failed to enable SACK: %v", err) + } - default: - panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) + // Set default TTLs as required by socket/netstack. + s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + + // Enable Receive Buffer Auto-Tuning. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) } + + s.FillDefaultIPTables() + + return &s, nil +} + +// sandboxNetstackCreator implements kernel.NetworkStackCreator. +// +// +stateify savable +type sandboxNetstackCreator struct { + clock tcpip.Clock + uniqueID stack.UniqueID +} + +// CreateStack implements kernel.NetworkStackCreator.CreateStack. +func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { + s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID) + if err != nil { + return nil, err + } + + // Setup loopback. + n := &Network{Stack: s.(*netstack.Stack).Stack} + nicID := tcpip.NICID(f.uniqueID.UniqueID()) + link := DefaultLoopbackLink + linkEP := loopback.New() + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return nil, err + } + + return s, nil } // signal sends a signal to one or more processes in a container. If PID is 0, @@ -993,7 +1107,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) if err == nil { // Send signal directly to the identified process. - return execTG.SendSignal(&arch.SignalInfo{Signo: signo}) + return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo}) } // The caller may be signaling a process not started directly via exec. @@ -1010,7 +1124,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er if tg.Leader().ContainerID() != cid { return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) } - return tg.SendSignal(&arch.SignalInfo{Signo: signo}) + return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}) } func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { @@ -1028,7 +1142,7 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s // No foreground process group has been set. Signal the // original thread group. log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid) - return tg.SendSignal(&arch.SignalInfo{Signo: signo}) + return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}) } // Send the signal to all processes in the process group. var lastErr error @@ -1036,7 +1150,7 @@ func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, s if tg.ProcessGroup() != pg { continue } - if err := tg.SendSignal(&arch.SignalInfo{Signo: signo}); err != nil { + if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil { lastErr = err } } diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go new file mode 100644 index 000000000..78df86611 --- /dev/null +++ b/runsc/boot/loader_amd64.go @@ -0,0 +1,26 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package boot + +import ( + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" +) + +func init() { + // Set the global syscall table. + syscallTable = linux.AMD64 +} diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go new file mode 100644 index 000000000..250785010 --- /dev/null +++ b/runsc/boot/loader_arm64.go @@ -0,0 +1,26 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package boot + +import ( + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" +) + +func init() { + // Set the global syscall table. + syscallTable = linux.ARM64 +} diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 147ff7703..55d27a632 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -19,17 +19,21 @@ import ( "math/rand" "os" "reflect" - "sync" "syscall" "testing" "time" specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/runsc/fsgofer" ) @@ -65,6 +69,11 @@ func testSpec() *specs.Spec { } } +func resetSyscallTable() { + kernel.VFS2Enabled = false + kernel.FlushSyscallTablesTestOnly() +} + // startGofer starts a new gofer routine serving 'root' path. It returns the // sandbox side of the connection, and a function that when called will stop the // gofer. @@ -100,20 +109,29 @@ func startGofer(root string) (int, func(), error) { return sandboxEnd, cleanup, nil } -func createLoader() (*Loader, func(), error) { +func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) { fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10])) if err != nil { return nil, nil, err } conf := testConfig() - spec := testSpec() + conf.VFS2 = vfsEnabled sandEnd, cleanup, err := startGofer(spec.Root.Path) if err != nil { return nil, nil, err } - stdio := []int{int(os.Stdin.Fd()), int(os.Stdout.Fd()), int(os.Stderr.Fd())} + // Loader takes ownership of stdio. + var stdio []int + for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} { + newFd, err := unix.Dup(int(f.Fd())) + if err != nil { + return nil, nil, err + } + stdio = append(stdio, newFd) + } + args := Args{ ID: "foo", Spec: spec, @@ -132,10 +150,22 @@ func createLoader() (*Loader, func(), error) { // TestRun runs a simple application in a sandbox and checks that it succeeds. func TestRun(t *testing.T) { - l, cleanup, err := createLoader() + defer resetSyscallTable() + doRun(t, false) +} + +// TestRunVFS2 runs TestRun in VFSv2. +func TestRunVFS2(t *testing.T) { + defer resetSyscallTable() + doRun(t, true) +} + +func doRun(t *testing.T, vfsEnabled bool) { + l, cleanup, err := createLoader(vfsEnabled, testSpec()) if err != nil { t.Fatalf("error creating loader: %v", err) } + defer l.Destroy() defer cleanup() @@ -169,7 +199,18 @@ func TestRun(t *testing.T) { // TestStartSignal tests that the controller Start message will cause // WaitForStartSignal to return. func TestStartSignal(t *testing.T) { - l, cleanup, err := createLoader() + defer resetSyscallTable() + doStartSignal(t, false) +} + +// TestStartSignalVFS2 does TestStartSignal with VFS2. +func TestStartSignalVFS2(t *testing.T) { + defer resetSyscallTable() + doStartSignal(t, true) +} + +func doStartSignal(t *testing.T, vfsEnabled bool) { + l, cleanup, err := createLoader(vfsEnabled, testSpec()) if err != nil { t.Fatalf("error creating loader: %v", err) } @@ -217,18 +258,19 @@ func TestStartSignal(t *testing.T) { } -// Test that MountNamespace can be created with various specs. -func TestCreateMountNamespace(t *testing.T) { - testCases := []struct { - name string - // Spec that will be used to create the mount manager. Note - // that we can't mount procfs without a kernel, so each spec - // MUST contain something other than procfs mounted at /proc. - spec specs.Spec - // Paths that are expected to exist in the resulting fs. - expectedPaths []string - }{ - { +type CreateMountTestcase struct { + name string + // Spec that will be used to create the mount manager. Note + // that we can't mount procfs without a kernel, so each spec + // MUST contain something other than procfs mounted at /proc. + spec specs.Spec + // Paths that are expected to exist in the resulting fs. + expectedPaths []string +} + +func createMountTestcases(vfs2 bool) []*CreateMountTestcase { + testCases := []*CreateMountTestcase{ + &CreateMountTestcase{ // Only proc. name: "only proc mount", spec: specs.Spec{ @@ -270,7 +312,7 @@ func TestCreateMountNamespace(t *testing.T) { // /dev, and /sys. expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - { + &CreateMountTestcase{ // Mounts are nested inside each other. name: "nested mounts", spec: specs.Spec{ @@ -314,7 +356,7 @@ func TestCreateMountNamespace(t *testing.T) { expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux", "/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - { + &CreateMountTestcase{ name: "mount inside /dev", spec: specs.Spec{ Root: &specs.Root{ @@ -357,40 +399,47 @@ func TestCreateMountNamespace(t *testing.T) { }, expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"}, }, - { - name: "mounts inside mandatory mounts", - spec: specs.Spec{ - Root: &specs.Root{ - Path: os.TempDir(), - Readonly: true, + } + + vfsCase := &CreateMountTestcase{ + name: "mounts inside mandatory mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", }, - Mounts: []specs.Mount{ - { - Destination: "/proc", - Type: "tmpfs", - }, - // We don't include /sys, and /tmp in - // the spec, since they will be added - // automatically. - // - // Instead, add submounts inside these - // directories and make sure they are - // visible under the mandatory mounts. - { - Destination: "/sys/bar", - Type: "tmpfs", - }, - { - Destination: "/tmp/baz", - Type: "tmpfs", - }, + // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports + // MkDirAt in VFS2 (and remove the reduntant append). + // { + // Destination: "/sys/bar", + // Type: "tmpfs", + // }, + // + { + Destination: "/tmp/baz", + Type: "tmpfs", }, }, - expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"}, }, + expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"}, } - for _, tc := range testCases { + if !vfs2 { + vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"}) + vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar") + } + return append(testCases, vfsCase) +} + +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespace(t *testing.T) { + + for _, tc := range createMountTestcases(false /* vfs2 */) { t.Run(tc.name, func(t *testing.T) { conf := testConfig() ctx := contexttest.Context(t) @@ -425,6 +474,56 @@ func TestCreateMountNamespace(t *testing.T) { } } +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespaceVFS2(t *testing.T) { + + for _, tc := range createMountTestcases(true /* vfs2 */) { + t.Run(tc.name, func(t *testing.T) { + defer resetSyscallTable() + + spec := testSpec() + spec.Mounts = tc.spec.Mounts + spec.Root = tc.spec.Root + + l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec) + if err != nil { + t.Fatalf("failed to create loader: %v", err) + } + defer l.Destroy() + defer loaderCleanup() + + mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.conf); err != nil { + t.Fatalf("failed process hints: %v", err) + } + + ctx := l.rootProcArgs.NewContext(l.k) + mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs) + if err != nil { + t.Fatalf("failed to setupVFS2: %v", err) + } + + root := mns.Root() + defer root.DecRef() + for _, p := range tc.expectedPaths { + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(p), + } + + if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { + t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + } else { + d.DecRef() + } + + } + }) + } +} + // TestRestoreEnvironment tests that the correct mounts are collected from the spec and config // in order to build the environment for restoring. func TestRestoreEnvironment(t *testing.T) { diff --git a/runsc/boot/network.go b/runsc/boot/network.go index f98c5fd36..bee6ee336 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -17,6 +17,7 @@ package boot import ( "fmt" "net" + "strings" "syscall" "gvisor.dev/gvisor/pkg/log" @@ -31,6 +32,32 @@ import ( "gvisor.dev/gvisor/pkg/urpc" ) +var ( + // DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and + // "::1/8" on "lo" interface. + DefaultLoopbackLink = LoopbackLink{ + Name: "lo", + Addresses: []net.IP{ + net.IP("\x7f\x00\x00\x01"), + net.IPv6loopback, + }, + Routes: []Route{ + { + Destination: net.IPNet{ + IP: net.IPv4(0x7f, 0, 0, 0), + Mask: net.IPv4Mask(0xff, 0, 0, 0), + }, + }, + { + Destination: net.IPNet{ + IP: net.IPv6loopback, + Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), + }, + }, + }, + } +) + // Network exposes methods that can be used to configure a network stack. type Network struct { Stack *stack.Stack @@ -80,7 +107,8 @@ type CreateLinksAndRoutesArgs struct { LoopbackLinks []LoopbackLink FDBasedLinks []FDBasedLink - DefaultGateway DefaultRoute + Defaultv4Gateway DefaultRoute + Defaultv6Gateway DefaultRoute } // Empty returns true if route hasn't been set. @@ -122,10 +150,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct nicID++ nicids[link.Name] = nicID - ep := loopback.New() + linkEP := loopback.New() log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) - if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, true /* loopback */); err != nil { + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err } @@ -157,7 +185,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } mac := tcpip.LinkAddress(link.LinkAddress) - ep, err := fdbased.New(&fdbased.Options{ + linkEP, err := fdbased.New(&fdbased.Options{ FDs: FDs, MTU: uint32(link.MTU), EthernetHeader: true, @@ -172,7 +200,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) - if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, false /* loopback */); err != nil { + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err } @@ -186,12 +214,24 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } } - if !args.DefaultGateway.Route.Empty() { - nicID, ok := nicids[args.DefaultGateway.Name] + if !args.Defaultv4Gateway.Route.Empty() { + nicID, ok := nicids[args.Defaultv4Gateway.Name] if !ok { - return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name) + return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name) } - route, err := args.DefaultGateway.Route.toTcpipRoute(nicID) + route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID) + if err != nil { + return err + } + routes = append(routes, route) + } + + if !args.Defaultv6Gateway.Route.Empty() { + nicID, ok := nicids[args.Defaultv6Gateway.Name] + if !ok { + return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name) + } + route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID) if err != nil { return err } @@ -205,15 +245,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct // createNICWithAddrs creates a NIC in the network stack and adds the given // addresses. -func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error { - if loopback { - if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil { - return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v) failed: %v", id, name, err) - } - } else { - if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil { - return fmt.Errorf("CreateNamedNIC(%v, %v) failed: %v", id, name, err) - } +func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error { + opts := stack.NICOptions{Name: name} + if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil { + return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err) } // Always start with an arp address for the NIC. diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD index 03391cdca..77774f43c 100644 --- a/runsc/boot/platforms/BUILD +++ b/runsc/boot/platforms/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "platforms", srcs = ["platforms.go"], - importpath = "gvisor.dev/gvisor/runsc/boot/platforms", visibility = [ "//runsc:__subpackages__", ], diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD new file mode 100644 index 000000000..29cb42b2f --- /dev/null +++ b/runsc/boot/pprof/BUILD @@ -0,0 +1,11 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "pprof", + srcs = ["pprof.go"], + visibility = [ + "//runsc:__subpackages__", + ], +) diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof/pprof.go index 463362f02..1ded20dee 100644 --- a/runsc/boot/pprof.go +++ b/runsc/boot/pprof/pprof.go @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -package boot +// Package pprof provides a stub to initialize custom profilers. +package pprof -func initializePProf() { +// Initialize will be called at boot for initializing custom profilers. +func Initialize() { } diff --git a/runsc/boot/user.go b/runsc/boot/user.go deleted file mode 100644 index 56cc12ee0..000000000 --- a/runsc/boot/user.go +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package boot - -import ( - "bufio" - "fmt" - "io" - "strconv" - "strings" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" -) - -type fileReader struct { - // Ctx is the context for the file reader. - Ctx context.Context - - // File is the file to read from. - File *fs.File -} - -// Read implements io.Reader.Read. -func (r *fileReader) Read(buf []byte) (int, error) { - n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf)) - return int(n), err -} - -// getExecUserHome returns the home directory of the executing user read from -// /etc/passwd as read from the container filesystem. -func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) { - // The default user home directory to return if no user matching the user - // if found in the /etc/passwd found in the image. - const defaultHome = "/" - - // Open the /etc/passwd file from the dirent via the root mount namespace. - mnsRoot := rootMns.Root() - maxTraversals := uint(linux.MaxSymlinkTraversals) - dirent, err := rootMns.FindInode(ctx, mnsRoot, nil, "/etc/passwd", &maxTraversals) - if err != nil { - // NOTE: Ignore errors opening the passwd file. If the passwd file - // doesn't exist we will return the default home directory. - return defaultHome, nil - } - defer dirent.DecRef() - - // Check read permissions on the file. - if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil { - // NOTE: Ignore permissions errors here and return default root dir. - return defaultHome, nil - } - - // Only open regular files. We don't open other files like named pipes as - // they may block and might present some attack surface to the container. - // Note that runc does not seem to do this kind of checking. - if !fs.IsRegular(dirent.Inode.StableAttr) { - return defaultHome, nil - } - - f, err := dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Directory: false}) - if err != nil { - return "", err - } - defer f.DecRef() - - r := &fileReader{ - Ctx: ctx, - File: f, - } - - homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome) - if err != nil { - return "", err - } - - return homeDir, nil -} - -// maybeAddExecUserHome returns a new slice with the HOME enviroment variable -// set if the slice does not already contain it, otherwise it returns the -// original slice unmodified. -func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) { - // Check if the envv already contains HOME. - for _, env := range envv { - if strings.HasPrefix(env, "HOME=") { - // We have it. Return the original slice unmodified. - return envv, nil - } - } - - // Read /etc/passwd for the user's HOME directory and set the HOME - // environment variable as required by POSIX if it is not overridden by - // the user. - homeDir, err := getExecUserHome(ctx, mns, uid) - if err != nil { - return nil, fmt.Errorf("error reading exec user: %v", err) - } - return append(envv, "HOME="+homeDir), nil -} - -// findHomeInPasswd parses a passwd file and returns the given user's home -// directory. This function does it's best to replicate the runc's behavior. -func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) { - s := bufio.NewScanner(passwd) - - for s.Scan() { - if err := s.Err(); err != nil { - return "", err - } - - line := strings.TrimSpace(s.Text()) - if line == "" { - continue - } - - // Pull out part of passwd entry. Loosely parse the passwd entry as some - // passwd files could be poorly written and for compatibility with runc. - // - // Per 'man 5 passwd' - // /etc/passwd contains one line for each user account, with seven - // fields delimited by colons (“:”). These fields are: - // - // - login name - // - optional encrypted password - // - numerical user ID - // - numerical group ID - // - user name or comment field - // - user home directory - // - optional user command interpreter - parts := strings.Split(line, ":") - - found := false - homeDir := "" - for i, p := range parts { - switch i { - case 2: - parsedUID, err := strconv.ParseUint(p, 10, 32) - if err == nil && parsedUID == uint64(uid) { - found = true - } - case 5: - homeDir = p - } - } - if found { - // NOTE: If the uid is present but the home directory is not - // present in the /etc/passwd entry we return an empty string. This - // is, for better or worse, what runc does. - return homeDir, nil - } - } - - return defaultHome, nil -} diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go deleted file mode 100644 index 9aee2ad07..000000000 --- a/runsc/boot/user_test.go +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package boot - -import ( - "io/ioutil" - "os" - "path/filepath" - "strings" - "syscall" - "testing" - - specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" -) - -func setupTempDir() (string, error) { - tmpDir, err := ioutil.TempDir(os.TempDir(), "exec-user-test") - if err != nil { - return "", err - } - return tmpDir, nil -} - -func setupPasswd(contents string, perms os.FileMode) func() (string, error) { - return func() (string, error) { - tmpDir, err := setupTempDir() - if err != nil { - return "", err - } - - if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil { - return "", err - } - - f, err := os.Create(filepath.Join(tmpDir, "etc", "passwd")) - if err != nil { - return "", err - } - defer f.Close() - - _, err = f.WriteString(contents) - if err != nil { - return "", err - } - - err = f.Chmod(perms) - if err != nil { - return "", err - } - return tmpDir, nil - } -} - -// TestGetExecUserHome tests the getExecUserHome function. -func TestGetExecUserHome(t *testing.T) { - tests := map[string]struct { - uid auth.KUID - createRoot func() (string, error) - expected string - }{ - "success": { - uid: 1000, - createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0666), - expected: "/home/adin", - }, - "no_passwd": { - uid: 1000, - createRoot: setupTempDir, - expected: "/", - }, - "no_perms": { - uid: 1000, - createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0000), - expected: "/", - }, - "directory": { - uid: 1000, - createRoot: func() (string, error) { - tmpDir, err := setupTempDir() - if err != nil { - return "", err - } - - if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil { - return "", err - } - - if err := syscall.Mkdir(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil { - return "", err - } - - return tmpDir, nil - }, - expected: "/", - }, - // Currently we don't allow named pipes. - "named_pipe": { - uid: 1000, - createRoot: func() (string, error) { - tmpDir, err := setupTempDir() - if err != nil { - return "", err - } - - if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil { - return "", err - } - - if err := syscall.Mkfifo(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil { - return "", err - } - - return tmpDir, nil - }, - expected: "/", - }, - } - - for name, tc := range tests { - t.Run(name, func(t *testing.T) { - tmpDir, err := tc.createRoot() - if err != nil { - t.Fatalf("failed to create root dir: %v", err) - } - - sandEnd, cleanup, err := startGofer(tmpDir) - if err != nil { - t.Fatalf("failed to create gofer: %v", err) - } - defer cleanup() - - ctx := contexttest.Context(t) - conf := &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - DisableSeccomp: true, - } - - spec := &specs.Spec{ - Root: &specs.Root{ - Path: tmpDir, - Readonly: true, - }, - // Add /proc mount as tmpfs to avoid needing a kernel. - Mounts: []specs.Mount{ - { - Destination: "/proc", - Type: "tmpfs", - }, - }, - } - - mntr := newContainerMounter(spec, []int{sandEnd}, nil, &podMountHints{}) - mns, err := mntr.createMountNamespace(ctx, conf) - if err != nil { - t.Fatalf("failed to create mount namespace: %v", err) - } - ctx = fs.WithRoot(ctx, mns.Root()) - if err := mntr.mountSubmounts(ctx, conf, mns); err != nil { - t.Fatalf("failed to create mount namespace: %v", err) - } - - got, err := getExecUserHome(ctx, mns, tc.uid) - if err != nil { - t.Fatalf("failed to get user home: %v", err) - } - - if got != tc.expected { - t.Fatalf("expected %v, got: %v", tc.expected, got) - } - }) - } -} - -// TestFindHomeInPasswd tests the findHomeInPasswd function's passwd file parsing. -func TestFindHomeInPasswd(t *testing.T) { - tests := map[string]struct { - uid uint32 - passwd string - expected string - def string - }{ - "empty": { - uid: 1000, - passwd: "", - expected: "/", - def: "/", - }, - "whitespace": { - uid: 1000, - passwd: " ", - expected: "/", - def: "/", - }, - "full": { - uid: 1000, - passwd: "adin::1000:1111::/home/adin:/bin/sh", - expected: "/home/adin", - def: "/", - }, - // For better or worse, this is how runc works. - "partial": { - uid: 1000, - passwd: "adin::1000:1111:", - expected: "", - def: "/", - }, - "multiple": { - uid: 1001, - passwd: "adin::1000:1111::/home/adin:/bin/sh\nian::1001:1111::/home/ian:/bin/sh", - expected: "/home/ian", - def: "/", - }, - "duplicate": { - uid: 1000, - passwd: "adin::1000:1111::/home/adin:/bin/sh\nian::1000:1111::/home/ian:/bin/sh", - expected: "/home/adin", - def: "/", - }, - "empty_lines": { - uid: 1001, - passwd: "adin::1000:1111::/home/adin:/bin/sh\n\n\nian::1001:1111::/home/ian:/bin/sh", - expected: "/home/ian", - def: "/", - }, - } - - for name, tc := range tests { - t.Run(name, func(t *testing.T) { - got, err := findHomeInPasswd(tc.uid, strings.NewReader(tc.passwd), tc.def) - if err != nil { - t.Fatalf("error parsing passwd: %v", err) - } - if tc.expected != got { - t.Fatalf("expected %v, got: %v", tc.expected, got) - } - }) - } -} diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go new file mode 100644 index 000000000..0b9b0b436 --- /dev/null +++ b/runsc/boot/vfs.go @@ -0,0 +1,366 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "path" + "sort" + "strconv" + "strings" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/devices/memdev" + "gvisor.dev/gvisor/pkg/sentry/fs" + devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" + sysimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" + tmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/syserror" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { + + vfsObj.MustRegisterFilesystemType(rootFsName, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(bind, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(devpts, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(devtmpfs, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(proc, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(sysfs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(tmpfs, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(nonefs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + // Setup files in devtmpfs. + if err := memdev.Register(vfsObj); err != nil { + return fmt.Errorf("registering memdev: %w", err) + } + a, err := devtmpfsimpl.NewAccessor(ctx, vfsObj, creds, devtmpfsimpl.Name) + if err != nil { + return fmt.Errorf("creating devtmpfs accessor: %w", err) + } + defer a.Release() + + if err := a.UserspaceInit(ctx); err != nil { + return fmt.Errorf("initializing userspace: %w", err) + } + if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating devtmpfs files: %w", err) + } + return nil +} + +func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + if err := mntr.k.VFS().Init(); err != nil { + return fmt.Errorf("failed to initialize VFS: %w", err) + } + mns, err := mntr.setupVFS2(ctx, conf, procArgs) + if err != nil { + return fmt.Errorf("failed to setupFS: %w", err) + } + procArgs.MountNamespaceVFS2 = mns + return setExecutablePathVFS2(ctx, procArgs) +} + +func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error { + + exe := procArgs.Argv[0] + + // Absolute paths can be used directly. + if path.IsAbs(exe) { + procArgs.Filename = exe + return nil + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(exe, '/') > 0 { + + if !path.IsAbs(procArgs.WorkingDirectory) { + return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory) + } + + procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe) + return nil + } + + // Paths with a '/' are relative to the CWD. + if strings.IndexByte(exe, '/') > 0 { + procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe) + return nil + } + + // Otherwise, We must lookup the name in the paths, starting from the + // root directory. + root := procArgs.MountNamespaceVFS2.Root() + defer root.DecRef() + + paths := fs.GetPath(procArgs.Envv) + creds := procArgs.Credentials + + for _, p := range paths { + + binPath := path.Join(p, exe) + + pop := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(binPath), + FollowFinalSymlink: true, + } + + opts := &vfs.OpenOptions{ + FileExec: true, + Flags: linux.O_RDONLY, + } + + dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts) + if err == syserror.ENOENT || err == syserror.EACCES { + // Didn't find it here. + continue + } + if err != nil { + return err + } + dentry.DecRef() + + procArgs.Filename = binPath + return nil + } + + return fmt.Errorf("executable %q not found in $PATH=%q", exe, strings.Join(paths, ":")) +} + +func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { + log.Infof("Configuring container's file system with VFS2") + + // Create context with root credentials to mount the filesystem (the current + // user may not be privileged enough). + rootProcArgs := *procArgs + rootProcArgs.WorkingDirectory = "/" + rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs.Umask = 0022 + rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals + rootCtx := procArgs.NewContext(c.k) + + creds := procArgs.Credentials + if err := registerFilesystems(rootCtx, c.k.VFS(), creds); err != nil { + return nil, fmt.Errorf("register filesystems: %w", err) + } + + mns, err := c.createMountNamespaceVFS2(ctx, conf, creds) + if err != nil { + return nil, fmt.Errorf("creating mount namespace: %w", err) + } + + rootProcArgs.MountNamespaceVFS2 = mns + + // Mount submounts. + if err := c.mountSubmountsVFS2(rootCtx, conf, mns, creds); err != nil { + return nil, fmt.Errorf("mounting submounts vfs2: %w", err) + } + + return mns, nil +} + +func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { + + fd := c.fds.remove() + opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",") + + log.Infof("Mounting root over 9P, ioFD: %d", fd) + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts}) + if err != nil { + return nil, fmt.Errorf("setting up mount namespace: %w", err) + } + return mns, nil +} + +func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { + + c.prepareMountsVFS2() + + for _, submount := range c.mounts { + log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) + if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil { + return err + } + } + + // TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go. + + return c.checkDispenser() +} + +func (c *containerMounter) prepareMountsVFS2() { + // Sort the mounts so that we don't place children before parents. + sort.Slice(c.mounts, func(i, j int) bool { return len(c.mounts[i].Destination) < len(c.mounts[j].Destination) }) +} + +// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version. +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error { + root := mns.Root() + defer root.DecRef() + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(submount.Destination), + } + + fsName, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount) + if err != nil { + return fmt.Errorf("mountOptions failed: %w", err) + } + + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil { + return err + } + log.Debugf("directory exists or made directory for submount: %s", submount.Destination) + + opts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(options, ","), + }, + InternalMount: true, + } + + // All writes go to upper, be paranoid and make lower readonly. + opts.ReadOnly = useOverlay + + if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil { + return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + } + log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts) + return nil +} + +// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m specs.Mount) (string, []string, bool, error) { + var ( + fsName string + opts []string + useOverlay bool + ) + + switch m.Type { + case devpts, devtmpfs, proc, sysfs: + fsName = m.Type + case nonefs: + fsName = sysfs + case tmpfs: + fsName = m.Type + + var err error + opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...) + if err != nil { + return "", nil, false, err + } + + case bind: + fd := c.fds.remove() + fsName = "9p" + opts = p9MountOptionsVFS2(fd, c.getMountAccessType(m)) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + + default: + log.Warningf("ignoring unknown filesystem type %q", m.Type) + } + return fsName, opts, useOverlay, nil +} + +// p9MountOptions creates a slice of options for a p9 mount. +// TODO(gvisor.dev/issue/1200): Remove this version in favor of the one in +// fs.go when privateunixsocket lands. +func p9MountOptionsVFS2(fd int, fa FileAccessType) []string { + opts := []string{ + "trans=fd", + "rfdno=" + strconv.Itoa(fd), + "wfdno=" + strconv.Itoa(fd), + } + if fa == FileAccessShared { + opts = append(opts, "cache=remote_revalidating") + } + return opts +} + +func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(currentPath), + } + + _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) + switch { + + case err == syserror.ENOENT: + if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil { + return err + } + + mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} + if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil { + return fmt.Errorf("failed to makedir for mount %+v: %w", target, err) + } + return nil + + case err != nil: + return fmt.Errorf("stat failed for mount %+v: %w", target, err) + + default: + return nil + } +} |