summaryrefslogtreecommitdiffhomepage
path: root/runsc/boot
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/boot')
-rw-r--r--runsc/boot/BUILD12
-rw-r--r--runsc/boot/compat.go16
-rw-r--r--runsc/boot/compat_amd64.go2
-rw-r--r--runsc/boot/compat_test.go2
-rw-r--r--runsc/boot/config.go47
-rw-r--r--runsc/boot/controller.go45
-rw-r--r--runsc/boot/debug.go2
-rw-r--r--runsc/boot/events.go4
-rw-r--r--runsc/boot/fds.go35
-rw-r--r--runsc/boot/filter/BUILD4
-rw-r--r--runsc/boot/filter/config.go29
-rw-r--r--runsc/boot/filter/extra_filters.go4
-rw-r--r--runsc/boot/filter/extra_filters_msan.go2
-rw-r--r--runsc/boot/filter/extra_filters_race.go3
-rw-r--r--runsc/boot/filter/filter.go19
-rw-r--r--runsc/boot/fs.go324
-rw-r--r--runsc/boot/fs_test.go193
-rw-r--r--runsc/boot/limits.go4
-rw-r--r--runsc/boot/loader.go208
-rw-r--r--runsc/boot/loader_test.go22
-rw-r--r--runsc/boot/network.go22
-rw-r--r--runsc/boot/platforms/BUILD16
-rw-r--r--runsc/boot/platforms/platforms.go30
-rw-r--r--runsc/boot/pprof.go18
-rw-r--r--runsc/boot/strace.go2
-rw-r--r--runsc/boot/user.go146
-rw-r--r--runsc/boot/user_test.go253
27 files changed, 1194 insertions, 270 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index ac28c4339..5025401dd 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -16,9 +16,11 @@ go_library(
"limits.go",
"loader.go",
"network.go",
+ "pprof.go",
"strace.go",
+ "user.go",
],
- importpath = "gvisor.googlesource.com/gvisor/runsc/boot",
+ importpath = "gvisor.dev/gvisor/runsc/boot",
visibility = [
"//runsc:__subpackages__",
"//test:__subpackages__",
@@ -32,6 +34,7 @@ go_library(
"//pkg/log",
"//pkg/memutil",
"//pkg/rand",
+ "//pkg/refs",
"//pkg/sentry/arch",
"//pkg/sentry/arch:registers_go_proto",
"//pkg/sentry/context",
@@ -49,13 +52,10 @@ go_library(
"//pkg/sentry/kernel",
"//pkg/sentry/kernel:uncaught_signal_go_proto",
"//pkg/sentry/kernel/auth",
- "//pkg/sentry/kernel/kdefs",
"//pkg/sentry/limits",
"//pkg/sentry/loader",
"//pkg/sentry/pgalloc",
"//pkg/sentry/platform",
- "//pkg/sentry/platform/kvm",
- "//pkg/sentry/platform/ptrace",
"//pkg/sentry/sighandling",
"//pkg/sentry/socket/epsocket",
"//pkg/sentry/socket/hostinet",
@@ -68,6 +68,7 @@ go_library(
"//pkg/sentry/time",
"//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
"//pkg/sentry/usage",
+ "//pkg/sentry/usermem",
"//pkg/sentry/watchdog",
"//pkg/syserror",
"//pkg/tcpip",
@@ -83,6 +84,7 @@ go_library(
"//pkg/tcpip/transport/udp",
"//pkg/urpc",
"//runsc/boot/filter",
+ "//runsc/boot/platforms",
"//runsc/specutils",
"@com_github_golang_protobuf//proto:go_default_library",
"@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
@@ -94,7 +96,9 @@ go_test(
size = "small",
srcs = [
"compat_test.go",
+ "fs_test.go",
"loader_test.go",
+ "user_test.go",
],
embed = [":boot"],
deps = [
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index c369e4d64..07e35ab10 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -21,14 +21,14 @@ import (
"syscall"
"github.com/golang/protobuf/proto"
- "gvisor.googlesource.com/gvisor/pkg/abi"
- "gvisor.googlesource.com/gvisor/pkg/eventchannel"
- "gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
- rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
- ucspb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
- "gvisor.googlesource.com/gvisor/pkg/sentry/strace"
- spb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+ "gvisor.dev/gvisor/pkg/abi"
+ "gvisor.dev/gvisor/pkg/eventchannel"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+ ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
+ "gvisor.dev/gvisor/pkg/sentry/strace"
+ spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
)
func initCompatLogs(fd int) error {
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 99df5e614..43cd0db94 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -17,7 +17,7 @@ package boot
import (
"fmt"
- rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+ rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
)
// reportLimit is the max number of events that should be reported per tracker.
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index ccec3d20c..388298d8d 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -17,7 +17,7 @@ package boot
import (
"testing"
- rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto"
+ rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
)
func TestOnceTracker(t *testing.T) {
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 8564c502d..6f1eb9a41 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -19,43 +19,9 @@ import (
"strconv"
"strings"
- "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
+ "gvisor.dev/gvisor/pkg/sentry/watchdog"
)
-// PlatformType tells which platform to use.
-type PlatformType int
-
-const (
- // PlatformPtrace runs the sandbox with the ptrace platform.
- PlatformPtrace PlatformType = iota
-
- // PlatformKVM runs the sandbox with the KVM platform.
- PlatformKVM
-)
-
-// MakePlatformType converts type from string.
-func MakePlatformType(s string) (PlatformType, error) {
- switch s {
- case "ptrace":
- return PlatformPtrace, nil
- case "kvm":
- return PlatformKVM, nil
- default:
- return 0, fmt.Errorf("invalid platform type %q", s)
- }
-}
-
-func (p PlatformType) String() string {
- switch p {
- case PlatformPtrace:
- return "ptrace"
- case PlatformKVM:
- return "kvm"
- default:
- return fmt.Sprintf("unknown(%d)", p)
- }
-}
-
// FileAccessType tells how the filesystem is accessed.
type FileAccessType int
@@ -187,7 +153,7 @@ type Config struct {
LogPackets bool
// Platform is the platform to run on.
- Platform PlatformType
+ Platform string
// Strace indicates that strace should be enabled.
Strace bool
@@ -226,6 +192,12 @@ type Config struct {
// to the same underlying network device. This allows netstack to better
// scale for high throughput use cases.
NumNetworkChannels int
+
+ // Rootless allows the sandbox to be started with a user that is not root.
+ // Defense is depth measures are weaker with rootless. Specifically, the
+ // sandbox and Gofer process run as root inside a user namespace with root
+ // mapped to the caller's user.
+ Rootless bool
}
// ToFlags returns a slice of flags that correspond to the given Config.
@@ -241,7 +213,7 @@ func (c *Config) ToFlags() []string {
"--overlay=" + strconv.FormatBool(c.Overlay),
"--network=" + c.Network.String(),
"--log-packets=" + strconv.FormatBool(c.LogPackets),
- "--platform=" + c.Platform.String(),
+ "--platform=" + c.Platform,
"--strace=" + strconv.FormatBool(c.Strace),
"--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
"--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
@@ -250,6 +222,7 @@ func (c *Config) ToFlags() []string {
"--profile=" + strconv.FormatBool(c.ProfileEnable),
"--net-raw=" + strconv.FormatBool(c.EnableRaw),
"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
+ "--rootless=" + strconv.FormatBool(c.Rootless),
}
if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index a277145b1..d79aaff60 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,17 +22,17 @@ import (
"syscall"
specs "github.com/opencontainers/runtime-spec/specs-go"
- "gvisor.googlesource.com/gvisor/pkg/control/server"
- "gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/sentry/control"
- "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
- "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
- "gvisor.googlesource.com/gvisor/pkg/sentry/state"
- "gvisor.googlesource.com/gvisor/pkg/sentry/time"
- "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
- "gvisor.googlesource.com/gvisor/pkg/urpc"
+ "gvisor.dev/gvisor/pkg/control/server"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/control"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/socket/epsocket"
+ "gvisor.dev/gvisor/pkg/sentry/state"
+ "gvisor.dev/gvisor/pkg/sentry/time"
+ "gvisor.dev/gvisor/pkg/sentry/watchdog"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/urpc"
)
const (
@@ -96,8 +96,10 @@ const (
// SandboxStacks collects sandbox stacks for debugging.
SandboxStacks = "debug.Stacks"
+)
- // Profiling related commands (see pprof.go for more details).
+// Profiling related commands (see pprof.go for more details).
+const (
StartCPUProfile = "Profile.StartCPUProfile"
StopCPUProfile = "Profile.StopCPUProfile"
HeapProfile = "Profile.HeapProfile"
@@ -105,6 +107,11 @@ const (
StopTrace = "Profile.StopTrace"
)
+// Logging related commands (see logging.go for more details).
+const (
+ ChangeLogging = "Logging.Change"
+)
+
// ControlSocketAddr generates an abstract unix socket name for the given ID.
func ControlSocketAddr(id string) string {
return fmt.Sprintf("\x00runsc-sandbox.%s", id)
@@ -143,6 +150,7 @@ func newController(fd int, l *Loader) (*controller, error) {
}
srv.Register(&debug{})
+ srv.Register(&control.Logging{})
if l.conf.ProfileEnable {
srv.Register(&control.Profile{})
}
@@ -340,7 +348,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
cm.l.k = k
// Set up the restore environment.
- mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k)
+ mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k, cm.l.mountHints)
renv, err := mntr.createRestoreEnvironment(cm.l.conf)
if err != nil {
return fmt.Errorf("creating RestoreEnvironment: %v", err)
@@ -359,6 +367,17 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("file cannot be empty")
}
+ if cm.l.conf.ProfileEnable {
+ // initializePProf opens /proc/self/maps, so has to be
+ // called before installing seccomp filters.
+ initializePProf()
+ }
+
+ // Seccomp filters have to be applied before parsing the state file.
+ if err := cm.l.installSeccompFilters(); err != nil {
+ return err
+ }
+
// Load the state.
loadOpts := state.LoadOpts{Source: specFile}
if err := loadOpts.Load(k, networkStack); err != nil {
diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go
index 79f7387ac..1fb32c527 100644
--- a/runsc/boot/debug.go
+++ b/runsc/boot/debug.go
@@ -15,7 +15,7 @@
package boot
import (
- "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/log"
)
type debug struct {
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index ffd99f5e9..422f4da00 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -15,8 +15,8 @@
package boot
import (
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
- "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
)
// Event struct for encoding the event data to JSON. Corresponds to runc's
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 0811e10f4..e5de1f3d7 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -17,36 +17,27 @@ package boot
import (
"fmt"
- "gvisor.googlesource.com/gvisor/pkg/sentry/context"
- "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
- "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
- "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/host"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
)
-// createFDMap creates an FD map that contains stdin, stdout, and stderr. If
-// console is true, then ioctl calls will be passed through to the host FD.
+// createFDTable creates an FD table that contains stdin, stdout, and stderr.
+// If console is true, then ioctl calls will be passed through to the host FD.
// Upon success, createFDMap dups then closes stdioFDs.
-func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) {
if len(stdioFDs) != 3 {
return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
}
k := kernel.KernelFromContext(ctx)
- fdm := k.NewFDMap()
- defer fdm.DecRef()
+ fdTable := k.NewFDTable()
+ defer fdTable.DecRef()
mounter := fs.FileOwnerFromContext(ctx)
- // Maps sandbox FD to host FD.
- fdMap := map[int]int{
- 0: stdioFDs[0],
- 1: stdioFDs[1],
- 2: stdioFDs[2],
- }
-
var ttyFile *fs.File
- for appFD, hostFD := range fdMap {
+ for appFD, hostFD := range stdioFDs {
var appFile *fs.File
if console && appFD < 3 {
@@ -80,11 +71,11 @@ func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs
}
// Add the file to the FD map.
- if err := fdm.NewFDAt(kdefs.FD(appFD), appFile, kernel.FDFlags{}, l); err != nil {
+ if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
return nil, err
}
}
- fdm.IncRef()
- return fdm, nil
+ fdTable.IncRef()
+ return fdTable, nil
}
diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD
index 3b6020cf3..f5509b6b7 100644
--- a/runsc/boot/filter/BUILD
+++ b/runsc/boot/filter/BUILD
@@ -11,7 +11,7 @@ go_library(
"extra_filters_race.go",
"filter.go",
],
- importpath = "gvisor.googlesource.com/gvisor/runsc/boot/filter",
+ importpath = "gvisor.dev/gvisor/runsc/boot/filter",
visibility = [
"//runsc/boot:__subpackages__",
],
@@ -20,8 +20,6 @@ go_library(
"//pkg/log",
"//pkg/seccomp",
"//pkg/sentry/platform",
- "//pkg/sentry/platform/kvm",
- "//pkg/sentry/platform/ptrace",
"//pkg/tcpip/link/fdbased",
"@org_golang_x_sys//unix:go_default_library",
],
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index ef2dbfad2..0ee5b8bbd 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -19,9 +19,9 @@ import (
"syscall"
"golang.org/x/sys/unix"
- "gvisor.googlesource.com/gvisor/pkg/abi/linux"
- "gvisor.googlesource.com/gvisor/pkg/seccomp"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/seccomp"
+ "gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
)
// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
@@ -437,29 +437,6 @@ func hostInetFilters() seccomp.SyscallRules {
}
}
-// ptraceFilters returns syscalls made exclusively by the ptrace platform.
-func ptraceFilters() seccomp.SyscallRules {
- return seccomp.SyscallRules{
- unix.SYS_GETCPU: {},
- unix.SYS_SCHED_SETAFFINITY: {},
- syscall.SYS_PTRACE: {},
- syscall.SYS_TGKILL: {},
- syscall.SYS_WAIT4: {},
- }
-}
-
-// kvmFilters returns syscalls made exclusively by the KVM platform.
-func kvmFilters() seccomp.SyscallRules {
- return seccomp.SyscallRules{
- syscall.SYS_ARCH_PRCTL: {},
- syscall.SYS_IOCTL: {},
- syscall.SYS_MMAP: {},
- syscall.SYS_RT_SIGSUSPEND: {},
- syscall.SYS_RT_SIGTIMEDWAIT: {},
- 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host.
- }
-}
-
func controlServerFilters(fd int) seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_ACCEPT: []seccomp.Rule{
diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go
index 5c5ec4e06..e28d4b8d6 100644
--- a/runsc/boot/filter/extra_filters.go
+++ b/runsc/boot/filter/extra_filters.go
@@ -17,11 +17,11 @@
package filter
import (
- "gvisor.googlesource.com/gvisor/pkg/seccomp"
+ "gvisor.dev/gvisor/pkg/seccomp"
)
// instrumentationFilters returns additional filters for syscalls used by
-// Go intrumentation tools, e.g. -race, -msan.
+// Go instrumentation tools, e.g. -race, -msan.
// Returns empty when disabled.
func instrumentationFilters() seccomp.SyscallRules {
return nil
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index ac5a0f1aa..5e5a3c998 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -19,7 +19,7 @@ package filter
import (
"syscall"
- "gvisor.googlesource.com/gvisor/pkg/seccomp"
+ "gvisor.dev/gvisor/pkg/seccomp"
)
// instrumentationFilters returns additional filters for syscalls used by MSAN.
diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go
index ba3c1ce87..9ff80276a 100644
--- a/runsc/boot/filter/extra_filters_race.go
+++ b/runsc/boot/filter/extra_filters_race.go
@@ -19,7 +19,7 @@ package filter
import (
"syscall"
- "gvisor.googlesource.com/gvisor/pkg/seccomp"
+ "gvisor.dev/gvisor/pkg/seccomp"
)
// instrumentationFilters returns additional filters for syscalls used by TSAN.
@@ -33,6 +33,7 @@ func instrumentationFilters() seccomp.SyscallRules {
syscall.SYS_MUNLOCK: {},
syscall.SYS_NANOSLEEP: {},
syscall.SYS_OPEN: {},
+ syscall.SYS_OPENAT: {},
syscall.SYS_SET_ROBUST_LIST: {},
// Used within glibc's malloc.
syscall.SYS_TIME: {},
diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go
index 17479e0dd..e80c171b3 100644
--- a/runsc/boot/filter/filter.go
+++ b/runsc/boot/filter/filter.go
@@ -18,13 +18,9 @@
package filter
import (
- "fmt"
-
- "gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/seccomp"
- "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
- "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
- "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/seccomp"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
)
// Options are seccomp filter related options.
@@ -53,14 +49,7 @@ func Install(opt Options) error {
s.Merge(profileFilters())
}
- switch p := opt.Platform.(type) {
- case *ptrace.PTrace:
- s.Merge(ptraceFilters())
- case *kvm.KVM:
- s.Merge(kvmFilters())
- default:
- return fmt.Errorf("unknown platform type %T", p)
- }
+ s.Merge(opt.Platform.SyscallFilters())
return seccomp.Install(s)
}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 939f2419c..d3e3196fd 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -18,29 +18,30 @@ import (
"fmt"
"path"
"path/filepath"
+ "sort"
"strconv"
"strings"
"syscall"
// Include filesystem types that OCI spec might mount.
- _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
- _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
- _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
- _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
- _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
- _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
+ "gvisor.dev/gvisor/pkg/sentry/fs/gofer"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/host"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
+ "gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
specs "github.com/opencontainers/runtime-spec/specs-go"
- "gvisor.googlesource.com/gvisor/pkg/abi/linux"
- "gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/sentry/context"
- "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
- "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
- "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
- "gvisor.googlesource.com/gvisor/pkg/syserror"
- "gvisor.googlesource.com/gvisor/runsc/specutils"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/runsc/specutils"
)
const (
@@ -50,6 +51,9 @@ const (
// Device name for root mount.
rootDevice = "9pfs-/"
+ // MountPrefix is the annotation prefix for mount hints.
+ MountPrefix = "gvisor.dev/spec/mount"
+
// ChildContainersDir is the directory where child container root
// filesystems are mounted.
ChildContainersDir = "/__runsc_containers__"
@@ -72,7 +76,7 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string,
tmpFS := mustFindFilesystem("tmpfs")
if !fs.IsDir(lower.StableAttr) {
// Create overlay on top of mount file, e.g. /etc/hostname.
- msrc := fs.NewCachingMountSource(tmpFS, upperFlags)
+ msrc := fs.NewCachingMountSource(ctx, tmpFS, upperFlags)
return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags)
}
@@ -222,7 +226,11 @@ func mustFindFilesystem(name string) fs.Filesystem {
// addSubmountOverlay overlays the inode over a ramfs tree containing the given
// paths.
func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
- msrc := fs.NewPseudoMountSource()
+ // Construct a ramfs tree of mount points. The contents never
+ // change, so this can be fully caching. There's no real
+ // filesystem backing this tree, so we set the filesystem to
+ // nil.
+ msrc := fs.NewCachingMountSource(ctx, nil, fs.MountSourceFlags{})
mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
if err != nil {
return nil, fmt.Errorf("creating mount tree: %v", err)
@@ -292,6 +300,174 @@ func (f *fdDispenser) empty() bool {
return len(f.fds) == 0
}
+type shareType int
+
+const (
+ invalid shareType = iota
+
+ // container shareType indicates that the mount is used by a single container.
+ container
+
+ // pod shareType indicates that the mount is used by more than one container
+ // inside the pod.
+ pod
+
+ // shared shareType indicates that the mount can also be shared with a process
+ // outside the pod, e.g. NFS.
+ shared
+)
+
+func parseShare(val string) (shareType, error) {
+ switch val {
+ case "container":
+ return container, nil
+ case "pod":
+ return pod, nil
+ case "shared":
+ return shared, nil
+ default:
+ return 0, fmt.Errorf("invalid share value %q", val)
+ }
+}
+
+func (s shareType) String() string {
+ switch s {
+ case invalid:
+ return "invalid"
+ case container:
+ return "container"
+ case pod:
+ return "pod"
+ case shared:
+ return "shared"
+ default:
+ return fmt.Sprintf("invalid share value %d", s)
+ }
+}
+
+// mountHint represents extra information about mounts that are provided via
+// annotations. They can override mount type, and provide sharing information
+// so that mounts can be correctly shared inside the pod.
+type mountHint struct {
+ name string
+ share shareType
+ mount specs.Mount
+
+ // root is the inode where the volume is mounted. For mounts with 'pod' share
+ // the volume is mounted once and then bind mounted inside the containers.
+ root *fs.Inode
+}
+
+func (m *mountHint) setField(key, val string) error {
+ switch key {
+ case "source":
+ if len(val) == 0 {
+ return fmt.Errorf("source cannot be empty")
+ }
+ m.mount.Source = val
+ case "type":
+ return m.setType(val)
+ case "share":
+ share, err := parseShare(val)
+ if err != nil {
+ return err
+ }
+ m.share = share
+ case "options":
+ return m.setOptions(val)
+ default:
+ return fmt.Errorf("invalid mount annotation: %s=%s", key, val)
+ }
+ return nil
+}
+
+func (m *mountHint) setType(val string) error {
+ switch val {
+ case "tmpfs", "bind":
+ m.mount.Type = val
+ default:
+ return fmt.Errorf("invalid type %q", val)
+ }
+ return nil
+}
+
+func (m *mountHint) setOptions(val string) error {
+ opts := strings.Split(val, ",")
+ if err := specutils.ValidateMountOptions(opts); err != nil {
+ return err
+ }
+ // Sort options so it can be compared with container mount options later on.
+ sort.Strings(opts)
+ m.mount.Options = opts
+ return nil
+}
+
+func (m *mountHint) isSupported() bool {
+ return m.mount.Type == tmpfs && m.share == pod
+}
+
+// podMountHints contains a collection of mountHints for the pod.
+type podMountHints struct {
+ mounts map[string]*mountHint
+}
+
+func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
+ mnts := make(map[string]*mountHint)
+ for k, v := range spec.Annotations {
+ // Look for 'gvisor.dev/spec/mount' annotations and parse them.
+ if strings.HasPrefix(k, MountPrefix) {
+ parts := strings.Split(k, "/")
+ if len(parts) != 5 {
+ return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
+ }
+ name := parts[3]
+ if len(name) == 0 || path.Clean(name) != name {
+ return nil, fmt.Errorf("invalid mount name: %s", name)
+ }
+ mnt := mnts[name]
+ if mnt == nil {
+ mnt = &mountHint{name: name}
+ mnts[name] = mnt
+ }
+ if err := mnt.setField(parts[4], v); err != nil {
+ return nil, err
+ }
+ }
+ }
+
+ // Validate all hints after done parsing.
+ for name, m := range mnts {
+ log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share)
+ if m.share == invalid {
+ return nil, fmt.Errorf("share field for %q has not been set", m.name)
+ }
+ if len(m.mount.Source) == 0 {
+ return nil, fmt.Errorf("source field for %q has not been set", m.name)
+ }
+ if len(m.mount.Type) == 0 {
+ return nil, fmt.Errorf("type field for %q has not been set", m.name)
+ }
+
+ // Check for duplicate mount sources.
+ for name2, m2 := range mnts {
+ if name != name2 && m.mount.Source == m2.mount.Source {
+ return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source)
+ }
+ }
+ }
+
+ return &podMountHints{mounts: mnts}, nil
+}
+
+func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
+ for _, m := range p.mounts {
+ if m.mount.Source == mount.Source {
+ return m
+ }
+ }
+ return nil
+}
+
type containerMounter struct {
// cid is the container ID. May be set to empty for the root container.
cid string
@@ -306,15 +482,18 @@ type containerMounter struct {
fds fdDispenser
k *kernel.Kernel
+
+ hints *podMountHints
}
-func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel) *containerMounter {
+func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
return &containerMounter{
cid: cid,
root: spec.Root,
mounts: compileMounts(spec),
fds: fdDispenser{fds: goferFDs},
k: k,
+ hints: hints,
}
}
@@ -476,6 +655,15 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
// 'setMountNS' is called after namespace is created. It must set the mount NS
// to 'rootCtx'.
func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
+ for _, hint := range c.hints.mounts {
+ log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+ inode, err := c.mountSharedMaster(rootCtx, conf, hint)
+ if err != nil {
+ return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+ }
+ hint.root = inode
+ }
+
// Create a tmpfs mount where we create and mount a root filesystem for
// each child container.
c.mounts = append(c.mounts, specs.Mount{
@@ -498,21 +686,57 @@ func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx c
return c.mountSubmounts(rootCtx, conf, mns, root)
}
+// mountSharedMaster mounts the master of a volume that is shared among
+// containers in a pod. It returns the root mount's inode.
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
+ if err != nil {
+ return nil, err
+ }
+ if len(fsName) == 0 {
+ return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+ }
+
+ // Mount with revalidate because it's shared among containers.
+ opts = append(opts, "cache=revalidate")
+
+ // All filesystem names should have been mapped to something we know.
+ filesystem := mustFindFilesystem(fsName)
+
+ mf := mountFlags(hint.mount.Options)
+ if useOverlay {
+ // All writes go to upper, be paranoid and make lower readonly.
+ mf.ReadOnly = true
+ }
+
+ inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
+ }
+
+ if useOverlay {
+ log.Debugf("Adding overlay on top of shared mount %q", hint.name)
+ inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ return inode, nil
+}
+
// createRootMount creates the root filesystem.
func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
// First construct the filesystem from the spec.Root.
mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
- var (
- rootInode *fs.Inode
- err error
- )
-
fd := c.fds.remove()
log.Infof("Mounting root over 9P, ioFD: %d", fd)
p9FS := mustFindFilesystem("9p")
opts := p9MountOptions(fd, conf.FileAccess)
- rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+ rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
if err != nil {
return nil, fmt.Errorf("creating root mount point: %v", err)
}
@@ -579,8 +803,14 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
for _, m := range c.mounts {
- if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
- return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+ if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
+ if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
+ return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
+ }
+ } else {
+ if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+ return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+ }
}
}
@@ -653,6 +883,40 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
return nil
}
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
+ // For now enforce that all options are the same. Once bind mount is properly
+ // supported, then we should ensure the master is less restrictive than the
+ // container, e.g. master can be 'rw' while container mounts as 'ro'.
+ if len(mount.Options) != len(source.mount.Options) {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
+ }
+ sort.Strings(mount.Options)
+ for i, opt := range mount.Options {
+ if opt != source.mount.Options[i] {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
+ }
+ }
+
+ maxTraversals := uint(0)
+ target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
+ }
+ defer target.DecRef()
+
+ // Take a ref on the inode that is about to be (re)-mounted.
+ source.root.IncRef()
+ if err := mns.Mount(ctx, target, source.root); err != nil {
+ source.root.DecRef()
+ return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
+ }
+
+ log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+ return nil
+}
+
// addRestoreMount adds a mount to the MountSources map used for restoring a
// checkpointed container.
func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
@@ -678,8 +942,8 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron
return nil
}
-// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
-// to the environment.
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
+// the mounts to the environment.
func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
renv := &fs.RestoreEnvironment{
MountSources: make(map[string][]fs.MountArgs),
@@ -730,7 +994,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
// the host /tmp, but this is a nice optimization, and fixes some apps that call
// mknod in /tmp. It's unsafe to mount tmpfs if:
-// 1. /tmp is mounted explictly: we should not override user's wish
+// 1. /tmp is mounted explicitly: we should not override user's wish
// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
//
// Note that when there are submounts inside of '/tmp', directories for the
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
new file mode 100644
index 000000000..49ab34b33
--- /dev/null
+++ b/runsc/boot/fs_test.go
@@ -0,0 +1,193 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "path"
+ "reflect"
+ "strings"
+ "testing"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestPodMountHintsHappy(t *testing.T) {
+ spec := &specs.Spec{
+ Annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+
+ path.Join(MountPrefix, "mount2", "source"): "bar",
+ path.Join(MountPrefix, "mount2", "type"): "bind",
+ path.Join(MountPrefix, "mount2", "share"): "container",
+ path.Join(MountPrefix, "mount2", "options"): "rw,private",
+ },
+ }
+ podHints, err := newPodMountHints(spec)
+ if err != nil {
+ t.Errorf("newPodMountHints failed: %v", err)
+ }
+
+ // Check that fields were set correctly.
+ mount1 := podHints.mounts["mount1"]
+ if want := "mount1"; want != mount1.name {
+ t.Errorf("mount1 name, want: %q, got: %q", want, mount1.name)
+ }
+ if want := "foo"; want != mount1.mount.Source {
+ t.Errorf("mount1 source, want: %q, got: %q", want, mount1.mount.Source)
+ }
+ if want := "tmpfs"; want != mount1.mount.Type {
+ t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Type)
+ }
+ if want := pod; want != mount1.share {
+ t.Errorf("mount1 type, want: %q, got: %q", want, mount1.share)
+ }
+ if want := []string(nil); !reflect.DeepEqual(want, mount1.mount.Options) {
+ t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Options)
+ }
+
+ mount2 := podHints.mounts["mount2"]
+ if want := "mount2"; want != mount2.name {
+ t.Errorf("mount2 name, want: %q, got: %q", want, mount2.name)
+ }
+ if want := "bar"; want != mount2.mount.Source {
+ t.Errorf("mount2 source, want: %q, got: %q", want, mount2.mount.Source)
+ }
+ if want := "bind"; want != mount2.mount.Type {
+ t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Type)
+ }
+ if want := container; want != mount2.share {
+ t.Errorf("mount2 type, want: %q, got: %q", want, mount2.share)
+ }
+ if want := []string{"private", "rw"}; !reflect.DeepEqual(want, mount2.mount.Options) {
+ t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Options)
+ }
+}
+
+func TestPodMountHintsErrors(t *testing.T) {
+ for _, tst := range []struct {
+ name string
+ annotations map[string]string
+ error string
+ }{
+ {
+ name: "too short",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1"): "foo",
+ },
+ error: "invalid mount annotation",
+ },
+ {
+ name: "no name",
+ annotations: map[string]string{
+ MountPrefix + "//source": "foo",
+ },
+ error: "invalid mount name",
+ },
+ {
+ name: "missing source",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+ },
+ error: "source field",
+ },
+ {
+ name: "missing type",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+ },
+ error: "type field",
+ },
+ {
+ name: "missing share",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ },
+ error: "share field",
+ },
+ {
+ name: "invalid field name",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "invalid"): "foo",
+ },
+ error: "invalid mount annotation",
+ },
+ {
+ name: "invalid source",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+ },
+ error: "source cannot be empty",
+ },
+ {
+ name: "invalid type",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "invalid-type",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+ },
+ error: "invalid type",
+ },
+ {
+ name: "invalid share",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "invalid-share",
+ },
+ error: "invalid share",
+ },
+ {
+ name: "invalid options",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+ path.Join(MountPrefix, "mount1", "options"): "invalid-option",
+ },
+ error: "unknown mount option",
+ },
+ {
+ name: "duplicate source",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+
+ path.Join(MountPrefix, "mount2", "source"): "foo",
+ path.Join(MountPrefix, "mount2", "type"): "bind",
+ path.Join(MountPrefix, "mount2", "share"): "container",
+ },
+ error: "have the same mount source",
+ },
+ } {
+ t.Run(tst.name, func(t *testing.T) {
+ spec := &specs.Spec{Annotations: tst.annotations}
+ podHints, err := newPodMountHints(spec)
+ if err == nil || !strings.Contains(err.Error(), tst.error) {
+ t.Errorf("newPodMountHints invalid error, want: .*%s.*, got: %v", tst.error, err)
+ }
+ if podHints != nil {
+ t.Errorf("newPodMountHints must return nil on failure: %+v", podHints)
+ }
+ })
+ }
+}
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index 3364aa5e6..d1c0bb9b5 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -20,8 +20,8 @@ import (
"syscall"
specs "github.com/opencontainers/runtime-spec/specs-go"
- "gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
)
// Mapping from linux resource names to limits.LimitType.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 42bddb2e8..8e8c6105b 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -20,51 +20,52 @@ import (
mrand "math/rand"
"os"
"runtime"
+ "strings"
"sync"
"sync/atomic"
"syscall"
gtime "time"
specs "github.com/opencontainers/runtime-spec/specs-go"
- "gvisor.googlesource.com/gvisor/pkg/abi/linux"
- "gvisor.googlesource.com/gvisor/pkg/cpuid"
- "gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/memutil"
- "gvisor.googlesource.com/gvisor/pkg/rand"
- "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
- "gvisor.googlesource.com/gvisor/pkg/sentry/control"
- "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
- "gvisor.googlesource.com/gvisor/pkg/sentry/inet"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
- "gvisor.googlesource.com/gvisor/pkg/sentry/loader"
- "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
- "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
- "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
- "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
- "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
- slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
- "gvisor.googlesource.com/gvisor/pkg/sentry/time"
- "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
- "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
- "gvisor.googlesource.com/gvisor/pkg/tcpip"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/icmp"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
- "gvisor.googlesource.com/gvisor/runsc/boot/filter"
- "gvisor.googlesource.com/gvisor/runsc/specutils"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/memutil"
+ "gvisor.dev/gvisor/pkg/rand"
+ "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/control"
+ "gvisor.dev/gvisor/pkg/sentry/fs/host"
+ "gvisor.dev/gvisor/pkg/sentry/inet"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/loader"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/sighandling"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/time"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/watchdog"
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+ "gvisor.dev/gvisor/pkg/tcpip/network/arp"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+ "gvisor.dev/gvisor/runsc/boot/filter"
+ _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
+ "gvisor.dev/gvisor/runsc/specutils"
// Include supported socket providers.
- "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
- "gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet"
- _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink"
- _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route"
- _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
+ "gvisor.dev/gvisor/pkg/sentry/socket/epsocket"
+ "gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
+ _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+ _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
+ _ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
)
// Loader keeps state needed to start the kernel and run the container..
@@ -117,6 +118,10 @@ type Loader struct {
//
// processes is guardded by mu.
processes map[execID]*execProcess
+
+ // mountHints provides extra information about mounts for containers that
+ // apply to the entire pod.
+ mountHints *podMountHints
}
// execID uniquely identifies a sentry process that is executed in a container.
@@ -201,7 +206,9 @@ func New(args Args) (*Loader, error) {
// Create VDSO.
//
// Pass k as the platform since it is savable, unlike the actual platform.
- vdso, err := loader.PrepareVDSO(k)
+ //
+ // FIXME(b/109889800): Use non-nil context.
+ vdso, err := loader.PrepareVDSO(nil, k)
if err != nil {
return nil, fmt.Errorf("creating vdso: %v", err)
}
@@ -255,7 +262,7 @@ func New(args Args) (*Loader, error) {
// Adjust the total memory returned by the Sentry so that applications that
// use /proc/meminfo can make allocations based on this limit.
usage.MinimumTotalMemoryBytes = args.TotalMem
- log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(2^30))
+ log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
}
// Initiate the Kernel object, which is required by the Context passed
@@ -299,6 +306,11 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("initializing compat logs: %v", err)
}
+ mountHints, err := newPodMountHints(args.Spec)
+ if err != nil {
+ return nil, fmt.Errorf("creating pod mount hints: %v", err)
+ }
+
eid := execID{cid: args.ID}
l := &Loader{
k: k,
@@ -311,6 +323,7 @@ func New(args Args) (*Loader, error) {
rootProcArgs: procArgs,
sandboxID: args.ID,
processes: map[execID]*execProcess{eid: {}},
+ mountHints: mountHints,
}
// We don't care about child signals; some platforms can generate a
@@ -402,19 +415,12 @@ func (l *Loader) Destroy() {
}
func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
- switch conf.Platform {
- case PlatformPtrace:
- log.Infof("Platform: ptrace")
- return ptrace.New()
- case PlatformKVM:
- log.Infof("Platform: kvm")
- if deviceFile == nil {
- return nil, fmt.Errorf("kvm device file must be provided")
- }
- return kvm.New(deviceFile)
- default:
- return nil, fmt.Errorf("invalid platform %v", conf.Platform)
+ p, err := platform.Lookup(conf.Platform)
+ if err != nil {
+ panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
}
+ log.Infof("Platform: %s", conf.Platform)
+ return p.New(deviceFile)
}
func createMemoryFile() (*pgalloc.MemoryFile, error) {
@@ -435,6 +441,23 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
return mf, nil
}
+func (l *Loader) installSeccompFilters() error {
+ if l.conf.DisableSeccomp {
+ filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+ } else {
+ opts := filter.Options{
+ Platform: l.k.Platform,
+ HostNetwork: l.conf.Network == NetworkHost,
+ ProfileEnable: l.conf.ProfileEnable,
+ ControllerFD: l.ctrl.srv.FD(),
+ }
+ if err := filter.Install(opts); err != nil {
+ return fmt.Errorf("installing seccomp filters: %v", err)
+ }
+ }
+ return nil
+}
+
// Run runs the root container.
func (l *Loader) Run() error {
err := l.run()
@@ -470,39 +493,33 @@ func (l *Loader) run() error {
return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
}
- // Finally done with all configuration. Setup filters before user code
- // is loaded.
- if l.conf.DisableSeccomp {
- filter.Report("syscall filter is DISABLED. Running in less secure mode.")
- } else {
- opts := filter.Options{
- Platform: l.k.Platform,
- HostNetwork: l.conf.Network == NetworkHost,
- ProfileEnable: l.conf.ProfileEnable,
- ControllerFD: l.ctrl.srv.FD(),
- }
- if err := filter.Install(opts); err != nil {
- return fmt.Errorf("installing seccomp filters: %v", err)
- }
- }
-
// If we are restoring, we do not want to create a process.
// l.restore is set by the container manager when a restore call is made.
if !l.restore {
+ if l.conf.ProfileEnable {
+ initializePProf()
+ }
+
+ // Finally done with all configuration. Setup filters before user code
+ // is loaded.
+ if err := l.installSeccompFilters(); err != nil {
+ return err
+ }
+
// Create the FD map, which will set stdin, stdout, and stderr. If console
// is true, then ioctl calls will be passed through to the host fd.
ctx := l.rootProcArgs.NewContext(l.k)
- fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs)
+ fdTable, err := createFDTable(ctx, l.console, l.stdioFDs)
if err != nil {
return fmt.Errorf("importing fds: %v", err)
}
// CreateProcess takes a reference on FDMap if successful. We won't need
// ours either way.
- l.rootProcArgs.FDMap = fdm
+ l.rootProcArgs.FDTable = fdTable
// cid for root container can be empty. Only subcontainers need it to set
// the mount location.
- mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k)
+ mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints)
if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil {
return err
}
@@ -513,19 +530,37 @@ func (l *Loader) run() error {
return err
}
+ // Read /etc/passwd for the user's HOME directory and set the HOME
+ // environment variable as required by POSIX if it is not overridden by
+ // the user.
+ hasHomeEnvv := false
+ for _, envv := range l.rootProcArgs.Envv {
+ if strings.HasPrefix(envv, "HOME=") {
+ hasHomeEnvv = true
+ }
+ }
+ if !hasHomeEnvv {
+ homeDir, err := getExecUserHome(rootCtx, rootMns, uint32(l.rootProcArgs.Credentials.RealKUID))
+ if err != nil {
+ return fmt.Errorf("error reading exec user: %v", err)
+ }
+
+ l.rootProcArgs.Envv = append(l.rootProcArgs.Envv, "HOME="+homeDir)
+ }
+
// Create the root container init task. It will begin running
// when the kernel is started.
if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
return fmt.Errorf("creating init process: %v", err)
}
- // CreateProcess takes a reference on FDMap if successful.
- l.rootProcArgs.FDMap.DecRef()
+ // CreateProcess takes a reference on FDTable if successful.
+ l.rootProcArgs.FDTable.DecRef()
}
ep.tg = l.k.GlobalInit()
if l.console {
- ttyFile := l.rootProcArgs.FDMap.GetFile(0)
+ ttyFile, _ := l.rootProcArgs.FDTable.Get(0)
defer ttyFile.DecRef()
ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
@@ -605,13 +640,13 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
// Create the FD map, which will set stdin, stdout, and stderr.
ctx := procArgs.NewContext(l.k)
- fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs)
+ fdTable, err := createFDTable(ctx, false, stdioFDs)
if err != nil {
return fmt.Errorf("importing fds: %v", err)
}
- // CreateProcess takes a reference on FDMap if successful. We won't need ours
- // either way.
- procArgs.FDMap = fdm
+ // CreateProcess takes a reference on fdTable if successful. We won't
+ // need ours either way.
+ procArgs.FDTable = fdTable
// Can't take ownership away from os.File. dup them to get a new FDs.
var goferFDs []int
@@ -623,7 +658,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
goferFDs = append(goferFDs, fd)
}
- mntr := newContainerMounter(spec, cid, goferFDs, l.k)
+ mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints)
if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil {
return fmt.Errorf("configuring container FS: %v", err)
}
@@ -640,8 +675,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
}
l.k.StartProcess(tg)
- // CreateProcess takes a reference on FDMap if successful.
- procArgs.FDMap.DecRef()
+ // CreateProcess takes a reference on FDTable if successful.
+ procArgs.FDTable.DecRef()
l.processes[eid].tg = tg
return nil
@@ -805,9 +840,17 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
// privileges.
Raw: true,
})}
+
+ // Enable SACK Recovery.
if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
return nil, fmt.Errorf("failed to enable SACK: %v", err)
}
+
+ // Enable Receive Buffer Auto-Tuning.
+ if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+ return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+ }
+
return &s, nil
default:
@@ -956,3 +999,8 @@ func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host
}
return ep.tg, ep.tty, nil
}
+
+func init() {
+ // TODO(gvisor.dev/issue/365): Make this configurable.
+ refs.SetLeakMode(refs.NoLeakChecking)
+}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 6393cb3fb..ff713660d 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -25,18 +25,21 @@ import (
"time"
specs "github.com/opencontainers/runtime-spec/specs-go"
- "gvisor.googlesource.com/gvisor/pkg/control/server"
- "gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/p9"
- "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
- "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
- "gvisor.googlesource.com/gvisor/pkg/unet"
- "gvisor.googlesource.com/gvisor/runsc/fsgofer"
+ "gvisor.dev/gvisor/pkg/control/server"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/unet"
+ "gvisor.dev/gvisor/runsc/fsgofer"
)
func init() {
log.SetLevel(log.Debug)
rand.Seed(time.Now().UnixNano())
+ if err := fsgofer.OpenProcSelfFD(); err != nil {
+ panic(err)
+ }
}
func testConfig() *Config {
@@ -44,6 +47,7 @@ func testConfig() *Config {
RootDir: "unused_root_dir",
Network: NetworkNone,
DisableSeccomp: true,
+ Platform: "ptrace",
}
}
@@ -404,7 +408,7 @@ func TestCreateMountNamespace(t *testing.T) {
mns = m
ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
}
- mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil)
+ mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil, &podMountHints{})
if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
}
@@ -610,7 +614,7 @@ func TestRestoreEnvironment(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
conf := testConfig()
- mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil)
+ mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil, &podMountHints{})
actualRenv, err := mntr.createRestoreEnvironment(conf)
if !tc.errorExpected && err != nil {
t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 82c259f47..d3d98243d 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -19,16 +19,16 @@ import (
"net"
"syscall"
- "gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/tcpip"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/link/loopback"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
- "gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
- "gvisor.googlesource.com/gvisor/pkg/urpc"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+ "gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+ "gvisor.dev/gvisor/pkg/tcpip/network/arp"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/urpc"
)
// Network exposes methods that can be used to configure a network stack.
@@ -56,7 +56,7 @@ type FDBasedLink struct {
Addresses []net.IP
Routes []Route
GSOMaxSize uint32
- LinkAddress []byte
+ LinkAddress net.HardwareAddr
// NumChannels controls how many underlying FD's are to be used to
// create this endpoint.
diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD
new file mode 100644
index 000000000..03391cdca
--- /dev/null
+++ b/runsc/boot/platforms/BUILD
@@ -0,0 +1,16 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "platforms",
+ srcs = ["platforms.go"],
+ importpath = "gvisor.dev/gvisor/runsc/boot/platforms",
+ visibility = [
+ "//runsc:__subpackages__",
+ ],
+ deps = [
+ "//pkg/sentry/platform/kvm",
+ "//pkg/sentry/platform/ptrace",
+ ],
+)
diff --git a/runsc/boot/platforms/platforms.go b/runsc/boot/platforms/platforms.go
new file mode 100644
index 000000000..056b46ad5
--- /dev/null
+++ b/runsc/boot/platforms/platforms.go
@@ -0,0 +1,30 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package platforms imports all available platform packages.
+package platforms
+
+import (
+ // Import platforms that runsc might use.
+ _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
+ _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
+)
+
+const (
+ // Ptrace runs the sandbox with the ptrace platform.
+ Ptrace = "ptrace"
+
+ // KVM runs the sandbox with the KVM platform.
+ KVM = "kvm"
+)
diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof.go
new file mode 100644
index 000000000..463362f02
--- /dev/null
+++ b/runsc/boot/pprof.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+func initializePProf() {
+}
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index 19c7f8fbd..fbfd3b07c 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -15,7 +15,7 @@
package boot
import (
- "gvisor.googlesource.com/gvisor/pkg/sentry/strace"
+ "gvisor.dev/gvisor/pkg/sentry/strace"
)
func enableStrace(conf *Config) error {
diff --git a/runsc/boot/user.go b/runsc/boot/user.go
new file mode 100644
index 000000000..d1d423a5c
--- /dev/null
+++ b/runsc/boot/user.go
@@ -0,0 +1,146 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "bufio"
+ "io"
+ "strconv"
+ "strings"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+type fileReader struct {
+ // Ctx is the context for the file reader.
+ Ctx context.Context
+
+ // File is the file to read from.
+ File *fs.File
+}
+
+// Read implements io.Reader.Read.
+func (r *fileReader) Read(buf []byte) (int, error) {
+ n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf))
+ return int(n), err
+}
+
+// getExecUserHome returns the home directory of the executing user read from
+// /etc/passwd as read from the container filesystem.
+func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32) (string, error) {
+ // The default user home directory to return if no user matching the user
+ // if found in the /etc/passwd found in the image.
+ const defaultHome = "/"
+
+ // Open the /etc/passwd file from the dirent via the root mount namespace.
+ mnsRoot := rootMns.Root()
+ maxTraversals := uint(linux.MaxSymlinkTraversals)
+ dirent, err := rootMns.FindInode(ctx, mnsRoot, nil, "/etc/passwd", &maxTraversals)
+ if err != nil {
+ // NOTE: Ignore errors opening the passwd file. If the passwd file
+ // doesn't exist we will return the default home directory.
+ return defaultHome, nil
+ }
+ defer dirent.DecRef()
+
+ // Check read permissions on the file.
+ if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil {
+ // NOTE: Ignore permissions errors here and return default root dir.
+ return defaultHome, nil
+ }
+
+ // Only open regular files. We don't open other files like named pipes as
+ // they may block and might present some attack surface to the container.
+ // Note that runc does not seem to do this kind of checking.
+ if !fs.IsRegular(dirent.Inode.StableAttr) {
+ return defaultHome, nil
+ }
+
+ f, err := dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Directory: false})
+ if err != nil {
+ return "", err
+ }
+ defer f.DecRef()
+
+ r := &fileReader{
+ Ctx: ctx,
+ File: f,
+ }
+
+ homeDir, err := findHomeInPasswd(uid, r, defaultHome)
+ if err != nil {
+ return "", err
+ }
+
+ return homeDir, nil
+}
+
+// findHomeInPasswd parses a passwd file and returns the given user's home
+// directory. This function does it's best to replicate the runc's behavior.
+func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) {
+ s := bufio.NewScanner(passwd)
+
+ for s.Scan() {
+ if err := s.Err(); err != nil {
+ return "", err
+ }
+
+ line := strings.TrimSpace(s.Text())
+ if line == "" {
+ continue
+ }
+
+ // Pull out part of passwd entry. Loosely parse the passwd entry as some
+ // passwd files could be poorly written and for compatibility with runc.
+ //
+ // Per 'man 5 passwd'
+ // /etc/passwd contains one line for each user account, with seven
+ // fields delimited by colons (“:”). These fields are:
+ //
+ // - login name
+ // - optional encrypted password
+ // - numerical user ID
+ // - numerical group ID
+ // - user name or comment field
+ // - user home directory
+ // - optional user command interpreter
+ parts := strings.Split(line, ":")
+
+ found := false
+ homeDir := ""
+ for i, p := range parts {
+ switch i {
+ case 2:
+ parsedUID, err := strconv.ParseUint(p, 10, 32)
+ if err == nil && parsedUID == uint64(uid) {
+ found = true
+ }
+ case 5:
+ homeDir = p
+ }
+ }
+ if found {
+ // NOTE: If the uid is present but the home directory is not
+ // present in the /etc/passwd entry we return an empty string. This
+ // is, for better or worse, what runc does.
+ return homeDir, nil
+ }
+ }
+
+ return defaultHome, nil
+}
diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go
new file mode 100644
index 000000000..834003430
--- /dev/null
+++ b/runsc/boot/user_test.go
@@ -0,0 +1,253 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strings"
+ "syscall"
+ "testing"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+)
+
+func setupTempDir() (string, error) {
+ tmpDir, err := ioutil.TempDir(os.TempDir(), "exec-user-test")
+ if err != nil {
+ return "", err
+ }
+ return tmpDir, nil
+}
+
+func setupPasswd(contents string, perms os.FileMode) func() (string, error) {
+ return func() (string, error) {
+ tmpDir, err := setupTempDir()
+ if err != nil {
+ return "", err
+ }
+
+ if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
+ return "", err
+ }
+
+ f, err := os.Create(filepath.Join(tmpDir, "etc", "passwd"))
+ if err != nil {
+ return "", err
+ }
+ defer f.Close()
+
+ _, err = f.WriteString(contents)
+ if err != nil {
+ return "", err
+ }
+
+ err = f.Chmod(perms)
+ if err != nil {
+ return "", err
+ }
+ return tmpDir, nil
+ }
+}
+
+// TestGetExecUserHome tests the getExecUserHome function.
+func TestGetExecUserHome(t *testing.T) {
+ tests := map[string]struct {
+ uid uint32
+ createRoot func() (string, error)
+ expected string
+ }{
+ "success": {
+ uid: 1000,
+ createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0666),
+ expected: "/home/adin",
+ },
+ "no_passwd": {
+ uid: 1000,
+ createRoot: setupTempDir,
+ expected: "/",
+ },
+ "no_perms": {
+ uid: 1000,
+ createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0000),
+ expected: "/",
+ },
+ "directory": {
+ uid: 1000,
+ createRoot: func() (string, error) {
+ tmpDir, err := setupTempDir()
+ if err != nil {
+ return "", err
+ }
+
+ if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
+ return "", err
+ }
+
+ if err := syscall.Mkdir(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil {
+ return "", err
+ }
+
+ return tmpDir, nil
+ },
+ expected: "/",
+ },
+ // Currently we don't allow named pipes.
+ "named_pipe": {
+ uid: 1000,
+ createRoot: func() (string, error) {
+ tmpDir, err := setupTempDir()
+ if err != nil {
+ return "", err
+ }
+
+ if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil {
+ return "", err
+ }
+
+ if err := syscall.Mkfifo(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil {
+ return "", err
+ }
+
+ return tmpDir, nil
+ },
+ expected: "/",
+ },
+ }
+
+ for name, tc := range tests {
+ t.Run(name, func(t *testing.T) {
+ tmpDir, err := tc.createRoot()
+ if err != nil {
+ t.Fatalf("failed to create root dir: %v", err)
+ }
+
+ sandEnd, cleanup, err := startGofer(tmpDir)
+ if err != nil {
+ t.Fatalf("failed to create gofer: %v", err)
+ }
+ defer cleanup()
+
+ ctx := contexttest.Context(t)
+ conf := &Config{
+ RootDir: "unused_root_dir",
+ Network: NetworkNone,
+ DisableSeccomp: true,
+ }
+
+ spec := &specs.Spec{
+ Root: &specs.Root{
+ Path: tmpDir,
+ Readonly: true,
+ },
+ // Add /proc mount as tmpfs to avoid needing a kernel.
+ Mounts: []specs.Mount{
+ {
+ Destination: "/proc",
+ Type: "tmpfs",
+ },
+ },
+ }
+
+ var mns *fs.MountNamespace
+ setMountNS := func(m *fs.MountNamespace) {
+ mns = m
+ ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
+ }
+ mntr := newContainerMounter(spec, "", []int{sandEnd}, nil, &podMountHints{})
+ if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
+ t.Fatalf("failed to create mount namespace: %v", err)
+ }
+
+ got, err := getExecUserHome(ctx, mns, tc.uid)
+ if err != nil {
+ t.Fatalf("failed to get user home: %v", err)
+ }
+
+ if got != tc.expected {
+ t.Fatalf("expected %v, got: %v", tc.expected, got)
+ }
+ })
+ }
+}
+
+// TestFindHomeInPasswd tests the findHomeInPasswd function's passwd file parsing.
+func TestFindHomeInPasswd(t *testing.T) {
+ tests := map[string]struct {
+ uid uint32
+ passwd string
+ expected string
+ def string
+ }{
+ "empty": {
+ uid: 1000,
+ passwd: "",
+ expected: "/",
+ def: "/",
+ },
+ "whitespace": {
+ uid: 1000,
+ passwd: " ",
+ expected: "/",
+ def: "/",
+ },
+ "full": {
+ uid: 1000,
+ passwd: "adin::1000:1111::/home/adin:/bin/sh",
+ expected: "/home/adin",
+ def: "/",
+ },
+ // For better or worse, this is how runc works.
+ "partial": {
+ uid: 1000,
+ passwd: "adin::1000:1111:",
+ expected: "",
+ def: "/",
+ },
+ "multiple": {
+ uid: 1001,
+ passwd: "adin::1000:1111::/home/adin:/bin/sh\nian::1001:1111::/home/ian:/bin/sh",
+ expected: "/home/ian",
+ def: "/",
+ },
+ "duplicate": {
+ uid: 1000,
+ passwd: "adin::1000:1111::/home/adin:/bin/sh\nian::1000:1111::/home/ian:/bin/sh",
+ expected: "/home/adin",
+ def: "/",
+ },
+ "empty_lines": {
+ uid: 1001,
+ passwd: "adin::1000:1111::/home/adin:/bin/sh\n\n\nian::1001:1111::/home/ian:/bin/sh",
+ expected: "/home/ian",
+ def: "/",
+ },
+ }
+
+ for name, tc := range tests {
+ t.Run(name, func(t *testing.T) {
+ got, err := findHomeInPasswd(tc.uid, strings.NewReader(tc.passwd), tc.def)
+ if err != nil {
+ t.Fatalf("error parsing passwd: %v", err)
+ }
+ if tc.expected != got {
+ t.Fatalf("expected %v, got: %v", tc.expected, got)
+ }
+ })
+ }
+}