diff options
Diffstat (limited to 'runsc/boot')
-rw-r--r-- | runsc/boot/BUILD | 9 | ||||
-rw-r--r-- | runsc/boot/compat.go | 16 | ||||
-rw-r--r-- | runsc/boot/compat_amd64.go | 2 | ||||
-rw-r--r-- | runsc/boot/compat_test.go | 2 | ||||
-rw-r--r-- | runsc/boot/config.go | 15 | ||||
-rw-r--r-- | runsc/boot/controller.go | 35 | ||||
-rw-r--r-- | runsc/boot/debug.go | 2 | ||||
-rw-r--r-- | runsc/boot/events.go | 4 | ||||
-rw-r--r-- | runsc/boot/fds.go | 12 | ||||
-rw-r--r-- | runsc/boot/filter/BUILD | 2 | ||||
-rw-r--r-- | runsc/boot/filter/config.go | 10 | ||||
-rw-r--r-- | runsc/boot/filter/extra_filters.go | 2 | ||||
-rw-r--r-- | runsc/boot/filter/extra_filters_msan.go | 2 | ||||
-rw-r--r-- | runsc/boot/filter/extra_filters_race.go | 2 | ||||
-rw-r--r-- | runsc/boot/filter/filter.go | 10 | ||||
-rw-r--r-- | runsc/boot/fs.go | 313 | ||||
-rw-r--r-- | runsc/boot/fs_test.go | 193 | ||||
-rw-r--r-- | runsc/boot/limits.go | 4 | ||||
-rw-r--r-- | runsc/boot/loader.go | 153 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 18 | ||||
-rw-r--r-- | runsc/boot/network.go | 61 | ||||
-rw-r--r-- | runsc/boot/pprof.go | 18 | ||||
-rw-r--r-- | runsc/boot/strace.go | 2 | ||||
-rw-r--r-- | runsc/boot/user.go | 146 | ||||
-rw-r--r-- | runsc/boot/user_test.go | 253 |
25 files changed, 1122 insertions, 164 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index df9907e52..7ec15a524 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -16,9 +16,11 @@ go_library( "limits.go", "loader.go", "network.go", + "pprof.go", "strace.go", + "user.go", ], - importpath = "gvisor.googlesource.com/gvisor/runsc/boot", + importpath = "gvisor.dev/gvisor/runsc/boot", visibility = [ "//runsc:__subpackages__", "//test:__subpackages__", @@ -30,6 +32,7 @@ go_library( "//pkg/cpuid", "//pkg/eventchannel", "//pkg/log", + "//pkg/memutil", "//pkg/rand", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", @@ -51,7 +54,6 @@ go_library( "//pkg/sentry/kernel/kdefs", "//pkg/sentry/limits", "//pkg/sentry/loader", - "//pkg/sentry/memutil", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/platform/kvm", @@ -68,6 +70,7 @@ go_library( "//pkg/sentry/time", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/usage", + "//pkg/sentry/usermem", "//pkg/sentry/watchdog", "//pkg/syserror", "//pkg/tcpip", @@ -94,7 +97,9 @@ go_test( size = "small", srcs = [ "compat_test.go", + "fs_test.go", "loader_test.go", + "user_test.go", ], embed = [":boot"], deps = [ diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index c369e4d64..07e35ab10 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -21,14 +21,14 @@ import ( "syscall" "github.com/golang/protobuf/proto" - "gvisor.googlesource.com/gvisor/pkg/abi" - "gvisor.googlesource.com/gvisor/pkg/eventchannel" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/sentry/arch" - rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto" - ucspb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" - "gvisor.googlesource.com/gvisor/pkg/sentry/strace" - spb "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/arch" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" + ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" + "gvisor.dev/gvisor/pkg/sentry/strace" + spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" ) func initCompatLogs(fd int) error { diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go index 99df5e614..43cd0db94 100644 --- a/runsc/boot/compat_amd64.go +++ b/runsc/boot/compat_amd64.go @@ -17,7 +17,7 @@ package boot import ( "fmt" - rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" ) // reportLimit is the max number of events that should be reported per tracker. diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go index ccec3d20c..388298d8d 100644 --- a/runsc/boot/compat_test.go +++ b/runsc/boot/compat_test.go @@ -17,7 +17,7 @@ package boot import ( "testing" - rpb "gvisor.googlesource.com/gvisor/pkg/sentry/arch/registers_go_proto" + rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" ) func TestOnceTracker(t *testing.T) { diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 15f624f9b..6d276f207 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -19,7 +19,7 @@ import ( "strconv" "strings" - "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/sentry/watchdog" ) // PlatformType tells which platform to use. @@ -221,6 +221,17 @@ type Config struct { // user, and without chrooting the sandbox process. This can be // necessary in test environments that have limited capabilities. TestOnlyAllowRunAsCurrentUserWithoutChroot bool + + // NumNetworkChannels controls the number of AF_PACKET sockets that map + // to the same underlying network device. This allows netstack to better + // scale for high throughput use cases. + NumNetworkChannels int + + // Rootless allows the sandbox to be started with a user that is not root. + // Defense is depth measures are weaker with rootless. Specifically, the + // sandbox and Gofer process run as root inside a user namespace with root + // mapped to the caller's user. + Rootless bool } // ToFlags returns a slice of flags that correspond to the given Config. @@ -244,6 +255,8 @@ func (c *Config) ToFlags() []string { "--panic-signal=" + strconv.Itoa(c.PanicSignal), "--profile=" + strconv.FormatBool(c.ProfileEnable), "--net-raw=" + strconv.FormatBool(c.EnableRaw), + "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), + "--rootless=" + strconv.FormatBool(c.Rootless), } if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { // Only include if set since it is never to be used by users. diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index a277145b1..7f41a9c53 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -22,17 +22,17 @@ import ( "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.googlesource.com/gvisor/pkg/control/server" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/sentry/control" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket" - "gvisor.googlesource.com/gvisor/pkg/sentry/state" - "gvisor.googlesource.com/gvisor/pkg/sentry/time" - "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog" - "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" - "gvisor.googlesource.com/gvisor/pkg/urpc" + "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/epsocket" + "gvisor.dev/gvisor/pkg/sentry/state" + "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/urpc" ) const ( @@ -340,7 +340,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { cm.l.k = k // Set up the restore environment. - mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k) + mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k, cm.l.mountHints) renv, err := mntr.createRestoreEnvironment(cm.l.conf) if err != nil { return fmt.Errorf("creating RestoreEnvironment: %v", err) @@ -359,6 +359,17 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("file cannot be empty") } + if cm.l.conf.ProfileEnable { + // initializePProf opens /proc/self/maps, so has to be + // called before installing seccomp filters. + initializePProf() + } + + // Seccomp filters have to be applied before parsing the state file. + if err := cm.l.installSeccompFilters(); err != nil { + return err + } + // Load the state. loadOpts := state.LoadOpts{Source: specFile} if err := loadOpts.Load(k, networkStack); err != nil { diff --git a/runsc/boot/debug.go b/runsc/boot/debug.go index 79f7387ac..1fb32c527 100644 --- a/runsc/boot/debug.go +++ b/runsc/boot/debug.go @@ -15,7 +15,7 @@ package boot import ( - "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/log" ) type debug struct { diff --git a/runsc/boot/events.go b/runsc/boot/events.go index ffd99f5e9..422f4da00 100644 --- a/runsc/boot/events.go +++ b/runsc/boot/events.go @@ -15,8 +15,8 @@ package boot import ( - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" - "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/usage" ) // Event struct for encoding the event data to JSON. Corresponds to runc's diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 0811e10f4..59e1b46ec 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -17,12 +17,12 @@ package boot import ( "fmt" - "gvisor.googlesource.com/gvisor/pkg/sentry/context" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" - "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.dev/gvisor/pkg/sentry/limits" ) // createFDMap creates an FD map that contains stdin, stdout, and stderr. If diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD index 3b6020cf3..07898f3de 100644 --- a/runsc/boot/filter/BUILD +++ b/runsc/boot/filter/BUILD @@ -11,7 +11,7 @@ go_library( "extra_filters_race.go", "filter.go", ], - importpath = "gvisor.googlesource.com/gvisor/runsc/boot/filter", + importpath = "gvisor.dev/gvisor/runsc/boot/filter", visibility = [ "//runsc/boot:__subpackages__", ], diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 652da1cef..e4ccb40d9 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -19,9 +19,9 @@ import ( "syscall" "golang.org/x/sys/unix" - "gvisor.googlesource.com/gvisor/pkg/abi/linux" - "gvisor.googlesource.com/gvisor/pkg/seccomp" - "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" ) // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS. @@ -246,6 +246,10 @@ var allowedSyscalls = seccomp.SyscallRules{ }, syscall.SYS_SETITIMER: {}, syscall.SYS_SHUTDOWN: []seccomp.Rule{ + // Used by fs/host to shutdown host sockets. + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)}, + {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)}, + // Used by unet to shutdown connections. {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go index 5c5ec4e06..1056cd314 100644 --- a/runsc/boot/filter/extra_filters.go +++ b/runsc/boot/filter/extra_filters.go @@ -17,7 +17,7 @@ package filter import ( - "gvisor.googlesource.com/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/seccomp" ) // instrumentationFilters returns additional filters for syscalls used by diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go index ac5a0f1aa..5e5a3c998 100644 --- a/runsc/boot/filter/extra_filters_msan.go +++ b/runsc/boot/filter/extra_filters_msan.go @@ -19,7 +19,7 @@ package filter import ( "syscall" - "gvisor.googlesource.com/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/seccomp" ) // instrumentationFilters returns additional filters for syscalls used by MSAN. diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go index ba3c1ce87..d5bee4453 100644 --- a/runsc/boot/filter/extra_filters_race.go +++ b/runsc/boot/filter/extra_filters_race.go @@ -19,7 +19,7 @@ package filter import ( "syscall" - "gvisor.googlesource.com/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/seccomp" ) // instrumentationFilters returns additional filters for syscalls used by TSAN. diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go index 17479e0dd..468481f29 100644 --- a/runsc/boot/filter/filter.go +++ b/runsc/boot/filter/filter.go @@ -20,11 +20,11 @@ package filter import ( "fmt" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/seccomp" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" ) // Options are seccomp filter related options. diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 939f2419c..588317f4f 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -18,29 +18,30 @@ import ( "fmt" "path" "path/filepath" + "sort" "strconv" "strings" "syscall" // Include filesystem types that OCI spec might mount. - _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty" + _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" + _ "gvisor.dev/gvisor/pkg/sentry/fs/host" + _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" + _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.googlesource.com/gvisor/pkg/abi/linux" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/sentry/context" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" - "gvisor.googlesource.com/gvisor/pkg/syserror" - "gvisor.googlesource.com/gvisor/runsc/specutils" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/gofer" + "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/runsc/specutils" ) const ( @@ -50,6 +51,9 @@ const ( // Device name for root mount. rootDevice = "9pfs-/" + // MountPrefix is the annotation prefix for mount hints. + MountPrefix = "gvisor.dev/spec/mount" + // ChildContainersDir is the directory where child container root // filesystems are mounted. ChildContainersDir = "/__runsc_containers__" @@ -292,6 +296,174 @@ func (f *fdDispenser) empty() bool { return len(f.fds) == 0 } +type shareType int + +const ( + invalid shareType = iota + + // container shareType indicates that the mount is used by a single container. + container + + // pod shareType indicates that the mount is used by more than one container + // inside the pod. + pod + + // shared shareType indicates that the mount can also be shared with a process + // outside the pod, e.g. NFS. + shared +) + +func parseShare(val string) (shareType, error) { + switch val { + case "container": + return container, nil + case "pod": + return pod, nil + case "shared": + return shared, nil + default: + return 0, fmt.Errorf("invalid share value %q", val) + } +} + +func (s shareType) String() string { + switch s { + case invalid: + return "invalid" + case container: + return "container" + case pod: + return "pod" + case shared: + return "shared" + default: + return fmt.Sprintf("invalid share value %d", s) + } +} + +// mountHint represents extra information about mounts that are provided via +// annotations. They can override mount type, and provide sharing information +// so that mounts can be correctly shared inside the pod. +type mountHint struct { + name string + share shareType + mount specs.Mount + + // root is the inode where the volume is mounted. For mounts with 'pod' share + // the volume is mounted once and then bind mounted inside the containers. + root *fs.Inode +} + +func (m *mountHint) setField(key, val string) error { + switch key { + case "source": + if len(val) == 0 { + return fmt.Errorf("source cannot be empty") + } + m.mount.Source = val + case "type": + return m.setType(val) + case "share": + share, err := parseShare(val) + if err != nil { + return err + } + m.share = share + case "options": + return m.setOptions(val) + default: + return fmt.Errorf("invalid mount annotation: %s=%s", key, val) + } + return nil +} + +func (m *mountHint) setType(val string) error { + switch val { + case "tmpfs", "bind": + m.mount.Type = val + default: + return fmt.Errorf("invalid type %q", val) + } + return nil +} + +func (m *mountHint) setOptions(val string) error { + opts := strings.Split(val, ",") + if err := specutils.ValidateMountOptions(opts); err != nil { + return err + } + // Sort options so it can be compared with container mount options later on. + sort.Strings(opts) + m.mount.Options = opts + return nil +} + +func (m *mountHint) isSupported() bool { + return m.mount.Type == tmpfs && m.share == pod +} + +// podMountHints contains a collection of mountHints for the pod. +type podMountHints struct { + mounts map[string]*mountHint +} + +func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { + mnts := make(map[string]*mountHint) + for k, v := range spec.Annotations { + // Look for 'gvisor.dev/spec/mount' annotations and parse them. + if strings.HasPrefix(k, MountPrefix) { + parts := strings.Split(k, "/") + if len(parts) != 5 { + return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v) + } + name := parts[3] + if len(name) == 0 || path.Clean(name) != name { + return nil, fmt.Errorf("invalid mount name: %s", name) + } + mnt := mnts[name] + if mnt == nil { + mnt = &mountHint{name: name} + mnts[name] = mnt + } + if err := mnt.setField(parts[4], v); err != nil { + return nil, err + } + } + } + + // Validate all hints after done parsing. + for name, m := range mnts { + log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share) + if m.share == invalid { + return nil, fmt.Errorf("share field for %q has not been set", m.name) + } + if len(m.mount.Source) == 0 { + return nil, fmt.Errorf("source field for %q has not been set", m.name) + } + if len(m.mount.Type) == 0 { + return nil, fmt.Errorf("type field for %q has not been set", m.name) + } + + // Check for duplicate mount sources. + for name2, m2 := range mnts { + if name != name2 && m.mount.Source == m2.mount.Source { + return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source) + } + } + } + + return &podMountHints{mounts: mnts}, nil +} + +func (p *podMountHints) findMount(mount specs.Mount) *mountHint { + for _, m := range p.mounts { + if m.mount.Source == mount.Source { + return m + } + } + return nil +} + type containerMounter struct { // cid is the container ID. May be set to empty for the root container. cid string @@ -306,15 +478,18 @@ type containerMounter struct { fds fdDispenser k *kernel.Kernel + + hints *podMountHints } -func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel) *containerMounter { +func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter { return &containerMounter{ cid: cid, root: spec.Root, mounts: compileMounts(spec), fds: fdDispenser{fds: goferFDs}, k: k, + hints: hints, } } @@ -476,6 +651,15 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error // 'setMountNS' is called after namespace is created. It must set the mount NS // to 'rootCtx'. func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error { + for _, hint := range c.hints.mounts { + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + inode, err := c.mountSharedMaster(rootCtx, conf, hint) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.root = inode + } + // Create a tmpfs mount where we create and mount a root filesystem for // each child container. c.mounts = append(c.mounts, specs.Mount{ @@ -498,21 +682,57 @@ func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx c return c.mountSubmounts(rootCtx, conf, mns, root) } +// mountSharedMaster mounts the master of a volume that is shared among +// containers in a pod. It returns the root mount's inode. +func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + + // Mount with revalidate because it's shared among containers. + opts = append(opts, "cache=revalidate") + + // All filesystem names should have been mapped to something we know. + filesystem := mustFindFilesystem(fsName) + + mf := mountFlags(hint.mount.Options) + if useOverlay { + // All writes go to upper, be paranoid and make lower readonly. + mf.ReadOnly = true + } + + inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil) + if err != nil { + return nil, fmt.Errorf("creating mount %q: %v", hint.name, err) + } + + if useOverlay { + log.Debugf("Adding overlay on top of shared mount %q", hint.name) + inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf) + if err != nil { + return nil, err + } + } + + return inode, nil +} + // createRootMount creates the root filesystem. func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) { // First construct the filesystem from the spec.Root. mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay} - var ( - rootInode *fs.Inode - err error - ) - fd := c.fds.remove() log.Infof("Mounting root over 9P, ioFD: %d", fd) p9FS := mustFindFilesystem("9p") opts := p9MountOptions(fd, conf.FileAccess) - rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) + rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) if err != nil { return nil, fmt.Errorf("creating root mount point: %v", err) } @@ -579,8 +799,14 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { for _, m := range c.mounts { - if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { - return fmt.Errorf("mount submount %q: %v", m.Destination, err) + if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err) + } + } else { + if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { + return fmt.Errorf("mount submount %q: %v", m.Destination, err) + } } } @@ -653,6 +879,37 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns return nil } +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error { + // For now enforce that all options are the same. Once bind mount is properly + // supported, then we should ensure the master is less restrictive than the + // container, e.g. master can be 'rw' while container mounts as 'ro'. + if len(mount.Options) != len(source.mount.Options) { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options) + } + sort.Strings(mount.Options) + for i, opt := range mount.Options { + if opt != source.mount.Options[i] { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options) + } + } + + maxTraversals := uint(0) + target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals) + if err != nil { + return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) + } + defer target.DecRef() + + if err := mns.Mount(ctx, target, source.root); err != nil { + return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) + } + + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return nil +} + // addRestoreMount adds a mount to the MountSources map used for restoring a // checkpointed container. func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error { @@ -678,8 +935,8 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron return nil } -// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts -// to the environment. +// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding +// the mounts to the environment. func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) { renv := &fs.RestoreEnvironment{ MountSources: make(map[string][]fs.MountArgs), @@ -730,7 +987,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn // Technically we don't have to mount tmpfs at /tmp, as we could just rely on // the host /tmp, but this is a nice optimization, and fixes some apps that call // mknod in /tmp. It's unsafe to mount tmpfs if: -// 1. /tmp is mounted explictly: we should not override user's wish +// 1. /tmp is mounted explicitly: we should not override user's wish // 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp // // Note that when there are submounts inside of '/tmp', directories for the diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go new file mode 100644 index 000000000..49ab34b33 --- /dev/null +++ b/runsc/boot/fs_test.go @@ -0,0 +1,193 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "path" + "reflect" + "strings" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestPodMountHintsHappy(t *testing.T) { + spec := &specs.Spec{ + Annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "pod", + + path.Join(MountPrefix, "mount2", "source"): "bar", + path.Join(MountPrefix, "mount2", "type"): "bind", + path.Join(MountPrefix, "mount2", "share"): "container", + path.Join(MountPrefix, "mount2", "options"): "rw,private", + }, + } + podHints, err := newPodMountHints(spec) + if err != nil { + t.Errorf("newPodMountHints failed: %v", err) + } + + // Check that fields were set correctly. + mount1 := podHints.mounts["mount1"] + if want := "mount1"; want != mount1.name { + t.Errorf("mount1 name, want: %q, got: %q", want, mount1.name) + } + if want := "foo"; want != mount1.mount.Source { + t.Errorf("mount1 source, want: %q, got: %q", want, mount1.mount.Source) + } + if want := "tmpfs"; want != mount1.mount.Type { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Type) + } + if want := pod; want != mount1.share { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.share) + } + if want := []string(nil); !reflect.DeepEqual(want, mount1.mount.Options) { + t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Options) + } + + mount2 := podHints.mounts["mount2"] + if want := "mount2"; want != mount2.name { + t.Errorf("mount2 name, want: %q, got: %q", want, mount2.name) + } + if want := "bar"; want != mount2.mount.Source { + t.Errorf("mount2 source, want: %q, got: %q", want, mount2.mount.Source) + } + if want := "bind"; want != mount2.mount.Type { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Type) + } + if want := container; want != mount2.share { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.share) + } + if want := []string{"private", "rw"}; !reflect.DeepEqual(want, mount2.mount.Options) { + t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Options) + } +} + +func TestPodMountHintsErrors(t *testing.T) { + for _, tst := range []struct { + name string + annotations map[string]string + error string + }{ + { + name: "too short", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1"): "foo", + }, + error: "invalid mount annotation", + }, + { + name: "no name", + annotations: map[string]string{ + MountPrefix + "//source": "foo", + }, + error: "invalid mount name", + }, + { + name: "missing source", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "pod", + }, + error: "source field", + }, + { + name: "missing type", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "share"): "pod", + }, + error: "type field", + }, + { + name: "missing share", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + }, + error: "share field", + }, + { + name: "invalid field name", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "invalid"): "foo", + }, + error: "invalid mount annotation", + }, + { + name: "invalid source", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "pod", + }, + error: "source cannot be empty", + }, + { + name: "invalid type", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "invalid-type", + path.Join(MountPrefix, "mount1", "share"): "pod", + }, + error: "invalid type", + }, + { + name: "invalid share", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "invalid-share", + }, + error: "invalid share", + }, + { + name: "invalid options", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "pod", + path.Join(MountPrefix, "mount1", "options"): "invalid-option", + }, + error: "unknown mount option", + }, + { + name: "duplicate source", + annotations: map[string]string{ + path.Join(MountPrefix, "mount1", "source"): "foo", + path.Join(MountPrefix, "mount1", "type"): "tmpfs", + path.Join(MountPrefix, "mount1", "share"): "pod", + + path.Join(MountPrefix, "mount2", "source"): "foo", + path.Join(MountPrefix, "mount2", "type"): "bind", + path.Join(MountPrefix, "mount2", "share"): "container", + }, + error: "have the same mount source", + }, + } { + t.Run(tst.name, func(t *testing.T) { + spec := &specs.Spec{Annotations: tst.annotations} + podHints, err := newPodMountHints(spec) + if err == nil || !strings.Contains(err.Error(), tst.error) { + t.Errorf("newPodMountHints invalid error, want: .*%s.*, got: %v", tst.error, err) + } + if podHints != nil { + t.Errorf("newPodMountHints must return nil on failure: %+v", podHints) + } + }) + } +} diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go index 3364aa5e6..d1c0bb9b5 100644 --- a/runsc/boot/limits.go +++ b/runsc/boot/limits.go @@ -20,8 +20,8 @@ import ( "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/limits" ) // Mapping from linux resource names to limits.LimitType. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index a997776f8..38425f97d 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -20,51 +20,52 @@ import ( mrand "math/rand" "os" "runtime" + "strings" "sync" "sync/atomic" "syscall" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.googlesource.com/gvisor/pkg/abi/linux" - "gvisor.googlesource.com/gvisor/pkg/cpuid" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/rand" - "gvisor.googlesource.com/gvisor/pkg/sentry/arch" - "gvisor.googlesource.com/gvisor/pkg/sentry/control" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" - "gvisor.googlesource.com/gvisor/pkg/sentry/inet" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" - "gvisor.googlesource.com/gvisor/pkg/sentry/loader" - "gvisor.googlesource.com/gvisor/pkg/sentry/memutil" - "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace" - "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling" - slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux" - "gvisor.googlesource.com/gvisor/pkg/sentry/time" - "gvisor.googlesource.com/gvisor/pkg/sentry/usage" - "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog" - "gvisor.googlesource.com/gvisor/pkg/tcpip" - "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer" - "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp" - "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4" - "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6" - "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" - "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/icmp" - "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp" - "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp" - "gvisor.googlesource.com/gvisor/runsc/boot/filter" - "gvisor.googlesource.com/gvisor/runsc/specutils" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" + "gvisor.dev/gvisor/pkg/sentry/sighandling" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" + "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" + "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/runsc/boot/filter" + "gvisor.dev/gvisor/runsc/specutils" // Include supported socket providers. - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/epsocket" + "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink" + _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route" + _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) // Loader keeps state needed to start the kernel and run the container.. @@ -117,6 +118,10 @@ type Loader struct { // // processes is guardded by mu. processes map[execID]*execProcess + + // mountHints provides extra information about mounts for containers that + // apply to the entire pod. + mountHints *podMountHints } // execID uniquely identifies a sentry process that is executed in a container. @@ -299,6 +304,11 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("initializing compat logs: %v", err) } + mountHints, err := newPodMountHints(args.Spec) + if err != nil { + return nil, fmt.Errorf("creating pod mount hints: %v", err) + } + eid := execID{cid: args.ID} l := &Loader{ k: k, @@ -311,6 +321,7 @@ func New(args Args) (*Loader, error) { rootProcArgs: procArgs, sandboxID: args.ID, processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, } // We don't care about child signals; some platforms can generate a @@ -424,6 +435,9 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return nil, fmt.Errorf("error creating memfd: %v", err) } memfile := os.NewFile(uintptr(memfd), memfileName) + // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if + // there are memory cgroups specified, because at this point we're already + // in a mount namespace in which the relevant cgroupfs is not visible. mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) if err != nil { memfile.Close() @@ -432,6 +446,23 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return mf, nil } +func (l *Loader) installSeccompFilters() error { + if l.conf.DisableSeccomp { + filter.Report("syscall filter is DISABLED. Running in less secure mode.") + } else { + opts := filter.Options{ + Platform: l.k.Platform, + HostNetwork: l.conf.Network == NetworkHost, + ProfileEnable: l.conf.ProfileEnable, + ControllerFD: l.ctrl.srv.FD(), + } + if err := filter.Install(opts); err != nil { + return fmt.Errorf("installing seccomp filters: %v", err) + } + } + return nil +} + // Run runs the root container. func (l *Loader) Run() error { err := l.run() @@ -467,25 +498,19 @@ func (l *Loader) run() error { return fmt.Errorf("trying to start deleted container %q", l.sandboxID) } - // Finally done with all configuration. Setup filters before user code - // is loaded. - if l.conf.DisableSeccomp { - filter.Report("syscall filter is DISABLED. Running in less secure mode.") - } else { - opts := filter.Options{ - Platform: l.k.Platform, - HostNetwork: l.conf.Network == NetworkHost, - ProfileEnable: l.conf.ProfileEnable, - ControllerFD: l.ctrl.srv.FD(), - } - if err := filter.Install(opts); err != nil { - return fmt.Errorf("installing seccomp filters: %v", err) - } - } - // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. if !l.restore { + if l.conf.ProfileEnable { + initializePProf() + } + + // Finally done with all configuration. Setup filters before user code + // is loaded. + if err := l.installSeccompFilters(); err != nil { + return err + } + // Create the FD map, which will set stdin, stdout, and stderr. If console // is true, then ioctl calls will be passed through to the host fd. ctx := l.rootProcArgs.NewContext(l.k) @@ -499,7 +524,7 @@ func (l *Loader) run() error { // cid for root container can be empty. Only subcontainers need it to set // the mount location. - mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k) + mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints) if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil { return err } @@ -510,6 +535,24 @@ func (l *Loader) run() error { return err } + // Read /etc/passwd for the user's HOME directory and set the HOME + // environment variable as required by POSIX if it is not overridden by + // the user. + hasHomeEnvv := false + for _, envv := range l.rootProcArgs.Envv { + if strings.HasPrefix(envv, "HOME=") { + hasHomeEnvv = true + } + } + if !hasHomeEnvv { + homeDir, err := getExecUserHome(rootCtx, rootMns, uint32(l.rootProcArgs.Credentials.RealKUID)) + if err != nil { + return fmt.Errorf("error reading exec user: %v", err) + } + + l.rootProcArgs.Envv = append(l.rootProcArgs.Envv, "HOME="+homeDir) + } + // Create the root container init task. It will begin running // when the kernel is started. if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil { @@ -620,7 +663,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file goferFDs = append(goferFDs, fd) } - mntr := newContainerMounter(spec, cid, goferFDs, l.k) + mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints) if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil { return fmt.Errorf("configuring container FS: %v", err) } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 6393cb3fb..4af45bfcc 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -25,13 +25,13 @@ import ( "time" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.googlesource.com/gvisor/pkg/control/server" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/p9" - "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs" - "gvisor.googlesource.com/gvisor/pkg/unet" - "gvisor.googlesource.com/gvisor/runsc/fsgofer" + "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/runsc/fsgofer" ) func init() { @@ -404,7 +404,7 @@ func TestCreateMountNamespace(t *testing.T) { mns = m ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root()) } - mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil) + mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil, &podMountHints{}) if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil { t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err) } @@ -610,7 +610,7 @@ func TestRestoreEnvironment(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { conf := testConfig() - mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil) + mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil, &podMountHints{}) actualRenv, err := mntr.createRestoreEnvironment(conf) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 0a154d90b..d3d98243d 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -19,16 +19,16 @@ import ( "net" "syscall" - "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/tcpip" - "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased" - "gvisor.googlesource.com/gvisor/pkg/tcpip/link/loopback" - "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer" - "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp" - "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4" - "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6" - "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" - "gvisor.googlesource.com/gvisor/pkg/urpc" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/urpc" ) // Network exposes methods that can be used to configure a network stack. @@ -56,7 +56,11 @@ type FDBasedLink struct { Addresses []net.IP Routes []Route GSOMaxSize uint32 - LinkAddress []byte + LinkAddress net.HardwareAddr + + // NumChannels controls how many underlying FD's are to be used to + // create this endpoint. + NumChannels int } // LoopbackLink configures a loopback li nk. @@ -68,8 +72,9 @@ type LoopbackLink struct { // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes. type CreateLinksAndRoutesArgs struct { - // FilePayload contains the fds associated with the FDBasedLinks. The - // two slices must have the same length. + // FilePayload contains the fds associated with the FDBasedLinks. The + // number of fd's should match the sum of the NumChannels field of the + // FDBasedLink entries below. urpc.FilePayload LoopbackLinks []LoopbackLink @@ -95,8 +100,12 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route { // CreateLinksAndRoutes creates links and routes in a network stack. It should // only be called once. func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { - if len(args.FilePayload.Files) != len(args.FDBasedLinks) { - return fmt.Errorf("FilePayload must be same length at FDBasedLinks") + wantFDs := 0 + for _, l := range args.FDBasedLinks { + wantFDs += l.NumChannels + } + if got := len(args.FilePayload.Files); got != wantFDs { + return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs) } var nicID tcpip.NICID @@ -123,20 +132,26 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } } - for i, link := range args.FDBasedLinks { + fdOffset := 0 + for _, link := range args.FDBasedLinks { nicID++ nicids[link.Name] = nicID - // Copy the underlying FD. - oldFD := args.FilePayload.Files[i].Fd() - newFD, err := syscall.Dup(int(oldFD)) - if err != nil { - return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + FDs := []int{} + for j := 0; j < link.NumChannels; j++ { + // Copy the underlying FD. + oldFD := args.FilePayload.Files[fdOffset].Fd() + newFD, err := syscall.Dup(int(oldFD)) + if err != nil { + return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + } + FDs = append(FDs, newFD) + fdOffset++ } mac := tcpip.LinkAddress(link.LinkAddress) linkEP, err := fdbased.New(&fdbased.Options{ - FD: newFD, + FDs: FDs, MTU: uint32(link.MTU), EthernetHeader: true, Address: mac, @@ -148,7 +163,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct return err } - log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac) + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil { return err } diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof.go new file mode 100644 index 000000000..463362f02 --- /dev/null +++ b/runsc/boot/pprof.go @@ -0,0 +1,18 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +func initializePProf() { +} diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go index 19c7f8fbd..fbfd3b07c 100644 --- a/runsc/boot/strace.go +++ b/runsc/boot/strace.go @@ -15,7 +15,7 @@ package boot import ( - "gvisor.googlesource.com/gvisor/pkg/sentry/strace" + "gvisor.dev/gvisor/pkg/sentry/strace" ) func enableStrace(conf *Config) error { diff --git a/runsc/boot/user.go b/runsc/boot/user.go new file mode 100644 index 000000000..d1d423a5c --- /dev/null +++ b/runsc/boot/user.go @@ -0,0 +1,146 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "bufio" + "io" + "strconv" + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +type fileReader struct { + // Ctx is the context for the file reader. + Ctx context.Context + + // File is the file to read from. + File *fs.File +} + +// Read implements io.Reader.Read. +func (r *fileReader) Read(buf []byte) (int, error) { + n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf)) + return int(n), err +} + +// getExecUserHome returns the home directory of the executing user read from +// /etc/passwd as read from the container filesystem. +func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32) (string, error) { + // The default user home directory to return if no user matching the user + // if found in the /etc/passwd found in the image. + const defaultHome = "/" + + // Open the /etc/passwd file from the dirent via the root mount namespace. + mnsRoot := rootMns.Root() + maxTraversals := uint(linux.MaxSymlinkTraversals) + dirent, err := rootMns.FindInode(ctx, mnsRoot, nil, "/etc/passwd", &maxTraversals) + if err != nil { + // NOTE: Ignore errors opening the passwd file. If the passwd file + // doesn't exist we will return the default home directory. + return defaultHome, nil + } + defer dirent.DecRef() + + // Check read permissions on the file. + if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil { + // NOTE: Ignore permissions errors here and return default root dir. + return defaultHome, nil + } + + // Only open regular files. We don't open other files like named pipes as + // they may block and might present some attack surface to the container. + // Note that runc does not seem to do this kind of checking. + if !fs.IsRegular(dirent.Inode.StableAttr) { + return defaultHome, nil + } + + f, err := dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Directory: false}) + if err != nil { + return "", err + } + defer f.DecRef() + + r := &fileReader{ + Ctx: ctx, + File: f, + } + + homeDir, err := findHomeInPasswd(uid, r, defaultHome) + if err != nil { + return "", err + } + + return homeDir, nil +} + +// findHomeInPasswd parses a passwd file and returns the given user's home +// directory. This function does it's best to replicate the runc's behavior. +func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) { + s := bufio.NewScanner(passwd) + + for s.Scan() { + if err := s.Err(); err != nil { + return "", err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // Pull out part of passwd entry. Loosely parse the passwd entry as some + // passwd files could be poorly written and for compatibility with runc. + // + // Per 'man 5 passwd' + // /etc/passwd contains one line for each user account, with seven + // fields delimited by colons (“:”). These fields are: + // + // - login name + // - optional encrypted password + // - numerical user ID + // - numerical group ID + // - user name or comment field + // - user home directory + // - optional user command interpreter + parts := strings.Split(line, ":") + + found := false + homeDir := "" + for i, p := range parts { + switch i { + case 2: + parsedUID, err := strconv.ParseUint(p, 10, 32) + if err == nil && parsedUID == uint64(uid) { + found = true + } + case 5: + homeDir = p + } + } + if found { + // NOTE: If the uid is present but the home directory is not + // present in the /etc/passwd entry we return an empty string. This + // is, for better or worse, what runc does. + return homeDir, nil + } + } + + return defaultHome, nil +} diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go new file mode 100644 index 000000000..834003430 --- /dev/null +++ b/runsc/boot/user_test.go @@ -0,0 +1,253 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "io/ioutil" + "os" + "path/filepath" + "strings" + "syscall" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +func setupTempDir() (string, error) { + tmpDir, err := ioutil.TempDir(os.TempDir(), "exec-user-test") + if err != nil { + return "", err + } + return tmpDir, nil +} + +func setupPasswd(contents string, perms os.FileMode) func() (string, error) { + return func() (string, error) { + tmpDir, err := setupTempDir() + if err != nil { + return "", err + } + + if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil { + return "", err + } + + f, err := os.Create(filepath.Join(tmpDir, "etc", "passwd")) + if err != nil { + return "", err + } + defer f.Close() + + _, err = f.WriteString(contents) + if err != nil { + return "", err + } + + err = f.Chmod(perms) + if err != nil { + return "", err + } + return tmpDir, nil + } +} + +// TestGetExecUserHome tests the getExecUserHome function. +func TestGetExecUserHome(t *testing.T) { + tests := map[string]struct { + uid uint32 + createRoot func() (string, error) + expected string + }{ + "success": { + uid: 1000, + createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0666), + expected: "/home/adin", + }, + "no_passwd": { + uid: 1000, + createRoot: setupTempDir, + expected: "/", + }, + "no_perms": { + uid: 1000, + createRoot: setupPasswd("adin::1000:1111::/home/adin:/bin/sh", 0000), + expected: "/", + }, + "directory": { + uid: 1000, + createRoot: func() (string, error) { + tmpDir, err := setupTempDir() + if err != nil { + return "", err + } + + if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil { + return "", err + } + + if err := syscall.Mkdir(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil { + return "", err + } + + return tmpDir, nil + }, + expected: "/", + }, + // Currently we don't allow named pipes. + "named_pipe": { + uid: 1000, + createRoot: func() (string, error) { + tmpDir, err := setupTempDir() + if err != nil { + return "", err + } + + if err := os.Mkdir(filepath.Join(tmpDir, "etc"), 0777); err != nil { + return "", err + } + + if err := syscall.Mkfifo(filepath.Join(tmpDir, "etc", "passwd"), 0666); err != nil { + return "", err + } + + return tmpDir, nil + }, + expected: "/", + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + tmpDir, err := tc.createRoot() + if err != nil { + t.Fatalf("failed to create root dir: %v", err) + } + + sandEnd, cleanup, err := startGofer(tmpDir) + if err != nil { + t.Fatalf("failed to create gofer: %v", err) + } + defer cleanup() + + ctx := contexttest.Context(t) + conf := &Config{ + RootDir: "unused_root_dir", + Network: NetworkNone, + DisableSeccomp: true, + } + + spec := &specs.Spec{ + Root: &specs.Root{ + Path: tmpDir, + Readonly: true, + }, + // Add /proc mount as tmpfs to avoid needing a kernel. + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + }, + } + + var mns *fs.MountNamespace + setMountNS := func(m *fs.MountNamespace) { + mns = m + ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root()) + } + mntr := newContainerMounter(spec, "", []int{sandEnd}, nil, &podMountHints{}) + if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil { + t.Fatalf("failed to create mount namespace: %v", err) + } + + got, err := getExecUserHome(ctx, mns, tc.uid) + if err != nil { + t.Fatalf("failed to get user home: %v", err) + } + + if got != tc.expected { + t.Fatalf("expected %v, got: %v", tc.expected, got) + } + }) + } +} + +// TestFindHomeInPasswd tests the findHomeInPasswd function's passwd file parsing. +func TestFindHomeInPasswd(t *testing.T) { + tests := map[string]struct { + uid uint32 + passwd string + expected string + def string + }{ + "empty": { + uid: 1000, + passwd: "", + expected: "/", + def: "/", + }, + "whitespace": { + uid: 1000, + passwd: " ", + expected: "/", + def: "/", + }, + "full": { + uid: 1000, + passwd: "adin::1000:1111::/home/adin:/bin/sh", + expected: "/home/adin", + def: "/", + }, + // For better or worse, this is how runc works. + "partial": { + uid: 1000, + passwd: "adin::1000:1111:", + expected: "", + def: "/", + }, + "multiple": { + uid: 1001, + passwd: "adin::1000:1111::/home/adin:/bin/sh\nian::1001:1111::/home/ian:/bin/sh", + expected: "/home/ian", + def: "/", + }, + "duplicate": { + uid: 1000, + passwd: "adin::1000:1111::/home/adin:/bin/sh\nian::1000:1111::/home/ian:/bin/sh", + expected: "/home/adin", + def: "/", + }, + "empty_lines": { + uid: 1001, + passwd: "adin::1000:1111::/home/adin:/bin/sh\n\n\nian::1001:1111::/home/ian:/bin/sh", + expected: "/home/ian", + def: "/", + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + got, err := findHomeInPasswd(tc.uid, strings.NewReader(tc.passwd), tc.def) + if err != nil { + t.Fatalf("error parsing passwd: %v", err) + } + if tc.expected != got { + t.Fatalf("expected %v, got: %v", tc.expected, got) + } + }) + } +} |