diff options
Diffstat (limited to 'runsc')
80 files changed, 5387 insertions, 2610 deletions
diff --git a/runsc/BUILD b/runsc/BUILD index 757f6d44c..3b91b984a 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_binary", "pkg_deb", "pkg_tar") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) @@ -13,16 +13,7 @@ go_binary( "//visibility:public", ], x_defs = {"main.version": "{STABLE_VERSION}"}, - deps = [ - "//pkg/log", - "//pkg/refs", - "//pkg/sentry/platform", - "//runsc/boot", - "//runsc/cmd", - "//runsc/flag", - "//runsc/specutils", - "@com_github_google_subcommands//:go_default_library", - ], + deps = ["//runsc/cli"], ) # The runsc-race target is a race-compatible BUILD target. This must be built @@ -49,68 +40,7 @@ go_binary( "//visibility:public", ], x_defs = {"main.version": "{STABLE_VERSION}"}, - deps = [ - "//pkg/log", - "//pkg/refs", - "//pkg/sentry/platform", - "//runsc/boot", - "//runsc/cmd", - "//runsc/flag", - "//runsc/specutils", - "@com_github_google_subcommands//:go_default_library", - ], -) - -pkg_tar( - name = "runsc-bin", - srcs = [":runsc"], - mode = "0755", - package_dir = "/usr/bin", - strip_prefix = "/runsc/linux_amd64_pure_stripped", -) - -pkg_tar( - name = "debian-data", - extension = "tar.gz", - deps = [ - ":runsc-bin", - ], -) - -genrule( - name = "deb-version", - # Note that runsc must appear in the srcs parameter and not the tools - # parameter, otherwise it will not be stamped. This is reasonable, as tools - # may be encoded differently in the build graph (cached more aggressively - # because they are assumes to be hermetic). - srcs = [":runsc"], - outs = ["version.txt"], - # Note that the little dance here is necessary because files in the $(SRCS) - # attribute are not executable by default, and we can't touch in place. - cmd = "cp $(location :runsc) $(@D)/runsc && \ - chmod a+x $(@D)/runsc && \ - $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \ - rm -f $(@D)/runsc", - stamp = 1, -) - -pkg_deb( - name = "runsc-debian", - architecture = "amd64", - data = ":debian-data", - # Note that the description_file will be flatten (all newlines removed), - # and therefore it is kept to a simple one-line description. The expected - # format for debian packages is "short summary\nLonger explanation of - # tool." and this is impossible with the flattening. - description_file = "debian/description", - homepage = "https://gvisor.dev/", - maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>", - package = "runsc", - postinst = "debian/postinst.sh", - version_file = ":version.txt", - visibility = [ - "//visibility:public", - ], + deps = ["//runsc/cli"], ) sh_test( diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index a907c103b..b97dc3c47 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -8,7 +8,6 @@ go_library( "compat.go", "compat_amd64.go", "compat_arm64.go", - "config.go", "controller.go", "debug.go", "events.go", @@ -27,19 +26,25 @@ go_library( deps = [ "//pkg/abi", "//pkg/abi/linux", + "//pkg/bpf", + "//pkg/cleanup", "//pkg/context", "//pkg/control/server", "//pkg/cpuid", "//pkg/eventchannel", + "//pkg/fd", "//pkg/fspath", "//pkg/log", "//pkg/memutil", "//pkg/rand", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/control", "//pkg/sentry/devices/memdev", + "//pkg/sentry/devices/ttydev", + "//pkg/sentry/devices/tundev", "//pkg/sentry/fdimport", "//pkg/sentry/fs", "//pkg/sentry/fs/dev", @@ -53,8 +58,10 @@ go_library( "//pkg/sentry/fs/user", "//pkg/sentry/fsimpl/devpts", "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/fuse", "//pkg/sentry/fsimpl/gofer", "//pkg/sentry/fsimpl/host", + "//pkg/sentry/fsimpl/overlay", "//pkg/sentry/fsimpl/proc", "//pkg/sentry/fsimpl/sys", "//pkg/sentry/fsimpl/tmpfs", @@ -86,6 +93,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/link/fdbased", "//pkg/tcpip/link/loopback", + "//pkg/tcpip/link/packetsocket", "//pkg/tcpip/link/qdisc/fifo", "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/arp", @@ -100,9 +108,11 @@ go_library( "//runsc/boot/filter", "//runsc/boot/platforms", "//runsc/boot/pprof", + "//runsc/config", "//runsc/specutils", - "@com_github_golang_protobuf//proto:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "//runsc/specutils/seccomp", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + "@org_golang_google_protobuf//proto:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) @@ -118,6 +128,7 @@ go_test( library = ":boot", deps = [ "//pkg/control/server", + "//pkg/fd", "//pkg/fspath", "//pkg/log", "//pkg/p9", @@ -126,8 +137,9 @@ go_test( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/unet", + "//runsc/config", "//runsc/fsgofer", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 84c67cbc2..7076ae2e2 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -19,7 +19,7 @@ import ( "os" "syscall" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 8125d5061..4e0f0d57a 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -22,6 +22,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -29,10 +30,12 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/state" "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) @@ -101,14 +104,13 @@ const ( // Profiling related commands (see pprof.go for more details). const ( - StartCPUProfile = "Profile.StartCPUProfile" - StopCPUProfile = "Profile.StopCPUProfile" - HeapProfile = "Profile.HeapProfile" - GoroutineProfile = "Profile.GoroutineProfile" - BlockProfile = "Profile.BlockProfile" - MutexProfile = "Profile.MutexProfile" - StartTrace = "Profile.StartTrace" - StopTrace = "Profile.StopTrace" + StartCPUProfile = "Profile.StartCPUProfile" + StopCPUProfile = "Profile.StopCPUProfile" + HeapProfile = "Profile.HeapProfile" + BlockProfile = "Profile.BlockProfile" + MutexProfile = "Profile.MutexProfile" + StartTrace = "Profile.StartTrace" + StopTrace = "Profile.StopTrace" ) // Logging related commands (see logging.go for more details). @@ -129,42 +131,52 @@ type controller struct { // manager holds the containerManager methods. manager *containerManager + + // pprop holds the profile instance if enabled. It may be nil. + pprof *control.Profile } // newController creates a new controller. The caller must call // controller.srv.StartServing() to start the controller. func newController(fd int, l *Loader) (*controller, error) { - srv, err := server.CreateFromFD(fd) + ctrl := &controller{} + var err error + ctrl.srv, err = server.CreateFromFD(fd) if err != nil { return nil, err } - manager := &containerManager{ + ctrl.manager = &containerManager{ startChan: make(chan struct{}), startResultChan: make(chan error), l: l, } - srv.Register(manager) + ctrl.srv.Register(ctrl.manager) if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { net := &Network{ Stack: eps.Stack, } - srv.Register(net) + ctrl.srv.Register(net) } - srv.Register(&debug{}) - srv.Register(&control.Logging{}) - if l.conf.ProfileEnable { - srv.Register(&control.Profile{ - Kernel: l.k, - }) + ctrl.srv.Register(&debug{}) + ctrl.srv.Register(&control.Logging{}) + + if l.root.conf.ProfileEnable { + ctrl.pprof = &control.Profile{Kernel: l.k} + ctrl.srv.Register(ctrl.pprof) } - return &controller{ - srv: srv, - manager: manager, - }, nil + return ctrl, nil +} + +func (c *controller) stop() { + if c.pprof != nil { + // These are noop if there is nothing being profiled. + _ = c.pprof.StopCPUProfile(nil, nil) + _ = c.pprof.StopTrace(nil, nil) + } } // containerManager manages sandbox containers. @@ -211,7 +223,7 @@ type StartArgs struct { Spec *specs.Spec // Config is the runsc-specific configuration for the sandbox. - Conf *Config + Conf *config.Config // CID is the ID of the container to start. CID string @@ -247,13 +259,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { // All validation passed, logs the spec for debugging. specutils.LogSpec(args.Spec) - err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files) + fds, err := fd.NewFromFiles(args.FilePayload.Files) if err != nil { + return err + } + defer func() { + for _, fd := range fds { + _ = fd.Close() + } + }() + if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil { log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) return err } log.Debugf("Container %q started", args.CID) - return nil } @@ -333,7 +352,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Pause the kernel while we build a new one. cm.l.k.Pause() - p, err := createPlatform(cm.l.conf, deviceFile) + p, err := createPlatform(cm.l.root.conf, deviceFile) if err != nil { return fmt.Errorf("creating platform: %v", err) } @@ -349,12 +368,20 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { cm.l.k = k // Set up the restore environment. - mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints) - renv, err := mntr.createRestoreEnvironment(cm.l.conf) - if err != nil { - return fmt.Errorf("creating RestoreEnvironment: %v", err) + ctx := k.SupervisorContext() + mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints) + if kernel.VFS2Enabled { + ctx, err = mntr.configureRestore(ctx, cm.l.root.conf) + if err != nil { + return fmt.Errorf("configuring filesystem restore: %v", err) + } + } else { + renv, err := mntr.createRestoreEnvironment(cm.l.root.conf) + if err != nil { + return fmt.Errorf("creating RestoreEnvironment: %v", err) + } + fs.SetRestoreEnvironment(*renv) } - fs.SetRestoreEnvironment(*renv) // Prepare to load from the state file. if eps, ok := networkStack.(*netstack.Stack); ok { @@ -368,7 +395,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("file cannot be empty") } - if cm.l.conf.ProfileEnable { + if cm.l.root.conf.ProfileEnable { // pprof.Initialize opens /proc/self/maps, so has to be called before // installing seccomp filters. pprof.Initialize() @@ -381,19 +408,19 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Load the state. loadOpts := state.LoadOpts{Source: specFile} - if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil { + if err := loadOpts.Load(ctx, k, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil { return err } // Since we have a new kernel we also must make a new watchdog. dogOpts := watchdog.DefaultOpts - dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction + dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction dog := watchdog.New(k, dogOpts) // Change the loader fields to reflect the changes made when restoring. cm.l.k = k cm.l.watchdog = dog - cm.l.rootProcArgs = kernel.CreateProcessArgs{} + cm.l.root.procArgs = kernel.CreateProcessArgs{} cm.l.restore = true // Reinitialize the sandbox ID and processes map. Note that it doesn't diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 60e33425f..a7c4ebb0c 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -27,41 +27,30 @@ import ( // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS. var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_CLOCK_GETTIME: {}, - syscall.SYS_CLONE: []seccomp.Rule{ - { - seccomp.AllowValue( - syscall.CLONE_VM | - syscall.CLONE_FS | - syscall.CLONE_FILES | - syscall.CLONE_SIGHAND | - syscall.CLONE_SYSVSEM | - syscall.CLONE_THREAD), - }, - }, - syscall.SYS_CLOSE: {}, - syscall.SYS_DUP: {}, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, syscall.SYS_DUP3: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.O_CLOEXEC), }, }, syscall.SYS_EPOLL_CREATE1: {}, syscall.SYS_EPOLL_CTL: {}, syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, }, syscall.SYS_EVENTFD2: []seccomp.Rule{ { - seccomp.AllowValue(0), - seccomp.AllowValue(0), + seccomp.EqualTo(0), + seccomp.EqualTo(0), }, }, syscall.SYS_EXIT: {}, @@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_FCHMOD: {}, syscall.SYS_FCNTL: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_SETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_SETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFD), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFD), }, }, syscall.SYS_FSTAT: {}, @@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_FTRUNCATE: {}, syscall.SYS_FUTEX: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, }, // Non-private variants are included for flipcall support. They are otherwise // unncessary, as the sentry will use only private futexes internally. { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE), - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE), + seccomp.MatchAny{}, }, }, syscall.SYS_GETPID: {}, unix.SYS_GETRANDOM: {}, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_DOMAIN), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_DOMAIN), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_TYPE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TYPE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_ERROR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_ERROR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), }, }, syscall.SYS_GETTID: {}, @@ -141,38 +130,44 @@ var allowedSyscalls = seccomp.SyscallRules{ // setting/getting termios and winsize. syscall.SYS_IOCTL: []seccomp.Rule{ { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCGETS), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCGETS), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETS), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETS), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETSF), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETSF), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETSW), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETSW), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TIOCSWINSZ), - seccomp.AllowAny{}, /* winsize struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TIOCSWINSZ), + seccomp.MatchAny{}, /* winsize struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TIOCGWINSZ), - seccomp.AllowAny{}, /* winsize struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TIOCGWINSZ), + seccomp.MatchAny{}, /* winsize struct */ }, }, syscall.SYS_LSEEK: {}, syscall.SYS_MADVISE: {}, + unix.SYS_MEMBARRIER: []seccomp.Rule{ + { + seccomp.EqualTo(linux.MEMBARRIER_CMD_GLOBAL), + seccomp.EqualTo(0), + }, + }, syscall.SYS_MINCORE: {}, // Used by the Go runtime as a temporarily workaround for a Linux // 5.2-5.4 bug. @@ -182,46 +177,46 @@ var allowedSyscalls = seccomp.SyscallRules{ // TODO(b/148688965): Remove once this is gone from Go. syscall.SYS_MLOCK: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(4096), + seccomp.MatchAny{}, + seccomp.EqualTo(4096), }, }, syscall.SYS_MMAP: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_SHARED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_SHARED), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ), - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ), + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), }, }, syscall.SYS_MPROTECT: {}, @@ -237,32 +232,32 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_READ: {}, syscall.SYS_RECVMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), }, }, syscall.SYS_RECVMMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(fdbased.MaxMsgsPerRecv), - seccomp.AllowValue(syscall.MSG_DONTWAIT), - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(fdbased.MaxMsgsPerRecv), + seccomp.EqualTo(syscall.MSG_DONTWAIT), + seccomp.EqualTo(0), }, }, unix.SYS_SENDMMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT), - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT), + seccomp.EqualTo(0), }, }, syscall.SYS_RESTART_SYSCALL: {}, @@ -272,57 +267,50 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_SCHED_YIELD: {}, syscall.SYS_SENDMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), }, }, syscall.SYS_SETITIMER: {}, syscall.SYS_SHUTDOWN: []seccomp.Rule{ // Used by fs/host to shutdown host sockets. - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)}, - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)}, // Used by unet to shutdown connections. - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, unix.SYS_STATX: {}, syscall.SYS_SYNC_FILE_RANGE: {}, syscall.SYS_TEE: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(1), /* len */ - seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */ + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(1), /* len */ + seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */ }, }, syscall.SYS_TGKILL: []seccomp.Rule{ { - seccomp.AllowValue(uint64(os.Getpid())), + seccomp.EqualTo(uint64(os.Getpid())), }, }, syscall.SYS_UTIMENSAT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(0), /* null pathname */ - seccomp.AllowAny{}, - seccomp.AllowValue(0), /* flags */ + seccomp.MatchAny{}, + seccomp.EqualTo(0), /* null pathname */ + seccomp.MatchAny{}, + seccomp.EqualTo(0), /* flags */ }, }, syscall.SYS_WRITE: {}, - // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with - // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR - // option is enabled for a packet socket. + // For rawfile.NonBlockingWriteIovec. syscall.SYS_WRITEV: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(2), - }, - { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(3), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.GreaterThan(0), }, }, } @@ -332,10 +320,10 @@ func hostInetFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_ACCEPT4: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), }, }, syscall.SYS_BIND: {}, @@ -344,84 +332,84 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_GETSOCKNAME: {}, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_TOS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_TOS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_RECVTOS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVTOS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_TCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_TCLASS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_RECVTCLASS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_V6ONLY), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_V6ONLY), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_ERROR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_ERROR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_KEEPALIVE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_KEEPALIVE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_RCVBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_RCVBUF), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_REUSEADDR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_TYPE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TYPE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_LINGER), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_LINGER), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_NODELAY), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_NODELAY), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_INFO), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_INFO), }, }, syscall.SYS_IOCTL: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.TIOCOUTQ), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.TIOCOUTQ), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.TIOCINQ), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.TIOCINQ), }, }, syscall.SYS_LISTEN: {}, @@ -432,103 +420,103 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_SENDTO: {}, syscall.SYS_SETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_V6ONLY), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_V6ONLY), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_RCVBUF), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_RCVBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_REUSEADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_NODELAY), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_NODELAY), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_TOS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_TOS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_RECVTOS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVTOS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_TCLASS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_TCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_RECVTCLASS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_RECVTCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_RD), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_RD), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_WR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_WR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_RDWR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_RDWR), }, }, syscall.SYS_SOCKET: []seccomp.Rule{ { - seccomp.AllowValue(syscall.AF_INET), - seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET), + seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET), - seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET), + seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET6), - seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET6), + seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET6), - seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET6), + seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, }, syscall.SYS_WRITEV: {}, @@ -539,20 +527,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_ACCEPT: []seccomp.Rule{ { - seccomp.AllowValue(fd), + seccomp.EqualTo(fd), }, }, syscall.SYS_LISTEN: []seccomp.Rule{ { - seccomp.AllowValue(fd), - seccomp.AllowValue(16 /* unet.backlog */), + seccomp.EqualTo(fd), + seccomp.EqualTo(16 /* unet.backlog */), }, }, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_PEERCRED), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_PEERCRED), }, }, } diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go index 5335ff82c..cea5613b8 100644 --- a/runsc/boot/filter/config_amd64.go +++ b/runsc/boot/filter/config_amd64.go @@ -24,8 +24,41 @@ import ( ) func init() { - allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL], - seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)}, - seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)}, - ) + allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{ + // TODO(b/168828518): No longer used in Go 1.16+. + {seccomp.EqualTo(linux.ARCH_SET_FS)}, + } + + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + // parent_tidptr and child_tidptr are always 0 because neither + // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SETTLS | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, + { + // TODO(b/168828518): No longer used in Go 1.16+ (on amd64). + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, + } } diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go index 7fa9bbda3..37313f97f 100644 --- a/runsc/boot/filter/config_arm64.go +++ b/runsc/boot/filter/config_arm64.go @@ -16,6 +16,29 @@ package filter -// Reserve for future customization. +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + func init() { + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + // These arguments are left uninitialized by the Go + // runtime, so they may be anything (and are unused by + // the host). + seccomp.MatchAny{}, // parent_tidptr + seccomp.MatchAny{}, // tls + seccomp.MatchAny{}, // child_tidptr + }, + } } diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go index 194952a7b..7b8669595 100644 --- a/runsc/boot/filter/config_profile.go +++ b/runsc/boot/filter/config_profile.go @@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_OPENAT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), }, }, } diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go index 5e5a3c998..209e646a7 100644 --- a/runsc/boot/filter/extra_filters_msan.go +++ b/runsc/boot/filter/extra_filters_msan.go @@ -26,6 +26,8 @@ import ( func instrumentationFilters() seccomp.SyscallRules { Report("MSAN is enabled: syscall filters less restrictive!") return seccomp.SyscallRules{ + syscall.SYS_CLONE: {}, + syscall.SYS_MMAP: {}, syscall.SYS_SCHED_GETAFFINITY: {}, syscall.SYS_SET_ROBUST_LIST: {}, } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index b98a1eb50..6b6ae98d7 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -29,10 +29,12 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" + "gvisor.dev/gvisor/pkg/sentry/vfs" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/gofer" @@ -47,6 +49,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) @@ -65,7 +68,7 @@ const ( // tmpfs has some extra supported options that we must pass through. var tmpfsAllowedData = []string{"mode", "uid", "gid"} -func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { +func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { // Upper layer uses the same flags as lower, but it must be read-write. upperFlags := lowerFlags upperFlags.ReadOnly = false @@ -102,33 +105,28 @@ func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, // mandatory mounts that are required by the OCI specification. func compileMounts(spec *specs.Spec) []specs.Mount { // Keep track of whether proc and sys were mounted. - var procMounted, sysMounted bool + var procMounted, sysMounted, devMounted, devptsMounted bool var mounts []specs.Mount - // Always mount /dev. - mounts = append(mounts, specs.Mount{ - Type: devtmpfs.Name, - Destination: "/dev", - }) - - mounts = append(mounts, specs.Mount{ - Type: devpts.Name, - Destination: "/dev/pts", - }) - // Mount all submounts from the spec. for _, m := range spec.Mounts { if !specutils.IsSupportedDevMount(m) { log.Warningf("ignoring dev mount at %q", m.Destination) continue } - mounts = append(mounts, m) switch filepath.Clean(m.Destination) { case "/proc": procMounted = true case "/sys": sysMounted = true + case "/dev": + m.Type = devtmpfs.Name + devMounted = true + case "/dev/pts": + m.Type = devpts.Name + devptsMounted = true } + mounts = append(mounts, m) } // Mount proc and sys even if the user did not ask for it, as the spec @@ -146,6 +144,18 @@ func compileMounts(spec *specs.Spec) []specs.Mount { Destination: "/sys", }) } + if !devMounted { + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: devtmpfs.Name, + Destination: "/dev", + }) + } + if !devptsMounted { + mandatoryMounts = append(mandatoryMounts, specs.Mount{ + Type: devpts.Name, + Destination: "/dev/pts", + }) + } // The mandatory mounts should be ordered right after the root, in case // there are submounts of these mandatory mounts already in the spec. @@ -155,7 +165,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount { } // p9MountData creates a slice of p9 mount data. -func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string { +func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string { opts := []string{ "trans=fd", "rfdno=" + strconv.Itoa(fd), @@ -166,7 +176,7 @@ func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string { // enablement. opts = append(opts, "privateunixsocket=true") } - if fa == FileAccessShared { + if fa == config.FileAccessShared { opts = append(opts, "cache=remote_revalidating") } return opts @@ -251,7 +261,7 @@ func mustFindFilesystem(name string) fs.Filesystem { // addSubmountOverlay overlays the inode over a ramfs tree containing the given // paths. -func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) { +func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) { // Construct a ramfs tree of mount points. The contents never // change, so this can be fully caching. There's no real // filesystem backing this tree, so we set the filesystem to @@ -261,7 +271,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string if err != nil { return nil, fmt.Errorf("creating mount tree: %v", err) } - overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{}) + overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf) if err != nil { return nil, fmt.Errorf("adding mount overlay: %v", err) } @@ -280,7 +290,7 @@ func subtargets(root string, mnts []specs.Mount) []string { return targets } -func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { +func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { if conf.VFS2 { return setupContainerVFS2(ctx, conf, mntr, procArgs) } @@ -293,11 +303,11 @@ func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs.MountNamespace = mns // Resolve the executable path from working dir and environment. - f, err := user.ResolveExecutablePath(ctx, procArgs.Credentials, procArgs.MountNamespace, procArgs.Envv, procArgs.WorkingDirectory, procArgs.Argv[0]) + resolved, err := user.ResolveExecutablePath(ctx, procArgs) if err != nil { - return fmt.Errorf("searching for executable %q, cwd: %q, envv: %q: %v", procArgs.Argv[0], procArgs.WorkingDirectory, procArgs.Envv, err) + return err } - procArgs.Filename = f + procArgs.Filename = resolved return nil } @@ -318,14 +328,14 @@ func adjustDirentCache(k *kernel.Kernel) error { } type fdDispenser struct { - fds []int + fds []*fd.FD } func (f *fdDispenser) remove() int { if f.empty() { panic("fdDispenser out of fds") } - rv := f.fds[0] + rv := f.fds[0].Release() f.fds = f.fds[1:] return rv } @@ -390,6 +400,10 @@ type mountHint struct { // root is the inode where the volume is mounted. For mounts with 'pod' share // the volume is mounted once and then bind mounted inside the containers. root *fs.Inode + + // vfsMount is the master mount for the volume. For mounts with 'pod' share + // the master volume is bind mounted inside the containers. + vfsMount *vfs.Mount } func (m *mountHint) setField(key, val string) error { @@ -447,27 +461,27 @@ func (m *mountHint) isSupported() bool { func (m *mountHint) checkCompatible(mount specs.Mount) error { // Remove options that don't affect to mount's behavior. masterOpts := filterUnsupportedOptions(m.mount) - slaveOpts := filterUnsupportedOptions(mount) + replicaOpts := filterUnsupportedOptions(mount) - if len(masterOpts) != len(slaveOpts) { - return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + if len(masterOpts) != len(replicaOpts) { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) } sort.Strings(masterOpts) - sort.Strings(slaveOpts) + sort.Strings(replicaOpts) for i, opt := range masterOpts { - if opt != slaveOpts[i] { - return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + if opt != replicaOpts[i] { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) } } return nil } -func (m *mountHint) fileAccessType() FileAccessType { +func (m *mountHint) fileAccessType() config.FileAccessType { if m.share == container { - return FileAccessExclusive + return config.FileAccessExclusive } - return FileAccessShared + return config.FileAccessShared } func filterUnsupportedOptions(mount specs.Mount) []string { @@ -558,7 +572,7 @@ type containerMounter struct { hints *podMountHints } -func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter { +func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter { return &containerMounter{ root: spec.Root, mounts: compileMounts(spec), @@ -571,9 +585,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin // processHints processes annotations that container hints about how volumes // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. -func (c *containerMounter) processHints(conf *Config) error { +func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error { if conf.VFS2 { - return nil + return c.processHintsVFS2(conf, creds) } ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { @@ -595,7 +609,7 @@ func (c *containerMounter) processHints(conf *Config) error { // setupFS is used to set up the file system for all containers. This is the // main entry point method, with most of the other being internal only. It // returns the mount namespace that is created for the container. -func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { +func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { log.Infof("Configuring container's file system") // Create context with root credentials to mount the filesystem (the current @@ -621,7 +635,7 @@ func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessA return mns, nil } -func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) { +func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) { rootInode, err := c.createRootMount(ctx, conf) if err != nil { return nil, fmt.Errorf("creating filesystem for container: %v", err) @@ -633,9 +647,9 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi return mns, nil } -func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { +func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, m := range c.mounts { log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) @@ -669,7 +683,7 @@ func (c *containerMounter) checkDispenser() error { // mountSharedMaster mounts the master of a volume that is shared among // containers in a pod. It returns the root mount's inode. -func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) { +func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount) @@ -709,7 +723,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, } // createRootMount creates the root filesystem. -func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) { +func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) { // First construct the filesystem from the spec.Root. mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay} @@ -734,7 +748,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (* // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always // mounted even if they are not in the spec. submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") - rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf) if err != nil { return nil, fmt.Errorf("adding submount overlay: %v", err) } @@ -754,7 +768,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (* // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values // used for mounts. -func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) { +func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) { var ( fsName string opts []string @@ -788,19 +802,19 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( return fsName, opts, useOverlay, nil } -func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType { +func (c *containerMounter) getMountAccessType(mount specs.Mount) config.FileAccessType { if hint := c.hints.findMount(mount); hint != nil { return hint.fileAccessType() } // Non-root bind mounts are always shared if no hints were provided. - return FileAccessShared + return config.FileAccessShared } // mountSubmount mounts volumes inside the container's root. Because mounts may // be readonly, a lower ramfs overlay is added to create the mount point dir. // Another overlay is added with tmpfs on top if Config.Overlay is true. // 'm.Destination' must be an absolute path with '..' and symlinks resolved. -func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { +func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) @@ -844,7 +858,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns submounts := subtargets(m.Destination, c.mounts) if len(submounts) > 0 { log.Infof("Adding submount overlay over %q", m.Destination) - inode, err = addSubmountOverlay(ctx, inode, submounts) + inode, err = addSubmountOverlay(ctx, inode, submounts, mf) if err != nil { return fmt.Errorf("adding submount overlay: %v", err) } @@ -863,7 +877,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns if err != nil { return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) } - defer dirent.DecRef() + defer dirent.DecRef(ctx) if err := mns.Mount(ctx, dirent, inode); err != nil { return fmt.Errorf("mount %q error: %v", m.Destination, err) } @@ -884,12 +898,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun if err != nil { return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) } - defer target.DecRef() + defer target.DecRef(ctx) // Take a ref on the inode that is about to be (re)-mounted. source.root.IncRef() if err := mns.Mount(ctx, target, source.root); err != nil { - source.root.DecRef() + source.root.DecRef(ctx) return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) } @@ -899,7 +913,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun // addRestoreMount adds a mount to the MountSources map used for restoring a // checkpointed container. -func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error { +func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error { fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) if err != nil { return err @@ -924,7 +938,7 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding // the mounts to the environment. -func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) { +func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) { renv := &fs.RestoreEnvironment{ MountSources: make(map[string][]fs.MountArgs), } @@ -979,7 +993,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn // // Note that when there are submounts inside of '/tmp', directories for the // mount points must be present, making '/tmp' not empty anymore. -func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { +func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error { for _, m := range c.mounts { if filepath.Clean(m.Destination) == "/tmp" { log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m) @@ -992,12 +1006,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M switch err { case nil: // Found '/tmp' in filesystem, check if it's empty. - defer tmp.DecRef() + defer tmp.DecRef(ctx) f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) if err != nil { return err } - defer f.DecRef() + defer f.DecRef(ctx) serializer := &fs.CollectEntriesSerializer{} if err := f.Readdir(ctx, serializer); err != nil { return err diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go index 912037075..e986231e5 100644 --- a/runsc/boot/fs_test.go +++ b/runsc/boot/fs_test.go @@ -20,6 +20,7 @@ import ( "testing" specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/runsc/config" ) func TestPodMountHintsHappy(t *testing.T) { @@ -196,7 +197,7 @@ func TestGetMountAccessType(t *testing.T) { for _, tst := range []struct { name string annotations map[string]string - want FileAccessType + want config.FileAccessType }{ { name: "container=exclusive", @@ -205,7 +206,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "container", }, - want: FileAccessExclusive, + want: config.FileAccessExclusive, }, { name: "pod=shared", @@ -214,7 +215,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "pod", }, - want: FileAccessShared, + want: config.FileAccessShared, }, { name: "shared=shared", @@ -223,7 +224,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "shared", }, - want: FileAccessShared, + want: config.FileAccessShared, }, { name: "default=shared", @@ -232,7 +233,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "container", }, - want: FileAccessShared, + want: config.FileAccessShared, }, } { t.Run(tst.name, func(t *testing.T) { diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index f802bc9fb..8c6ab213d 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -16,22 +16,26 @@ package boot import ( + "errors" "fmt" mrand "math/rand" "os" "runtime" "sync/atomic" - "syscall" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fdimport" @@ -66,7 +70,9 @@ import ( "gvisor.dev/gvisor/runsc/boot/filter" _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" + "gvisor.dev/gvisor/runsc/specutils/seccomp" // Include supported socket providers. "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" @@ -77,6 +83,22 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) +type containerInfo struct { + conf *config.Config + + // spec is the base configuration for the root container. + spec *specs.Spec + + // procArgs refers to the container's init task. + procArgs kernel.CreateProcessArgs + + // stdioFDs contains stdin, stdout, and stderr. + stdioFDs []*fd.FD + + // goferFDs are the FDs that attach the sandbox to the gofers. + goferFDs []*fd.FD +} + // Loader keeps state needed to start the kernel and run the container.. type Loader struct { // k is the kernel. @@ -85,22 +107,11 @@ type Loader struct { // ctrl is the control server. ctrl *controller - conf *Config - - // console is set to true if terminal is enabled. - console bool + // root contains information about the root container in the sandbox. + root containerInfo watchdog *watchdog.Watchdog - // stdioFDs contains stdin, stdout, and stderr. - stdioFDs []int - - // goferFDs are the FDs that attach the sandbox to the gofers. - goferFDs []int - - // spec is the base configuration for the root container. - spec *specs.Spec - // stopSignalForwarding disables forwarding of signals to the sandboxed // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() @@ -108,9 +119,6 @@ type Loader struct { // restore is set to true if we are restoring a container. restore bool - // rootProcArgs refers to the root sandbox init task. - rootProcArgs kernel.CreateProcessArgs - // sandboxID is the ID for the whole sandbox. sandboxID string @@ -162,7 +170,7 @@ type Args struct { // Spec is the sandbox specification. Spec *specs.Spec // Conf is the system configuration. - Conf *Config + Conf *config.Config // ControllerFD is the FD to the URPC controller. The Loader takes ownership // of this FD and may close it at any time. ControllerFD int @@ -175,8 +183,6 @@ type Args struct { // StdioFDs is the stdio for the application. The Loader takes ownership of // these FDs and may close them at any time. StdioFDs []int - // Console is set to true if using TTY. - Console bool // NumCPU is the number of CPUs to create inside the sandbox. NumCPU int // TotalMem is the initial amount of total memory to report back to the @@ -187,7 +193,7 @@ type Args struct { } // make sure stdioFDs are always the same on initial start and on restore -const startingStdioFD = 64 +const startingStdioFD = 256 // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. @@ -205,6 +211,10 @@ func New(args Args) (*Loader, error) { // Is this a VFSv2 kernel? if args.Conf.VFS2 { kernel.VFS2Enabled = true + if args.Conf.FUSE { + kernel.FUSEEnabled = true + } + vfs2.Override() } @@ -227,9 +237,7 @@ func New(args Args) (*Loader, error) { // Create VDSO. // // Pass k as the platform since it is savable, unlike the actual platform. - // - // FIXME(b/109889800): Use non-nil context. - vdso, err := loader.PrepareVDSO(nil, k) + vdso, err := loader.PrepareVDSO(k) if err != nil { return nil, fmt.Errorf("creating vdso: %v", err) } @@ -275,6 +283,7 @@ func New(args Args) (*Loader, error) { args.NumCPU = runtime.NumCPU() } log.Infof("CPUs: %d", args.NumCPU) + runtime.GOMAXPROCS(args.NumCPU) if args.TotalMem > 0 { // Adjust the total memory returned by the Sentry so that applications that @@ -300,6 +309,12 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } + if kernel.VFS2Enabled { + if err := registerFilesystems(k); err != nil { + return nil, fmt.Errorf("registering filesystems: %w", err) + } + } + if err := adjustDirentCache(k); err != nil { return nil, err } @@ -318,7 +333,7 @@ func New(args Args) (*Loader, error) { dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction dog := watchdog.New(k, dogOpts) - procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) + procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) if err != nil { return nil, fmt.Errorf("creating init process for root container: %v", err) } @@ -338,7 +353,7 @@ func New(args Args) (*Loader, error) { if err != nil { return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) } - defer hostFilesystem.DecRef() + defer hostFilesystem.DecRef(k.SupervisorContext()) hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { return nil, fmt.Errorf("failed to create hostfs mount: %v", err) @@ -346,37 +361,45 @@ func New(args Args) (*Loader, error) { k.SetHostMount(hostMount) } + info := containerInfo{ + conf: args.Conf, + spec: args.Spec, + procArgs: procArgs, + } + // Make host FDs stable between invocations. Host FDs must map to the exact // same number when the sandbox is restored. Otherwise the wrong FD will be // used. - var stdioFDs []int newfd := startingStdioFD - for _, fd := range args.StdioFDs { - err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC) - if err != nil { - return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) + for _, stdioFD := range args.StdioFDs { + // Check that newfd is unused to avoid clobbering over it. + if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { + if err != nil { + return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) + } + return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) } - stdioFDs = append(stdioFDs, newfd) - err = syscall.Close(fd) + + err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) if err != nil { - return nil, fmt.Errorf("close original stdioFDs failed: %v", err) + return nil, fmt.Errorf("dup3 of stdios failed: %w", err) } + info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) + _ = unix.Close(stdioFD) newfd++ } + for _, goferFD := range args.GoferFDs { + info.goferFDs = append(info.goferFDs, fd.New(goferFD)) + } eid := execID{cid: args.ID} l := &Loader{ - k: k, - conf: args.Conf, - console: args.Console, - watchdog: dog, - spec: args.Spec, - goferFDs: args.GoferFDs, - stdioFDs: stdioFDs, - rootProcArgs: procArgs, - sandboxID: args.ID, - processes: map[execID]*execProcess{eid: {}}, - mountHints: mountHints, + k: k, + watchdog: dog, + sandboxID: args.ID, + processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, + root: info, } // We don't care about child signals; some platforms can generate a @@ -404,8 +427,8 @@ func New(args Args) (*Loader, error) { return l, nil } -// newProcess creates a process that can be run with kernel.CreateProcess. -func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { +// createProcessArgs creates args that can be used with kernel.CreateProcess. +func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { // Create initial limits. ls, err := createLimitSet(spec) if err != nil { @@ -449,9 +472,29 @@ func (l *Loader) Destroy() { l.stopSignalForwarding() } l.watchdog.Stop() + + // Release all kernel resources. This is only safe after we can no longer + // save/restore. + l.k.Release() + + // All sentry-created resources should have been released at this point; + // check for reference leaks. + if refsvfs2.LeakCheckEnabled() { + refsvfs2.DoLeakCheck() + } + + // In the success case, stdioFDs and goferFDs will only contain + // released/closed FDs that ownership has been passed over to host FDs and + // gofer sessions. Close them here in case of failure. + for _, fd := range l.root.stdioFDs { + _ = fd.Close() + } + for _, fd := range l.root.goferFDs { + _ = fd.Close() + } } -func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { +func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { p, err := platform.Lookup(conf.Platform) if err != nil { panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err)) @@ -478,14 +521,15 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return mf, nil } +// installSeccompFilters installs sandbox seccomp filters with the host. func (l *Loader) installSeccompFilters() error { - if l.conf.DisableSeccomp { + if l.root.conf.DisableSeccomp { filter.Report("syscall filter is DISABLED. Running in less secure mode.") } else { opts := filter.Options{ Platform: l.k.Platform, - HostNetwork: l.conf.Network == NetworkHost, - ProfileEnable: l.conf.ProfileEnable, + HostNetwork: l.root.conf.Network == config.NetworkHost, + ProfileEnable: l.root.conf.ProfileEnable, ControllerFD: l.ctrl.srv.FD(), } if err := filter.Install(opts); err != nil { @@ -511,7 +555,7 @@ func (l *Loader) Run() error { } func (l *Loader) run() error { - if l.conf.Network == NetworkHost { + if l.root.conf.Network == config.NetworkHost { // Delay host network configuration to this point because network namespace // is configured after the loader is created and before Run() is called. log.Debugf("Configuring host network") @@ -532,10 +576,8 @@ func (l *Loader) run() error { // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. - var ttyFile *host.TTYFileOperations - var ttyFileVFS2 *hostvfs2.TTYFileDescription if !l.restore { - if l.conf.ProfileEnable { + if l.root.conf.ProfileEnable { pprof.Initialize() } @@ -545,82 +587,30 @@ func (l *Loader) run() error { return err } - // Create the FD map, which will set stdin, stdout, and stderr. If console - // is true, then ioctl calls will be passed through to the host fd. - ctx := l.rootProcArgs.NewContext(l.k) - var err error - - // CreateProcess takes a reference on FDMap if successful. We won't need - // ours either way. - l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs) - if err != nil { - return fmt.Errorf("importing fds: %v", err) - } - - // Setup the root container file system. - l.startGoferMonitor(l.sandboxID, l.goferFDs) - - mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.processHints(l.conf); err != nil { - return err - } - if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil { - return err - } - - // Add the HOME enviroment variable if it is not already set. - var envv []string - if kernel.VFS2Enabled { - envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2, - l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) - - } else { - envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, - l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) - } - if err != nil { - return err - } - l.rootProcArgs.Envv = envv - // Create the root container init task. It will begin running // when the kernel is started. - if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil { - return fmt.Errorf("creating init process: %v", err) + if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { + return err } - // CreateProcess takes a reference on FDTable if successful. - l.rootProcArgs.FDTable.DecRef() } ep.tg = l.k.GlobalInit() - if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok { + if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok { ep.pidnsPath = ns.Path } - if l.console { - // Set the foreground process group on the TTY to the global init process - // group, since that is what we are about to start running. - switch { - case ttyFileVFS2 != nil: - ep.ttyVFS2 = ttyFileVFS2 - ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup()) - case ttyFile != nil: - ep.tty = ttyFile - ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup()) - } - } // Handle signals by forwarding them to the root container process // (except for panic signal, which should cause a panic). l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { // Panic signal should cause a panic. - if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) { + if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) { panic("Signal-induced panic") } // Otherwise forward to root container. deliveryMode := DeliverToProcess - if l.console { + if l.root.spec.Process.Terminal { // Since we are running with a console, we should forward the signal to // the foreground process group so that job control signals like ^C can // be handled properly. @@ -632,19 +622,6 @@ func (l *Loader) run() error { } }) - // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again - // either in createFDTable() during initial start or in descriptor.initAfterLoad() - // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the - // passed FDs, so only close for VFS1. - if !kernel.VFS2Enabled { - for _, fd := range l.stdioFDs { - err := syscall.Close(fd) - if err != nil { - return fmt.Errorf("close dup()ed stdioFDs: %v", err) - } - } - } - log.Infof("Process should have started...") l.watchdog.Start() return l.k.Start() @@ -664,9 +641,9 @@ func (l *Loader) createContainer(cid string) error { } // startContainer starts a child container. It returns the thread group ID of -// the newly created process. Caller owns 'files' and may close them after -// this method returns. -func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error { +// the newly created process. Used FDs are either closed or released. It's safe +// for the caller to close any remaining files upon return. +func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -676,8 +653,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file l.mu.Lock() defer l.mu.Unlock() - eid := execID{cid: cid} - if _, ok := l.processes[eid]; !ok { + ep := l.processes[execID{cid: cid}] + if ep == nil { return fmt.Errorf("trying to start a deleted container %q", cid) } @@ -711,73 +688,136 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file if pidns == nil { pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) } - l.processes[eid].pidnsPath = ns.Path + ep.pidnsPath = ns.Path } else { pidns = l.k.RootPIDNamespace() } - procArgs, err := newProcess(cid, spec, creds, l.k, pidns) + + info := &containerInfo{ + conf: conf, + spec: spec, + stdioFDs: files[:3], + goferFDs: files[3:], + } + info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } + tg, err := l.createContainerProcess(false, cid, info, ep) + if err != nil { + return err + } + + // Success! + l.k.StartProcess(tg) + ep.tg = tg + return nil +} - // setupContainerFS() dups stdioFDs, so we don't need to dup them here. - var stdioFDs []int - for _, f := range files[:3] { - stdioFDs = append(stdioFDs, int(f.Fd())) +func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) { + console := false + if root { + // Only root container supports terminal for now. + console = info.spec.Process.Terminal } // Create the FD map, which will set stdin, stdout, and stderr. - ctx := procArgs.NewContext(l.k) - fdTable, _, _, err := createFDTable(ctx, false, stdioFDs) + ctx := info.procArgs.NewContext(l.k) + fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs) if err != nil { - return fmt.Errorf("importing fds: %v", err) + return nil, fmt.Errorf("importing fds: %v", err) } - // CreateProcess takes a reference on fdTable if successful. We won't - // need ours either way. - procArgs.FDTable = fdTable + // CreateProcess takes a reference on fdTable if successful. We won't need + // ours either way. + info.procArgs.FDTable = fdTable - // Can't take ownership away from os.File. dup them to get a new FDs. - var goferFDs []int - for _, f := range files[3:] { - fd, err := syscall.Dup(int(f.Fd())) - if err != nil { - return fmt.Errorf("failed to dup file: %v", err) + // Setup the child container file system. + l.startGoferMonitor(cid, info.goferFDs) + + mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints) + if root { + if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { + return nil, err } - goferFDs = append(goferFDs, fd) + } + if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil { + return nil, err } - // Setup the child container file system. - l.startGoferMonitor(cid, goferFDs) + // Add the HOME environment variable if it is not already set. + var envv []string + if kernel.VFS2Enabled { + envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2, + info.procArgs.Credentials.RealKUID, info.procArgs.Envv) - mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints) - if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil { - return err + } else { + envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace, + info.procArgs.Credentials.RealKUID, info.procArgs.Envv) + } + if err != nil { + return nil, err } + info.procArgs.Envv = envv // Create and start the new process. - tg, _, err := l.k.CreateProcess(procArgs) + tg, _, err := l.k.CreateProcess(info.procArgs) if err != nil { - return fmt.Errorf("creating process: %v", err) + return nil, fmt.Errorf("creating process: %v", err) } - l.k.StartProcess(tg) - // CreateProcess takes a reference on FDTable if successful. - procArgs.FDTable.DecRef() + info.procArgs.FDTable.DecRef(ctx) - l.processes[eid].tg = tg - return nil + // Set the foreground process group on the TTY to the global init process + // group, since that is what we are about to start running. + if root { + switch { + case ttyFileVFS2 != nil: + ep.ttyVFS2 = ttyFileVFS2 + ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) + case ttyFile != nil: + ep.tty = ttyFile + ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) + } + } + + // Install seccomp filters with the new task if there are any. + if info.conf.OCISeccomp { + if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { + program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) + if err != nil { + return nil, fmt.Errorf("building seccomp program: %v", err) + } + + if log.IsLogging(log.Debug) { + out, _ := bpf.DecodeProgram(program) + log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) + } + + task := tg.Leader() + // NOTE: It seems Flags are ignored by runc so we ignore them too. + if err := task.AppendSyscallFilter(program, true); err != nil { + return nil, fmt.Errorf("appending seccomp filters: %v", err) + } + } + } else { + if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { + log.Warningf("Seccomp spec is being ignored") + } + } + + return tg, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on -// the gofer FDs looking for disconnects, and destroys the container if a +// the gofer FDs looking for disconnects, and kills the container processes if a // disconnect occurs in any of the gofer FDs. -func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { +func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { go func() { log.Debugf("Monitoring gofer health for container %q", cid) var events []unix.PollFd - for _, fd := range goferFDs { + for _, goferFD := range goferFDs { events = append(events, unix.PollFd{ - Fd: int32(fd), + Fd: int32(goferFD.FD()), Events: unix.POLLHUP | unix.POLLRDHUP, }) } @@ -790,18 +830,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err)) } - // Check if the gofer has stopped as part of normal container destruction. - // This is done just to avoid sending an annoying error message to the log. - // Note that there is a small race window in between mu.Unlock() and the - // lock being reacquired in destroyContainer(), but it's harmless to call - // destroyContainer() multiple times. l.mu.Lock() - _, ok := l.processes[execID{cid: cid}] - l.mu.Unlock() - if ok { - log.Infof("Gofer socket disconnected, destroying container %q", cid) - if err := l.destroyContainer(cid); err != nil { - log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err) + defer l.mu.Unlock() + + // The gofer could have been stopped due to a normal container shutdown. + // Check if the container has not stopped yet. + if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil { + log.Infof("Gofer socket disconnected, killing container %q", cid) + if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { + log.Warningf("Error killing container %q after gofer stopped: %v", cid, err) } } }() @@ -870,37 +907,41 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { return 0, fmt.Errorf("container %q not started", args.ContainerID) } - // Get the container MountNamespace from the Task. + // Get the container MountNamespace from the Task. Try to acquire ref may fail + // in case it raced with task exit. if kernel.VFS2Enabled { - // task.MountNamespace() does not take a ref, so we must do so ourselves. + // task.MountNamespaceVFS2() does not take a ref, so we must do so ourselves. args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2() - args.MountNamespaceVFS2.IncRef() + if !args.MountNamespaceVFS2.TryIncRef() { + return 0, fmt.Errorf("container %q has stopped", args.ContainerID) + } } else { + var reffed bool tg.Leader().WithMuLocked(func(t *kernel.Task) { // task.MountNamespace() does not take a ref, so we must do so ourselves. args.MountNamespace = t.MountNamespace() - args.MountNamespace.IncRef() + reffed = args.MountNamespace.TryIncRef() }) + if !reffed { + return 0, fmt.Errorf("container %q has stopped", args.ContainerID) + } } // Add the HOME environment variable if it is not already set. if kernel.VFS2Enabled { - defer args.MountNamespaceVFS2.DecRef() - root := args.MountNamespaceVFS2.Root() - defer root.DecRef() ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespaceVFS2.DecRef(ctx) envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) if err != nil { return 0, err } args.Envv = envv } else { - defer args.MountNamespace.DecRef() - root := args.MountNamespace.Root() - defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespace.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) if err != nil { return 0, err @@ -997,20 +1038,25 @@ func (l *Loader) WaitExit() kernel.ExitStatus { // Wait for container. l.k.WaitExited() + // Cleanup + l.ctrl.stop() + + refs.OnExit() + return l.k.GlobalInit().ExitStatus() } -func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { +func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { // Create an empty network stack because the network namespace may be empty at // this point. Netns is configured before Run() is called. Netstack is // configured using a control uRPC message. Host network is configured inside // Run(). switch conf.Network { - case NetworkHost: + case config.NetworkHost: // No network namespacing support for hostinet yet, hence creator is nil. return inet.NewRootNamespace(hostinet.NewStack(), nil), nil - case NetworkNone, NetworkSandbox: + case config.NetworkNone, config.NetworkSandbox: s, err := newEmptySandboxNetworkStack(clock, uniqueID) if err != nil { return nil, err @@ -1028,8 +1074,8 @@ func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.Uni } func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { - netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} - transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} + transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4} s := netstack.Stack{stack.New(stack.Options{ NetworkProtocols: netProtos, TransportProtocols: transProtos, @@ -1043,21 +1089,32 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in })} // Enable SACK Recovery. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { - return nil, fmt.Errorf("failed to enable SACK: %v", err) + { + opt := tcpip.TCPSACKEnabled(true) + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } } // Set default TTLs as required by socket/netstack. - s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) - s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + { + opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) + if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) + } + if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) + } + } // Enable Receive Buffer Auto-Tuning. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) + { + opt := tcpip.TCPModerateReceiveBufferOption(true) + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } } - s.FillDefaultIPTables() - return &s, nil } @@ -1251,7 +1308,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2 return ep.tty, ep.ttyVFS2, nil } -func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { +func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { if len(stdioFDs) != 3 { return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } @@ -1260,7 +1317,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F fdTable := k.NewFDTable() ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) if err != nil { - fdTable.DecRef() + fdTable.DecRef(ctx) return nil, nil, nil, err } return fdTable, ttyFile, ttyFileVFS2, nil diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index e448fd773..b77b4762e 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -26,6 +26,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" @@ -34,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/fsgofer" ) @@ -43,15 +45,19 @@ func init() { if err := fsgofer.OpenProcSelfFD(); err != nil { panic(err) } + config.RegisterFlags() } -func testConfig() *Config { - return &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - DisableSeccomp: true, - Platform: "ptrace", +func testConfig() *config.Config { + conf, err := config.NewFromFlags() + if err != nil { + panic(err) } + // Change test defaults. + conf.RootDir = "unused_root_dir" + conf.Network = config.NetworkNone + conf.DisableSeccomp = true + return conf } // testSpec returns a simple spec that can be used in tests. @@ -258,9 +264,9 @@ type CreateMountTestcase struct { expectedPaths []string } -func createMountTestcases(vfs2 bool) []*CreateMountTestcase { +func createMountTestcases() []*CreateMountTestcase { testCases := []*CreateMountTestcase{ - &CreateMountTestcase{ + { // Only proc. name: "only proc mount", spec: specs.Spec{ @@ -298,11 +304,10 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase { }, }, }, - // /some/deep/path should be mounted, along with /proc, - // /dev, and /sys. + // /some/deep/path should be mounted, along with /proc, /dev, and /sys. expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - &CreateMountTestcase{ + { // Mounts are nested inside each other. name: "nested mounts", spec: specs.Spec{ @@ -346,7 +351,7 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase { expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux", "/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - &CreateMountTestcase{ + { name: "mount inside /dev", spec: specs.Spec{ Root: &specs.Root{ @@ -389,46 +394,42 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase { }, expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"}, }, - } - - vfsCase := &CreateMountTestcase{ - name: "mounts inside mandatory mounts", - spec: specs.Spec{ - Root: &specs.Root{ - Path: os.TempDir(), - Readonly: true, - }, - Mounts: []specs.Mount{ - { - Destination: "/proc", - Type: "tmpfs", + { + name: "mounts inside mandatory mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, }, - // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports - // MkDirAt in VFS2 (and remove the reduntant append). - // { - // Destination: "/sys/bar", - // Type: "tmpfs", - // }, - // - { - Destination: "/tmp/baz", - Type: "tmpfs", + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + { + Destination: "/sys/bar", + Type: "tmpfs", + }, + { + Destination: "/tmp/baz", + Type: "tmpfs", + }, + { + Destination: "/dev/goo", + Type: "tmpfs", + }, }, }, + expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz", "/dev/goo"}, }, - expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"}, } - if !vfs2 { - vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"}) - vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar") - } - return append(testCases, vfsCase) + return testCases } // Test that MountNamespace can be created with various specs. func TestCreateMountNamespace(t *testing.T) { - for _, tc := range createMountTestcases(false /* vfs2 */) { + for _, tc := range createMountTestcases() { t.Run(tc.name, func(t *testing.T) { conf := testConfig() ctx := contexttest.Context(t) @@ -439,7 +440,7 @@ func TestCreateMountNamespace(t *testing.T) { } defer cleanup() - mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{}) + mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{}) mns, err := mntr.createMountNamespace(ctx, conf) if err != nil { t.Fatalf("failed to create mount namespace: %v", err) @@ -450,13 +451,13 @@ func TestCreateMountNamespace(t *testing.T) { } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { maxTraversals := uint(0) if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) @@ -465,7 +466,7 @@ func TestCreateMountNamespace(t *testing.T) { // Test that MountNamespace can be created with various specs. func TestCreateMountNamespaceVFS2(t *testing.T) { - for _, tc := range createMountTestcases(true /* vfs2 */) { + for _, tc := range createMountTestcases() { t.Run(tc.name, func(t *testing.T) { spec := testSpec() spec.Mounts = tc.spec.Mounts @@ -479,19 +480,20 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { defer l.Destroy() defer loaderCleanup() - mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.processHints(l.conf); err != nil { + mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil { t.Fatalf("failed process hints: %v", err) } ctx := l.k.SupervisorContext() - mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs) + mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs) if err != nil { - t.Fatalf("failed to setupVFS2: %v", err) + t.Fatalf("mountAll: %v", err) } root := mns.Root() - defer root.DecRef() + root.IncRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { target := &vfs.PathOperation{ Root: root, @@ -499,10 +501,10 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { Path: fspath.Parse(p), } - if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { + if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) @@ -545,7 +547,7 @@ func TestRestoreEnvironment(t *testing.T) { { Dev: "9pfs-/", Flags: fs.MountSourceFlags{ReadOnly: true}, - DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true", }, }, "tmpfs": { @@ -599,7 +601,7 @@ func TestRestoreEnvironment(t *testing.T) { { Dev: "9pfs-/", Flags: fs.MountSourceFlags{ReadOnly: true}, - DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true", }, { Dev: "9pfs-/dev/fd-foo", @@ -657,7 +659,7 @@ func TestRestoreEnvironment(t *testing.T) { { Dev: "9pfs-/", Flags: fs.MountSourceFlags{ReadOnly: true}, - DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true", }, }, "tmpfs": { @@ -697,7 +699,11 @@ func TestRestoreEnvironment(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { conf := testConfig() - mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{}) + var ioFDs []*fd.FD + for _, ioFD := range tc.ioFDs { + ioFDs = append(ioFDs, fd.New(ioFD)) + } + mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{}) actualRenv, err := mntr.createRestoreEnvironment(conf) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 0af30456e..988573640 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" @@ -32,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/config" ) var ( @@ -77,44 +79,6 @@ type DefaultRoute struct { Name string } -// QueueingDiscipline is used to specify the kind of Queueing Discipline to -// apply for a give FDBasedLink. -type QueueingDiscipline int - -const ( - // QDiscNone disables any queueing for the underlying FD. - QDiscNone QueueingDiscipline = iota - - // QDiscFIFO applies a simple fifo based queue to the underlying - // FD. - QDiscFIFO -) - -// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s -// else returns an error. -func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) { - switch s { - case "none": - return QDiscNone, nil - case "fifo": - return QDiscFIFO, nil - default: - return 0, fmt.Errorf("unsupported qdisc specified: %q", s) - } -} - -// String implements fmt.Stringer. -func (q QueueingDiscipline) String() string { - switch q { - case QDiscNone: - return "none" - case QDiscFIFO: - return "fifo" - default: - panic(fmt.Sprintf("Invalid queueing discipline: %d", q)) - } -} - // FDBasedLink configures an fd-based link. type FDBasedLink struct { Name string @@ -123,8 +87,10 @@ type FDBasedLink struct { Routes []Route GSOMaxSize uint32 SoftwareGSOEnabled bool + TXChecksumOffload bool + RXChecksumOffload bool LinkAddress net.HardwareAddr - QDisc QueueingDiscipline + QDisc config.QueueingDiscipline // NumChannels controls how many underlying FD's are to be used to // create this endpoint. @@ -236,19 +202,23 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct PacketDispatchMode: fdbased.RecvMMsg, GSOMaxSize: link.GSOMaxSize, SoftwareGSOEnabled: link.SoftwareGSOEnabled, - RXChecksumOffload: true, + TXChecksumOffload: link.TXChecksumOffload, + RXChecksumOffload: link.RXChecksumOffload, }) if err != nil { return err } switch link.QDisc { - case QDiscNone: - case QDiscFIFO: + case config.QDiscNone: + case config.QDiscFIFO: log.Infof("Enabling FIFO QDisc on %q", link.Name) linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) } + // Enable support for AF_PACKET sockets to receive outgoing packets. + linkEP = packetsocket.New(linkEP) + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go index fbfd3b07c..c21648a32 100644 --- a/runsc/boot/strace.go +++ b/runsc/boot/strace.go @@ -15,10 +15,13 @@ package boot import ( + "strings" + "gvisor.dev/gvisor/pkg/sentry/strace" + "gvisor.dev/gvisor/runsc/config" ) -func enableStrace(conf *Config) error { +func enableStrace(conf *config.Config) error { // We must initialize even if strace is not enabled. strace.Initialize() @@ -36,5 +39,5 @@ func enableStrace(conf *Config) error { strace.EnableAll(strace.SinkTypeLog) return nil } - return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog) + return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog) } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 6c84f0794..b157387ef 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -16,30 +16,40 @@ package boot import ( "fmt" - "path" "sort" "strings" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/devices/memdev" + "gvisor.dev/gvisor/pkg/sentry/devices/ttydev" + "gvisor.dev/gvisor/pkg/sentry/devices/tundev" "gvisor.dev/gvisor/pkg/sentry/fs/user" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse" "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay" "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/runsc/config" ) -func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { +func registerFilesystems(k *kernel.Kernel) error { + ctx := k.SupervisorContext() + creds := auth.NewRootCredentials(k.RootUserNamespace()) + vfsObj := k.VFS() + vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, // TODO(b/29356795): Users may mount this once the terminals are in a @@ -53,6 +63,10 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, }) + vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, @@ -65,46 +79,78 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre AllowUserMount: true, AllowUserList: true, }) + vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) // Setup files in devtmpfs. if err := memdev.Register(vfsObj); err != nil { return fmt.Errorf("registering memdev: %w", err) } + if err := ttydev.Register(vfsObj); err != nil { + return fmt.Errorf("registering ttydev: %w", err) + } + tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx)) + if tunSupported { + if err := tundev.Register(vfsObj); err != nil { + return fmt.Errorf("registering tundev: %v", err) + } + } + + if kernel.FUSEEnabled { + if err := fuse.Register(vfsObj); err != nil { + return fmt.Errorf("registering fusedev: %w", err) + } + } + a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name) if err != nil { return fmt.Errorf("creating devtmpfs accessor: %w", err) } - defer a.Release() + defer a.Release(ctx) if err := a.UserspaceInit(ctx); err != nil { return fmt.Errorf("initializing userspace: %w", err) } if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil { - return fmt.Errorf("creating devtmpfs files: %w", err) + return fmt.Errorf("creating memdev devtmpfs files: %w", err) + } + if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating ttydev devtmpfs files: %w", err) + } + if tunSupported { + if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating tundev devtmpfs files: %v", err) + } } + + if kernel.FUSEEnabled { + if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { + return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + } + } + return nil } -func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { - if err := mntr.k.VFS().Init(); err != nil { - return fmt.Errorf("failed to initialize VFS: %w", err) - } - mns, err := mntr.setupVFS2(ctx, conf, procArgs) +func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + mns, err := mntr.mountAll(conf, procArgs) if err != nil { return fmt.Errorf("failed to setupFS: %w", err) } procArgs.MountNamespaceVFS2 = mns // Resolve the executable path from working dir and environment. - f, err := user.ResolveExecutablePathVFS2(ctx, procArgs.Credentials, procArgs.MountNamespaceVFS2, procArgs.Envv, procArgs.WorkingDirectory, procArgs.Argv[0]) + resolved, err := user.ResolveExecutablePath(ctx, procArgs) if err != nil { - return fmt.Errorf("searching for executable %q, cwd: %q, envv: %q: %v", procArgs.Argv[0], procArgs.WorkingDirectory, procArgs.Envv, err) + return err } - procArgs.Filename = f + procArgs.Filename = resolved return nil } -func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { +func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { log.Infof("Configuring container's file system with VFS2") // Create context with root credentials to mount the filesystem (the current @@ -117,36 +163,147 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals rootCtx := procArgs.NewContext(c.k) - if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil { - return nil, fmt.Errorf("register filesystems: %w", err) - } - mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) if err != nil { return nil, fmt.Errorf("creating mount namespace: %w", err) } rootProcArgs.MountNamespaceVFS2 = mns + root := mns.Root() + root.IncRef() + defer root.DecRef(rootCtx) + if root.Mount().ReadOnly() { + // Switch to ReadWrite while we setup submounts. + if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil { + return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err) + } + // Restore back to ReadOnly at the end. + defer func() { + if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { + panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err)) + } + }() + } + // Mount submounts. if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil { return nil, fmt.Errorf("mounting submounts vfs2: %w", err) } + return mns, nil } -func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { +// createMountNamespaceVFS2 creates the container's root mount and namespace. +func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { fd := c.fds.remove() - opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",") + data := p9MountData(fd, conf.FileAccess, true /* vfs2 */) + + if conf.OverlayfsStaleRead { + // We can't check for overlayfs here because sandbox is chroot'ed and gofer + // can only send mount options for specs.Mounts (specs.Root is missing + // Options field). So assume root is always on top of overlayfs. + data = append(data, "overlayfs_stale_read") + } log.Infof("Mounting root over 9P, ioFD: %d", fd) - mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts}) + opts := &vfs.MountOptions{ + ReadOnly: c.root.Readonly, + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(data, ","), + InternalData: gofer.InternalFilesystemOptions{ + UniqueID: "/", + }, + }, + InternalMount: true, + } + + fsName := gofer.Name + if conf.Overlay && !c.root.Readonly { + log.Infof("Adding overlay on top of root") + var err error + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting root with overlay: %w", err) + } + defer cleanup() + fsName = overlay.Name + } + + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts) if err != nil { return nil, fmt.Errorf("setting up mount namespace: %w", err) } return mns, nil } -func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { +// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper +// layer using tmpfs, and return overlay mount options. "cleanup" must be called +// after the options have been used to mount the overlay, to release refs on +// lower and upper mounts. +func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) { + // First copy options from lower layer to upper layer and overlay. Clear + // filesystem specific options. + upperOpts := *lowerOpts + upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} + + overlayOpts := *lowerOpts + overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} + + // Next mount upper and lower. Upper is a tmpfs mount to keep all + // modifications inside the sandbox. + upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) + if err != nil { + return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) + } + cu := cleanup.Make(func() { upper.DecRef(ctx) }) + defer cu.Clean() + + // All writes go to the upper layer, be paranoid and make lower readonly. + lowerOpts.ReadOnly = true + lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) + if err != nil { + return nil, nil, err + } + cu.Add(func() { lower.DecRef(ctx) }) + + // Propagate the lower layer's root's owner, group, and mode to the upper + // layer's root for consistency with VFS1. + upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) + lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root()) + stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{ + Root: lowerRootVD, + Start: lowerRootVD, + }, &vfs.StatOptions{ + Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE, + }) + if err != nil { + return nil, nil, err + } + err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{ + Root: upperRootVD, + Start: upperRootVD, + }, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask, + UID: stat.UID, + GID: stat.GID, + Mode: stat.Mode, + }, + }) + if err != nil { + return nil, nil, err + } + + // Configure overlay with both layers. + overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{ + UpperRoot: upperRootVD, + LowerRoots: []vfs.VirtualDentry{lowerRootVD}, + } + return &overlayOpts, cu.Release(), nil +} + +func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { mounts, err := c.prepareMountsVFS2() if err != nil { return err @@ -155,8 +312,34 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, for i := range mounts { submount := &mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) - if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { - return err + var ( + mnt *vfs.Mount + err error + ) + + if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() { + mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint) + if err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err) + } + } else { + mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount) + if err != nil { + return fmt.Errorf("mount submount %q: %w", submount.Destination, err) + } + } + + if mnt != nil && mnt.ReadOnly() { + // Switch to ReadWrite while we setup submounts. + if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { + return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err) + } + // Restore back to ReadOnly at the end. + defer func() { + if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { + panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err)) + } + }() } } @@ -200,67 +383,94 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { return mounts, nil } -func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { - root := mns.Root() - defer root.DecRef() - target := &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(submount.Destination), - } - fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount) +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) { + fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount) if err != nil { - return fmt.Errorf("mountOptions failed: %w", err) + return nil, fmt.Errorf("mountOptions failed: %w", err) } if len(fsName) == 0 { // Filesystem is not supported (e.g. cgroup), just skip it. - return nil + return nil, nil } - if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil { - return err + if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err) } - if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil { - return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + + if useOverlay { + log.Infof("Adding overlay on top of mount %q", submount.Destination) + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err) + } + defer cleanup() + fsName = overlay.Name + } + + root := mns.Root() + root.IncRef() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(submount.Destination), + } + mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) + if err != nil { + return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data) - return nil + return mnt, nil } // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values // used for mounts. -func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) { - var ( - fsName string - data []string - ) +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) { + fsName := m.Type + useOverlay := false + var data []string + var iopts interface{} // Find filesystem name and FS specific data field. switch m.Type { case devpts.Name, devtmpfs.Name, proc.Name, sys.Name: - fsName = m.Type + // Nothing to do. + case nonefs: fsName = sys.Name - case tmpfs.Name: - fsName = m.Type + case tmpfs.Name: var err error data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { - return "", nil, err + return "", nil, false, err } case bind: fsName = gofer.Name + if m.fd == 0 { + // Check that an FD was provided to fails fast. Technically FD=0 is valid, + // but unlikely to be correct in this context. + return "", nil, false, fmt.Errorf("9P mount requires a connection FD") + } data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) + iopts = gofer.InternalFilesystemOptions{ + UniqueID: m.Destination, + } + + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly default: log.Warningf("ignoring unknown filesystem type %q", m.Type) + return "", nil, false, nil } opts := &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ - Data: strings.Join(data, ","), + Data: strings.Join(data, ","), + InternalData: iopts, }, InternalMount: true, } @@ -272,7 +482,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF case "ro": opts.ReadOnly = true case "noatime": - // TODO(gvisor.dev/issue/1193): Implement MS_NOATIME. + opts.Flags.NoATime = true case "noexec": opts.Flags.NoExec = true default: @@ -280,38 +490,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF } } - if conf.Overlay { - // All writes go to upper, be paranoid and make lower readonly. - opts.ReadOnly = true - } - return fsName, opts, nil -} - -func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { - target := &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(currentPath), - } - _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) - if err == nil { - // Mount point exists, nothing else to do. - return nil - } - if err != syserror.ENOENT { - return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err) - } - - // Recurse to ensure parent is created and then create the mount point. - if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil { - return err - } - log.Debugf("Creating dir %q for mount point", currentPath) - mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} - if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil { - return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err) - } - return nil + return fsName, opts, useOverlay, nil } // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so. @@ -323,7 +502,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s // // Note that when there are submounts inside of '/tmp', directories for the // mount points must be present, making '/tmp' not empty anymore. -func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { +func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { for _, m := range c.mounts { // m.Destination has been cleaned, so it's to use equality here. if m.Destination == "/tmp" { @@ -333,28 +512,36 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds } root := mns.Root() - defer root.DecRef() + root.IncRef() + defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse("/tmp"), } // TODO(gvisor.dev/issue/2782): Use O_PATH when available. - statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{}) + fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) switch err { case nil: - // Found '/tmp' in filesystem, check if it's empty. - if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory { - // Not a dir?! Leave it be. + defer fd.DecRef(ctx) + + err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name != "." && dirent.Name != ".." { + return syserror.ENOTEMPTY + } return nil - } - if statx.Nlink > 2 { + })) + switch err { + case nil: + log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) + case syserror.ENOTEMPTY: // If more than "." and ".." is found, skip internal tmpfs to prevent // hiding existing files. log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) return nil + default: + return err } - log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) fallthrough case syserror.ENOENT: @@ -367,9 +554,142 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds // another user. This is normally done for /tmp. Options: []string{"mode=01777"}, } - return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + _, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + return err + + case syserror.ENOTDIR: + // Not a dir?! Let it be. + return nil default: - return fmt.Errorf(`stating "/tmp" inside container: %w`, err) + return fmt.Errorf(`opening "/tmp" inside container: %w`, err) + } +} + +// processHintsVFS2 processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error { + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfs.Name { + continue + } + + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.vfsMount = mnt + } + return nil +} + +// mountSharedMasterVFS2 mounts the master of a volume that is shared among +// containers in a pod. +func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + mntFD := &mountAndFD{Mount: hint.mount} + fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + + if useOverlay { + log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination) + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err) + } + defer cleanup() + fsName = overlay.Name + } + + return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) +} + +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) { + if err := source.checkCompatible(mount); err != nil { + return nil, err + } + + // Ignore data and useOverlay because these were already applied to + // the master mount. + _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + if err != nil { + return nil, err + } + newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts) + if err != nil { + return nil, err + } + defer newMnt.DecRef(ctx) + + root := mns.Root() + root.IncRef() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(mount.Destination), + } + + if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err) + } + + if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { + return nil, err + } + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return newMnt, nil +} + +func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error { + root := mns.Root() + root.IncRef() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dest), + } + // First check if mount point exists. When overlay is enabled, gofer doesn't + // allow changes to the FS, making MakeSytheticMountpoint() ineffective + // because MkdirAt fails with EROFS even if file exists. + vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{}) + if err == nil { + // File exists, we're done. + vd.DecRef(ctx) + return nil + } + return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds) +} + +// configureRestore returns an updated context.Context including filesystem +// state used by restore defined by conf. +func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Config) (context.Context, error) { + fdmap := make(map[string]int) + fdmap["/"] = c.fds.remove() + mounts, err := c.prepareMountsVFS2() + if err != nil { + return ctx, err + } + for i := range c.mounts { + submount := &mounts[i] + if submount.fd >= 0 { + fdmap[submount.Destination] = submount.fd + } } + return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil } diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD index c087e1a3c..37f4253ba 100644 --- a/runsc/cgroup/BUILD +++ b/runsc/cgroup/BUILD @@ -10,7 +10,7 @@ go_library( "//pkg/cleanup", "//pkg/log", "@com_github_cenkalti_backoff//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) @@ -20,4 +20,8 @@ go_test( srcs = ["cgroup_test.go"], library = ":cgroup", tags = ["local"], + deps = [ + "//pkg/test/testutil", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + ], ) diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index ef01820ef..5bd0afc52 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -21,6 +21,7 @@ import ( "context" "errors" "fmt" + "io" "io/ioutil" "os" "path/filepath" @@ -43,6 +44,7 @@ var controllers = map[string]config{ "blkio": config{ctrlr: &blockIO{}}, "cpu": config{ctrlr: &cpu{}}, "cpuset": config{ctrlr: &cpuSet{}}, + "hugetlb": config{ctrlr: &hugeTLB{}, optional: true}, "memory": config{ctrlr: &memory{}}, "net_cls": config{ctrlr: &networkClass{}}, "net_prio": config{ctrlr: &networkPrio{}}, @@ -52,7 +54,6 @@ var controllers = map[string]config{ // irrelevant for a sandbox. "devices": config{ctrlr: &noop{}}, "freezer": config{ctrlr: &noop{}}, - "hugetlb": config{ctrlr: &noop{}, optional: true}, "perf_event": config{ctrlr: &noop{}}, "rdma": config{ctrlr: &noop{}, optional: true}, "systemd": config{ctrlr: &noop{}}, @@ -92,7 +93,17 @@ func setOptionalValueUint16(path, name string, val *uint16) error { func setValue(path, name, data string) error { fullpath := filepath.Join(path, name) - return ioutil.WriteFile(fullpath, []byte(data), 0700) + + // Retry writes on EINTR; see: + // https://github.com/golang/go/issues/38033 + for { + err := ioutil.WriteFile(fullpath, []byte(data), 0700) + if err == nil { + return nil + } else if !errors.Is(err, syscall.EINTR) { + return err + } + } } func getValue(path, name string) (string, error) { @@ -125,15 +136,23 @@ func fillFromAncestor(path string) (string, error) { return val, nil } - // File is not set, recurse to parent and then set here. + // File is not set, recurse to parent and then set here. name := filepath.Base(path) parent := filepath.Dir(filepath.Dir(path)) val, err = fillFromAncestor(filepath.Join(parent, name)) if err != nil { return "", err } - if err := ioutil.WriteFile(path, []byte(val), 0700); err != nil { - return "", err + + // Retry writes on EINTR; see: + // https://github.com/golang/go/issues/38033 + for { + err := ioutil.WriteFile(path, []byte(val), 0700) + if err == nil { + break + } else if !errors.Is(err, syscall.EINTR) { + return "", err + } } return val, nil } @@ -180,16 +199,26 @@ func LoadPaths(pid string) (map[string]string, error) { } defer f.Close() + return loadPathsHelper(f) +} + +func loadPathsHelper(cgroup io.Reader) (map[string]string, error) { paths := make(map[string]string) - scanner := bufio.NewScanner(f) + + scanner := bufio.NewScanner(cgroup) for scanner.Scan() { - // Format: ID:controller1,controller2:path + // Format: ID:[name=]controller1,controller2:path // Example: 2:cpu,cpuacct:/user.slice tokens := strings.Split(scanner.Text(), ":") if len(tokens) != 3 { return nil, fmt.Errorf("invalid cgroups file, line: %q", scanner.Text()) } + if len(tokens[1]) == 0 { + continue + } for _, ctrlr := range strings.Split(tokens[1], ",") { + // Remove prefix for cgroups with no controller, eg. systemd. + ctrlr = strings.TrimPrefix(ctrlr, "name=") paths[ctrlr] = tokens[2] } } @@ -219,7 +248,7 @@ func New(spec *specs.Spec) (*Cgroup, error) { var err error parents, err = LoadPaths("self") if err != nil { - return nil, fmt.Errorf("finding current cgroups: %v", err) + return nil, fmt.Errorf("finding current cgroups: %w", err) } } return &Cgroup{ @@ -258,10 +287,8 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error { } return err } - if res != nil { - if err := cfg.ctrlr.set(res, path); err != nil { - return err - } + if err := cfg.ctrlr.set(res, path); err != nil { + return err } } clean.Release() @@ -286,14 +313,15 @@ func (c *Cgroup) Uninstall() error { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) - if err := backoff.Retry(func() error { + fn := func() error { err := syscall.Rmdir(path) if os.IsNotExist(err) { return nil } return err - }, b); err != nil { - return fmt.Errorf("removing cgroup path %q: %v", path, err) + } + if err := backoff.Retry(fn, b); err != nil { + return fmt.Errorf("removing cgroup path %q: %w", path, err) } } return nil @@ -314,7 +342,6 @@ func (c *Cgroup) Join() (func(), error) { if _, ok := controllers[ctrlr]; ok { fullPath := filepath.Join(cgroupRoot, ctrlr, path) undoPaths = append(undoPaths, fullPath) - break } } @@ -404,7 +431,7 @@ func (*noop) set(*specs.LinuxResources, string) error { type memory struct{} func (*memory) set(spec *specs.LinuxResources, path string) error { - if spec.Memory == nil { + if spec == nil || spec.Memory == nil { return nil } if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil { @@ -437,7 +464,7 @@ func (*memory) set(spec *specs.LinuxResources, path string) error { type cpu struct{} func (*cpu) set(spec *specs.LinuxResources, path string) error { - if spec.CPU == nil { + if spec == nil || spec.CPU == nil { return nil } if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil { @@ -446,7 +473,13 @@ func (*cpu) set(spec *specs.LinuxResources, path string) error { if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil { return err } - return setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period) + if err := setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period); err != nil { + return err + } + if err := setOptionalValueUint(path, "cpu.rt_period_us", spec.CPU.RealtimePeriod); err != nil { + return err + } + return setOptionalValueInt(path, "cpu.rt_runtime_us", spec.CPU.RealtimeRuntime) } type cpuSet struct{} @@ -454,7 +487,7 @@ type cpuSet struct{} func (*cpuSet) set(spec *specs.LinuxResources, path string) error { // cpuset.cpus and mems are required fields, but are not set on a new cgroup. // If not set in the spec, get it from one of the ancestors cgroup. - if spec.CPU == nil || spec.CPU.Cpus == "" { + if spec == nil || spec.CPU == nil || spec.CPU.Cpus == "" { if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil { return err } @@ -464,18 +497,17 @@ func (*cpuSet) set(spec *specs.LinuxResources, path string) error { } } - if spec.CPU == nil || spec.CPU.Mems == "" { + if spec == nil || spec.CPU == nil || spec.CPU.Mems == "" { _, err := fillFromAncestor(filepath.Join(path, "cpuset.mems")) return err } - mems := spec.CPU.Mems - return setValue(path, "cpuset.mems", mems) + return setValue(path, "cpuset.mems", spec.CPU.Mems) } type blockIO struct{} func (*blockIO) set(spec *specs.LinuxResources, path string) error { - if spec.BlockIO == nil { + if spec == nil || spec.BlockIO == nil { return nil } @@ -487,13 +519,17 @@ func (*blockIO) set(spec *specs.LinuxResources, path string) error { } for _, dev := range spec.BlockIO.WeightDevice { - val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Weight) - if err := setValue(path, "blkio.weight_device", val); err != nil { - return err + if dev.Weight != nil { + val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight) + if err := setValue(path, "blkio.weight_device", val); err != nil { + return err + } } - val = fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.LeafWeight) - if err := setValue(path, "blkio.leaf_weight_device", val); err != nil { - return err + if dev.LeafWeight != nil { + val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.LeafWeight) + if err := setValue(path, "blkio.leaf_weight_device", val); err != nil { + return err + } } } if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil { @@ -521,7 +557,7 @@ func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error { type networkClass struct{} func (*networkClass) set(spec *specs.LinuxResources, path string) error { - if spec.Network == nil { + if spec == nil || spec.Network == nil { return nil } return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID) @@ -530,7 +566,7 @@ func (*networkClass) set(spec *specs.LinuxResources, path string) error { type networkPrio struct{} func (*networkPrio) set(spec *specs.LinuxResources, path string) error { - if spec.Network == nil { + if spec == nil || spec.Network == nil { return nil } for _, prio := range spec.Network.Priorities { @@ -545,9 +581,25 @@ func (*networkPrio) set(spec *specs.LinuxResources, path string) error { type pids struct{} func (*pids) set(spec *specs.LinuxResources, path string) error { - if spec.Pids == nil { + if spec == nil || spec.Pids == nil || spec.Pids.Limit <= 0 { return nil } val := strconv.FormatInt(spec.Pids.Limit, 10) return setValue(path, "pids.max", val) } + +type hugeTLB struct{} + +func (*hugeTLB) set(spec *specs.LinuxResources, path string) error { + if spec == nil { + return nil + } + for _, limit := range spec.HugepageLimits { + name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize) + val := strconv.FormatUint(limit.Limit, 10) + if err := setValue(path, name, val); err != nil { + return err + } + } + return nil +} diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go index 548c80e9a..9794517a7 100644 --- a/runsc/cgroup/cgroup_test.go +++ b/runsc/cgroup/cgroup_test.go @@ -15,7 +15,14 @@ package cgroup import ( + "io/ioutil" + "os" + "path/filepath" + "strings" "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/test/testutil" ) func TestUninstallEnoent(t *testing.T) { @@ -65,3 +72,658 @@ func TestCountCpuset(t *testing.T) { }) } } + +func uint16Ptr(v uint16) *uint16 { + return &v +} + +func uint32Ptr(v uint32) *uint32 { + return &v +} + +func int64Ptr(v int64) *int64 { + return &v +} + +func uint64Ptr(v uint64) *uint64 { + return &v +} + +func boolPtr(v bool) *bool { + return &v +} + +func checkDir(t *testing.T, dir string, contents map[string]string) { + all, err := ioutil.ReadDir(dir) + if err != nil { + t.Fatalf("ReadDir(%q): %v", dir, err) + } + fileCount := 0 + for _, file := range all { + if file.IsDir() { + // Only want to compare files. + continue + } + fileCount++ + + want, ok := contents[file.Name()] + if !ok { + t.Errorf("file not expected: %q", file.Name()) + continue + } + gotBytes, err := ioutil.ReadFile(filepath.Join(dir, file.Name())) + if err != nil { + t.Fatal(err.Error()) + } + got := strings.TrimSuffix(string(gotBytes), "\n") + if got != want { + t.Errorf("wrong file content, file: %q, want: %q, got: %q", file.Name(), want, got) + } + } + if fileCount != len(contents) { + t.Errorf("file is missing, want: %v, got: %v", contents, all) + } +} + +func makeLinuxWeightDevice(major, minor int64, weight, leafWeight *uint16) specs.LinuxWeightDevice { + rv := specs.LinuxWeightDevice{ + Weight: weight, + LeafWeight: leafWeight, + } + rv.Major = major + rv.Minor = minor + return rv +} + +func makeLinuxThrottleDevice(major, minor int64, rate uint64) specs.LinuxThrottleDevice { + rv := specs.LinuxThrottleDevice{ + Rate: rate, + } + rv.Major = major + rv.Minor = minor + return rv +} + +func TestBlockIO(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxBlockIO + wants map[string]string + }{ + { + name: "simple", + spec: &specs.LinuxBlockIO{ + Weight: uint16Ptr(1), + LeafWeight: uint16Ptr(2), + }, + wants: map[string]string{ + "blkio.weight": "1", + "blkio.leaf_weight": "2", + }, + }, + { + name: "weight_device", + spec: &specs.LinuxBlockIO{ + WeightDevice: []specs.LinuxWeightDevice{ + makeLinuxWeightDevice(1, 2, uint16Ptr(3), uint16Ptr(4)), + }, + }, + wants: map[string]string{ + "blkio.weight_device": "1:2 3", + "blkio.leaf_weight_device": "1:2 4", + }, + }, + { + name: "weight_device_nil_values", + spec: &specs.LinuxBlockIO{ + WeightDevice: []specs.LinuxWeightDevice{ + makeLinuxWeightDevice(1, 2, nil, nil), + }, + }, + }, + { + name: "throttle", + spec: &specs.LinuxBlockIO{ + ThrottleReadBpsDevice: []specs.LinuxThrottleDevice{ + makeLinuxThrottleDevice(1, 2, 3), + }, + ThrottleReadIOPSDevice: []specs.LinuxThrottleDevice{ + makeLinuxThrottleDevice(4, 5, 6), + }, + ThrottleWriteBpsDevice: []specs.LinuxThrottleDevice{ + makeLinuxThrottleDevice(7, 8, 9), + }, + ThrottleWriteIOPSDevice: []specs.LinuxThrottleDevice{ + makeLinuxThrottleDevice(10, 11, 12), + }, + }, + wants: map[string]string{ + "blkio.throttle.read_bps_device": "1:2 3", + "blkio.throttle.read_iops_device": "4:5 6", + "blkio.throttle.write_bps_device": "7:8 9", + "blkio.throttle.write_iops_device": "10:11 12", + }, + }, + { + name: "nil_values", + spec: &specs.LinuxBlockIO{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + BlockIO: tc.spec, + } + ctrlr := blockIO{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestCPU(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxCPU + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxCPU{ + Shares: uint64Ptr(1), + Quota: int64Ptr(2), + Period: uint64Ptr(3), + RealtimeRuntime: int64Ptr(4), + RealtimePeriod: uint64Ptr(5), + }, + wants: map[string]string{ + "cpu.shares": "1", + "cpu.cfs_quota_us": "2", + "cpu.cfs_period_us": "3", + "cpu.rt_runtime_us": "4", + "cpu.rt_period_us": "5", + }, + }, + { + name: "nil_values", + spec: &specs.LinuxCPU{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + CPU: tc.spec, + } + ctrlr := cpu{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestCPUSet(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxCPU + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxCPU{ + Cpus: "foo", + Mems: "bar", + }, + wants: map[string]string{ + "cpuset.cpus": "foo", + "cpuset.mems": "bar", + }, + }, + // Don't test nil values because they are copied from the parent. + // See TestCPUSetAncestor(). + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + CPU: tc.spec, + } + ctrlr := cpuSet{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +// TestCPUSetAncestor checks that, when not available, value is read from +// parent directory. +func TestCPUSetAncestor(t *testing.T) { + // Prepare master directory with cgroup files that will be propagated to + // children. + grandpa, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(grandpa) + + if err := ioutil.WriteFile(filepath.Join(grandpa, "cpuset.cpus"), []byte("parent-cpus"), 0666); err != nil { + t.Fatalf("ioutil.WriteFile(): %v", err) + } + if err := ioutil.WriteFile(filepath.Join(grandpa, "cpuset.mems"), []byte("parent-mems"), 0666); err != nil { + t.Fatalf("ioutil.WriteFile(): %v", err) + } + + for _, tc := range []struct { + name string + spec *specs.LinuxCPU + }{ + { + name: "nil_values", + spec: &specs.LinuxCPU{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + // Create empty files in intermediate directory. They should be ignored + // when reading, and then populated from parent. + parent, err := ioutil.TempDir(grandpa, "parent") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(parent) + if _, err := os.Create(filepath.Join(parent, "cpuset.cpus")); err != nil { + t.Fatalf("os.Create(): %v", err) + } + if _, err := os.Create(filepath.Join(parent, "cpuset.mems")); err != nil { + t.Fatalf("os.Create(): %v", err) + } + + // cgroup files mmust exist. + dir, err := ioutil.TempDir(parent, "child") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + if _, err := os.Create(filepath.Join(dir, "cpuset.cpus")); err != nil { + t.Fatalf("os.Create(): %v", err) + } + if _, err := os.Create(filepath.Join(dir, "cpuset.mems")); err != nil { + t.Fatalf("os.Create(): %v", err) + } + + spec := &specs.LinuxResources{ + CPU: tc.spec, + } + ctrlr := cpuSet{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + want := map[string]string{ + "cpuset.cpus": "parent-cpus", + "cpuset.mems": "parent-mems", + } + // Both path and dir must have been populated from grandpa. + checkDir(t, parent, want) + checkDir(t, dir, want) + }) + } +} + +func TestHugeTlb(t *testing.T) { + for _, tc := range []struct { + name string + spec []specs.LinuxHugepageLimit + wants map[string]string + }{ + { + name: "single", + spec: []specs.LinuxHugepageLimit{ + { + Pagesize: "1G", + Limit: 123, + }, + }, + wants: map[string]string{ + "hugetlb.1G.limit_in_bytes": "123", + }, + }, + { + name: "multiple", + spec: []specs.LinuxHugepageLimit{ + { + Pagesize: "1G", + Limit: 123, + }, + { + Pagesize: "2G", + Limit: 456, + }, + { + Pagesize: "1P", + Limit: 789, + }, + }, + wants: map[string]string{ + "hugetlb.1G.limit_in_bytes": "123", + "hugetlb.2G.limit_in_bytes": "456", + "hugetlb.1P.limit_in_bytes": "789", + }, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + HugepageLimits: tc.spec, + } + ctrlr := hugeTLB{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestMemory(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxMemory + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxMemory{ + Limit: int64Ptr(1), + Reservation: int64Ptr(2), + Swap: int64Ptr(3), + Kernel: int64Ptr(4), + KernelTCP: int64Ptr(5), + Swappiness: uint64Ptr(6), + DisableOOMKiller: boolPtr(true), + }, + wants: map[string]string{ + "memory.limit_in_bytes": "1", + "memory.soft_limit_in_bytes": "2", + "memory.memsw.limit_in_bytes": "3", + "memory.kmem.limit_in_bytes": "4", + "memory.kmem.tcp.limit_in_bytes": "5", + "memory.swappiness": "6", + "memory.oom_control": "1", + }, + }, + { + // Disable OOM killer should only write when set to true. + name: "oomkiller", + spec: &specs.LinuxMemory{ + DisableOOMKiller: boolPtr(false), + }, + }, + { + name: "nil_values", + spec: &specs.LinuxMemory{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + Memory: tc.spec, + } + ctrlr := memory{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestNetworkClass(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxNetwork + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxNetwork{ + ClassID: uint32Ptr(1), + }, + wants: map[string]string{ + "net_cls.classid": "1", + }, + }, + { + name: "nil_values", + spec: &specs.LinuxNetwork{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + Network: tc.spec, + } + ctrlr := networkClass{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestNetworkPriority(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxNetwork + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxNetwork{ + Priorities: []specs.LinuxInterfacePriority{ + { + Name: "foo", + Priority: 1, + }, + }, + }, + wants: map[string]string{ + "net_prio.ifpriomap": "foo 1", + }, + }, + { + name: "nil_values", + spec: &specs.LinuxNetwork{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + Network: tc.spec, + } + ctrlr := networkPrio{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestPids(t *testing.T) { + for _, tc := range []struct { + name string + spec *specs.LinuxPids + wants map[string]string + }{ + { + name: "all", + spec: &specs.LinuxPids{Limit: 1}, + wants: map[string]string{ + "pids.max": "1", + }, + }, + { + name: "nil_values", + spec: &specs.LinuxPids{}, + }, + { + name: "nil", + }, + } { + t.Run(tc.name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + defer os.RemoveAll(dir) + + spec := &specs.LinuxResources{ + Pids: tc.spec, + } + ctrlr := pids{} + if err := ctrlr.set(spec, dir); err != nil { + t.Fatalf("ctrlr.set(): %v", err) + } + checkDir(t, dir, tc.wants) + }) + } +} + +func TestLoadPaths(t *testing.T) { + for _, tc := range []struct { + name string + cgroups string + want map[string]string + err string + }{ + { + name: "abs-path", + cgroups: "0:ctr:/path", + want: map[string]string{"ctr": "/path"}, + }, + { + name: "rel-path", + cgroups: "0:ctr:rel-path", + want: map[string]string{"ctr": "rel-path"}, + }, + { + name: "non-controller", + cgroups: "0:name=systemd:/path", + want: map[string]string{"systemd": "/path"}, + }, + { + name: "empty", + }, + { + name: "multiple", + cgroups: "0:ctr0:/path0\n" + + "1:ctr1:/path1\n" + + "2::/empty\n", + want: map[string]string{ + "ctr0": "/path0", + "ctr1": "/path1", + }, + }, + { + name: "missing-field", + cgroups: "0:nopath\n", + err: "invalid cgroups file", + }, + { + name: "too-many-fields", + cgroups: "0:ctr:/path:extra\n", + err: "invalid cgroups file", + }, + { + name: "multiple-malformed", + cgroups: "0:ctr0:/path0\n" + + "1:ctr1:/path1\n" + + "2:\n", + err: "invalid cgroups file", + }, + } { + t.Run(tc.name, func(t *testing.T) { + r := strings.NewReader(tc.cgroups) + got, err := loadPathsHelper(r) + if len(tc.err) == 0 { + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + } else if !strings.Contains(err.Error(), tc.err) { + t.Fatalf("Wrong error message, want: *%s*, got: %v", tc.err, err) + } + for key, vWant := range tc.want { + vGot, ok := got[key] + if !ok { + t.Errorf("Missing controller %q", key) + } + if vWant != vGot { + t.Errorf("Wrong controller %q value, want: %q, got: %q", key, vWant, vGot) + } + delete(got, key) + } + for k, v := range got { + t.Errorf("Unexpected controller %q: %q", k, v) + } + }) + } +} diff --git a/runsc/cli/BUILD b/runsc/cli/BUILD new file mode 100644 index 000000000..32cce2a18 --- /dev/null +++ b/runsc/cli/BUILD @@ -0,0 +1,22 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "cli", + srcs = ["main.go"], + visibility = [ + "//:__pkg__", + "//runsc:__pkg__", + ], + deps = [ + "//pkg/log", + "//pkg/refs", + "//pkg/sentry/platform", + "//runsc/cmd", + "//runsc/config", + "//runsc/flag", + "//runsc/specutils", + "@com_github_google_subcommands//:go_default_library", + ], +) diff --git a/runsc/cli/main.go b/runsc/cli/main.go new file mode 100644 index 000000000..bca015db5 --- /dev/null +++ b/runsc/cli/main.go @@ -0,0 +1,256 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cli is the main entrypoint for runsc. +package cli + +import ( + "context" + "fmt" + "io" + "io/ioutil" + "os" + "os/signal" + "syscall" + "time" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/runsc/cmd" + "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" +) + +var ( + // Although these flags are not part of the OCI spec, they are used by + // Docker, and thus should not be changed. + // TODO(gvisor.dev/issue/193): support systemd cgroups + systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.") + showVersion = flag.Bool("version", false, "show version and exit.") + + // These flags are unique to runsc, and are used to configure parts of the + // system that are not covered by the runtime spec. + + // Debugging flags. + logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.") + debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.") + panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.") +) + +// Main is the main entrypoint. +func Main(version string) { + // Help and flags commands are generated automatically. + help := cmd.NewHelp(subcommands.DefaultCommander) + help.Register(new(cmd.Syscalls)) + subcommands.Register(help, "") + subcommands.Register(subcommands.FlagsCommand(), "") + + // Installation helpers. + const helperGroup = "helpers" + subcommands.Register(new(cmd.Install), helperGroup) + subcommands.Register(new(cmd.Uninstall), helperGroup) + + // Register user-facing runsc commands. + subcommands.Register(new(cmd.Checkpoint), "") + subcommands.Register(new(cmd.Create), "") + subcommands.Register(new(cmd.Delete), "") + subcommands.Register(new(cmd.Do), "") + subcommands.Register(new(cmd.Events), "") + subcommands.Register(new(cmd.Exec), "") + subcommands.Register(new(cmd.Gofer), "") + subcommands.Register(new(cmd.Kill), "") + subcommands.Register(new(cmd.List), "") + subcommands.Register(new(cmd.Pause), "") + subcommands.Register(new(cmd.PS), "") + subcommands.Register(new(cmd.Restore), "") + subcommands.Register(new(cmd.Resume), "") + subcommands.Register(new(cmd.Run), "") + subcommands.Register(new(cmd.Spec), "") + subcommands.Register(new(cmd.State), "") + subcommands.Register(new(cmd.Start), "") + subcommands.Register(new(cmd.Wait), "") + + // Register internal commands with the internal group name. This causes + // them to be sorted below the user-facing commands with empty group. + // The string below will be printed above the commands. + const internalGroup = "internal use only" + subcommands.Register(new(cmd.Boot), internalGroup) + subcommands.Register(new(cmd.Debug), internalGroup) + subcommands.Register(new(cmd.Gofer), internalGroup) + subcommands.Register(new(cmd.Statefile), internalGroup) + + config.RegisterFlags() + + // All subcommands must be registered before flag parsing. + flag.Parse() + + // Are we showing the version? + if *showVersion { + // The format here is the same as runc. + fmt.Fprintf(os.Stdout, "runsc version %s\n", version) + fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version) + os.Exit(0) + } + + // Create a new Config from the flags. + conf, err := config.NewFromFlags() + if err != nil { + cmd.Fatalf(err.Error()) + } + + // TODO(gvisor.dev/issue/193): support systemd cgroups + if *systemdCgroup { + fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193") + os.Exit(1) + } + + var errorLogger io.Writer + if *logFD > -1 { + errorLogger = os.NewFile(uintptr(*logFD), "error log file") + + } else if conf.LogFilename != "" { + // We must set O_APPEND and not O_TRUNC because Docker passes + // the same log file for all commands (and also parses these + // log files), so we can't destroy them on each command. + var err error + errorLogger, err = os.OpenFile(conf.LogFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) + if err != nil { + cmd.Fatalf("error opening log file %q: %v", conf.LogFilename, err) + } + } + cmd.ErrorLogger = errorLogger + + if _, err := platform.Lookup(conf.Platform); err != nil { + cmd.Fatalf("%v", err) + } + + // Sets the reference leak check mode. Also set it in config below to + // propagate it to child processes. + refs.SetLeakMode(conf.ReferenceLeak) + + // Set up logging. + if conf.Debug { + log.SetLevel(log.Debug) + } + + // Logging will include the local date and time via the time package. + // + // On first use, time.Local initializes the local time zone, which + // involves opening tzdata files on the host. Since this requires + // opening host files, it must be done before syscall filter + // installation. + // + // Generally there will be a log message before filter installation + // that will force initialization, but force initialization here in + // case that does not occur. + _ = time.Local.String() + + subcommand := flag.CommandLine.Arg(0) + + var e log.Emitter + if *debugLogFD > -1 { + f := os.NewFile(uintptr(*debugLogFD), "debug log file") + + e = newEmitter(conf.DebugLogFormat, f) + + } else if conf.DebugLog != "" { + f, err := specutils.DebugLogFile(conf.DebugLog, subcommand, "" /* name */) + if err != nil { + cmd.Fatalf("error opening debug log file in %q: %v", conf.DebugLog, err) + } + e = newEmitter(conf.DebugLogFormat, f) + + } else { + // Stderr is reserved for the application, just discard the logs if no debug + // log is specified. + e = newEmitter("text", ioutil.Discard) + } + + if *panicLogFD > -1 || *debugLogFD > -1 { + fd := *panicLogFD + if fd < 0 { + fd = *debugLogFD + } + // Quick sanity check to make sure no other commands get passed + // a log fd (they should use log dir instead). + if subcommand != "boot" && subcommand != "gofer" { + cmd.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand) + } + + // If we are the boot process, then we own our stdio FDs and can do what we + // want with them. Since Docker and Containerd both eat boot's stderr, we + // dup our stderr to the provided log FD so that panics will appear in the + // logs, rather than just disappear. + if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil { + cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err) + } + } else if conf.AlsoLogToStderr { + e = &log.MultiEmitter{e, newEmitter(conf.DebugLogFormat, os.Stderr)} + } + + log.SetTarget(e) + + log.Infof("***************************") + log.Infof("Args: %s", os.Args) + log.Infof("Version %s", version) + log.Infof("PID: %d", os.Getpid()) + log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid()) + log.Infof("Configuration:") + log.Infof("\t\tRootDir: %s", conf.RootDir) + log.Infof("\t\tPlatform: %v", conf.Platform) + log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay) + log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets) + log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls) + log.Infof("\t\tVFS2 enabled: %v", conf.VFS2) + log.Infof("***************************") + + if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + // SIGTERM is sent to all processes if a test exceeds its + // timeout and this case is handled by syscall_test_runner. + log.Warningf("Block the TERM signal. This is only safe in tests!") + signal.Ignore(syscall.SIGTERM) + } + + // Call the subcommand and pass in the configuration. + var ws syscall.WaitStatus + subcmdCode := subcommands.Execute(context.Background(), conf, &ws) + if subcmdCode == subcommands.ExitSuccess { + log.Infof("Exiting with status: %v", ws) + if ws.Signaled() { + // No good way to return it, emulate what the shell does. Maybe raise + // signal to self? + os.Exit(128 + int(ws.Signal())) + } + os.Exit(ws.ExitStatus()) + } + // Return an error that is unlikely to be used by the application. + log.Warningf("Failure to execute command, err: %v", subcmdCode) + os.Exit(128) +} + +func newEmitter(format string, logFile io.Writer) log.Emitter { + switch format { + case "text": + return log.GoogleEmitter{&log.Writer{Next: logFile}} + case "json": + return log.JSONEmitter{&log.Writer{Next: logFile}} + case "json-k8s": + return log.K8sJSONEmitter{&log.Writer{Next: logFile}} + } + cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format) + panic("unreachable") +} diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index af3538ef0..2556f6d9e 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -45,12 +45,13 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/platform", - "//pkg/state", + "//pkg/state/pretty", "//pkg/state/statefile", "//pkg/sync", "//pkg/unet", "//pkg/urpc", "//runsc/boot", + "//runsc/config", "//runsc/console", "//runsc/container", "//runsc/flag", @@ -58,7 +59,7 @@ go_library( "//runsc/fsgofer/filter", "//runsc/specutils", "@com_github_google_subcommands//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], @@ -84,12 +85,12 @@ go_test( "//pkg/sentry/kernel/auth", "//pkg/test/testutil", "//pkg/urpc", - "//runsc/boot", + "//runsc/config", "//runsc/container", "//runsc/specutils", - "@com_github_google_go-cmp//cmp:go_default_library", - "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_google_go_cmp//cmp:go_default_library", + "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", ], ) diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 01204ab4d..2c92e3067 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" ) @@ -54,10 +55,6 @@ type Boot struct { // provided in that order. stdioFDs intFlags - // console is set to true if the sandbox should allow terminal ioctl(2) - // syscalls. - console bool - // applyCaps determines if capabilities defined in the spec should be applied // to the process. applyCaps bool @@ -115,7 +112,6 @@ func (b *Boot) SetFlags(f *flag.FlagSet) { f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file") f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec") f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") - f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls") f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process") f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace") @@ -135,10 +131,10 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) return subcommands.ExitUsageError } - // Ensure that if there is a panic, all goroutine stacks are printed. - debug.SetTraceback("system") + conf := args[0].(*config.Config) - conf := args[0].(*boot.Config) + // Set traceback level + debug.SetTraceback(conf.Traceback) if b.attached { // Ensure this process is killed after parent process terminates when @@ -172,7 +168,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Get the spec from the specFD. specFile := os.NewFile(uintptr(b.specFD), "spec file") defer specFile.Close() - spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile) + spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf) if err != nil { Fatalf("reading spec: %v", err) } @@ -229,7 +225,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) Device: os.NewFile(uintptr(b.deviceFD), "platform device"), GoferFDs: b.ioFDs.GetArray(), StdioFDs: b.stdioFDs.GetArray(), - Console: b.console, NumCPU: b.cpuNum, TotalMem: b.totalMem, UserLogFD: b.userLogFD, diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go index a84067112..e13a94486 100644 --- a/runsc/cmd/capability_test.go +++ b/runsc/cmd/capability_test.go @@ -24,7 +24,7 @@ import ( "github.com/syndtr/gocapability/capability" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/test/testutil" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/specutils" ) @@ -88,7 +88,7 @@ func TestCapabilities(t *testing.T) { conf := testutil.TestConfig(t) // Use --network=host to make sandbox use spec's capabilities. - conf.Network = boot.NetworkHost + conf.Network = config.NetworkHost _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go index 8a29e521e..8fe0c427a 100644 --- a/runsc/cmd/checkpoint.go +++ b/runsc/cmd/checkpoint.go @@ -22,7 +22,7 @@ import ( "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" @@ -72,7 +72,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) waitStatus := args[1].(*syscall.WaitStatus) cont, err := container.Load(conf.RootDir, id) @@ -118,7 +118,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa Fatalf("setting bundleDir") } - spec, err := specutils.ReadSpec(bundleDir) + spec, err := specutils.ReadSpec(bundleDir, conf) if err != nil { Fatalf("reading spec: %v", err) } diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go index 910e97577..e76f7ba1d 100644 --- a/runsc/cmd/create.go +++ b/runsc/cmd/create.go @@ -18,7 +18,7 @@ import ( "context" "github.com/google/subcommands" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" @@ -81,7 +81,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) if conf.Rootless { return Errorf("Rootless mode not supported with %q", c.Name()) @@ -91,7 +91,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} if bundleDir == "" { bundleDir = getwdOrDie() } - spec, err := specutils.ReadSpec(bundleDir) + spec, err := specutils.ReadSpec(bundleDir, conf) if err != nil { return Errorf("reading spec: %v", err) } diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index b5de2588b..132198222 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -25,27 +25,26 @@ import ( "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // Debug implements subcommands.Command for the "debug" command. type Debug struct { - pid int - stacks bool - signal int - profileHeap string - profileCPU string - profileGoroutine string - profileBlock string - profileMutex string - trace string - strace string - logLevel string - logPackets string - duration time.Duration - ps bool + pid int + stacks bool + signal int + profileHeap string + profileCPU string + profileBlock string + profileMutex string + trace string + strace string + logLevel string + logPackets string + duration time.Duration + ps bool } // Name implements subcommands.Command. @@ -69,7 +68,6 @@ func (d *Debug) SetFlags(f *flag.FlagSet) { f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log") f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") - f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.") f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.") f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.") f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles") @@ -84,7 +82,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) { // Execute implements subcommands.Command.Execute. func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { var c *container.Container - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) if d.pid == 0 { // No pid, container ID must have been provided. @@ -153,18 +151,6 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } log.Infof("Heap profile written to %q", d.profileHeap) } - if d.profileGoroutine != "" { - f, err := os.Create(d.profileGoroutine) - if err != nil { - return Errorf(err.Error()) - } - defer f.Close() - - if err := c.Sandbox.GoroutineProfile(f); err != nil { - return Errorf(err.Error()) - } - log.Infof("Goroutine profile written to %q", d.profileGoroutine) - } if d.profileBlock != "" { f, err := os.Create(d.profileBlock) if err != nil { diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go index 0e4863f50..4e49deff8 100644 --- a/runsc/cmd/delete.go +++ b/runsc/cmd/delete.go @@ -21,7 +21,7 @@ import ( "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) @@ -59,14 +59,14 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} return subcommands.ExitUsageError } - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) if err := d.execute(f.Args(), conf); err != nil { Fatalf("%v", err) } return subcommands.ExitSuccess } -func (d *Delete) execute(ids []string, conf *boot.Config) error { +func (d *Delete) execute(ids []string, conf *config.Config) error { for _, id := range ids { c, err := container.Load(conf.RootDir, id) if err != nil { diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go index cb59516a3..e2d994a05 100644 --- a/runsc/cmd/delete_test.go +++ b/runsc/cmd/delete_test.go @@ -18,7 +18,7 @@ import ( "io/ioutil" "testing" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" ) func TestNotFound(t *testing.T) { @@ -27,7 +27,7 @@ func TestNotFound(t *testing.T) { if err != nil { t.Fatalf("error creating dir: %v", err) } - conf := &boot.Config{RootDir: dir} + conf := &config.Config{RootDir: dir} d := Delete{} if err := d.execute(ids, conf); err == nil { diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go index 7d1310c96..640de4c47 100644 --- a/runsc/cmd/do.go +++ b/runsc/cmd/do.go @@ -17,6 +17,7 @@ package cmd import ( "context" "encoding/json" + "errors" "fmt" "io/ioutil" "math/rand" @@ -30,12 +31,14 @@ import ( "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" ) +var errNoDefaultInterface = errors.New("no default interface found") + // Do implements subcommands.Command for the "do" command. It sets up a simple // sandbox and executes the command inside it. See Usage() for more details. type Do struct { @@ -82,7 +85,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su return subcommands.ExitUsageError } - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) waitStatus := args[1].(*syscall.WaitStatus) if conf.Rootless { @@ -125,27 +128,29 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su specutils.LogSpec(spec) cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000)) - if conf.Network == boot.NetworkNone { - netns := specs.LinuxNamespace{ - Type: specs.NetworkNamespace, - } - if spec.Linux != nil { - panic("spec.Linux is not nil") - } - spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}} + if conf.Network == config.NetworkNone { + addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace}) } else if conf.Rootless { - if conf.Network == boot.NetworkSandbox { - c.notifyUser("*** Warning: using host network due to --rootless ***") - conf.Network = boot.NetworkHost + if conf.Network == config.NetworkSandbox { + c.notifyUser("*** Warning: sandbox network isn't supported with --rootless, switching to host ***") + conf.Network = config.NetworkHost } } else { - clean, err := c.setupNet(cid, spec) - if err != nil { + switch clean, err := c.setupNet(cid, spec); err { + case errNoDefaultInterface: + log.Warningf("Network interface not found, using internal network") + addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace}) + conf.Network = config.NetworkHost + + case nil: + // Setup successfull. + defer clean() + + default: return Errorf("Error setting up network: %v", err) } - defer clean() } out, err := json.Marshal(spec) @@ -199,6 +204,13 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su return subcommands.ExitSuccess } +func addNamespace(spec *specs.Spec, ns specs.LinuxNamespace) { + if spec.Linux == nil { + spec.Linux = &specs.Linux{} + } + spec.Linux.Namespaces = append(spec.Linux.Namespaces, ns) +} + func (c *Do) notifyUser(format string, v ...interface{}) { if !c.quiet { fmt.Printf(format+"\n", v...) @@ -219,10 +231,14 @@ func resolvePath(path string) (string, error) { return path, nil } +// setupNet setups up the sandbox network, including the creation of a network +// namespace, and iptable rules to redirect the traffic. Returns a cleanup +// function to tear down the network. Returns errNoDefaultInterface when there +// is no network interface available to setup the network. func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) { dev, err := defaultDevice() if err != nil { - return nil, err + return nil, errNoDefaultInterface } peerIP, err := calculatePeerIP(c.ip) if err != nil { @@ -279,14 +295,11 @@ func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) { return nil, err } - if spec.Linux == nil { - spec.Linux = &specs.Linux{} - } netns := specs.LinuxNamespace{ Type: specs.NetworkNamespace, Path: filepath.Join("/var/run/netns", cid), } - spec.Linux.Namespaces = append(spec.Linux.Namespaces, netns) + addNamespace(spec, netns) return func() { c.cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath) }, nil } diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go index 51f6a98ed..25fe2cf1c 100644 --- a/runsc/cmd/events.go +++ b/runsc/cmd/events.go @@ -22,7 +22,7 @@ import ( "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) @@ -72,7 +72,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, id) if err != nil { diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index d9a94903e..775ed4b43 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -33,7 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/urpc" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" @@ -105,7 +105,7 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) { // Execute implements subcommands.Command.Execute. It starts a process in an // already created container. func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) e, id, err := ex.parseArgs(f, conf.EnableRaw) if err != nil { Fatalf("parsing process spec: %v", err) @@ -220,7 +220,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi cmd.Stderr = os.Stderr // If the console control socket file is provided, then create a new - // pty master/slave pair and set the TTY on the sandbox process. + // pty master/replica pair and set the TTY on the sandbox process. if ex.consoleSocket != "" { // Create a new TTY pair and send the master on the provided socket. tty, err := console.NewWithSocket(ex.consoleSocket) @@ -229,7 +229,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi } defer tty.Close() - // Set stdio to the new TTY slave. + // Set stdio to the new TTY replica. cmd.Stdin = tty cmd.Stdout = tty cmd.Stderr = tty diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 10448a759..371fcc0ae 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -30,7 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/fsgofer" "gvisor.dev/gvisor/runsc/fsgofer/filter" @@ -62,9 +62,8 @@ type Gofer struct { applyCaps bool setUpRoot bool - panicOnWrite bool - specFD int - mountsFD int + specFD int + mountsFD int } // Name implements subcommands.Command. @@ -87,7 +86,6 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") - f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected") f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).") @@ -100,15 +98,15 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) return subcommands.ExitUsageError } + conf := args[0].(*config.Config) + specFile := os.NewFile(uintptr(g.specFD), "spec file") defer specFile.Close() - spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile) + spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf) if err != nil { Fatalf("reading spec: %v", err) } - conf := args[0].(*boot.Config) - if g.setUpRoot { if err := setupRootFS(spec, conf); err != nil { Fatalf("Error setting up root FS: %v", err) @@ -168,8 +166,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Start with root mount, then add any other additional mount as needed. ats := make([]p9.Attacher, 0, len(spec.Mounts)+1) ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{ - ROMount: spec.Root.Readonly || conf.Overlay, - PanicOnWrite: g.panicOnWrite, + ROMount: spec.Root.Readonly || conf.Overlay, }) if err != nil { Fatalf("creating attach point: %v", err) @@ -181,9 +178,8 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) for _, m := range spec.Mounts { if specutils.Is9PMount(m) { cfg := fsgofer.Config{ - ROMount: isReadonlyMount(m.Options) || conf.Overlay, - PanicOnWrite: g.panicOnWrite, - HostUDS: conf.FSGoferHostUDS, + ROMount: isReadonlyMount(m.Options) || conf.Overlay, + HostUDS: conf.FSGoferHostUDS, } ap, err := fsgofer.NewAttachPoint(m.Destination, cfg) if err != nil { @@ -263,7 +259,7 @@ func isReadonlyMount(opts []string) bool { return false } -func setupRootFS(spec *specs.Spec, conf *boot.Config) error { +func setupRootFS(spec *specs.Spec, conf *config.Config) error { // Convert all shared mounts into slaves to be sure that nothing will be // propagated outside of our namespace. if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { @@ -306,7 +302,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error { } // Replace the current spec, with the clean spec with symlinks resolved. - if err := setupMounts(spec.Mounts, root); err != nil { + if err := setupMounts(conf, spec.Mounts, root); err != nil { Fatalf("error setting up FS: %v", err) } @@ -316,13 +312,14 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error { if err != nil { return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err) } + log.Infof("Create working directory %q if needed", spec.Process.Cwd) if err := os.MkdirAll(dst, 0755); err != nil { return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err) } } // Check if root needs to be remounted as readonly. - if spec.Root.Readonly { + if spec.Root.Readonly || conf.Overlay { // If root is a mount point but not read-only, we can change mount options // to make it read-only for extra safety. log.Infof("Remounting root as readonly: %q", root) @@ -346,7 +343,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error { // setupMounts binds mount all mounts specified in the spec in their correct // location inside root. It will resolve relative paths and symlinks. It also // creates directories as needed. -func setupMounts(mounts []specs.Mount, root string) error { +func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error { for _, m := range mounts { if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { continue @@ -358,6 +355,11 @@ func setupMounts(mounts []specs.Mount, root string) error { } flags := specutils.OptionsToFlags(m.Options) | syscall.MS_BIND + if conf.Overlay { + // Force mount read-only if writes are not going to be sent to it. + flags |= syscall.MS_RDONLY + } + log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags) if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil { return fmt.Errorf("mounting %v: %v", m, err) @@ -380,7 +382,7 @@ func setupMounts(mounts []specs.Mount, root string) error { // Otherwise, it may follow symlinks to locations that would be overwritten // with another mount point and return the wrong location. In short, make sure // setupMounts() has been called before. -func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) { +func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) { cleanMounts := make([]specs.Mount, 0, len(mounts)) for _, m := range mounts { if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { @@ -462,7 +464,7 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro } // adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs. -func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) { +func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) { rv := make([]string, len(opts)) copy(rv, opts) diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go index 8282ea0e0..04eee99b2 100644 --- a/runsc/cmd/kill.go +++ b/runsc/cmd/kill.go @@ -23,7 +23,7 @@ import ( "github.com/google/subcommands" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) @@ -63,7 +63,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) if k.pid != 0 && k.all { Fatalf("it is invalid to specify both --all and --pid") diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go index d8d906fe3..f92d6fef9 100644 --- a/runsc/cmd/list.go +++ b/runsc/cmd/list.go @@ -24,7 +24,7 @@ import ( "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) @@ -63,7 +63,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) return subcommands.ExitUsageError } - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) ids, err := container.List(conf.RootDir) if err != nil { Fatalf("%v", err) diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go index 6f95a9837..0eb1402ed 100644 --- a/runsc/cmd/pause.go +++ b/runsc/cmd/pause.go @@ -18,7 +18,7 @@ import ( "context" "github.com/google/subcommands" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) @@ -53,7 +53,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) cont, err := container.Load(conf.RootDir, id) if err != nil { diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go index 7fb8041af..bc58c928f 100644 --- a/runsc/cmd/ps.go +++ b/runsc/cmd/ps.go @@ -20,7 +20,7 @@ import ( "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) @@ -58,7 +58,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, id) if err != nil { diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go index 72584b326..096ec814c 100644 --- a/runsc/cmd/restore.go +++ b/runsc/cmd/restore.go @@ -20,7 +20,7 @@ import ( "syscall" "github.com/google/subcommands" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" @@ -77,7 +77,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{ } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) waitStatus := args[1].(*syscall.WaitStatus) if conf.Rootless { @@ -88,7 +88,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{ if bundleDir == "" { bundleDir = getwdOrDie() } - spec, err := specutils.ReadSpec(bundleDir) + spec, err := specutils.ReadSpec(bundleDir, conf) if err != nil { return Errorf("reading spec: %v", err) } diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go index 61a55a554..f24823f99 100644 --- a/runsc/cmd/resume.go +++ b/runsc/cmd/resume.go @@ -18,7 +18,7 @@ import ( "context" "github.com/google/subcommands" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) @@ -54,7 +54,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) cont, err := container.Load(conf.RootDir, id) if err != nil { diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go index cf41581ad..c48cbe4cd 100644 --- a/runsc/cmd/run.go +++ b/runsc/cmd/run.go @@ -19,7 +19,7 @@ import ( "syscall" "github.com/google/subcommands" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" @@ -64,7 +64,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) waitStatus := args[1].(*syscall.WaitStatus) if conf.Rootless { @@ -75,7 +75,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s if bundleDir == "" { bundleDir = getwdOrDie() } - spec, err := specutils.ReadSpec(bundleDir) + spec, err := specutils.ReadSpec(bundleDir, conf) if err != nil { return Errorf("reading spec: %v", err) } diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go index a2b0a4b14..55194e641 100644 --- a/runsc/cmd/spec.go +++ b/runsc/cmd/spec.go @@ -16,124 +16,122 @@ package cmd import ( "context" - "fmt" - "io/ioutil" + "encoding/json" + "io" "os" "path/filepath" "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/runsc/flag" ) -func genSpec(cwd string) []byte { - var template = fmt.Sprintf(`{ - "ociVersion": "1.0.0", - "process": { - "terminal": true, - "user": { - "uid": 0, - "gid": 0 - }, - "args": [ - "sh" - ], - "env": [ - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm" - ], - "cwd": "%s", - "capabilities": { - "bounding": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "effective": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "inheritable": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "permitted": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ], - "ambient": [ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE" - ] - }, - "rlimits": [ - { - "type": "RLIMIT_NOFILE", - "hard": 1024, - "soft": 1024 - } - ] - }, - "root": { - "path": "rootfs", - "readonly": true - }, - "hostname": "runsc", - "mounts": [ - { - "destination": "/proc", - "type": "proc", - "source": "proc" +func writeSpec(w io.Writer, cwd string, netns string, args []string) error { + spec := &specs.Spec{ + Version: "1.0.0", + Process: &specs.Process{ + Terminal: true, + User: specs.User{ + UID: 0, + GID: 0, + }, + Args: args, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: cwd, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + // TODO(gvisor.dev/issue/3166): support ambient capabilities + }, + Rlimits: []specs.POSIXRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: 1024, + Soft: 1024, + }, + }, }, - { - "destination": "/dev", - "type": "tmpfs", - "source": "tmpfs", - "options": [] + Root: &specs.Root{ + Path: "rootfs", + Readonly: true, }, - { - "destination": "/sys", - "type": "sysfs", - "source": "sysfs", - "options": [ - "nosuid", - "noexec", - "nodev", - "ro" - ] - } - ], - "linux": { - "namespaces": [ + Hostname: "runsc", + Mounts: []specs.Mount{ { - "type": "pid" + Destination: "/proc", + Type: "proc", + Source: "proc", }, { - "type": "network" + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", }, { - "type": "ipc" + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{ + "nosuid", + "noexec", + "nodev", + "ro", + }, }, - { - "type": "uts" + }, + Linux: &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + Path: netns, + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, }, - { - "type": "mount" - } - ] + }, } -}`, cwd) - return []byte(template) + e := json.NewEncoder(w) + e.SetIndent("", " ") + return e.Encode(spec) } // Spec implements subcommands.Command for the "spec" command. type Spec struct { bundle string cwd string + netns string } // Name implements subcommands.Command.Name. @@ -148,21 +146,26 @@ func (*Spec) Synopsis() string { // Usage implements subcommands.Command.Usage. func (*Spec) Usage() string { - return `spec [options] - create a new OCI bundle specification file. + return `spec [options] [-- args...] - create a new OCI bundle specification file. + +The spec command creates a new specification file (config.json) for a new OCI +bundle. -The spec command creates a new specification file (config.json) for a new OCI bundle. +The specification file is a starter file that runs the command specified by +'args' in the container. If 'args' is not specified the default is to run the +'sh' program. -The specification file is a starter file that runs the "sh" command in the container. You -should edit the file to suit your needs. You can find out more about the format of the -specification file by visiting the OCI runtime spec repository: +While a number of flags are provided to change values in the specification, you +can examine the file and edit it to suit your needs after this command runs. +You can find out more about the format of the specification file by visiting +the OCI runtime spec repository: https://github.com/opencontainers/runtime-spec/ EXAMPLE: $ mkdir -p bundle/rootfs $ cd bundle - $ runsc spec + $ runsc spec -- /hello $ docker export $(docker create hello-world) | tar -xf - -C rootfs - $ sed -i 's;"sh";"/hello";' config.json $ sudo runsc run hello ` @@ -173,18 +176,29 @@ func (s *Spec) SetFlags(f *flag.FlagSet) { f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle") f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+ "this value MUST be an absolute path") + f.StringVar(&s.netns, "netns", "", "network namespace path") } // Execute implements subcommands.Command.Execute. func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + // Grab the arguments. + containerArgs := f.Args() + if len(containerArgs) == 0 { + containerArgs = []string{"sh"} + } + confPath := filepath.Join(s.bundle, "config.json") if _, err := os.Stat(confPath); !os.IsNotExist(err) { Fatalf("file %q already exists", confPath) } - var spec = genSpec(s.cwd) + configFile, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0664) + if err != nil { + Fatalf("opening file %q: %v", confPath, err) + } - if err := ioutil.WriteFile(confPath, spec, 0664); err != nil { + err = writeSpec(configFile, s.cwd, s.netns, containerArgs) + if err != nil { Fatalf("writing to %q: %v", confPath, err) } diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go index 0205fd9f7..139edbd49 100644 --- a/runsc/cmd/start.go +++ b/runsc/cmd/start.go @@ -18,9 +18,10 @@ import ( "context" "github.com/google/subcommands" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" + "gvisor.dev/gvisor/runsc/specutils" ) // Start implements subcommands.Command for the "start" command. @@ -52,12 +53,18 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, id) if err != nil { Fatalf("loading container: %v", err) } + // Read the spec again here to ensure flag annotations from the spec are + // applied to "conf". + if _, err := specutils.ReadSpec(c.BundleDir, conf); err != nil { + Fatalf("reading spec: %v", err) + } + if err := c.Start(conf); err != nil { Fatalf("starting container: %v", err) } diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go index cf2413deb..2bd2ab9f8 100644 --- a/runsc/cmd/state.go +++ b/runsc/cmd/state.go @@ -21,7 +21,7 @@ import ( "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) @@ -55,7 +55,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, id) if err != nil { diff --git a/runsc/cmd/statefile.go b/runsc/cmd/statefile.go index e6f1907da..daed9e728 100644 --- a/runsc/cmd/statefile.go +++ b/runsc/cmd/statefile.go @@ -20,7 +20,7 @@ import ( "os" "github.com/google/subcommands" - "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/pretty" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/runsc/flag" ) @@ -105,8 +105,14 @@ func (s *Statefile) Execute(_ context.Context, f *flag.FlagSet, args ...interfac if err != nil { Fatalf("error parsing statefile: %v", err) } - if err := state.PrettyPrint(output, rc, s.html); err != nil { - Fatalf("error printing state: %v", err) + if s.html { + if err := pretty.PrintHTML(output, rc); err != nil { + Fatalf("error printing state: %v", err) + } + } else { + if err := pretty.PrintText(output, rc); err != nil { + Fatalf("error printing state: %v", err) + } } return subcommands.ExitSuccess } diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go index 29c0a15f0..28d0642ed 100644 --- a/runsc/cmd/wait.go +++ b/runsc/cmd/wait.go @@ -21,7 +21,7 @@ import ( "syscall" "github.com/google/subcommands" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) @@ -70,7 +70,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } id := f.Arg(0) - conf := args[0].(*boot.Config) + conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, id) if err != nil { diff --git a/runsc/config/BUILD b/runsc/config/BUILD new file mode 100644 index 000000000..b1672bb9d --- /dev/null +++ b/runsc/config/BUILD @@ -0,0 +1,28 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "config", + srcs = [ + "config.go", + "flags.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/refs", + "//pkg/sentry/watchdog", + "//pkg/sync", + "//runsc/flag", + ], +) + +go_test( + name = "config_test", + size = "small", + srcs = [ + "config_test.go", + ], + library = ":config", + deps = ["//runsc/flag"], +) diff --git a/runsc/boot/config.go b/runsc/config/config.go index bcec7e4db..b02d8e2e1 100644 --- a/runsc/boot/config.go +++ b/runsc/config/config.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,214 +12,115 @@ // See the License for the specific language governing permissions and // limitations under the License. -package boot +// Package config provides basic infrastructure to set configuration settings +// for runsc. The configuration is set by flags to the command line. They can +// also propagate to a different process using the same flags. +package config import ( "fmt" - "strconv" - "strings" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/watchdog" ) -// FileAccessType tells how the filesystem is accessed. -type FileAccessType int - -const ( - // FileAccessShared sends IO requests to a Gofer process that validates the - // requests and forwards them to the host. - FileAccessShared FileAccessType = iota - - // FileAccessExclusive is the same as FileAccessShared, but enables - // extra caching for improved performance. It should only be used if - // the sandbox has exclusive access to the filesystem. - FileAccessExclusive -) - -// MakeFileAccessType converts type from string. -func MakeFileAccessType(s string) (FileAccessType, error) { - switch s { - case "shared": - return FileAccessShared, nil - case "exclusive": - return FileAccessExclusive, nil - default: - return 0, fmt.Errorf("invalid file access type %q", s) - } -} - -func (f FileAccessType) String() string { - switch f { - case FileAccessShared: - return "shared" - case FileAccessExclusive: - return "exclusive" - default: - return fmt.Sprintf("unknown(%d)", f) - } -} - -// NetworkType tells which network stack to use. -type NetworkType int - -const ( - // NetworkSandbox uses internal network stack, isolated from the host. - NetworkSandbox NetworkType = iota - - // NetworkHost redirects network related syscalls to the host network. - NetworkHost - - // NetworkNone sets up just loopback using netstack. - NetworkNone -) - -// MakeNetworkType converts type from string. -func MakeNetworkType(s string) (NetworkType, error) { - switch s { - case "sandbox": - return NetworkSandbox, nil - case "host": - return NetworkHost, nil - case "none": - return NetworkNone, nil - default: - return 0, fmt.Errorf("invalid network type %q", s) - } -} - -func (n NetworkType) String() string { - switch n { - case NetworkSandbox: - return "sandbox" - case NetworkHost: - return "host" - case NetworkNone: - return "none" - default: - return fmt.Sprintf("unknown(%d)", n) - } -} - -// MakeWatchdogAction converts type from string. -func MakeWatchdogAction(s string) (watchdog.Action, error) { - switch strings.ToLower(s) { - case "log", "logwarning": - return watchdog.LogWarning, nil - case "panic": - return watchdog.Panic, nil - default: - return 0, fmt.Errorf("invalid watchdog action %q", s) - } -} - -// MakeRefsLeakMode converts type from string. -func MakeRefsLeakMode(s string) (refs.LeakMode, error) { - switch strings.ToLower(s) { - case "disabled": - return refs.NoLeakChecking, nil - case "log-names": - return refs.LeaksLogWarning, nil - case "log-traces": - return refs.LeaksLogTraces, nil - default: - return 0, fmt.Errorf("invalid refs leakmode %q", s) - } -} - -func refsLeakModeToString(mode refs.LeakMode) string { - switch mode { - // If not set, default it to disabled. - case refs.UninitializedLeakChecking, refs.NoLeakChecking: - return "disabled" - case refs.LeaksLogWarning: - return "log-names" - case refs.LeaksLogTraces: - return "log-traces" - default: - panic(fmt.Sprintf("Invalid leakmode: %d", mode)) - } -} - // Config holds configuration that is not part of the runtime spec. +// +// Follow these steps to add a new flag: +// 1. Create a new field in Config. +// 2. Add a field tag with the flag name +// 3. Register a new flag in flags.go, with name and description +// 4. Add any necessary validation into validate() +// 5. If adding an enum, follow the same pattern as FileAccessType +// type Config struct { // RootDir is the runtime root directory. - RootDir string + RootDir string `flag:"root"` + + // Traceback changes the Go runtime's traceback level. + Traceback string `flag:"traceback"` // Debug indicates that debug logging should be enabled. - Debug bool + Debug bool `flag:"debug"` // LogFilename is the filename to log to, if not empty. - LogFilename string + LogFilename string `flag:"log"` // LogFormat is the log format. - LogFormat string + LogFormat string `flag:"log-format"` // DebugLog is the path to log debug information to, if not empty. - DebugLog string + DebugLog string `flag:"debug-log"` // PanicLog is the path to log GO's runtime messages, if not empty. - PanicLog string + PanicLog string `flag:"panic-log"` // DebugLogFormat is the log format for debug. - DebugLogFormat string + DebugLogFormat string `flag:"debug-log-format"` // FileAccess indicates how the filesystem is accessed. - FileAccess FileAccessType + FileAccess FileAccessType `flag:"file-access"` // Overlay is whether to wrap the root filesystem in an overlay. - Overlay bool + Overlay bool `flag:"overlay"` // FSGoferHostUDS enables the gofer to mount a host UDS. - FSGoferHostUDS bool + FSGoferHostUDS bool `flag:"fsgofer-host-uds"` // Network indicates what type of network to use. - Network NetworkType + Network NetworkType `flag:"network"` // EnableRaw indicates whether raw sockets should be enabled. Raw // sockets are disabled by stripping CAP_NET_RAW from the list of // capabilities. - EnableRaw bool + EnableRaw bool `flag:"net-raw"` // HardwareGSO indicates that hardware segmentation offload is enabled. - HardwareGSO bool + HardwareGSO bool `flag:"gso"` // SoftwareGSO indicates that software segmentation offload is enabled. - SoftwareGSO bool + SoftwareGSO bool `flag:"software-gso"` + + // TXChecksumOffload indicates that TX Checksum Offload is enabled. + TXChecksumOffload bool `flag:"tx-checksum-offload"` + + // RXChecksumOffload indicates that RX Checksum Offload is enabled. + RXChecksumOffload bool `flag:"rx-checksum-offload"` // QDisc indicates the type of queuening discipline to use by default // for non-loopback interfaces. - QDisc QueueingDiscipline + QDisc QueueingDiscipline `flag:"qdisc"` // LogPackets indicates that all network packets should be logged. - LogPackets bool + LogPackets bool `flag:"log-packets"` // Platform is the platform to run on. - Platform string + Platform string `flag:"platform"` // Strace indicates that strace should be enabled. - Strace bool + Strace bool `flag:"strace"` - // StraceSyscalls is the set of syscalls to trace. If StraceEnable is - // true and this list is empty, then all syscalls will be traced. - StraceSyscalls []string + // StraceSyscalls is the set of syscalls to trace (comma-separated values). + // If StraceEnable is true and this string is empty, then all syscalls will + // be traced. + StraceSyscalls string `flag:"strace-syscalls"` // StraceLogSize is the max size of data blobs to display. - StraceLogSize uint + StraceLogSize uint `flag:"strace-log-size"` // DisableSeccomp indicates whether seccomp syscall filters should be // disabled. Pardon the double negation, but default to enabled is important. DisableSeccomp bool // WatchdogAction sets what action the watchdog takes when triggered. - WatchdogAction watchdog.Action + WatchdogAction watchdog.Action `flag:"watchdog-action"` // PanicSignal registers signal handling that panics. Usually set to // SIGUSR2(12) to troubleshoot hangs. -1 disables it. - PanicSignal int + PanicSignal int `flag:"panic-signal"` // ProfileEnable is set to prepare the sandbox to be profiled. - ProfileEnable bool + ProfileEnable bool `flag:"profile"` // RestoreFile is the path to the saved container image RestoreFile string @@ -227,95 +128,215 @@ type Config struct { // NumNetworkChannels controls the number of AF_PACKET sockets that map // to the same underlying network device. This allows netstack to better // scale for high throughput use cases. - NumNetworkChannels int + NumNetworkChannels int `flag:"num-network-channels"` // Rootless allows the sandbox to be started with a user that is not root. // Defense is depth measures are weaker with rootless. Specifically, the // sandbox and Gofer process run as root inside a user namespace with root // mapped to the caller's user. - Rootless bool + Rootless bool `flag:"rootless"` // AlsoLogToStderr allows to send log messages to stderr. - AlsoLogToStderr bool + AlsoLogToStderr bool `flag:"alsologtostderr"` // ReferenceLeakMode sets reference leak check mode - ReferenceLeakMode refs.LeakMode + ReferenceLeak refs.LeakMode `flag:"ref-leak-mode"` // OverlayfsStaleRead instructs the sandbox to assume that the root mount // is on a Linux overlayfs mount, which does not necessarily preserve // coherence between read-only and subsequent writable file descriptors // representing the "same" file. - OverlayfsStaleRead bool + OverlayfsStaleRead bool `flag:"overlayfs-stale-read"` + + // CPUNumFromQuota sets CPU number count to available CPU quota, using + // least integer value greater than or equal to quota. + // + // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. + CPUNumFromQuota bool `flag:"cpu-num-from-quota"` + + // Enables VFS2. + VFS2 bool `flag:"vfs2"` + + // Enables FUSE usage. + FUSE bool `flag:"fuse"` + + // Allows overriding of flags in OCI annotations. + AllowFlagOverride bool `flag:"allow-flag-override"` + + // Enables seccomp inside the sandbox. + OCISeccomp bool `flag:"oci-seccomp"` // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in // tests. It allows runsc to start the sandbox process as the current // user, and without chrooting the sandbox process. This can be // necessary in test environments that have limited capabilities. - TestOnlyAllowRunAsCurrentUserWithoutChroot bool + TestOnlyAllowRunAsCurrentUserWithoutChroot bool `flag:"TESTONLY-unsafe-nonroot"` // TestOnlyTestNameEnv should only be used in tests. It looks up for the // test name in the container environment variables and adds it to the debug // log file name. This is done to help identify the log with the test when // multiple tests are run in parallel, since there is no way to pass // parameters to the runtime from docker. - TestOnlyTestNameEnv string + TestOnlyTestNameEnv string `flag:"TESTONLY-test-name-env"` +} - // CPUNumFromQuota sets CPU number count to available CPU quota, using - // least integer value greater than or equal to quota. - // - // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. - CPUNumFromQuota bool +func (c *Config) validate() error { + if c.FileAccess == FileAccessShared && c.Overlay { + return fmt.Errorf("overlay flag is incompatible with shared file access") + } + if c.NumNetworkChannels <= 0 { + return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels) + } + return nil +} + +// FileAccessType tells how the filesystem is accessed. +type FileAccessType int - // Enables VFS2 (not plumbled through yet). - VFS2 bool +const ( + // FileAccessExclusive is the same as FileAccessShared, but enables + // extra caching for improved performance. It should only be used if + // the sandbox has exclusive access to the filesystem. + FileAccessExclusive FileAccessType = iota + + // FileAccessShared sends IO requests to a Gofer process that validates the + // requests and forwards them to the host. + FileAccessShared +) + +func fileAccessTypePtr(v FileAccessType) *FileAccessType { + return &v } -// ToFlags returns a slice of flags that correspond to the given Config. -func (c *Config) ToFlags() []string { - f := []string{ - "--root=" + c.RootDir, - "--debug=" + strconv.FormatBool(c.Debug), - "--log=" + c.LogFilename, - "--log-format=" + c.LogFormat, - "--debug-log=" + c.DebugLog, - "--panic-log=" + c.PanicLog, - "--debug-log-format=" + c.DebugLogFormat, - "--file-access=" + c.FileAccess.String(), - "--overlay=" + strconv.FormatBool(c.Overlay), - "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS), - "--network=" + c.Network.String(), - "--log-packets=" + strconv.FormatBool(c.LogPackets), - "--platform=" + c.Platform, - "--strace=" + strconv.FormatBool(c.Strace), - "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","), - "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)), - "--watchdog-action=" + c.WatchdogAction.String(), - "--panic-signal=" + strconv.Itoa(c.PanicSignal), - "--profile=" + strconv.FormatBool(c.ProfileEnable), - "--net-raw=" + strconv.FormatBool(c.EnableRaw), - "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), - "--rootless=" + strconv.FormatBool(c.Rootless), - "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr), - "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), - "--gso=" + strconv.FormatBool(c.HardwareGSO), - "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), - "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), - "--qdisc=" + c.QDisc.String(), +// Set implements flag.Value. +func (f *FileAccessType) Set(v string) error { + switch v { + case "shared": + *f = FileAccessShared + case "exclusive": + *f = FileAccessExclusive + default: + return fmt.Errorf("invalid file access type %q", v) + } + return nil +} + +// Get implements flag.Value. +func (f *FileAccessType) Get() interface{} { + return *f +} + +// String implements flag.Value. +func (f *FileAccessType) String() string { + switch *f { + case FileAccessShared: + return "shared" + case FileAccessExclusive: + return "exclusive" } - if c.CPUNumFromQuota { - f = append(f, "--cpu-num-from-quota") + panic(fmt.Sprintf("Invalid file access type %v", *f)) +} + +// NetworkType tells which network stack to use. +type NetworkType int + +const ( + // NetworkSandbox uses internal network stack, isolated from the host. + NetworkSandbox NetworkType = iota + + // NetworkHost redirects network related syscalls to the host network. + NetworkHost + + // NetworkNone sets up just loopback using netstack. + NetworkNone +) + +func networkTypePtr(v NetworkType) *NetworkType { + return &v +} + +// Set implements flag.Value. +func (n *NetworkType) Set(v string) error { + switch v { + case "sandbox": + *n = NetworkSandbox + case "host": + *n = NetworkHost + case "none": + *n = NetworkNone + default: + return fmt.Errorf("invalid network type %q", v) } - // Only include these if set since it is never to be used by users. - if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { - f = append(f, "--TESTONLY-unsafe-nonroot=true") + return nil +} + +// Get implements flag.Value. +func (n *NetworkType) Get() interface{} { + return *n +} + +// String implements flag.Value. +func (n *NetworkType) String() string { + switch *n { + case NetworkSandbox: + return "sandbox" + case NetworkHost: + return "host" + case NetworkNone: + return "none" } - if len(c.TestOnlyTestNameEnv) != 0 { - f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv) + panic(fmt.Sprintf("Invalid network type %v", *n)) +} + +// QueueingDiscipline is used to specify the kind of Queueing Discipline to +// apply for a give FDBasedLink. +type QueueingDiscipline int + +const ( + // QDiscNone disables any queueing for the underlying FD. + QDiscNone QueueingDiscipline = iota + + // QDiscFIFO applies a simple fifo based queue to the underlying FD. + QDiscFIFO +) + +func queueingDisciplinePtr(v QueueingDiscipline) *QueueingDiscipline { + return &v +} + +// Set implements flag.Value. +func (q *QueueingDiscipline) Set(v string) error { + switch v { + case "none": + *q = QDiscNone + case "fifo": + *q = QDiscFIFO + default: + return fmt.Errorf("invalid qdisc %q", v) } + return nil +} + +// Get implements flag.Value. +func (q *QueueingDiscipline) Get() interface{} { + return *q +} - if c.VFS2 { - f = append(f, "--vfs2=true") +// String implements flag.Value. +func (q *QueueingDiscipline) String() string { + switch *q { + case QDiscNone: + return "none" + case QDiscFIFO: + return "fifo" } + panic(fmt.Sprintf("Invalid qdisc %v", *q)) +} + +func leakModePtr(v refs.LeakMode) *refs.LeakMode { + return &v +} - return f +func watchdogActionPtr(v watchdog.Action) *watchdog.Action { + return &v } diff --git a/runsc/config/config_test.go b/runsc/config/config_test.go new file mode 100644 index 000000000..fb162b7eb --- /dev/null +++ b/runsc/config/config_test.go @@ -0,0 +1,272 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package config + +import ( + "strings" + "testing" + + "gvisor.dev/gvisor/runsc/flag" +) + +func init() { + RegisterFlags() +} + +func TestDefault(t *testing.T) { + c, err := NewFromFlags() + if err != nil { + t.Fatal(err) + } + // "--root" is always set to something different than the default. Reset it + // to make it easier to test that default values do not generate flags. + c.RootDir = "" + + // All defaults doesn't require setting flags. + flags := c.ToFlags() + if len(flags) > 0 { + t.Errorf("default flags not set correctly for: %s", flags) + } +} + +func setDefault(name string) { + fl := flag.CommandLine.Lookup(name) + fl.Value.Set(fl.DefValue) +} + +func TestFromFlags(t *testing.T) { + flag.CommandLine.Lookup("root").Value.Set("some-path") + flag.CommandLine.Lookup("debug").Value.Set("true") + flag.CommandLine.Lookup("num-network-channels").Value.Set("123") + flag.CommandLine.Lookup("network").Value.Set("none") + defer func() { + setDefault("root") + setDefault("debug") + setDefault("num-network-channels") + setDefault("network") + }() + + c, err := NewFromFlags() + if err != nil { + t.Fatal(err) + } + if want := "some-path"; c.RootDir != want { + t.Errorf("RootDir=%v, want: %v", c.RootDir, want) + } + if want := true; c.Debug != want { + t.Errorf("Debug=%v, want: %v", c.Debug, want) + } + if want := 123; c.NumNetworkChannels != want { + t.Errorf("NumNetworkChannels=%v, want: %v", c.NumNetworkChannels, want) + } + if want := NetworkNone; c.Network != want { + t.Errorf("Network=%v, want: %v", c.Network, want) + } +} + +func TestToFlags(t *testing.T) { + c, err := NewFromFlags() + if err != nil { + t.Fatal(err) + } + c.RootDir = "some-path" + c.Debug = true + c.NumNetworkChannels = 123 + c.Network = NetworkNone + + flags := c.ToFlags() + if len(flags) != 4 { + t.Errorf("wrong number of flags set, want: 4, got: %d: %s", len(flags), flags) + } + t.Logf("Flags: %s", flags) + fm := map[string]string{} + for _, f := range flags { + kv := strings.Split(f, "=") + fm[kv[0]] = kv[1] + } + for name, want := range map[string]string{ + "--root": "some-path", + "--debug": "true", + "--num-network-channels": "123", + "--network": "none", + } { + if got, ok := fm[name]; ok { + if got != want { + t.Errorf("flag %q, want: %q, got: %q", name, want, got) + } + } else { + t.Errorf("flag %q not set", name) + } + } +} + +// TestInvalidFlags checks that enum flags fail when value is not in enum set. +func TestInvalidFlags(t *testing.T) { + for _, tc := range []struct { + name string + error string + }{ + { + name: "file-access", + error: "invalid file access type", + }, + { + name: "network", + error: "invalid network type", + }, + { + name: "qdisc", + error: "invalid qdisc", + }, + { + name: "watchdog-action", + error: "invalid watchdog action", + }, + { + name: "ref-leak-mode", + error: "invalid ref leak mode", + }, + } { + t.Run(tc.name, func(t *testing.T) { + defer setDefault(tc.name) + if err := flag.CommandLine.Lookup(tc.name).Value.Set("invalid"); err == nil || !strings.Contains(err.Error(), tc.error) { + t.Errorf("flag.Value.Set(invalid) wrong error reported: %v", err) + } + }) + } +} + +func TestValidationFail(t *testing.T) { + for _, tc := range []struct { + name string + flags map[string]string + error string + }{ + { + name: "shared+overlay", + flags: map[string]string{ + "file-access": "shared", + "overlay": "true", + }, + error: "overlay flag is incompatible", + }, + { + name: "network-channels", + flags: map[string]string{ + "num-network-channels": "-1", + }, + error: "num_network_channels must be > 0", + }, + } { + t.Run(tc.name, func(t *testing.T) { + for name, val := range tc.flags { + defer setDefault(name) + if err := flag.CommandLine.Lookup(name).Value.Set(val); err != nil { + t.Errorf("%s=%q: %v", name, val, err) + } + } + if _, err := NewFromFlags(); err == nil || !strings.Contains(err.Error(), tc.error) { + t.Errorf("NewFromFlags() wrong error reported: %v", err) + } + }) + } +} + +func TestOverride(t *testing.T) { + c, err := NewFromFlags() + if err != nil { + t.Fatal(err) + } + c.AllowFlagOverride = true + + t.Run("string", func(t *testing.T) { + c.RootDir = "foobar" + if err := c.Override("root", "bar"); err != nil { + t.Fatalf("Override(root, bar) failed: %v", err) + } + defer setDefault("root") + if c.RootDir != "bar" { + t.Errorf("Override(root, bar) didn't work: %+v", c) + } + }) + + t.Run("bool", func(t *testing.T) { + c.Debug = true + if err := c.Override("debug", "false"); err != nil { + t.Fatalf("Override(debug, false) failed: %v", err) + } + defer setDefault("debug") + if c.Debug { + t.Errorf("Override(debug, false) didn't work: %+v", c) + } + }) + + t.Run("enum", func(t *testing.T) { + c.FileAccess = FileAccessShared + if err := c.Override("file-access", "exclusive"); err != nil { + t.Fatalf("Override(file-access, exclusive) failed: %v", err) + } + defer setDefault("file-access") + if c.FileAccess != FileAccessExclusive { + t.Errorf("Override(file-access, exclusive) didn't work: %+v", c) + } + }) +} + +func TestOverrideDisabled(t *testing.T) { + c, err := NewFromFlags() + if err != nil { + t.Fatal(err) + } + const errMsg = "flag override disabled" + if err := c.Override("root", "path"); err == nil || !strings.Contains(err.Error(), errMsg) { + t.Errorf("Override() wrong error: %v", err) + } +} + +func TestOverrideError(t *testing.T) { + c, err := NewFromFlags() + if err != nil { + t.Fatal(err) + } + c.AllowFlagOverride = true + for _, tc := range []struct { + name string + value string + error string + }{ + { + name: "invalid", + value: "valid", + error: `flag "invalid" not found`, + }, + { + name: "debug", + value: "invalid", + error: "error setting flag debug", + }, + { + name: "file-access", + value: "invalid", + error: "invalid file access type", + }, + } { + t.Run(tc.name, func(t *testing.T) { + if err := c.Override(tc.name, tc.value); err == nil || !strings.Contains(err.Error(), tc.error) { + t.Errorf("Override(%q, %q) wrong error: %v", tc.name, tc.value, err) + } + }) + } +} diff --git a/runsc/config/flags.go b/runsc/config/flags.go new file mode 100644 index 000000000..d3203b565 --- /dev/null +++ b/runsc/config/flags.go @@ -0,0 +1,206 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package config + +import ( + "fmt" + "os" + "path/filepath" + "reflect" + "strconv" + + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/runsc/flag" +) + +var registration sync.Once + +// This is the set of flags used to populate Config. +func RegisterFlags() { + registration.Do(func() { + // Although these flags are not part of the OCI spec, they are used by + // Docker, and thus should not be changed. + flag.String("root", "", "root directory for storage of container state.") + flag.String("log", "", "file path where internal debug information is written, default is stdout.") + flag.String("log-format", "text", "log format: text (default), json, or json-k8s.") + flag.Bool("debug", false, "enable debug logging.") + + // These flags are unique to runsc, and are used to configure parts of the + // system that are not covered by the runtime spec. + + // Debugging flags. + flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.") + flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.") + flag.Bool("log-packets", false, "enable network packet logging.") + flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.") + flag.Bool("alsologtostderr", false, "send log messages to stderr.") + flag.Bool("allow-flag-override", false, "allow OCI annotations (dev.gvisor.flag.<name>) to override flags for debugging.") + flag.String("traceback", "system", "golang runtime's traceback level") + + // Debugging flags: strace related + flag.Bool("strace", false, "enable strace.") + flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.") + flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.") + + // Flags that control sandbox runtime behavior. + flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.") + flag.Var(watchdogActionPtr(watchdog.LogWarning), "watchdog-action", "sets what action the watchdog takes when triggered: log (default), panic.") + flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") + flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") + flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") + flag.Var(leakModePtr(refs.NoLeakChecking), "ref-leak-mode", "sets reference leak check mode: disabled (default), log-names, log-traces.") + flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") + flag.Bool("oci-seccomp", false, "Enables loading OCI seccomp filters inside the sandbox.") + + // Flags that control sandbox runtime behavior: FS related. + flag.Var(fileAccessTypePtr(FileAccessExclusive), "file-access", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") + flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") + flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem") + flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") + flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.") + flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.") + + // Flags that control sandbox runtime behavior: network related. + flag.Var(networkTypePtr(NetworkSandbox), "network", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") + flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") + flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.") + flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.") + flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.") + flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.") + flag.Var(queueingDisciplinePtr(QDiscFIFO), "qdisc", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.") + flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") + + // Test flags, not to be used outside tests, ever. + flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") + flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.") + }) +} + +// NewFromFlags creates a new Config with values coming from command line flags. +func NewFromFlags() (*Config, error) { + conf := &Config{} + + obj := reflect.ValueOf(conf).Elem() + st := obj.Type() + for i := 0; i < st.NumField(); i++ { + f := st.Field(i) + name, ok := f.Tag.Lookup("flag") + if !ok { + // No flag set for this field. + continue + } + fl := flag.CommandLine.Lookup(name) + if fl == nil { + panic(fmt.Sprintf("Flag %q not found", name)) + } + x := reflect.ValueOf(flag.Get(fl.Value)) + obj.Field(i).Set(x) + } + + if len(conf.RootDir) == 0 { + // If not set, set default root dir to something (hopefully) user-writeable. + conf.RootDir = "/var/run/runsc" + if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" { + conf.RootDir = filepath.Join(runtimeDir, "runsc") + } + } + + if err := conf.validate(); err != nil { + return nil, err + } + return conf, nil +} + +// ToFlags returns a slice of flags that correspond to the given Config. +func (c *Config) ToFlags() []string { + var rv []string + + obj := reflect.ValueOf(c).Elem() + st := obj.Type() + for i := 0; i < st.NumField(); i++ { + f := st.Field(i) + name, ok := f.Tag.Lookup("flag") + if !ok { + // No flag set for this field. + continue + } + val := getVal(obj.Field(i)) + + flag := flag.CommandLine.Lookup(name) + if flag == nil { + panic(fmt.Sprintf("Flag %q not found", name)) + } + if val == flag.DefValue { + continue + } + rv = append(rv, fmt.Sprintf("--%s=%s", flag.Name, val)) + } + return rv +} + +// Override writes a new value to a flag. +func (c *Config) Override(name string, value string) error { + if !c.AllowFlagOverride { + return fmt.Errorf("flag override disabled, use --allow-flag-override to enable it") + } + + obj := reflect.ValueOf(c).Elem() + st := obj.Type() + for i := 0; i < st.NumField(); i++ { + f := st.Field(i) + fieldName, ok := f.Tag.Lookup("flag") + if !ok || fieldName != name { + // Not a flag field, or flag name doesn't match. + continue + } + fl := flag.CommandLine.Lookup(name) + if fl == nil { + // Flag must exist if there is a field match above. + panic(fmt.Sprintf("Flag %q not found", name)) + } + + // Use flag to convert the string value to the underlying flag type, using + // the same rules as the command-line for consistency. + if err := fl.Value.Set(value); err != nil { + return fmt.Errorf("error setting flag %s=%q: %w", name, value, err) + } + x := reflect.ValueOf(flag.Get(fl.Value)) + obj.Field(i).Set(x) + + // Validates the config again to ensure it's left in a consistent state. + return c.validate() + } + return fmt.Errorf("flag %q not found. Cannot set it to %q", name, value) +} + +func getVal(field reflect.Value) string { + if str, ok := field.Addr().Interface().(fmt.Stringer); ok { + return str.String() + } + switch field.Kind() { + case reflect.Bool: + return strconv.FormatBool(field.Bool()) + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + return strconv.FormatInt(field.Int(), 10) + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: + return strconv.FormatUint(field.Uint(), 10) + case reflect.String: + return field.String() + default: + panic("unknown type " + field.Kind().String()) + } +} diff --git a/runsc/console/console.go b/runsc/console/console.go index 64b23639a..dbb88e117 100644 --- a/runsc/console/console.go +++ b/runsc/console/console.go @@ -24,11 +24,11 @@ import ( "golang.org/x/sys/unix" ) -// NewWithSocket creates pty master/slave pair, sends the master FD over the given -// socket, and returns the slave. +// NewWithSocket creates pty master/replica pair, sends the master FD over the given +// socket, and returns the replica. func NewWithSocket(socketPath string) (*os.File, error) { - // Create a new pty master and slave. - ptyMaster, ptySlave, err := pty.Open() + // Create a new pty master and replica. + ptyMaster, ptyReplica, err := pty.Open() if err != nil { return nil, fmt.Errorf("opening pty: %v", err) } @@ -37,18 +37,18 @@ func NewWithSocket(socketPath string) (*os.File, error) { // Get a connection to the socket path. conn, err := net.Dial("unix", socketPath) if err != nil { - ptySlave.Close() + ptyReplica.Close() return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err) } defer conn.Close() uc, ok := conn.(*net.UnixConn) if !ok { - ptySlave.Close() + ptyReplica.Close() return nil, fmt.Errorf("connection is not a UnixConn: %T", conn) } socket, err := uc.File() if err != nil { - ptySlave.Close() + ptyReplica.Close() return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err) } defer socket.Close() @@ -56,8 +56,8 @@ func NewWithSocket(socketPath string) (*os.File, error) { // Send the master FD over the connection. msg := unix.UnixRights(int(ptyMaster.Fd())) if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil { - ptySlave.Close() + ptyReplica.Close() return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err) } - return ptySlave, nil + return ptyReplica, nil } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 49cfb0837..c33755482 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -23,11 +23,12 @@ go_library( "//pkg/sync", "//runsc/boot", "//runsc/cgroup", + "//runsc/config", "//runsc/sandbox", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_gofrs_flock//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) @@ -65,10 +66,11 @@ go_test( "//pkg/urpc", "//runsc/boot", "//runsc/boot/platforms", + "//runsc/config", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_kr_pty//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 3813c6b93..4228399b8 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -122,6 +122,7 @@ func TestConsoleSocket(t *testing.T) { for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { spec := testutil.NewSpecWithArgs("true") + spec.Process.Terminal = true _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) @@ -184,14 +185,14 @@ func TestJobControlSignalExec(t *testing.T) { t.Fatalf("error starting container: %v", err) } - // Create a pty master/slave. The slave will be passed to the exec + // Create a pty master/replica. The replica will be passed to the exec // process. - ptyMaster, ptySlave, err := pty.Open() + ptyMaster, ptyReplica, err := pty.Open() if err != nil { t.Fatalf("error opening pty: %v", err) } defer ptyMaster.Close() - defer ptySlave.Close() + defer ptyReplica.Close() // Exec bash and attach a terminal. Note that occasionally /bin/sh // may be a different shell or have a different configuration (such @@ -202,9 +203,9 @@ func TestJobControlSignalExec(t *testing.T) { // Don't let bash execute from profile or rc files, otherwise // our PID counts get messed up. Argv: []string{"/bin/bash", "--noprofile", "--norc"}, - // Pass the pty slave as FD 0, 1, and 2. + // Pass the pty replica as FD 0, 1, and 2. FilePayload: urpc.FilePayload{ - Files: []*os.File{ptySlave, ptySlave, ptySlave}, + Files: []*os.File{ptyReplica, ptyReplica, ptyReplica}, }, StdioIsPty: true, } diff --git a/runsc/container/container.go b/runsc/container/container.go index 6d297d0df..52e1755ce 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -37,6 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/sighandling" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cgroup" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/sandbox" "gvisor.dev/gvisor/runsc/specutils" ) @@ -269,7 +270,7 @@ type Args struct { // New creates the container in a new Sandbox process, unless the metadata // indicates that an existing Sandbox should be used. The caller must call // Destroy() on the container. -func New(conf *boot.Config, args Args) (*Container, error) { +func New(conf *config.Config, args Args) (*Container, error) { log.Debugf("Create container %q in root dir: %s", args.ID, conf.RootDir) if err := validateID(args.ID); err != nil { return nil, err @@ -311,6 +312,14 @@ func New(conf *boot.Config, args Args) (*Container, error) { if isRoot(args.Spec) { log.Debugf("Creating new sandbox for container %q", args.ID) + if args.Spec.Linux == nil { + args.Spec.Linux = &specs.Linux{} + } + // Don't force the use of cgroups in tests because they lack permission to do so. + if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { + args.Spec.Linux.CgroupsPath = "/" + args.ID + } + // Create and join cgroup before processes are created to ensure they are // part of the cgroup from the start (and all their children processes). cg, err := cgroup.New(args.Spec) @@ -320,11 +329,17 @@ func New(conf *boot.Config, args Args) (*Container, error) { if cg != nil { // If there is cgroup config, install it before creating sandbox process. if err := cg.Install(args.Spec.Linux.Resources); err != nil { - return nil, fmt.Errorf("configuring cgroup: %v", err) + switch { + case errors.Is(err, syscall.EACCES) && conf.Rootless: + log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) + cg = nil + default: + return nil, fmt.Errorf("configuring cgroup: %v", err) + } } } if err := runInCgroup(cg, func() error { - ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir) + ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached) if err != nil { return err } @@ -397,7 +412,7 @@ func New(conf *boot.Config, args Args) (*Container, error) { } // Start starts running the containerized process inside the sandbox. -func (c *Container) Start(conf *boot.Config) error { +func (c *Container) Start(conf *config.Config) error { log.Debugf("Start container %q", c.ID) if err := c.Saver.lock(); err != nil { @@ -427,7 +442,7 @@ func (c *Container) Start(conf *boot.Config) error { // the start (and all their children processes). if err := runInCgroup(c.Sandbox.Cgroup, func() error { // Create the gofer process. - ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir) + ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) if err != nil { return err } @@ -472,7 +487,7 @@ func (c *Container) Start(conf *boot.Config) error { // Restore takes a container and replaces its kernel and file system // to restore a container from its state file. -func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error { +func (c *Container) Restore(spec *specs.Spec, conf *config.Config, restoreFile string) error { log.Debugf("Restore container %q", c.ID) if err := c.Saver.lock(); err != nil { return err @@ -499,7 +514,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str } // Run is a helper that calls Create + Start + Wait. -func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) { +func Run(conf *config.Config, args Args) (syscall.WaitStatus, error) { log.Debugf("Run container %q in root dir: %s", args.ID, conf.RootDir) c, err := New(conf, args) if err != nil { @@ -861,7 +876,7 @@ func (c *Container) waitForStopped() error { return backoff.Retry(op, b) } -func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) { +func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) { // Start with the general config flags. args := conf.ToFlags() @@ -901,9 +916,6 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund } args = append(args, "gofer", "--bundle", bundleDir) - if conf.Overlay { - args = append(args, "--panic-on-write=true") - } // Open the spec file to donate to the sandbox. specFile, err := specutils.OpenSpec(bundleDir) @@ -955,6 +967,14 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund cmd.ExtraFiles = goferEnds cmd.Args[0] = "runsc-gofer" + if attached { + // The gofer is attached to the lifetime of this process, so it + // should synchronously die when this process dies. + cmd.SysProcAttr = &syscall.SysProcAttr{ + Pdeathsig: syscall.SIGKILL, + } + } + // Enter new namespaces to isolate from the rest of the system. Don't unshare // cgroup because gofer is added to a cgroup in the caller's namespace. nss := []specs.LinuxNamespace{ @@ -979,7 +999,7 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund // Start the gofer in the given namespace. log.Debugf("Starting gofer: %s %v", binPath, args) if err := specutils.StartInNS(cmd, nss); err != nil { - return nil, nil, fmt.Errorf("Gofer: %v", err) + return nil, nil, fmt.Errorf("gofer: %v", err) } log.Infof("Gofer started, PID: %d", cmd.Process.Pid) c.GoferPid = cmd.Process.Pid diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index e7715b6f7..cc188f45b 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -20,6 +20,7 @@ import ( "fmt" "io" "io/ioutil" + "math" "os" "path" "path/filepath" @@ -40,8 +41,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/test/testutil" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot/platforms" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) @@ -53,9 +55,8 @@ func waitForProcessList(cont *Container, want []*control.Process) error { err = fmt.Errorf("error getting process data from container: %v", err) return &backoff.PermanentError{Err: err} } - if r, err := procListsEqual(got, want); !r { - return fmt.Errorf("container got process list: %s, want: %s: error: %v", - procListToString(got), procListToString(want), err) + if !procListsEqual(got, want) { + return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want)) } return nil } @@ -92,36 +93,72 @@ func blockUntilWaitable(pid int) error { return err } -// procListsEqual is used to check whether 2 Process lists are equal for all -// implemented fields. -func procListsEqual(got, want []*control.Process) (bool, error) { - if len(got) != len(want) { - return false, nil - } - for i := range got { - pd1 := got[i] - pd2 := want[i] - // Zero out timing dependant fields. - pd1.Time = "" - pd1.STime = "" - pd1.C = 0 - // Ignore TTY field too, since it's not relevant in the cases - // where we use this method. Tests that care about the TTY - // field should check for it themselves. - pd1.TTY = "" - pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1}) - if err != nil { - return false, err +// procListsEqual is used to check whether 2 Process lists are equal. Fields +// set to -1 in wants are ignored. Timestamp and threads fields are always +// ignored. +func procListsEqual(gots, wants []*control.Process) bool { + if len(gots) != len(wants) { + return false + } + for i := range gots { + got := gots[i] + want := wants[i] + + if want.UID != math.MaxUint32 && want.UID != got.UID { + return false } - pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2}) - if err != nil { - return false, err + if want.PID != -1 && want.PID != got.PID { + return false } - if pd1Json != pd2Json { - return false, nil + if want.PPID != -1 && want.PPID != got.PPID { + return false + } + if len(want.TTY) != 0 && want.TTY != got.TTY { + return false + } + if len(want.Cmd) != 0 && want.Cmd != got.Cmd { + return false } } - return true, nil + return true +} + +type processBuilder struct { + process control.Process +} + +func newProcessBuilder() *processBuilder { + return &processBuilder{ + process: control.Process{ + UID: math.MaxUint32, + PID: -1, + PPID: -1, + }, + } +} + +func (p *processBuilder) Cmd(cmd string) *processBuilder { + p.process.Cmd = cmd + return p +} + +func (p *processBuilder) PID(pid kernel.ThreadID) *processBuilder { + p.process.PID = pid + return p +} + +func (p *processBuilder) PPID(ppid kernel.ThreadID) *processBuilder { + p.process.PPID = ppid + return p +} + +func (p *processBuilder) UID(uid auth.KUID) *processBuilder { + p.process.UID = uid + return p +} + +func (p *processBuilder) Process() *control.Process { + return &p.process } func procListToString(pl []*control.Process) string { @@ -214,7 +251,7 @@ func readOutputNum(file string, position int) (int, error) { // run starts the sandbox and waits for it to exit, checking that the // application succeeded. -func run(spec *specs.Spec, conf *boot.Config) error { +func run(spec *specs.Spec, conf *config.Config) error { _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { return fmt.Errorf("error setting up container: %v", err) @@ -253,26 +290,24 @@ var ( ) // configs generates different configurations to run tests. -func configs(t *testing.T, opts ...configOption) map[string]*boot.Config { +func configs(t *testing.T, opts ...configOption) map[string]*config.Config { // Always load the default config. - cs := make(map[string]*boot.Config) + cs := make(map[string]*config.Config) + testutil.TestConfig(t) for _, o := range opts { + c := testutil.TestConfig(t) switch o { case overlay: - c := testutil.TestConfig(t) c.Overlay = true cs["overlay"] = c case ptrace: - c := testutil.TestConfig(t) c.Platform = platforms.Ptrace cs["ptrace"] = c case kvm: - c := testutil.TestConfig(t) c.Platform = platforms.KVM cs["kvm"] = c case nonExclusiveFS: - c := testutil.TestConfig(t) - c.FileAccess = boot.FileAccessShared + c.FileAccess = config.FileAccessShared cs["non-exclusive"] = c default: panic(fmt.Sprintf("unknown config option %v", o)) @@ -281,23 +316,14 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config { return cs } -func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config { - vfs1 := configs(t, opts...) - - var optsVFS2 []configOption - for _, opt := range opts { - // TODO(gvisor.dev/issue/1487): Enable overlay tests. - if opt != overlay { - optsVFS2 = append(optsVFS2, opt) - } - } - - for key, value := range configs(t, optsVFS2...) { +// TODO(gvisor.dev/issue/1624): Merge with configs when VFS2 is the default. +func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config { + all := configs(t, opts...) + for key, value := range configs(t, opts...) { value.VFS2 = true - vfs1[key+"VFS2"] = value + all[key+"VFS2"] = value } - - return vfs1 + return all } // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle. @@ -323,14 +349,7 @@ func TestLifecycle(t *testing.T) { // expectedPL lists the expected process state of the container. expectedPL := []*control.Process{ - { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", - Threads: []kernel.ThreadID{1}, - }, + newProcessBuilder().Cmd("sleep").Process(), } // Create the container. args := Args{ @@ -483,7 +502,7 @@ func TestExePath(t *testing.T) { t.Fatalf("error making directory: %v", err) } - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { for _, test := range []struct { path string @@ -608,10 +627,16 @@ func doAppExitStatus(t *testing.T, vfs2 bool) { // TestExec verifies that a container can exec a new program. func TestExec(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { - const uid = 343 - spec := testutil.NewSpecWithArgs("sleep", "100") + dir, err := ioutil.TempDir(testutil.TmpDir(), "exec-test") + if err != nil { + t.Fatalf("error creating temporary directory: %v", err) + } + // Note that some shells may exec the final command in a sequence as + // an optimization. We avoid this here by adding the exit 0. + cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100 && exit 0", dir) + spec := testutil.NewSpecWithArgs("sh", "-c", cmd) _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { @@ -634,29 +659,127 @@ func TestExec(t *testing.T) { t.Fatalf("error starting container: %v", err) } - // expectedPL lists the expected process state of the container. + // Wait until sleep is running to ensure the symlink was created. expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sh").Process(), + newProcessBuilder().Cmd("sleep").Process(), + } + if err := waitForProcessList(cont, expectedPL); err != nil { + t.Fatalf("waitForProcessList: %v", err) + } + + for _, tc := range []struct { + name string + args control.ExecArgs + }{ { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", - Threads: []kernel.ThreadID{1}, + name: "complete", + args: control.ExecArgs{ + Filename: "/bin/true", + Argv: []string{"/bin/true"}, + }, }, { - UID: uid, - PID: 2, - PPID: 0, - C: 0, - Cmd: "sleep", - Threads: []kernel.ThreadID{2}, + name: "filename", + args: control.ExecArgs{ + Filename: "/bin/true", + }, }, + { + name: "argv", + args: control.ExecArgs{ + Argv: []string{"/bin/true"}, + }, + }, + { + name: "filename resolution", + args: control.ExecArgs{ + Filename: "true", + Envv: []string{"PATH=/bin"}, + }, + }, + { + name: "argv resolution", + args: control.ExecArgs{ + Argv: []string{"true"}, + Envv: []string{"PATH=/bin"}, + }, + }, + { + name: "argv symlink", + args: control.ExecArgs{ + Argv: []string{filepath.Join(dir, "symlink")}, + }, + }, + { + name: "working dir", + args: control.ExecArgs{ + Argv: []string{"/bin/sh", "-c", `if [[ "${PWD}" != "/tmp" ]]; then exit 1; fi`}, + WorkingDirectory: "/tmp", + }, + }, + { + name: "user", + args: control.ExecArgs{ + Argv: []string{"/bin/sh", "-c", `if [[ "$(id -u)" != "343" ]]; then exit 1; fi`}, + KUID: 343, + }, + }, + { + name: "group", + args: control.ExecArgs{ + Argv: []string{"/bin/sh", "-c", `if [[ "$(id -g)" != "343" ]]; then exit 1; fi`}, + KGID: 343, + }, + }, + { + name: "env", + args: control.ExecArgs{ + Argv: []string{"/bin/sh", "-c", `if [[ "${FOO}" != "123" ]]; then exit 1; fi`}, + Envv: []string{"FOO=123"}, + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + // t.Parallel() + if ws, err := cont.executeSync(&tc.args); err != nil { + t.Fatalf("executeAsync(%+v): %v", tc.args, err) + } else if ws != 0 { + t.Fatalf("executeAsync(%+v) failed with exit: %v", tc.args, ws) + } + }) } + }) + } +} - // Verify that "sleep 100" is running. - if err := waitForProcessList(cont, expectedPL[:1]); err != nil { - t.Error(err) +// TestExecProcList verifies that a container can exec a new program and it +// shows correcly in the process list. +func TestExecProcList(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + const uid = 343 + spec := testutil.NewSpecWithArgs("sleep", "100") + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) } execArgs := &control.ExecArgs{ @@ -666,9 +789,8 @@ func TestExec(t *testing.T) { KUID: uid, } - // Verify that "sleep 100" and "sleep 5" are running - // after exec. First, start running exec (whick - // blocks). + // Verify that "sleep 100" and "sleep 5" are running after exec. First, + // start running exec (which blocks). ch := make(chan error) go func() { exitStatus, err := cont.executeSync(execArgs) @@ -681,6 +803,11 @@ func TestExec(t *testing.T) { } }() + // expectedPL lists the expected process state of the container. + expectedPL := []*control.Process{ + newProcessBuilder().PID(1).PPID(0).Cmd("sleep").UID(0).Process(), + newProcessBuilder().PID(2).PPID(0).Cmd("sleep").UID(uid).Process(), + } if err := waitForProcessList(cont, expectedPL); err != nil { t.Fatalf("error waiting for processes: %v", err) } @@ -700,7 +827,7 @@ func TestExec(t *testing.T) { // TestKillPid verifies that we can signal individual exec'd processes. func TestKillPid(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { app, err := testutil.FindFile("test/cmd/test_app/test_app") if err != nil { @@ -768,13 +895,15 @@ func TestKillPid(t *testing.T) { } } -// TestCheckpointRestore creates a container that continuously writes successive integers -// to a file. To test checkpoint and restore functionality, the container is -// checkpointed and the last number printed to the file is recorded. Then, it is restored in two -// new containers and the first number printed from these containers is checked. Both should -// be the next consecutive number after the last number from the checkpointed container. +// TestCheckpointRestore creates a container that continuously writes successive +// integers to a file. To test checkpoint and restore functionality, the +// container is checkpointed and the last number printed to the file is +// recorded. Then, it is restored in two new containers and the first number +// printed from these containers is checked. Both should be the next consecutive +// number after the last number from the checkpointed container. func TestCheckpointRestore(t *testing.T) { // Skip overlay because test requires writing to host file. + // TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added. for name, conf := range configs(t, noOverlay...) { t.Run(name, func(t *testing.T) { dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test") @@ -936,6 +1065,7 @@ func TestCheckpointRestore(t *testing.T) { // with filesystem Unix Domain Socket use. func TestUnixDomainSockets(t *testing.T) { // Skip overlay because test requires writing to host file. + // TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added. for name, conf := range configs(t, noOverlay...) { t.Run(name, func(t *testing.T) { // UDS path is limited to 108 chars for compatibility with older systems. @@ -1073,7 +1203,7 @@ func TestUnixDomainSockets(t *testing.T) { // recreated. Then it resumes the container, verify that the file gets created // again. func TestPauseResume(t *testing.T) { - for name, conf := range configs(t, noOverlay...) { + for name, conf := range configsWithVFS2(t, noOverlay...) { t.Run(name, func(t *testing.T) { tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock") if err != nil { @@ -1242,24 +1372,9 @@ func TestCapabilities(t *testing.T) { // expectedPL lists the expected process state of the container. expectedPL := []*control.Process{ - { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", - Threads: []kernel.ThreadID{1}, - }, - { - UID: uid, - PID: 2, - PPID: 0, - C: 0, - Cmd: "exe", - Threads: []kernel.ThreadID{2}, - }, + newProcessBuilder().Cmd("sleep").Process(), } - if err := waitForProcessList(cont, expectedPL[:1]); err != nil { + if err := waitForProcessList(cont, expectedPL); err != nil { t.Fatalf("Failed to wait for sleep to start, err: %v", err) } @@ -1348,7 +1463,7 @@ func TestRunNonRoot(t *testing.T) { // TestMountNewDir checks that runsc will create destination directory if it // doesn't exit. func TestMountNewDir(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { root, err := ioutil.TempDir(testutil.TmpDir(), "root") if err != nil { @@ -1368,6 +1483,8 @@ func TestMountNewDir(t *testing.T) { Source: srcDir, Type: "bind", }) + // Extra points for creating the mount with a readonly root. + spec.Root.Readonly = true if err := run(spec, conf); err != nil { t.Fatalf("error running sandbox: %v", err) @@ -1377,17 +1494,17 @@ func TestMountNewDir(t *testing.T) { } func TestReadonlyRoot(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { - spec := testutil.NewSpecWithArgs("/bin/touch", "/foo") + spec := testutil.NewSpecWithArgs("sleep", "100") spec.Root.Readonly = true + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } defer cleanup() - // Create, start and wait for the container. args := Args{ ID: testutil.RandomContainerID(), Spec: spec, @@ -1402,12 +1519,82 @@ func TestReadonlyRoot(t *testing.T) { t.Fatalf("error starting container: %v", err) } - ws, err := c.Wait() + // Read mounts to check that root is readonly. + out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", "mount | grep ' / '") + if err != nil || ws != 0 { + t.Fatalf("exec failed, ws: %v, err: %v", ws, err) + } + t.Logf("root mount: %q", out) + if !strings.Contains(string(out), "(ro)") { + t.Errorf("root not mounted readonly: %q", out) + } + + // Check that file cannot be created. + ws, err = execute(c, "/bin/touch", "/foo") if err != nil { - t.Fatalf("error waiting on container: %v", err) + t.Fatalf("touch file in ro mount: %v", err) } if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { - t.Fatalf("container failed, waitStatus: %v", ws) + t.Fatalf("wrong waitStatus: %v", ws) + } + }) + } +} + +func TestReadonlyMount(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + spec := testutil.NewSpecWithArgs("sleep", "100") + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: dir, + Type: "bind", + Options: []string{"ro"}, + }) + spec.Root.Readonly = false + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Read mounts to check that volume is readonly. + cmd := fmt.Sprintf("mount | grep ' %s '", dir) + out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", cmd) + if err != nil || ws != 0 { + t.Fatalf("exec failed, ws: %v, err: %v", ws, err) + } + t.Logf("mount: %q", out) + if !strings.Contains(string(out), "(ro)") { + t.Errorf("volume not mounted readonly: %q", out) + } + + // Check that file cannot be created. + ws, err = execute(c, "/bin/touch", path.Join(dir, "file")) + if err != nil { + t.Fatalf("touch file in ro mount: %v", err) + } + if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { + t.Fatalf("wrong WaitStatus: %v", ws) } }) } @@ -1494,54 +1681,6 @@ func TestUIDMap(t *testing.T) { } } -func TestReadonlyMount(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { - t.Run(name, func(t *testing.T) { - dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount") - spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) - if err != nil { - t.Fatalf("ioutil.TempDir() failed: %v", err) - } - spec.Mounts = append(spec.Mounts, specs.Mount{ - Destination: dir, - Source: dir, - Type: "bind", - Options: []string{"ro"}, - }) - spec.Root.Readonly = false - - _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer cleanup() - - // Create, start and wait for the container. - args := Args{ - ID: testutil.RandomContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() - if err := c.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } - - ws, err := c.Wait() - if err != nil { - t.Fatalf("error waiting on container: %v", err) - } - if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { - t.Fatalf("container failed, waitStatus: %v", ws) - } - }) - } -} - // TestAbbreviatedIDs checks that runsc supports using abbreviated container // IDs in place of full IDs. func TestAbbreviatedIDs(t *testing.T) { @@ -1708,8 +1847,9 @@ func TestUserLog(t *testing.T) { t.Fatal("error finding test_app:", err) } - // sched_rr_get_interval = 148 - not implemented in gvisor. - spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148") + // sched_rr_get_interval - not implemented in gvisor. + num := strconv.Itoa(syscall.SYS_SCHED_RR_GET_INTERVAL) + spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall="+num) conf := testutil.TestConfig(t) _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { @@ -1891,7 +2031,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) { } func TestCreateWorkingDir(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create") if err != nil { @@ -1994,27 +2134,19 @@ func TestMountPropagation(t *testing.T) { // Check that mount didn't propagate to private mount. privFile := filepath.Join(priv, "mnt", "file") - execArgs := &control.ExecArgs{ - Filename: "/usr/bin/test", - Argv: []string{"test", "!", "-f", privFile}, - } - if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + if ws, err := execute(cont, "/usr/bin/test", "!", "-f", privFile); err != nil || ws != 0 { t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err) } // Check that mount propagated to slave mount. slaveFile := filepath.Join(slave, "mnt", "file") - execArgs = &control.ExecArgs{ - Filename: "/usr/bin/test", - Argv: []string{"test", "-f", slaveFile}, - } - if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + if ws, err := execute(cont, "/usr/bin/test", "-f", slaveFile); err != nil || ws != 0 { t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err) } } func TestMountSymlink(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink") if err != nil { @@ -2074,11 +2206,7 @@ func TestMountSymlink(t *testing.T) { // Check that symlink was resolved and mount was created where the symlink // is pointing to. file := path.Join(target, "file") - execArgs := &control.ExecArgs{ - Filename: "/usr/bin/test", - Argv: []string{"test", "-f", file}, - } - if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + if ws, err := execute(cont, "/usr/bin/test", "-f", file); err != nil || ws != 0 { t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err) } }) @@ -2204,13 +2332,42 @@ func TestTTYField(t *testing.T) { } } +func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) { + args := &control.ExecArgs{ + Filename: name, + Argv: append([]string{name}, arg...), + } + return cont.executeSync(args) +} + +func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, syscall.WaitStatus, error) { + r, w, err := os.Pipe() + if err != nil { + return nil, 0, err + } + defer r.Close() + + args := &control.ExecArgs{ + Filename: name, + Argv: append([]string{name}, arg...), + FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, w, w}}, + } + ws, err := cont.executeSync(args) + w.Close() + if err != nil { + return nil, 0, err + } + out, err := ioutil.ReadAll(r) + return out, ws, err +} + // executeSync synchronously executes a new process. -func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { - pid, err := cont.Execute(args) +func (c *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { + pid, err := c.Execute(args) if err != nil { return 0, fmt.Errorf("error executing: %v", err) } - ws, err := cont.WaitPID(pid) + ws, err := c.WaitPID(pid) if err != nil { return 0, fmt.Errorf("error waiting: %v", err) } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 207206dd2..850e80290 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -33,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) @@ -60,7 +61,7 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) { return specs, ids } -func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) { +func startContainers(conf *config.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) { if len(conf.RootDir) == 0 { panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.") } @@ -100,19 +101,20 @@ type execDesc struct { c *Container cmd []string want int - desc string + name string } -func execMany(execs []execDesc) error { +func execMany(t *testing.T, execs []execDesc) { for _, exec := range execs { - args := &control.ExecArgs{Argv: exec.cmd} - if ws, err := exec.c.executeSync(args); err != nil { - return fmt.Errorf("error executing %+v: %v", args, err) - } else if ws.ExitStatus() != exec.want { - return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want) - } + t.Run(exec.name, func(t *testing.T) { + args := &control.ExecArgs{Argv: exec.cmd} + if ws, err := exec.c.executeSync(args); err != nil { + t.Errorf("error executing %+v: %v", args, err) + } else if ws.ExitStatus() != exec.want { + t.Errorf("%q: exec %q got exit status: %d, want: %d", exec.name, exec.cmd, ws.ExitStatus(), exec.want) + } + }) } - return nil } func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) { @@ -149,13 +151,13 @@ func TestMultiContainerSanity(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).PPID(0).Cmd("sleep").Process(), } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) } expectedPL = []*control.Process{ - {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, + newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(), } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -167,7 +169,7 @@ func TestMultiContainerSanity(t *testing.T) { // TestMultiPIDNS checks that it is possible to run 2 dead-simple // containers in the same sandbox with different pidns. func TestMultiPIDNS(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -195,13 +197,13 @@ func TestMultiPIDNS(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).Cmd("sleep").Process(), } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) } expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).Cmd("sleep").Process(), } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -212,7 +214,7 @@ func TestMultiPIDNS(t *testing.T) { // TestMultiPIDNSPath checks the pidns path. func TestMultiPIDNSPath(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -257,7 +259,7 @@ func TestMultiPIDNSPath(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).PPID(0).Cmd("sleep").Process(), } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -267,7 +269,7 @@ func TestMultiPIDNSPath(t *testing.T) { } expectedPL = []*control.Process{ - {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, + newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(), } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -300,7 +302,7 @@ func TestMultiContainerWait(t *testing.T) { // Check via ps that multiple processes are running. expectedPL := []*control.Process{ - {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, + newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(), } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -345,7 +347,7 @@ func TestMultiContainerWait(t *testing.T) { // After Wait returns, ensure that the root container is running and // the child has finished. expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().Cmd("sleep").Process(), } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err) @@ -377,7 +379,7 @@ func TestExecWait(t *testing.T) { // Check via ps that process is running. expectedPL := []*control.Process{ - {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, + newProcessBuilder().Cmd("sleep").Process(), } if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Fatalf("failed to wait for sleep to start: %v", err) @@ -412,7 +414,7 @@ func TestExecWait(t *testing.T) { // Wait for the exec'd process to exit. expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).Cmd("sleep").Process(), } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Fatalf("failed to wait for second container to stop: %v", err) @@ -478,7 +480,7 @@ func TestMultiContainerMount(t *testing.T) { // TestMultiContainerSignal checks that it is possible to signal individual // containers without killing the entire sandbox. func TestMultiContainerSignal(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -498,9 +500,8 @@ func TestMultiContainerSignal(t *testing.T) { // Check via ps that container 1 process is running. expectedPL := []*control.Process{ - {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, + newProcessBuilder().Cmd("sleep").Process(), } - if err := waitForProcessList(containers[1], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) } @@ -512,7 +513,7 @@ func TestMultiContainerSignal(t *testing.T) { // Make sure process 1 is still running. expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).Cmd("sleep").Process(), } if err := waitForProcessList(containers[0], expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -579,7 +580,7 @@ func TestMultiContainerDestroy(t *testing.T) { t.Fatal("error finding test_app:", err) } - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -626,8 +627,10 @@ func TestMultiContainerDestroy(t *testing.T) { if err != nil { t.Fatalf("error getting process data from sandbox: %v", err) } - expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}} - if r, err := procListsEqual(pss, expectedPL); !r { + expectedPL := []*control.Process{ + newProcessBuilder().PID(1).Cmd("sleep").Process(), + } + if !procListsEqual(pss, expectedPL) { t.Errorf("container got process list: %s, want: %s: error: %v", procListToString(pss), procListToString(expectedPL), err) } @@ -664,7 +667,7 @@ func TestMultiContainerProcesses(t *testing.T) { // Check root's container process list doesn't include other containers. expectedPL0 := []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).Cmd("sleep").Process(), } if err := waitForProcessList(containers[0], expectedPL0); err != nil { t.Errorf("failed to wait for process to start: %v", err) @@ -672,8 +675,8 @@ func TestMultiContainerProcesses(t *testing.T) { // Same for the other container. expectedPL1 := []*control.Process{ - {PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}}, - {PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}}, + newProcessBuilder().PID(2).Cmd("sh").Process(), + newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process(), } if err := waitForProcessList(containers[1], expectedPL1); err != nil { t.Errorf("failed to wait for process to start: %v", err) @@ -687,7 +690,7 @@ func TestMultiContainerProcesses(t *testing.T) { if _, err := containers[1].Execute(args); err != nil { t.Fatalf("error exec'ing: %v", err) } - expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}}) + expectedPL1 = append(expectedPL1, newProcessBuilder().PID(4).Cmd("sleep").Process()) if err := waitForProcessList(containers[1], expectedPL1); err != nil { t.Errorf("failed to wait for process to start: %v", err) } @@ -1071,7 +1074,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) { // Test that pod shared mounts are properly mounted in 2 containers and that // changes from one container is reflected in the other. func TestMultiContainerSharedMount(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1109,84 +1112,82 @@ func TestMultiContainerSharedMount(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", + name: "directory is mounted in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", + name: "directory is mounted in container1", }, { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - desc: "create file in container0", + cmd: []string{"/bin/touch", file0}, + name: "create file in container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file appears in container0", + name: "file appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file appears in container1", + name: "file appears in container1", }, { c: containers[1], cmd: []string{"/bin/rm", file1}, - desc: "file removed from container1", + name: "remove file from container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-f", file0}, - desc: "file removed from container0", + name: "file removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-f", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, { c: containers[1], cmd: []string{"/bin/mkdir", file1}, - desc: "create directory in container1", + name: "create directory in container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-d", file0}, - desc: "dir appears in container0", + name: "dir appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", file1}, - desc: "dir appears in container1", + name: "dir appears in container1", }, { c: containers[0], cmd: []string{"/bin/rmdir", file0}, - desc: "create directory in container0", + name: "remove directory from container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-d", file0}, - desc: "dir removed from container0", + name: "dir removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-d", file1}, - desc: "dir removed from container1", + name: "dir removed from container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } // Test that pod mounts are mounted as readonly when requested. func TestMultiContainerSharedMountReadonly(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1224,36 +1225,34 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", + name: "directory is mounted in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", + name: "directory is mounted in container1", }, { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, + cmd: []string{"/bin/touch", file0}, want: 1, - desc: "fails to write to container0", + name: "fails to write to container0", }, { c: containers[1], - cmd: []string{"/usr/bin/touch", file1}, + cmd: []string{"/bin/touch", file1}, want: 1, - desc: "fails to write to container1", + name: "fails to write to container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } // Test that shared pod mounts continue to work after container is restarted. func TestMultiContainerSharedMountRestart(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1290,23 +1289,21 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { execs := []execDesc{ { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - desc: "create file in container0", + cmd: []string{"/bin/touch", file0}, + name: "create file in container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file appears in container0", + name: "file appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file appears in container1", + name: "file appears in container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) containers[1].Destroy() @@ -1333,86 +1330,84 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file is still in container0", + name: "file is still in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file is still in container1", + name: "file is still in container1", }, { c: containers[1], cmd: []string{"/bin/rm", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-f", file0}, - desc: "file removed from container0", + name: "file removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-f", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } // Test that unsupported pod mounts options are ignored when matching master and -// slave mounts. +// replica mounts. func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { - rootDir, cleanup, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer cleanup() - - conf := testutil.TestConfig(t) - conf.RootDir = rootDir + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir - // Setup the containers. - sleep := []string{"/bin/sleep", "100"} - podSpec, ids := createSpecs(sleep, sleep) - mnt0 := specs.Mount{ - Destination: "/mydir/test", - Source: "/some/dir", - Type: "tmpfs", - Options: []string{"rw", "rbind", "relatime"}, - } - podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + // Setup the containers. + sleep := []string{"/bin/sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: []string{"rw", "rbind", "relatime"}, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) - mnt1 := mnt0 - mnt1.Destination = "/mydir2/test2" - mnt1.Options = []string{"rw", "nosuid"} - podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + mnt1.Options = []string{"rw", "nosuid"} + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) - createSharedMount(mnt0, "test-mount", podSpec...) + createSharedMount(mnt0, "test-mount", podSpec...) - containers, cleanup, err := startContainers(conf, podSpec, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - execs := []execDesc{ - { - c: containers[0], - cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", - }, - } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + name: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + name: "directory is mounted in container1", + }, + } + execMany(t, execs) + }) } } @@ -1505,7 +1500,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { // Ensure container is running c := containers[2] expectedPL := []*control.Process{ - {PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}}, + newProcessBuilder().PID(3).Cmd("sleep").Process(), } if err := waitForProcessList(c, expectedPL); err != nil { t.Errorf("failed to wait for sleep to start: %v", err) @@ -1522,8 +1517,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { } // Check that container isn't running anymore. - args := &control.ExecArgs{Argv: []string{"/bin/true"}} - if _, err := c.executeSync(args); err == nil { + if _, err := execute(c, "/bin/true"); err == nil { t.Fatalf("Container %q was not stopped after gofer death", c.ID) } @@ -1533,13 +1527,12 @@ func TestMultiContainerGoferKilled(t *testing.T) { continue // container[2] has been killed. } pl := []*control.Process{ - {PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}}, + newProcessBuilder().PID(kernel.ThreadID(i + 1)).Cmd("sleep").Process(), } if err := waitForProcessList(c, pl); err != nil { t.Errorf("Container %q was affected by another container: %v", c.ID, err) } - args := &control.ExecArgs{Argv: []string{"/bin/true"}} - if _, err := c.executeSync(args); err != nil { + if _, err := execute(c, "/bin/true"); err != nil { t.Fatalf("Container %q was affected by another container: %v", c.ID, err) } } @@ -1553,7 +1546,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { // Wait until sandbox stops. waitForProcessList will loop until sandbox exits // and RPC errors out. impossiblePL := []*control.Process{ - {PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}}, + newProcessBuilder().Cmd("non-existent-process").Process(), } if err := waitForProcessList(c, impossiblePL); err == nil { t.Fatalf("Sandbox was not killed after gofer death") @@ -1561,8 +1554,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { // Check that entire sandbox isn't running anymore. for _, c := range containers { - args := &control.ExecArgs{Argv: []string{"/bin/true"}} - if _, err := c.executeSync(args); err == nil { + if _, err := execute(c, "/bin/true"); err == nil { t.Fatalf("Container %q was not stopped after gofer death", c.ID) } } @@ -1697,3 +1689,80 @@ func TestMultiContainerRunNonRoot(t *testing.T) { t.Fatalf("child container failed, waitStatus: %v", ws) } } + +// TestMultiContainerHomeEnvDir tests that the HOME environment variable is set +// for root containers, sub-containers, and exec'ed processes. +func TestMultiContainerHomeEnvDir(t *testing.T) { + // NOTE: Don't use overlay since we need changes to persist to the temp dir + // outside the sandbox. + for testName, conf := range configsWithVFS2(t, noOverlay...) { + t.Run(testName, func(t *testing.T) { + + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Create temp files we can write the value of $HOME to. + homeDirs := map[string]*os.File{} + for _, name := range []string{"root", "sub", "exec"} { + homeFile, err := ioutil.TempFile(testutil.TmpDir(), name) + if err != nil { + t.Fatalf("creating temp file: %v", err) + } + homeDirs[name] = homeFile + } + + // We will sleep in the root container in order to ensure that the root + //container doesn't terminate before sub containers can be created. + rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s; sleep 1000`, homeDirs["root"].Name())} + subCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["sub"].Name())} + execCmd := fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["exec"].Name()) + + // Setup the containers, a root container and sub container. + specConfig, ids := createSpecs(rootCmd, subCmd) + containers, cleanup, err := startContainers(conf, specConfig, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() + + // Exec into the root container synchronously. + if _, err := execute(containers[0], "/bin/sh", "-c", execCmd); err != nil { + t.Errorf("error executing %+v: %v", execCmd, err) + } + + // Wait for the subcontainer to finish. + _, err = containers[1].Wait() + if err != nil { + t.Errorf("wait on child container: %v", err) + } + + // Wait for the root container to run. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("sh").Process(), + newProcessBuilder().Cmd("sleep").Process(), + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + + // Check the written files. + for name, tmpFile := range homeDirs { + dirBytes, err := ioutil.ReadAll(tmpFile) + if err != nil { + t.Fatalf("reading %s temp file: %v", name, err) + } + got := string(dirBytes) + + want := "/" + if got != want { + t.Errorf("%s $HOME incorrect: got: %q, want: %q", name, got, want) + } + } + + }) + } +} diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go index bac177a88..cb5bffb89 100644 --- a/runsc/container/shared_volume_test.go +++ b/runsc/container/shared_volume_test.go @@ -25,14 +25,14 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/test/testutil" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" ) // TestSharedVolume checks that modifications to a volume mount are propagated // into and out of the sandbox. func TestSharedVolume(t *testing.T) { conf := testutil.TestConfig(t) - conf.FileAccess = boot.FileAccessShared + conf.FileAccess = config.FileAccessShared // Main process just sleeps. We will use "exec" to probe the state of // the filesystem. @@ -168,11 +168,7 @@ func TestSharedVolume(t *testing.T) { func checkFile(c *Container, filename string, want []byte) error { cpy := filename + ".copy" - argsCp := &control.ExecArgs{ - Filename: "/bin/cp", - Argv: []string{"cp", "-f", filename, cpy}, - } - if _, err := c.executeSync(argsCp); err != nil { + if _, err := execute(c, "/bin/cp", "-f", filename, cpy); err != nil { return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err) } got, err := ioutil.ReadFile(cpy) @@ -189,7 +185,7 @@ func checkFile(c *Container, filename string, want []byte) error { // is reflected inside. func TestSharedVolumeFile(t *testing.T) { conf := testutil.TestConfig(t) - conf.FileAccess = boot.FileAccessShared + conf.FileAccess = config.FileAccessShared // Main process just sleeps. We will use "exec" to probe the state of // the filesystem. @@ -235,11 +231,7 @@ func TestSharedVolumeFile(t *testing.T) { } // Append to file inside the container and check that content is not lost. - argsAppend := &control.ExecArgs{ - Filename: "/bin/bash", - Argv: []string{"bash", "-c", "echo -n sandbox- >> " + filename}, - } - if _, err := c.executeSync(argsAppend); err != nil { + if _, err := execute(c, "/bin/bash", "-c", "echo -n sandbox- >> "+filename); err != nil { t.Fatalf("unexpected error appending file %q: %v", filename, err) } want = []byte("host-sandbox-") diff --git a/runsc/debian/description b/runsc/debian/description deleted file mode 100644 index 9e8e08805..000000000 --- a/runsc/debian/description +++ /dev/null @@ -1 +0,0 @@ -gVisor container sandbox runtime diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh deleted file mode 100755 index dc7aeee87..000000000 --- a/runsc/debian/postinst.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh -e - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if [ "$1" != configure ]; then - exit 0 -fi - -if [ -f /etc/docker/daemon.json ]; then - runsc install - systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2 -fi diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go index 0ca4829d7..775325c06 100644 --- a/runsc/flag/flag.go +++ b/runsc/flag/flag.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package flag wraps flag primitives. package flag import ( @@ -21,13 +22,19 @@ import ( type FlagSet = flag.FlagSet var ( - NewFlagSet = flag.NewFlagSet - String = flag.String Bool = flag.Bool - Int = flag.Int - Uint = flag.Uint CommandLine = flag.CommandLine + Int = flag.Int + NewFlagSet = flag.NewFlagSet Parse = flag.Parse + String = flag.String + Uint = flag.Uint + Var = flag.Var ) const ContinueOnError = flag.ContinueOnError + +// Get returns the flag's underlying object. +func Get(v flag.Value) interface{} { + return v.(flag.Getter).Get() +} diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index 1036b0630..96c57a426 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -31,5 +31,7 @@ go_test( deps = [ "//pkg/log", "//pkg/p9", + "//pkg/test/testutil", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 1dce36965..39b8a0b1e 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -27,62 +27,51 @@ import ( var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_ACCEPT: {}, syscall.SYS_CLOCK_GETTIME: {}, - syscall.SYS_CLONE: []seccomp.Rule{ - { - seccomp.AllowValue( - syscall.CLONE_VM | - syscall.CLONE_FS | - syscall.CLONE_FILES | - syscall.CLONE_SIGHAND | - syscall.CLONE_SYSVSEM | - syscall.CLONE_THREAD), - }, - }, - syscall.SYS_CLOSE: {}, - syscall.SYS_DUP: {}, - syscall.SYS_EPOLL_CTL: {}, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, + syscall.SYS_EPOLL_CTL: {}, syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, }, syscall.SYS_EVENTFD2: []seccomp.Rule{ { - seccomp.AllowValue(0), - seccomp.AllowValue(0), + seccomp.EqualTo(0), + seccomp.EqualTo(0), }, }, syscall.SYS_EXIT: {}, syscall.SYS_EXIT_GROUP: {}, syscall.SYS_FALLOCATE: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, }, syscall.SYS_FCHMOD: {}, syscall.SYS_FCHOWNAT: {}, syscall.SYS_FCNTL: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_SETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_SETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFD), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFD), }, // Used by flipcall.PacketWindowAllocator.Init(). { - seccomp.AllowAny{}, - seccomp.AllowValue(unix.F_ADD_SEALS), + seccomp.MatchAny{}, + seccomp.EqualTo(unix.F_ADD_SEALS), }, }, syscall.SYS_FSTAT: {}, @@ -91,31 +80,31 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_FTRUNCATE: {}, syscall.SYS_FUTEX: { seccomp.Rule{ - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, seccomp.Rule{ - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, // Non-private futex used for flipcall. seccomp.Rule{ - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, seccomp.Rule{ - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, }, syscall.SYS_GETDENTS64: {}, @@ -128,6 +117,7 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_MADVISE: {}, unix.SYS_MEMFD_CREATE: {}, /// Used by flipcall.PacketWindowAllocator.Init(). syscall.SYS_MKDIRAT: {}, + syscall.SYS_MKNODAT: {}, // Used by the Go runtime as a temporarily workaround for a Linux // 5.2-5.4 bug. // @@ -136,28 +126,28 @@ var allowedSyscalls = seccomp.SyscallRules{ // TODO(b/148688965): Remove once this is gone from Go. syscall.SYS_MLOCK: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(4096), + seccomp.MatchAny{}, + seccomp.EqualTo(4096), }, }, syscall.SYS_MMAP: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_SHARED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_SHARED), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), }, }, syscall.SYS_MPROTECT: {}, @@ -171,14 +161,14 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_READLINKAT: {}, syscall.SYS_RECVMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), }, }, syscall.SYS_RENAMEAT: {}, @@ -189,33 +179,33 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_SENDMSG: []seccomp.Rule{ // Used by fdchannel.Endpoint.SendFD(). { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, // Used by unet.SocketWriter.WriteVec(). { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, // Used by fdchannel.NewConnectedSockets(). syscall.SYS_SOCKETPAIR: { { - seccomp.AllowValue(syscall.AF_UNIX), - seccomp.AllowValue(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_UNIX), + seccomp.EqualTo(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, }, syscall.SYS_SYMLINKAT: {}, syscall.SYS_TGKILL: []seccomp.Rule{ { - seccomp.AllowValue(uint64(os.Getpid())), + seccomp.EqualTo(uint64(os.Getpid())), }, }, syscall.SYS_UNLINKAT: {}, @@ -226,24 +216,24 @@ var allowedSyscalls = seccomp.SyscallRules{ var udsSyscalls = seccomp.SyscallRules{ syscall.SYS_SOCKET: []seccomp.Rule{ { - seccomp.AllowValue(syscall.AF_UNIX), - seccomp.AllowValue(syscall.SOCK_STREAM), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_UNIX), + seccomp.EqualTo(syscall.SOCK_STREAM), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_UNIX), - seccomp.AllowValue(syscall.SOCK_DGRAM), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_UNIX), + seccomp.EqualTo(syscall.SOCK_DGRAM), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_UNIX), - seccomp.AllowValue(syscall.SOCK_SEQPACKET), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_UNIX), + seccomp.EqualTo(syscall.SOCK_SEQPACKET), + seccomp.EqualTo(0), }, }, syscall.SYS_CONNECT: []seccomp.Rule{ { - seccomp.AllowAny{}, + seccomp.MatchAny{}, }, }, } diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go index a4b28cb8b..686753d96 100644 --- a/runsc/fsgofer/filter/config_amd64.go +++ b/runsc/fsgofer/filter/config_amd64.go @@ -25,8 +25,41 @@ import ( func init() { allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{ - {seccomp.AllowValue(linux.ARCH_GET_FS)}, - {seccomp.AllowValue(linux.ARCH_SET_FS)}, + // TODO(b/168828518): No longer used in Go 1.16+. + {seccomp.EqualTo(linux.ARCH_SET_FS)}, + } + + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + // parent_tidptr and child_tidptr are always 0 because neither + // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SETTLS | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, + { + // TODO(b/168828518): No longer used in Go 1.16+ (on amd64). + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, } allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{} diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go index d2697deb7..ff0cf77a0 100644 --- a/runsc/fsgofer/filter/config_arm64.go +++ b/runsc/fsgofer/filter/config_arm64.go @@ -23,5 +23,26 @@ import ( ) func init() { + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + // parent_tidptr and child_tidptr are always 0 because neither + // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + // These arguments are left uninitialized by the Go + // runtime, so they may be anything (and are unused by + // the host). + seccomp.MatchAny{}, // parent_tidptr + seccomp.MatchAny{}, // tls + seccomp.MatchAny{}, // child_tidptr + }, + } + allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{} } diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go index 885c92f7a..20a0732be 100644 --- a/runsc/fsgofer/filter/extra_filters_race.go +++ b/runsc/fsgofer/filter/extra_filters_race.go @@ -35,6 +35,7 @@ func instrumentationFilters() seccomp.SyscallRules { syscall.SYS_MUNLOCK: {}, syscall.SYS_NANOSLEEP: {}, syscall.SYS_OPEN: {}, + syscall.SYS_OPENAT: {}, syscall.SYS_SET_ROBUST_LIST: {}, // Used within glibc's malloc. syscall.SYS_TIME: {}, diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index edc239013..0b628c8ce 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -29,7 +29,6 @@ import ( "path/filepath" "runtime" "strconv" - "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" @@ -45,39 +44,11 @@ const ( // modes to ensure an unopened/closed file fails all mode checks. invalidMode = p9.OpenFlags(math.MaxUint32) - openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC -) - -type fileType int + openFlags = unix.O_NOFOLLOW | unix.O_CLOEXEC -const ( - regular fileType = iota - directory - symlink - socket - unknown + allowedOpenFlags = unix.O_TRUNC ) -// String implements fmt.Stringer. -func (f fileType) String() string { - switch f { - case regular: - return "regular" - case directory: - return "directory" - case symlink: - return "symlink" - case socket: - return "socket" - } - return "unknown" -} - -// ControlSocketAddr generates an abstract unix socket name for the given id. -func ControlSocketAddr(id string) string { - return fmt.Sprintf("\x00runsc-gofer.%s", id) -} - // Config sets configuration options for each attach point. type Config struct { // ROMount is set to true if this is a readonly mount. @@ -132,19 +103,19 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) } - f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { + f, readable, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { return fd.Open(a.prefix, openFlags|mode, 0) }) if err != nil { return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err) } - lf, err := newLocalFile(a, f, a.prefix, stat) + lf, err := newLocalFile(a, f, a.prefix, readable, stat) if err != nil { return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err) } @@ -153,7 +124,7 @@ func (a *attachPoint) Attach() (p9.File, error) { } // makeQID returns a unique QID for the given stat buffer. -func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { +func (a *attachPoint) makeQID(stat unix.Stat_t) p9.QID { a.deviceMu.Lock() defer a.deviceMu.Unlock() @@ -175,8 +146,6 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { log.Warningf("first 8 bytes of host inode id %x will be truncated to construct virtual inode id", stat.Ino) } ino := uint64(dev)<<56 | maskedIno - log.Debugf("host inode %x on device %x mapped to virtual inode %x", stat.Ino, stat.Dev, ino) - return p9.QID{ Type: p9.FileMode(stat.Mode).QIDType(), Path: ino, @@ -186,9 +155,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { // localFile implements p9.File wrapping a local file. The underlying file // is opened during Walk() and stored in 'file' to be used with other // operations. The file is opened as readonly, unless it's a symlink or there is -// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is -// called to clone the file. This reduces the number of walks that need to be -// done by the host file system when files are reused. +// no read access, which requires O_PATH. // // The file may be reopened if the requested mode in Open() is not a subset of // current mode. Consequently, 'file' could have a mode wider than requested and @@ -200,13 +167,30 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID { // performance with 'overlay2' storage driver. overlay2 eagerly copies the // entire file up when it's opened in write mode, and would perform badly when // multiple files are only being opened for read (esp. startup). +// +// File operations must use "at" functions whenever possible: +// * Local operations must use AT_EMPTY_PATH: +// fchownat(fd, "", AT_EMPTY_PATH, ...), instead of chown(fullpath, ...) +// * Creation operations must use (fd + name): +// mkdirat(fd, name, ...), instead of mkdir(fullpath, ...) +// +// Apart from being faster, it also adds another layer of defense against +// symlink attacks (note that O_NOFOLLOW applies only to the last element in +// the path). +// +// The few exceptions where it cannot be done are: utimensat on symlinks, and +// Connect() for the socket address. type localFile struct { - p9.DefaultWalkGetAttr + p9.DisallowClientCalls // attachPoint is the attachPoint that serves this localFile. attachPoint *attachPoint - // hostPath will be safely updated by the Renamed hook. + // hostPath is the full path to the host file. It can be used for logging and + // the few cases where full path is required to operation the host file. In + // all other cases, use "file" directly. + // + // Note: it's safely updated by the Renamed hook. hostPath string // file is opened when localFile is created and it's never nil. It may be @@ -214,12 +198,19 @@ type localFile struct { // opened with. file *fd.FD + // controlReadable tells whether 'file' was opened with read permissions + // during a walk. + controlReadable bool + // mode is the mode in which the file was opened. Set to invalidMode // if localFile isn't opened. mode p9.OpenFlags - // ft is the fileType for this file. - ft fileType + // fileType for this file. It is equivalent to: + // unix.Stat_t.Mode & unix.S_IFMT + fileType uint32 + + qid p9.QID // readDirMu protects against concurrent Readdir calls. readDirMu sync.Mutex @@ -236,7 +227,7 @@ var procSelfFD *fd.FD // OpenProcSelfFD opens the /proc/self/fd directory, which will be used to // reopen file descriptors. func OpenProcSelfFD() error { - d, err := syscall.Open("/proc/self/fd", syscall.O_RDONLY|syscall.O_DIRECTORY, 0) + d, err := unix.Open("/proc/self/fd", unix.O_RDONLY|unix.O_DIRECTORY, 0) if err != nil { return fmt.Errorf("error opening /proc/self/fd: %v", err) } @@ -245,7 +236,7 @@ func OpenProcSelfFD() error { } func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) { - d, err := syscall.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^syscall.O_NOFOLLOW, 0) + d, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^unix.O_NOFOLLOW, 0) if err != nil { return nil, err } @@ -253,83 +244,88 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) { return fd.New(d), nil } -func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) { - path := path.Join(parent.hostPath, name) - f, err := openAnyFile(path, func(mode int) (*fd.FD, error) { +func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) { + pathDebug := path.Join(parent.hostPath, name) + f, readable, err := openAnyFile(pathDebug, func(mode int) (*fd.FD, error) { return fd.OpenAt(parent.file, name, openFlags|mode, 0) }) - return f, path, err + return f, pathDebug, readable, err } -// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback +// openAnyFile attempts to open the file in O_RDONLY. If it fails, falls back // to O_PATH. 'path' is used for logging messages only. 'fn' is what does the // actual file open and is customizable by the caller. -func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) { +func openAnyFile(pathDebug string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) { // Attempt to open file in the following mode in order: // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. // Use non-blocking to prevent getting stuck inside open(2) for // FIFOs. This option has no effect on regular files. // 2. PATH: for symlinks, sockets. - modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH} + options := []struct { + mode int + readable bool + }{ + { + mode: unix.O_RDONLY | unix.O_NONBLOCK, + readable: true, + }, + { + mode: unix.O_PATH, + readable: false, + }, + } var err error - var file *fd.FD - for i, mode := range modes { - file, err = fn(mode) + for i, option := range options { + var file *fd.FD + file, err = fn(option.mode) if err == nil { - // openat succeeded, we're done. - break + // Succeeded opening the file, we're done. + return file, option.readable, nil } switch e := extractErrno(err); e { - case syscall.ENOENT: + case unix.ENOENT: // File doesn't exist, no point in retrying. - return nil, e + return nil, false, e } - // openat failed. Try again with next mode, preserving 'err' in case this - // was the last attempt. - log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err) + // File failed to open. Try again with next mode, preserving 'err' in case + // this was the last attempt. + log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, pathDebug, err) } - if err != nil { - // All attempts to open file have failed, return the last error. - log.Debugf("Failed to open file, path: %q, err: %v", path, err) - return nil, extractErrno(err) - } - - return file, nil + // All attempts to open file have failed, return the last error. + log.Debugf("Failed to open file, path: %q, err: %v", pathDebug, err) + return nil, false, extractErrno(err) } -func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) { - var ft fileType - switch stat.Mode & syscall.S_IFMT { - case syscall.S_IFREG: - ft = regular - case syscall.S_IFDIR: - ft = directory - case syscall.S_IFLNK: - ft = symlink - case syscall.S_IFSOCK: +func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error { + switch stat.Mode & unix.S_IFMT { + case unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK: + return nil + + case unix.S_IFSOCK: if !permitSocket { - return unknown, syscall.EPERM + return unix.EPERM } - ft = socket + return nil + default: - return unknown, syscall.EPERM + return unix.EPERM } - return ft, nil } -func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) { - ft, err := getSupportedFileType(stat, a.conf.HostUDS) - if err != nil { +func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat unix.Stat_t) (*localFile, error) { + if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil { return nil, err } return &localFile{ - attachPoint: a, - hostPath: path, - file: file, - mode: invalidMode, - ft: ft, + attachPoint: a, + hostPath: path, + file: file, + mode: invalidMode, + fileType: stat.Mode & unix.S_IFMT, + qid: a.makeQID(stat), + controlReadable: readable, }, nil } @@ -337,7 +333,7 @@ func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) // non-blocking. If anything fails, returns nil. It's better to have a file // without host FD, than to fail the operation. func newFDMaybe(file *fd.FD) *fd.FD { - dupFD, err := syscall.Dup(file.FD()) + dupFD, err := unix.Dup(file.FD()) // Technically, the runtime may call the finalizer on file as soon as // FD() returns. runtime.KeepAlive(file) @@ -347,23 +343,23 @@ func newFDMaybe(file *fd.FD) *fd.FD { dup := fd.New(dupFD) // fd is blocking; non-blocking is required. - if err := syscall.SetNonblock(dup.FD(), true); err != nil { - dup.Close() + if err := unix.SetNonblock(dup.FD(), true); err != nil { + _ = dup.Close() return nil } return dup } -func stat(fd int) (syscall.Stat_t, error) { - var stat syscall.Stat_t - if err := syscall.Fstat(fd, &stat); err != nil { - return syscall.Stat_t{}, err +func fstat(fd int) (unix.Stat_t, error) { + var stat unix.Stat_t + if err := unix.Fstat(fd, &stat); err != nil { + return unix.Stat_t{}, err } return stat, nil } func fchown(fd int, uid p9.UID, gid p9.GID) error { - return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) + return unix.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) } // Open implements p9.File. @@ -371,10 +367,16 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { if l.isOpen() { panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath)) } + mode := flags & p9.OpenFlagsModeMask + if mode == p9.WriteOnly || mode == p9.ReadWrite || flags&p9.OpenTruncate != 0 { + if err := l.checkROMount(); err != nil { + return nil, p9.QID{}, 0, err + } + } // Check if control file can be used or if a new open must be created. var newFile *fd.FD - if flags == p9.ReadOnly { + if mode == p9.ReadOnly && l.controlReadable && flags.OSFlags()&allowedOpenFlags == 0 { log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath) newFile = l.file } else { @@ -383,23 +385,15 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { // name_to_handle_at and open_by_handle_at aren't supported by overlay2. log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath) var err error - // Constrain open flags to the open mode and O_TRUNC. - newFile, err = reopenProcFd(l.file, openFlags|(flags.OSFlags()&(syscall.O_ACCMODE|syscall.O_TRUNC))) + osFlags := flags.OSFlags() & (unix.O_ACCMODE | allowedOpenFlags) + newFile, err = reopenProcFd(l.file, openFlags|osFlags) if err != nil { return nil, p9.QID{}, 0, extractErrno(err) } } - stat, err := stat(newFile.FD()) - if err != nil { - if newFile != l.file { - newFile.Close() - } - return nil, p9.QID{}, 0, extractErrno(err) - } - var fd *fd.FD - if stat.Mode&syscall.S_IFMT == syscall.S_IFREG { + if l.fileType == unix.S_IFREG { // Donate FD for regular files only. fd = newFDMaybe(newFile) } @@ -411,38 +405,38 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { } l.file = newFile } - l.mode = flags & p9.OpenFlagsModeMask - return fd, l.attachPoint.makeQID(stat), 0, nil + l.mode = mode + return fd, l.qid, 0, nil } // Create implements p9.File. -func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) { - conf := l.attachPoint.conf - if conf.ROMount { - if conf.PanicOnWrite { - panic("attempt to write to RO mount") - } - return nil, nil, p9.QID{}, 0, syscall.EBADF +func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) { + if err := l.checkROMount(); err != nil { + return nil, nil, p9.QID{}, 0, err } + // Set file creation flags, plus allowed open flags from caller. + osFlags := openFlags | unix.O_CREAT | unix.O_EXCL + osFlags |= p9Flags.OSFlags() & allowedOpenFlags + // 'file' may be used for other operations (e.g. Walk), so read access is // always added to flags. Note that resulting file might have a wider mode // than needed for each particular case. - flags := openFlags | syscall.O_CREAT | syscall.O_EXCL + mode := p9Flags & p9.OpenFlagsModeMask if mode == p9.WriteOnly { - flags |= syscall.O_RDWR + osFlags |= unix.O_RDWR } else { - flags |= mode.OSFlags() + osFlags |= mode.OSFlags() } - child, err := fd.OpenAt(l.file, name, flags, uint32(perm.Permissions())) + child, err := fd.OpenAt(l.file, name, osFlags, uint32(perm.Permissions())) if err != nil { return nil, nil, p9.QID{}, 0, extractErrno(err) } cu := cleanup.Make(func() { - child.Close() + _ = child.Close() // Best effort attempt to remove the file in case of failure. - if err := syscall.Unlinkat(l.file.FD(), name); err != nil { + if err := unix.Unlinkat(l.file.FD(), name, 0); err != nil { log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err) } }) @@ -451,7 +445,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid if err := fchown(child.FD(), uid, gid); err != nil { return nil, nil, p9.QID{}, 0, extractErrno(err) } - stat, err := stat(child.FD()) + stat, err := fstat(child.FD()) if err != nil { return nil, nil, p9.QID{}, 0, extractErrno(err) } @@ -461,23 +455,21 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid hostPath: path.Join(l.hostPath, name), file: child, mode: mode, + fileType: unix.S_IFREG, + qid: l.attachPoint.makeQID(stat), } cu.Release() - return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil + return newFDMaybe(c.file), c, c.qid, 0, nil } // Mkdir implements p9.File. func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) { - conf := l.attachPoint.conf - if conf.ROMount { - if conf.PanicOnWrite { - panic("attempt to write to RO mount") - } - return p9.QID{}, syscall.EBADF + if err := l.checkROMount(); err != nil { + return p9.QID{}, err } - if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil { + if err := unix.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil { return p9.QID{}, extractErrno(err) } cu := cleanup.Make(func() { @@ -489,7 +481,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) defer cu.Clean() // Open directory to change ownership and stat it. - flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags + flags := unix.O_DIRECTORY | unix.O_RDONLY | openFlags f, err := fd.OpenAt(l.file, name, flags, 0) if err != nil { return p9.QID{}, extractErrno(err) @@ -499,7 +491,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) if err := fchown(f.FD(), uid, gid); err != nil { return p9.QID{}, extractErrno(err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return p9.QID{}, extractErrno(err) } @@ -510,61 +502,80 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) // Walk implements p9.File. func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { + qids, file, _, err := l.walk(names) + return qids, file, err +} + +// WalkGetAttr implements p9.File. +func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) { + qids, file, stat, err := l.walk(names) + if err != nil { + return nil, nil, p9.AttrMask{}, p9.Attr{}, err + } + mask, attr := l.fillAttr(stat) + return qids, file, mask, attr, nil +} + +func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error) { // Duplicate current file if 'names' is empty. if len(names) == 0 { - newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { + newFile, readable, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) { return reopenProcFd(l.file, openFlags|mode) }) if err != nil { - return nil, nil, extractErrno(err) + return nil, nil, unix.Stat_t{}, extractErrno(err) } - stat, err := stat(newFile.FD()) + stat, err := fstat(newFile.FD()) if err != nil { - newFile.Close() - return nil, nil, extractErrno(err) + _ = newFile.Close() + return nil, nil, unix.Stat_t{}, extractErrno(err) } c := &localFile{ - attachPoint: l.attachPoint, - hostPath: l.hostPath, - file: newFile, - mode: invalidMode, + attachPoint: l.attachPoint, + hostPath: l.hostPath, + file: newFile, + mode: invalidMode, + fileType: l.fileType, + qid: l.attachPoint.makeQID(stat), + controlReadable: readable, } - return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil + return []p9.QID{c.qid}, c, stat, nil } var qids []p9.QID + var lastStat unix.Stat_t last := l for _, name := range names { - f, path, err := openAnyFileFromParent(last, name) + f, path, readable, err := openAnyFileFromParent(last, name) if last != l { - last.Close() + _ = last.Close() } if err != nil { - return nil, nil, extractErrno(err) + return nil, nil, unix.Stat_t{}, extractErrno(err) } - stat, err := stat(f.FD()) + lastStat, err = fstat(f.FD()) if err != nil { - f.Close() - return nil, nil, extractErrno(err) + _ = f.Close() + return nil, nil, unix.Stat_t{}, extractErrno(err) } - c, err := newLocalFile(last.attachPoint, f, path, stat) + c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat) if err != nil { - f.Close() - return nil, nil, extractErrno(err) + _ = f.Close() + return nil, nil, unix.Stat_t{}, extractErrno(err) } - qids = append(qids, l.attachPoint.makeQID(stat)) + qids = append(qids, c.qid) last = c } - return qids, last, nil + return qids, last, lastStat, nil } // StatFS implements p9.File. func (l *localFile) StatFS() (p9.FSStat, error) { - var s syscall.Statfs_t - if err := syscall.Fstatfs(l.file.FD(), &s); err != nil { + var s unix.Statfs_t + if err := unix.Fstatfs(l.file.FD(), &s); err != nil { return p9.FSStat{}, extractErrno(err) } @@ -584,9 +595,9 @@ func (l *localFile) StatFS() (p9.FSStat, error) { // FSync implements p9.File. func (l *localFile) FSync() error { if !l.isOpen() { - return syscall.EBADF + return unix.EBADF } - if err := syscall.Fsync(l.file.FD()); err != nil { + if err := unix.Fsync(l.file.FD()); err != nil { return extractErrno(err) } return nil @@ -594,11 +605,15 @@ func (l *localFile) FSync() error { // GetAttr implements p9.File. func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { - stat, err := stat(l.file.FD()) + stat, err := fstat(l.file.FD()) if err != nil { return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err) } + mask, attr := l.fillAttr(stat) + return l.qid, mask, attr, nil +} +func (l *localFile) fillAttr(stat unix.Stat_t) (p9.AttrMask, p9.Attr) { attr := p9.Attr{ Mode: p9.FileMode(stat.Mode), UID: p9.UID(stat.Uid), @@ -627,20 +642,15 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) MTime: true, CTime: true, } - - return l.attachPoint.makeQID(stat), valid, attr, nil + return valid, attr } // SetAttr implements p9.File. Due to mismatch in file API, options // cannot be changed atomically and user may see partial changes when // an error happens. func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { - conf := l.attachPoint.conf - if conf.ROMount { - if conf.PanicOnWrite { - panic("attempt to write to RO mount") - } - return syscall.EBADF + if err := l.checkROMount(); err != nil { + return err } allowed := p9.SetAttrMask{ @@ -663,13 +673,13 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { // consistent result that is not attribute dependent. if !valid.IsSubsetOf(allowed) { log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid) - return syscall.EPERM + return unix.EPERM } // Check if it's possible to use cached file, or if another one needs to be // opened for write. f := l.file - if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { + if l.fileType == unix.S_IFREG && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { var err error f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY) if err != nil { @@ -690,21 +700,21 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { // over another. var err error if valid.Permissions { - if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil { + if cerr := unix.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil { log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr) err = extractErrno(cerr) } } if valid.Size { - if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil { + if terr := unix.Ftruncate(f.FD(), int64(attr.Size)); terr != nil { log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr) err = extractErrno(terr) } } if valid.ATime || valid.MTime { - utimes := [2]syscall.Timespec{ + utimes := [2]unix.Timespec{ {Sec: 0, Nsec: linux.UTIME_OMIT}, {Sec: 0, Nsec: linux.UTIME_OMIT}, } @@ -725,15 +735,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { } } - if l.ft == symlink { + if l.fileType == unix.S_IFLNK { // utimensat operates different that other syscalls. To operate on a // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty // name. - parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0) + parent, err := unix.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0) if err != nil { return extractErrno(err) } - defer syscall.Close(parent) + defer unix.Close(parent) if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil { log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr) @@ -758,7 +768,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { if valid.GID { gid = int(attr.GID) } - if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil { + if oerr := unix.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil { log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr) err = extractErrno(oerr) } @@ -768,28 +778,28 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { } func (*localFile) GetXattr(string, uint64) (string, error) { - return "", syscall.EOPNOTSUPP + return "", unix.EOPNOTSUPP } func (*localFile) SetXattr(string, string, uint32) error { - return syscall.EOPNOTSUPP + return unix.EOPNOTSUPP } func (*localFile) ListXattr(uint64) (map[string]struct{}, error) { - return nil, syscall.EOPNOTSUPP + return nil, unix.EOPNOTSUPP } func (*localFile) RemoveXattr(string) error { - return syscall.EOPNOTSUPP + return unix.EOPNOTSUPP } // Allocate implements p9.File. func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error { if !l.isOpen() { - return syscall.EBADF + return unix.EBADF } - if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil { + if err := unix.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil { return extractErrno(err) } return nil @@ -802,12 +812,8 @@ func (*localFile) Rename(p9.File, string) error { // RenameAt implements p9.File.RenameAt. func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error { - conf := l.attachPoint.conf - if conf.ROMount { - if conf.PanicOnWrite { - panic("attempt to write to RO mount") - } - return syscall.EBADF + if err := l.checkROMount(); err != nil { + return err } newParent := directory.(*localFile) @@ -820,10 +826,10 @@ func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) // ReadAt implements p9.File. func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) { if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite { - return 0, syscall.EBADF + return 0, unix.EBADF } if !l.isOpen() { - return 0, syscall.EBADF + return 0, unix.EBADF } r, err := l.file.ReadAt(p, int64(offset)) @@ -838,10 +844,10 @@ func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) { // WriteAt implements p9.File. func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) { if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { - return 0, syscall.EBADF + return 0, unix.EBADF } if !l.isOpen() { - return 0, syscall.EBADF + return 0, unix.EBADF } w, err := l.file.WriteAt(p, int64(offset)) @@ -853,12 +859,8 @@ func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) { // Symlink implements p9.File. func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) { - conf := l.attachPoint.conf - if conf.ROMount { - if conf.PanicOnWrite { - panic("attempt to write to RO mount") - } - return p9.QID{}, syscall.EBADF + if err := l.checkROMount(); err != nil { + return p9.QID{}, err } if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil { @@ -866,7 +868,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. } cu := cleanup.Make(func() { // Best effort attempt to remove the symlink in case of failure. - if err := syscall.Unlinkat(l.file.FD(), newName); err != nil { + if err := unix.Unlinkat(l.file.FD(), newName, 0); err != nil { log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err) } }) @@ -882,7 +884,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. if err := fchown(f.FD(), uid, gid); err != nil { return p9.QID{}, extractErrno(err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return p9.QID{}, extractErrno(err) } @@ -893,12 +895,8 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. // Link implements p9.File. func (l *localFile) Link(target p9.File, newName string) error { - conf := l.attachPoint.conf - if conf.ROMount { - if conf.PanicOnWrite { - panic("attempt to write to RO mount") - } - return syscall.EBADF + if err := l.checkROMount(); err != nil { + return err } targetFile := target.(*localFile) @@ -909,23 +907,53 @@ func (l *localFile) Link(target p9.File, newName string) error { } // Mknod implements p9.File. -// -// Not implemented. -func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) { +func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { + if err := l.checkROMount(); err != nil { + return p9.QID{}, err + } + // From mknod(2) man page: // "EPERM: [...] if the filesystem containing pathname does not support // the type of node requested." - return p9.QID{}, syscall.EPERM + if mode.FileType() != p9.ModeRegular { + return p9.QID{}, unix.EPERM + } + + // Allow Mknod to create regular files. + if err := unix.Mknodat(l.file.FD(), name, uint32(mode), 0); err != nil { + return p9.QID{}, err + } + cu := cleanup.Make(func() { + // Best effort attempt to remove the file in case of failure. + if err := unix.Unlinkat(l.file.FD(), name, 0); err != nil { + log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err) + } + }) + defer cu.Clean() + + // Open file to change ownership and stat it. + child, err := fd.OpenAt(l.file, name, unix.O_PATH|openFlags, 0) + if err != nil { + return p9.QID{}, extractErrno(err) + } + defer child.Close() + + if err := fchown(child.FD(), uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } + stat, err := fstat(child.FD()) + if err != nil { + return p9.QID{}, extractErrno(err) + } + + cu.Release() + return l.attachPoint.makeQID(stat), nil } // UnlinkAt implements p9.File. func (l *localFile) UnlinkAt(name string, flags uint32) error { - conf := l.attachPoint.conf - if conf.ROMount { - if conf.PanicOnWrite { - panic("attempt to write to RO mount") - } - return syscall.EBADF + if err := l.checkROMount(); err != nil { + return err } if err := unix.Unlinkat(l.file.FD(), name, int(flags)); err != nil { @@ -937,10 +965,10 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error { // Readdir implements p9.File. func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite { - return nil, syscall.EBADF + return nil, unix.EBADF } if !l.isOpen() { - return nil, syscall.EBADF + return nil, unix.EBADF } // Readdirnames is a cursor over directories, so seek back to 0 to ensure it's @@ -951,10 +979,13 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { skip := uint64(0) - // Check if the file is at the correct position already. If not, seek to the - // beginning and read the entire directory again. - if l.lastDirentOffset != offset { - if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil { + // Check if the file is at the correct position already. If not, seek to + // the beginning and read the entire directory again. We always seek if + // offset is 0, since this is side-effectual (equivalent to rewinddir(3), + // which causes the directory stream to resynchronize with the directory's + // current contents). + if l.lastDirentOffset != offset || offset == 0 { + if _, err := unix.Seek(l.file.FD(), 0, 0); err != nil { return nil, extractErrno(err) } skip = offset @@ -987,7 +1018,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) end := offset + uint64(count) for offset < end { - dirSize, err := syscall.ReadDirent(f, direntsBuf) + dirSize, err := unix.ReadDirent(f, direntsBuf) if err != nil { return dirents, err } @@ -996,7 +1027,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) } names := names[:0] - _, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names) + _, _, names = unix.ParseDirent(direntsBuf[:dirSize], -1, names) // Skip over entries that the caller is not interested in. if skip > 0 { @@ -1041,7 +1072,7 @@ func (l *localFile) Readlink() (string, error) { return string(b[:n]), nil } } - return "", syscall.ENOMEM + return "", unix.ENOMEM } // Flush implements p9.File. @@ -1052,7 +1083,7 @@ func (l *localFile) Flush() error { // Connect implements p9.File. func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) { if !l.attachPoint.conf.HostUDS { - return nil, syscall.ECONNREFUSED + return nil, unix.ECONNREFUSED } // TODO(gvisor.dev/issue/1003): Due to different app vs replacement @@ -1060,34 +1091,34 @@ func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) { // fit f.path in our sockaddr. We'd need to redirect through a shorter // path in order to actually connect to this socket. if len(l.hostPath) > linux.UnixPathMax { - return nil, syscall.ECONNREFUSED + return nil, unix.ECONNREFUSED } var stype int switch flags { case p9.StreamSocket: - stype = syscall.SOCK_STREAM + stype = unix.SOCK_STREAM case p9.DgramSocket: - stype = syscall.SOCK_DGRAM + stype = unix.SOCK_DGRAM case p9.SeqpacketSocket: - stype = syscall.SOCK_SEQPACKET + stype = unix.SOCK_SEQPACKET default: - return nil, syscall.ENXIO + return nil, unix.ENXIO } - f, err := syscall.Socket(syscall.AF_UNIX, stype, 0) + f, err := unix.Socket(unix.AF_UNIX, stype, 0) if err != nil { return nil, err } - if err := syscall.SetNonblock(f, true); err != nil { - syscall.Close(f) + if err := unix.SetNonblock(f, true); err != nil { + _ = unix.Close(f) return nil, err } - sa := syscall.SockaddrUnix{Name: l.hostPath} - if err := syscall.Connect(f, &sa); err != nil { - syscall.Close(f) + sa := unix.SockaddrUnix{Name: l.hostPath} + if err := unix.Connect(f, &sa); err != nil { + _ = unix.Close(f) return nil, err } @@ -1112,7 +1143,7 @@ func (l *localFile) Renamed(newDir p9.File, newName string) { } // extractErrno tries to determine the errno. -func extractErrno(err error) syscall.Errno { +func extractErrno(err error) unix.Errno { if err == nil { // This should never happen. The likely result will be that // some user gets the frustrating "error: SUCCESS" message. @@ -1122,18 +1153,18 @@ func extractErrno(err error) syscall.Errno { switch err { case os.ErrNotExist: - return syscall.ENOENT + return unix.ENOENT case os.ErrExist: - return syscall.EEXIST + return unix.EEXIST case os.ErrPermission: - return syscall.EACCES + return unix.EACCES case os.ErrInvalid: - return syscall.EINVAL + return unix.EINVAL } // See if it's an errno or a common wrapped error. switch e := err.(type) { - case syscall.Errno: + case unix.Errno: return e case *os.PathError: return extractErrno(e.Err) @@ -1145,5 +1176,12 @@ func extractErrno(err error) syscall.Errno { // Fall back to EIO. log.Debugf("Unknown error: %v, defaulting to EIO", err) - return syscall.EIO + return unix.EIO +} + +func (l *localFile) checkROMount() error { + if conf := l.attachPoint.conf; conf.ROMount { + return unix.EROFS + } + return nil } diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go index 5d4aab597..c46958185 100644 --- a/runsc/fsgofer/fsgofer_amd64_unsafe.go +++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go @@ -17,25 +17,25 @@ package fsgofer import ( - "syscall" "unsafe" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/syserr" ) -func statAt(dirFd int, name string) (syscall.Stat_t, error) { - nameBytes, err := syscall.BytePtrFromString(name) +func statAt(dirFd int, name string) (unix.Stat_t, error) { + nameBytes, err := unix.BytePtrFromString(name) if err != nil { - return syscall.Stat_t{}, err + return unix.Stat_t{}, err } namePtr := unsafe.Pointer(nameBytes) - var stat syscall.Stat_t + var stat unix.Stat_t statPtr := unsafe.Pointer(&stat) - if _, _, errno := syscall.Syscall6( - syscall.SYS_NEWFSTATAT, + if _, _, errno := unix.Syscall6( + unix.SYS_NEWFSTATAT, uintptr(dirFd), uintptr(namePtr), uintptr(statPtr), @@ -43,7 +43,7 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) { 0, 0); errno != 0 { - return syscall.Stat_t{}, syserr.FromHost(errno).ToError() + return unix.Stat_t{}, syserr.FromHost(errno).ToError() } return stat, nil } diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go index 8041fd352..491460718 100644 --- a/runsc/fsgofer/fsgofer_arm64_unsafe.go +++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go @@ -17,25 +17,25 @@ package fsgofer import ( - "syscall" "unsafe" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/syserr" ) -func statAt(dirFd int, name string) (syscall.Stat_t, error) { - nameBytes, err := syscall.BytePtrFromString(name) +func statAt(dirFd int, name string) (unix.Stat_t, error) { + nameBytes, err := unix.BytePtrFromString(name) if err != nil { - return syscall.Stat_t{}, err + return unix.Stat_t{}, err } namePtr := unsafe.Pointer(nameBytes) - var stat syscall.Stat_t + var stat unix.Stat_t statPtr := unsafe.Pointer(&stat) - if _, _, errno := syscall.Syscall6( - syscall.SYS_FSTATAT, + if _, _, errno := unix.Syscall6( + unix.SYS_FSTATAT, uintptr(dirFd), uintptr(namePtr), uintptr(statPtr), @@ -43,7 +43,7 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) { 0, 0); errno != 0 { - return syscall.Stat_t{}, syserr.FromHost(errno).ToError() + return unix.Stat_t{}, syserr.FromHost(errno).ToError() } return stat, nil } diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index 05af7e397..a84206686 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -21,11 +21,24 @@ import ( "os" "path" "path/filepath" - "syscall" "testing" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/test/testutil" +) + +var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} + +var ( + allTypes = []uint32{unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK} + + // allConfs is set in init(). + allConfs []Config + + rwConfs = []Config{{ROMount: false}} + roConfs = []Config{{ROMount: true}} ) func init() { @@ -39,6 +52,13 @@ func init() { } } +func configTestName(conf *Config) string { + if conf.ROMount { + return "ROMount" + } + return "RWMount" +} + func assertPanic(t *testing.T, f func()) { defer func() { if r := recover(); r == nil { @@ -63,7 +83,7 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error { } want = append(want, b...) } else { - if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF { + if e, ok := err.(unix.Errno); !ok || e != unix.EBADF { return fmt.Errorf("WriteAt() should have failed, got: %d, want: EBADFD", err) } } @@ -81,78 +101,83 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error { return fmt.Errorf("ReadAt() wrong data, got: %s, want: %s", string(rBuf), want) } } else { - if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF { + if e, ok := err.(unix.Errno); !ok || e != unix.EBADF { return fmt.Errorf("ReadAt() should have failed, got: %d, want: EBADFD", err) } } return nil } -var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite} - -var ( - allTypes = []fileType{regular, directory, symlink} - - // allConfs is set in init() above. - allConfs []Config - - rwConfs = []Config{{ROMount: false}} - roConfs = []Config{{ROMount: true}} -) - type state struct { - root *localFile - file *localFile - conf Config - ft fileType + root *localFile + file *localFile + conf Config + fileType uint32 } func (s state) String() string { - return fmt.Sprintf("type(%v)", s.ft) + return fmt.Sprintf("type(%v)", s.fileType) +} + +func typeName(fileType uint32) string { + switch fileType { + case unix.S_IFREG: + return "file" + case unix.S_IFDIR: + return "directory" + case unix.S_IFLNK: + return "symlink" + default: + panic(fmt.Sprintf("invalid file type for test: %d", fileType)) + } } func runAll(t *testing.T, test func(*testing.T, state)) { runCustom(t, allTypes, allConfs, test) } -func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) { +func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.T, state)) { for _, c := range confs { - t.Logf("Config: %+v", c) - for _, ft := range types { - t.Logf("File type: %v", ft) + name := fmt.Sprintf("%s/%s", configTestName(&c), typeName(ft)) + t.Run(name, func(t *testing.T) { + path, name, err := setup(ft) + if err != nil { + t.Fatalf("%v", err) + } + defer os.RemoveAll(path) - path, name, err := setup(ft) - if err != nil { - t.Fatalf("%v", err) - } - defer os.RemoveAll(path) + a, err := NewAttachPoint(path, c) + if err != nil { + t.Fatalf("NewAttachPoint failed: %v", err) + } + root, err := a.Attach() + if err != nil { + t.Fatalf("Attach failed, err: %v", err) + } - a, err := NewAttachPoint(path, c) - if err != nil { - t.Fatalf("NewAttachPoint failed: %v", err) - } - root, err := a.Attach() - if err != nil { - t.Fatalf("Attach failed, err: %v", err) - } + _, file, err := root.Walk([]string{name}) + if err != nil { + root.Close() + t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) + } - _, file, err := root.Walk([]string{name}) - if err != nil { + st := state{ + root: root.(*localFile), + file: file.(*localFile), + conf: c, + fileType: ft, + } + test(t, st) + file.Close() root.Close() - t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) - } - - st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft} - test(t, st) - file.Close() - root.Close() + }) } } } -func setup(ft fileType) (string, string, error) { - path, err := ioutil.TempDir("", "root-") +func setup(fileType uint32) (string, string, error) { + path, err := ioutil.TempDir(testutil.TmpDir(), "root-") if err != nil { return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err) } @@ -169,26 +194,26 @@ func setup(ft fileType) (string, string, error) { defer root.Close() var name string - switch ft { - case regular: + switch fileType { + case unix.S_IFREG: name = "file" _, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) if err != nil { return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err) } defer f.Close() - case directory: + case unix.S_IFDIR: name = "dir" if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err) } - case symlink: + case unix.S_IFLNK: name = "symlink" if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err) } default: - panic(fmt.Sprintf("unknown file type %v", ft)) + panic(fmt.Sprintf("unknown file type %v", fileType)) } return path, name, nil } @@ -202,7 +227,7 @@ func createFile(dir *localFile, name string) (*localFile, error) { } func TestReadWrite(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { child, err := createFile(s.file, "test") if err != nil { t.Fatalf("%v: createFile() failed, err: %v", s, err) @@ -221,9 +246,13 @@ func TestReadWrite(t *testing.T) { if err != nil { t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err) } - if _, _, _, err := l.Open(flags); err != nil { + fd, _, _, err := l.Open(flags) + if err != nil { t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err) } + if fd != nil { + defer fd.Close() + } if err := testReadWrite(l, flags, want); err != nil { t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err) } @@ -232,14 +261,14 @@ func TestReadWrite(t *testing.T) { } func TestCreate(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { for i, flags := range allOpenFlags { _, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) if err != nil { t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, flags, err) } - if err := testReadWrite(l, flags, []byte{}); err != nil { + if err := testReadWrite(l, flags, nil); err != nil { t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err) } } @@ -249,7 +278,7 @@ func TestCreate(t *testing.T) { // TestReadWriteDup tests that a file opened in any mode can be dup'ed and // reopened in any other mode. func TestReadWriteDup(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { child, err := createFile(s.file, "test") if err != nil { t.Fatalf("%v: createFile() failed, err: %v", s, err) @@ -279,9 +308,13 @@ func TestReadWriteDup(t *testing.T) { t.Fatalf("%v: Walk(<empty>) failed: %v", s, err) } defer dup.Close() - if _, _, _, err := dup.Open(dupFlags); err != nil { + fd, _, _, err := dup.Open(dupFlags) + if err != nil { t.Fatalf("%v: Open(%v) failed: %v", s, flags, err) } + if fd != nil { + defer fd.Close() + } if err := testReadWrite(dup, dupFlags, want); err != nil { t.Fatalf("%v: testReadWrite(%v) failed: %v", s, dupFlags, err) } @@ -291,19 +324,45 @@ func TestReadWriteDup(t *testing.T) { } func TestUnopened(t *testing.T) { - runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFREG}, allConfs, func(t *testing.T, s state) { b := []byte("foobar") - if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF { - t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err) + if _, err := s.file.WriteAt(b, 0); err != unix.EBADF { + t.Errorf("%v: WriteAt() should have failed, got: %v, expected: unix.EBADF", s, err) } - if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF { - t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err) + if _, err := s.file.ReadAt(b, 0); err != unix.EBADF { + t.Errorf("%v: ReadAt() should have failed, got: %v, expected: unix.EBADF", s, err) } - if _, err := s.file.Readdir(0, 100); err != syscall.EBADF { - t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err) + if _, err := s.file.Readdir(0, 100); err != unix.EBADF { + t.Errorf("%v: Readdir() should have failed, got: %v, expected: unix.EBADF", s, err) } - if err := s.file.FSync(); err != syscall.EBADF { - t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err) + if err := s.file.FSync(); err != unix.EBADF { + t.Errorf("%v: FSync() should have failed, got: %v, expected: unix.EBADF", s, err) + } + }) +} + +// TestOpenOPath is a regression test to ensure that a file that cannot be open +// for read is allowed to be open. This was happening because the control file +// was open with O_PATH, but Open() was not checking for it and allowing the +// control file to be reused. +func TestOpenOPath(t *testing.T) { + runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s state) { + // Fist remove all permissions on the file. + if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil { + t.Fatalf("SetAttr(): %v", err) + } + // Then walk to the file again to open a new control file. + filename := filepath.Base(s.file.hostPath) + _, newFile, err := s.root.Walk([]string{filename}) + if err != nil { + t.Fatalf("root.Walk(%q): %v", filename, err) + } + + if newFile.(*localFile).controlReadable { + t.Fatalf("control file didn't open with O_PATH: %+v", newFile) + } + if _, _, _, err := newFile.Open(p9.ReadOnly); err != unix.EACCES { + t.Fatalf("Open() should have failed, got: %v, wanted: EACCES", err) } }) } @@ -324,7 +383,7 @@ func TestSetAttrPerm(t *testing.T) { valid := p9.SetAttrMask{Permissions: true} attr := p9.SetAttr{Permissions: 0777} got, err := SetGetAttr(s.file, valid, attr) - if s.ft == symlink { + if s.fileType == unix.S_IFLNK { if err == nil { t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions) } @@ -345,7 +404,7 @@ func TestSetAttrSize(t *testing.T) { valid := p9.SetAttrMask{Size: true} attr := p9.SetAttr{Size: size} got, err := SetGetAttr(s.file, valid, attr) - if s.ft == symlink || s.ft == directory { + if s.fileType == unix.S_IFLNK || s.fileType == unix.S_IFDIR { if err == nil { t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions) } @@ -427,9 +486,9 @@ func TestLink(t *testing.T) { } err = dir.Link(s.file, linkFile) - if s.ft == directory { - if err != syscall.EPERM { - t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err) + if s.fileType == unix.S_IFDIR { + if err != unix.EPERM { + t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: unix.EPERM", s, linkFile, err) } return } @@ -440,54 +499,64 @@ func TestLink(t *testing.T) { } func TestROMountChecks(t *testing.T) { + const want = unix.EROFS + uid := p9.UID(os.Getuid()) + gid := p9.GID(os.Getgid()) + runCustom(t, allTypes, roConfs, func(t *testing.T, s state) { - if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF { - t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err) + if s.fileType != unix.S_IFLNK { + if _, _, _, err := s.file.Open(p9.WriteOnly); err != want { + t.Errorf("Open() should have failed, got: %v, expected: %v", err, want) + } + if _, _, _, err := s.file.Open(p9.ReadWrite); err != want { + t.Errorf("Open() should have failed, got: %v, expected: %v", err, want) + } + if _, _, _, err := s.file.Open(p9.ReadOnly | p9.OpenTruncate); err != want { + t.Errorf("Open() should have failed, got: %v, expected: %v", err, want) + } + f, _, _, err := s.file.Open(p9.ReadOnly) + if err != nil { + t.Errorf("Open() failed: %v", err) + } + if f != nil { + _ = f.Close() + } } - if _, err := s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF { - t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err) + + if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, uid, gid); err != want { + t.Errorf("Create() should have failed, got: %v, expected: %v", err, want) } - if err := s.file.RenameAt("some_file", s.file, "other_file"); err != syscall.EBADF { - t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err) + if _, err := s.file.Mkdir("some_dir", 0777, uid, gid); err != want { + t.Errorf("MkDir() should have failed, got: %v, expected: %v", err, want) } - if _, err := s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF { - t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err) + if err := s.file.RenameAt("some_file", s.file, "other_file"); err != want { + t.Errorf("Rename() should have failed, got: %v, expected: %v", err, want) } - if err := s.file.UnlinkAt("some_file", 0); err != syscall.EBADF { - t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err) + if _, err := s.file.Symlink("some_place", "some_symlink", uid, gid); err != want { + t.Errorf("Symlink() should have failed, got: %v, expected: %v", err, want) } - if err := s.file.Link(s.file, "some_link"); err != syscall.EBADF { - t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err) + if err := s.file.UnlinkAt("some_file", 0); err != want { + t.Errorf("UnlinkAt() should have failed, got: %v, expected: %v", err, want) } - - valid := p9.SetAttrMask{Size: true} - attr := p9.SetAttr{Size: 0} - if err := s.file.SetAttr(valid, attr); err != syscall.EBADF { - t.Errorf("%v: SetAttr() should have failed, got: %v, expected: syscall.EBADF", s, err) + if err := s.file.Link(s.file, "some_link"); err != want { + t.Errorf("Link() should have failed, got: %v, expected: %v", err, want) + } + if _, err := s.file.Mknod("some-nod", 0777, 1, 2, uid, gid); err != want { + t.Errorf("Mknod() should have failed, got: %v, expected: %v", err, want) } - }) -} - -func TestROMountPanics(t *testing.T) { - conf := Config{ROMount: true, PanicOnWrite: true} - runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) { - assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) }) - assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) }) - assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") }) - assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())) }) - assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) }) - assertPanic(t, func() { s.file.Link(s.file, "some_link") }) valid := p9.SetAttrMask{Size: true} attr := p9.SetAttr{Size: 0} - assertPanic(t, func() { s.file.SetAttr(valid, attr) }) + if err := s.file.SetAttr(valid, attr); err != want { + t.Errorf("SetAttr() should have failed, got: %v, expected: %v", err, want) + } }) } func TestWalkNotFound(t *testing.T) { - runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) { - if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT { - t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err) + runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) { + if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT { + t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: unix.ENOENT", s, "nobody-here", err) } }) } @@ -506,7 +575,7 @@ func TestWalkDup(t *testing.T) { } func TestReaddir(t *testing.T) { - runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { name := "dir" if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err) @@ -631,7 +700,7 @@ func TestAttachInvalidType(t *testing.T) { defer os.RemoveAll(dir) fifo := filepath.Join(dir, "fifo") - if err := syscall.Mkfifo(fifo, 0755); err != nil { + if err := unix.Mkfifo(fifo, 0755); err != nil { t.Fatalf("Mkfifo(%q): %v", fifo, err) } @@ -690,3 +759,63 @@ func TestDoubleAttachError(t *testing.T) { t.Fatalf("Attach should have failed, got %v want non-nil", err) } } + +func TestTruncate(t *testing.T) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + child, err := createFile(s.file, "test") + if err != nil { + t.Fatalf("createFile() failed: %v", err) + } + defer child.Close() + want := []byte("foobar") + w, err := child.WriteAt(want, 0) + if err != nil { + t.Fatalf("Write() failed: %v", err) + } + if w != len(want) { + t.Fatalf("Write() was partial, got: %d, expected: %d", w, len(want)) + } + + _, l, err := s.file.Walk([]string{"test"}) + if err != nil { + t.Fatalf("Walk(%s) failed: %v", "test", err) + } + if _, _, _, err := l.Open(p9.ReadOnly | p9.OpenTruncate); err != nil { + t.Fatalf("Open() failed: %v", err) + } + _, mask, attr, err := l.GetAttr(p9.AttrMask{Size: true}) + if err != nil { + t.Fatalf("GetAttr() failed: %v", err) + } + if !mask.Size { + t.Fatalf("GetAttr() didn't return size: %+v", mask) + } + if attr.Size != 0 { + t.Fatalf("truncate didn't work, want: 0, got: %d", attr.Size) + } + }) +} + +func TestMknod(t *testing.T) { + runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) { + _, err := s.file.Mknod("test", p9.ModeRegular|0777, 1, 2, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + t.Fatalf("Mknod() failed: %v", err) + } + + _, f, err := s.file.Walk([]string{"test"}) + if err != nil { + t.Fatalf("Walk() failed: %v", err) + } + fd, _, _, err := f.Open(p9.ReadWrite) + if err != nil { + t.Fatalf("Open() failed: %v", err) + } + if fd != nil { + defer fd.Close() + } + if err := testReadWrite(f, p9.ReadWrite, nil); err != nil { + t.Fatalf("testReadWrite() failed: %v", err) + } + }) +} diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go index 542b54365..f11fea40d 100644 --- a/runsc/fsgofer/fsgofer_unsafe.go +++ b/runsc/fsgofer/fsgofer_unsafe.go @@ -15,18 +15,18 @@ package fsgofer import ( - "syscall" "unsafe" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/syserr" ) -func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error { +func utimensat(dirFd int, name string, times [2]unix.Timespec, flags int) error { // utimensat(2) doesn't accept empty name, instead name must be nil to make it // operate directly on 'dirFd' unlike other *at syscalls. var namePtr unsafe.Pointer if name != "" { - nameBytes, err := syscall.BytePtrFromString(name) + nameBytes, err := unix.BytePtrFromString(name) if err != nil { return err } @@ -35,8 +35,8 @@ func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) err timesPtr := unsafe.Pointer(×[0]) - if _, _, errno := syscall.Syscall6( - syscall.SYS_UTIMENSAT, + if _, _, errno := unix.Syscall6( + unix.SYS_UTIMENSAT, uintptr(dirFd), uintptr(namePtr), uintptr(timesPtr), @@ -52,7 +52,7 @@ func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) err func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error { var oldNamePtr unsafe.Pointer if oldName != "" { - nameBytes, err := syscall.BytePtrFromString(oldName) + nameBytes, err := unix.BytePtrFromString(oldName) if err != nil { return err } @@ -60,15 +60,15 @@ func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error } var newNamePtr unsafe.Pointer if newName != "" { - nameBytes, err := syscall.BytePtrFromString(newName) + nameBytes, err := unix.BytePtrFromString(newName) if err != nil { return err } newNamePtr = unsafe.Pointer(nameBytes) } - if _, _, errno := syscall.Syscall6( - syscall.SYS_RENAMEAT, + if _, _, errno := unix.Syscall6( + unix.SYS_RENAMEAT, uintptr(oldDirFD), uintptr(oldNamePtr), uintptr(newDirFD), diff --git a/runsc/main.go b/runsc/main.go index 920ed84a5..4ce5ebee9 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,357 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Binary runsc is an implementation of the Open Container Initiative Runtime -// that runs applications inside a sandbox. +// Binary runsc implements the OCI runtime interface. package main import ( - "context" - "fmt" - "io" - "io/ioutil" - "os" - "os/signal" - "path/filepath" - "strings" - "syscall" - "time" - - "github.com/google/subcommands" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/runsc/boot" - "gvisor.dev/gvisor/runsc/cmd" - "gvisor.dev/gvisor/runsc/flag" - "gvisor.dev/gvisor/runsc/specutils" -) - -var ( - // Although these flags are not part of the OCI spec, they are used by - // Docker, and thus should not be changed. - rootDir = flag.String("root", "", "root directory for storage of container state.") - logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.") - logFormat = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.") - debug = flag.Bool("debug", false, "enable debug logging.") - showVersion = flag.Bool("version", false, "show version and exit.") - // TODO(gvisor.dev/issue/193): support systemd cgroups - systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.") - - // These flags are unique to runsc, and are used to configure parts of the - // system that are not covered by the runtime spec. - - // Debugging flags. - debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.") - panicLog = flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.") - logPackets = flag.Bool("log-packets", false, "enable network packet logging.") - logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.") - debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.") - panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.") - debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.") - alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.") - - // Debugging flags: strace related - strace = flag.Bool("strace", false, "enable strace.") - straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.") - straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.") - - // Flags that control sandbox runtime behavior. - platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.") - network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") - hardwareGSO = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.") - softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.") - qDisc = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.") - fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") - fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") - overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") - overlayfsStaleRead = flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem") - watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") - panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") - profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") - netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") - numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") - rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") - referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.") - cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") - vfs2Enabled = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.") - - // Test flags, not to be used outside tests, ever. - testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") - testOnlyTestNameEnv = flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.") + "gvisor.dev/gvisor/runsc/cli" ) func main() { - // Help and flags commands are generated automatically. - help := cmd.NewHelp(subcommands.DefaultCommander) - help.Register(new(cmd.Syscalls)) - subcommands.Register(help, "") - subcommands.Register(subcommands.FlagsCommand(), "") - - // Installation helpers. - const helperGroup = "helpers" - subcommands.Register(new(cmd.Install), helperGroup) - subcommands.Register(new(cmd.Uninstall), helperGroup) - - // Register user-facing runsc commands. - subcommands.Register(new(cmd.Checkpoint), "") - subcommands.Register(new(cmd.Create), "") - subcommands.Register(new(cmd.Delete), "") - subcommands.Register(new(cmd.Do), "") - subcommands.Register(new(cmd.Events), "") - subcommands.Register(new(cmd.Exec), "") - subcommands.Register(new(cmd.Gofer), "") - subcommands.Register(new(cmd.Kill), "") - subcommands.Register(new(cmd.List), "") - subcommands.Register(new(cmd.Pause), "") - subcommands.Register(new(cmd.PS), "") - subcommands.Register(new(cmd.Restore), "") - subcommands.Register(new(cmd.Resume), "") - subcommands.Register(new(cmd.Run), "") - subcommands.Register(new(cmd.Spec), "") - subcommands.Register(new(cmd.State), "") - subcommands.Register(new(cmd.Start), "") - subcommands.Register(new(cmd.Wait), "") - - // Register internal commands with the internal group name. This causes - // them to be sorted below the user-facing commands with empty group. - // The string below will be printed above the commands. - const internalGroup = "internal use only" - subcommands.Register(new(cmd.Boot), internalGroup) - subcommands.Register(new(cmd.Debug), internalGroup) - subcommands.Register(new(cmd.Gofer), internalGroup) - subcommands.Register(new(cmd.Statefile), internalGroup) - - // All subcommands must be registered before flag parsing. - flag.Parse() - - // Are we showing the version? - if *showVersion { - // The format here is the same as runc. - fmt.Fprintf(os.Stdout, "runsc version %s\n", version) - fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version) - os.Exit(0) - } - - // TODO(gvisor.dev/issue/193): support systemd cgroups - if *systemdCgroup { - fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193") - os.Exit(1) - } - - var errorLogger io.Writer - if *logFD > -1 { - errorLogger = os.NewFile(uintptr(*logFD), "error log file") - - } else if *logFilename != "" { - // We must set O_APPEND and not O_TRUNC because Docker passes - // the same log file for all commands (and also parses these - // log files), so we can't destroy them on each command. - var err error - errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) - if err != nil { - cmd.Fatalf("error opening log file %q: %v", *logFilename, err) - } - } - cmd.ErrorLogger = errorLogger - - platformType := *platformName - if _, err := platform.Lookup(platformType); err != nil { - cmd.Fatalf("%v", err) - } - - fsAccess, err := boot.MakeFileAccessType(*fileAccess) - if err != nil { - cmd.Fatalf("%v", err) - } - - if fsAccess == boot.FileAccessShared && *overlay { - cmd.Fatalf("overlay flag is incompatible with shared file access") - } - - netType, err := boot.MakeNetworkType(*network) - if err != nil { - cmd.Fatalf("%v", err) - } - - wa, err := boot.MakeWatchdogAction(*watchdogAction) - if err != nil { - cmd.Fatalf("%v", err) - } - - if *numNetworkChannels <= 0 { - cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels) - } - - refsLeakMode, err := boot.MakeRefsLeakMode(*referenceLeakMode) - if err != nil { - cmd.Fatalf("%v", err) - } - - queueingDiscipline, err := boot.MakeQueueingDiscipline(*qDisc) - if err != nil { - cmd.Fatalf("%s", err) - } - - // Sets the reference leak check mode. Also set it in config below to - // propagate it to child processes. - refs.SetLeakMode(refsLeakMode) - - // Create a new Config from the flags. - conf := &boot.Config{ - RootDir: *rootDir, - Debug: *debug, - LogFilename: *logFilename, - LogFormat: *logFormat, - DebugLog: *debugLog, - PanicLog: *panicLog, - DebugLogFormat: *debugLogFormat, - FileAccess: fsAccess, - FSGoferHostUDS: *fsGoferHostUDS, - Overlay: *overlay, - Network: netType, - HardwareGSO: *hardwareGSO, - SoftwareGSO: *softwareGSO, - LogPackets: *logPackets, - Platform: platformType, - Strace: *strace, - StraceLogSize: *straceLogSize, - WatchdogAction: wa, - PanicSignal: *panicSignal, - ProfileEnable: *profile, - EnableRaw: *netRaw, - NumNetworkChannels: *numNetworkChannels, - Rootless: *rootless, - AlsoLogToStderr: *alsoLogToStderr, - ReferenceLeakMode: refsLeakMode, - OverlayfsStaleRead: *overlayfsStaleRead, - CPUNumFromQuota: *cpuNumFromQuota, - VFS2: *vfs2Enabled, - QDisc: queueingDiscipline, - TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, - TestOnlyTestNameEnv: *testOnlyTestNameEnv, - } - if len(*straceSyscalls) != 0 { - conf.StraceSyscalls = strings.Split(*straceSyscalls, ",") - } - - // Set up logging. - if *debug { - log.SetLevel(log.Debug) - } - - // Logging will include the local date and time via the time package. - // - // On first use, time.Local initializes the local time zone, which - // involves opening tzdata files on the host. Since this requires - // opening host files, it must be done before syscall filter - // installation. - // - // Generally there will be a log message before filter installation - // that will force initialization, but force initialization here in - // case that does not occur. - _ = time.Local.String() - - subcommand := flag.CommandLine.Arg(0) - - var e log.Emitter - if *debugLogFD > -1 { - f := os.NewFile(uintptr(*debugLogFD), "debug log file") - - e = newEmitter(*debugLogFormat, f) - - } else if *debugLog != "" { - f, err := specutils.DebugLogFile(*debugLog, subcommand, "" /* name */) - if err != nil { - cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err) - } - e = newEmitter(*debugLogFormat, f) - - } else { - // Stderr is reserved for the application, just discard the logs if no debug - // log is specified. - e = newEmitter("text", ioutil.Discard) - } - - if *panicLogFD > -1 || *debugLogFD > -1 { - fd := *panicLogFD - if fd < 0 { - fd = *debugLogFD - } - // Quick sanity check to make sure no other commands get passed - // a log fd (they should use log dir instead). - if subcommand != "boot" && subcommand != "gofer" { - cmd.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand) - } - - // If we are the boot process, then we own our stdio FDs and can do what we - // want with them. Since Docker and Containerd both eat boot's stderr, we - // dup our stderr to the provided log FD so that panics will appear in the - // logs, rather than just disappear. - if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil { - cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err) - } - } else if *alsoLogToStderr { - e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)} - } - - log.SetTarget(e) - - log.Infof("***************************") - log.Infof("Args: %s", os.Args) - log.Infof("Version %s", version) - log.Infof("PID: %d", os.Getpid()) - log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid()) - log.Infof("Configuration:") - log.Infof("\t\tRootDir: %s", conf.RootDir) - log.Infof("\t\tPlatform: %v", conf.Platform) - log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay) - log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets) - log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls) - log.Infof("\t\tVFS2 enabled: %v", conf.VFS2) - log.Infof("***************************") - - if *testOnlyAllowRunAsCurrentUserWithoutChroot { - // SIGTERM is sent to all processes if a test exceeds its - // timeout and this case is handled by syscall_test_runner. - log.Warningf("Block the TERM signal. This is only safe in tests!") - signal.Ignore(syscall.SIGTERM) - } - - // Call the subcommand and pass in the configuration. - var ws syscall.WaitStatus - subcmdCode := subcommands.Execute(context.Background(), conf, &ws) - if subcmdCode == subcommands.ExitSuccess { - log.Infof("Exiting with status: %v", ws) - if ws.Signaled() { - // No good way to return it, emulate what the shell does. Maybe raise - // signal to self? - os.Exit(128 + int(ws.Signal())) - } - os.Exit(ws.ExitStatus()) - } - // Return an error that is unlikely to be used by the application. - log.Warningf("Failure to execute command, err: %v", subcmdCode) - os.Exit(128) -} - -func newEmitter(format string, logFile io.Writer) log.Emitter { - switch format { - case "text": - return log.GoogleEmitter{&log.Writer{Next: logFile}} - case "json": - return log.JSONEmitter{&log.Writer{Next: logFile}} - case "json-k8s": - return log.K8sJSONEmitter{&log.Writer{Next: logFile}} - } - cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format) - panic("unreachable") -} - -func init() { - // Set default root dir to something (hopefully) user-writeable. - *rootDir = "/var/run/runsc" - if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" { - *rootDir = filepath.Join(runtimeDir, "runsc") - } + cli.Main(version) } diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 035dcd3e3..f0a551a1e 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -26,10 +26,11 @@ go_library( "//runsc/boot", "//runsc/boot/platforms", "//runsc/cgroup", + "//runsc/config", "//runsc/console", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", "@com_github_vishvananda_netlink//:go_default_library", "@org_golang_x_sys//unix:go_default_library", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 209bfdb20..8f66dd1f8 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -31,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) @@ -49,26 +50,26 @@ import ( // // Run the following container to test it: // docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4 -func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error { +func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *config.Config) error { log.Infof("Setting up network") switch conf.Network { - case boot.NetworkNone: + case config.NetworkNone: log.Infof("Network is disabled, create loopback interface only") if err := createDefaultLoopbackInterface(conn); err != nil { return fmt.Errorf("creating default loopback interface: %v", err) } - case boot.NetworkSandbox: + case config.NetworkSandbox: // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") - if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels, conf.QDisc); err != nil { + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil { return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) } - case boot.NetworkHost: + case config.NetworkHost: // Nothing to do here. default: - return fmt.Errorf("invalid network type: %d", conf.Network) + return fmt.Errorf("invalid network type: %v", conf.Network) } return nil } @@ -115,7 +116,7 @@ func isRootNS() (bool, error) { // createInterfacesAndRoutesFromNS scrapes the interface and routes from the // net namespace with the given path, creates them in the sandbox, and removes // them from the host. -func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error { +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc config.QueueingDiscipline) error { // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { @@ -134,7 +135,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG return err } if isRoot { - return fmt.Errorf("cannot run with network enabled in root network namespace") } @@ -197,11 +197,13 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG } link := boot.FDBasedLink{ - Name: iface.Name, - MTU: iface.MTU, - Routes: routes, - NumChannels: numNetworkChannels, - QDisc: qDisc, + Name: iface.Name, + MTU: iface.MTU, + Routes: routes, + TXChecksumOffload: txChecksumOffload, + RXChecksumOffload: rxChecksumOffload, + NumChannels: numNetworkChannels, + QDisc: qDisc, } // Get the link for the interface. @@ -307,11 +309,20 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) ( const bufSize = 4 << 20 // 4MB. if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil { - return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err) + syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize) + sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF) + + if sz < bufSize { + log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err) + } } if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil { - return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err) + syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize) + sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF) + if sz < bufSize { + log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Curent buffer %d: %v", bufSize, iface.Name, sz, err) + } } return &socketEntry{deviceFile, gsoMaxSize}, nil diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 6e1a2af25..c4309feb3 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -41,6 +41,7 @@ import ( "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/cgroup" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/specutils" ) @@ -71,11 +72,14 @@ type Sandbox struct { // will have it as a child process. child bool - // status is an exit status of a sandbox process. - status syscall.WaitStatus - // statusMu protects status. statusMu sync.Mutex + + // status is the exit status of a sandbox process. It's only set if the + // child==true and the sandbox was waited on. This field allows for multiple + // threads to wait on sandbox and get the exit code, since Linux will return + // WaitStatus to one of the waiters only. + status syscall.WaitStatus } // Args is used to configure a new sandbox. @@ -116,7 +120,7 @@ type Args struct { // New creates the sandbox process. The caller must call Destroy() on the // sandbox. -func New(conf *boot.Config, args *Args) (*Sandbox, error) { +func New(conf *config.Config, args *Args) (*Sandbox, error) { s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup} // The Cleanup object cleans up partially created sandboxes when an error // occurs. Any errors occurring during cleanup itself are ignored. @@ -180,7 +184,7 @@ func (s *Sandbox) CreateContainer(cid string) error { } // StartRoot starts running the root container process inside the sandbox. -func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error { +func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error { log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid) conn, err := s.sandboxConnect() if err != nil { @@ -203,7 +207,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error { } // StartContainer starts running a non-root container inside the sandbox. -func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error { +func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error { for _, f := range goferFiles { defer f.Close() } @@ -232,7 +236,7 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string } // Restore sends the restore call for a container in the sandbox. -func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, filename string) error { +func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *config.Config, filename string) error { log.Debugf("Restore sandbox %q", s.ID) rf, err := os.Open(filename) @@ -344,7 +348,7 @@ func (s *Sandbox) connError(err error) error { // createSandboxProcess starts the sandbox as a subprocess by running the "boot" // command, passing in the bundle dir. -func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncFile *os.File) error { +func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error { // nextFD is used to get unused FDs that we can pass to the sandbox. It // starts at 3 because 0, 1, and 2 are taken by stdin/out/err. nextFD := 3 @@ -477,12 +481,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF cmd.Stderr = nil // If the console control socket file is provided, then create a new - // pty master/slave pair and set the TTY on the sandbox process. - if args.ConsoleSocket != "" { - cmd.Args = append(cmd.Args, "--console=true") - + // pty master/replica pair and set the TTY on the sandbox process. + if args.Spec.Process.Terminal && args.ConsoleSocket != "" { // console.NewWithSocket will send the master on the given - // socket, and return the slave. + // socket, and return the replica. tty, err := console.NewWithSocket(args.ConsoleSocket) if err != nil { return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err) @@ -557,10 +559,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF // Joins the network namespace if network is enabled. the sandbox talks // directly to the host network, which may have been configured in the // namespace. - if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != boot.NetworkNone { + if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone { log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) nss = append(nss, ns) - } else if conf.Network == boot.NetworkHost { + } else if conf.Network == config.NetworkHost { log.Infof("Sandbox will be started in the host network namespace") } else { log.Infof("Sandbox will be started in new network namespace") @@ -570,7 +572,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF // User namespace depends on the network type. Host network requires to run // inside the user namespace specified in the spec or the current namespace // if none is configured. - if conf.Network == boot.NetworkHost { + if conf.Network == config.NetworkHost { if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok { log.Infof("Sandbox will be started in container's user namespace: %+v", userns) nss = append(nss, userns) @@ -747,35 +749,47 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF // Wait waits for the containerized process to exit, and returns its WaitStatus. func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) - var ws syscall.WaitStatus if conn, err := s.sandboxConnect(); err != nil { - // The sandbox may have exited while before we had a chance to - // wait on it. + // The sandbox may have exited while before we had a chance to wait on it. + // There is nothing we can do for subcontainers. For the init container, we + // can try to get the sandbox exit code. + if !s.IsRootContainer(cid) { + return syscall.WaitStatus(0), err + } log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) } else { defer conn.Close() + // Try the Wait RPC to the sandbox. + var ws syscall.WaitStatus err = conn.Call(boot.ContainerWait, &cid, &ws) if err == nil { // It worked! return ws, nil } + // See comment above. + if !s.IsRootContainer(cid) { + return syscall.WaitStatus(0), err + } + // The sandbox may have exited after we connected, but before // or during the Wait RPC. log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) } - // The sandbox may have already exited, or exited while handling the - // Wait RPC. The best we can do is ask Linux what the sandbox exit - // status was, since in most cases that will be the same as the - // container exit status. + // The sandbox may have already exited, or exited while handling the Wait RPC. + // The best we can do is ask Linux what the sandbox exit status was, since in + // most cases that will be the same as the container exit status. if err := s.waitForStopped(); err != nil { - return ws, err + return syscall.WaitStatus(0), err } if !s.child { - return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable") + return syscall.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable") } + + s.statusMu.Lock() + defer s.statusMu.Unlock() return s.status, nil } @@ -1014,26 +1028,6 @@ func (s *Sandbox) StopCPUProfile() error { return nil } -// GoroutineProfile writes a goroutine profile to the given file. -func (s *Sandbox) GoroutineProfile(f *os.File) error { - log.Debugf("Goroutine profile %q", s.ID) - conn, err := s.sandboxConnect() - if err != nil { - return err - } - defer conn.Close() - - opts := control.ProfileOpts{ - FilePayload: urpc.FilePayload{ - Files: []*os.File{f}, - }, - } - if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil { - return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err) - } - return nil -} - // BlockProfile writes a block profile to the given file. func (s *Sandbox) BlockProfile(f *os.File) error { log.Debugf("Block profile %q", s.ID) @@ -1201,7 +1195,7 @@ func deviceFileForPlatform(name string) (*os.File, error) { // checkBinaryPermissions verifies that the required binary bits are set on // the runsc executable. -func checkBinaryPermissions(conf *boot.Config) error { +func checkBinaryPermissions(conf *config.Config) error { // All platforms need the other exe bit neededBits := os.FileMode(0001) if conf.Platform == platforms.Ptrace { diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index 4ccd77f63..679d8bc8e 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -16,8 +16,10 @@ go_library( "//pkg/bits", "//pkg/log", "//pkg/sentry/kernel/auth", + "//runsc/config", "@com_github_cenkalti_backoff//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_mohae_deepcopy//:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], @@ -28,5 +30,5 @@ go_test( size = "small", srcs = ["specutils_test.go"], library = ":specutils", - deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"], + deps = ["@com_github_opencontainers_runtime_spec//specs-go:go_default_library"], ) diff --git a/runsc/specutils/seccomp/BUILD b/runsc/specutils/seccomp/BUILD new file mode 100644 index 000000000..3520f2d6d --- /dev/null +++ b/runsc/specutils/seccomp/BUILD @@ -0,0 +1,34 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "seccomp", + srcs = [ + "audit_amd64.go", + "audit_arm64.go", + "seccomp.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi/linux", + "//pkg/bpf", + "//pkg/log", + "//pkg/seccomp", + "//pkg/sentry/kernel", + "//pkg/sentry/syscalls/linux", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + ], +) + +go_test( + name = "seccomp_test", + size = "small", + srcs = ["seccomp_test.go"], + library = ":seccomp", + deps = [ + "//pkg/binary", + "//pkg/bpf", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + ], +) diff --git a/runsc/specutils/seccomp/audit_amd64.go b/runsc/specutils/seccomp/audit_amd64.go new file mode 100644 index 000000000..417cf4a7a --- /dev/null +++ b/runsc/specutils/seccomp/audit_amd64.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package seccomp + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" +) + +const ( + nativeArchAuditNo = linux.AUDIT_ARCH_X86_64 +) diff --git a/runsc/specutils/seccomp/audit_arm64.go b/runsc/specutils/seccomp/audit_arm64.go new file mode 100644 index 000000000..b727ceff2 --- /dev/null +++ b/runsc/specutils/seccomp/audit_arm64.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package seccomp + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" +) + +const ( + nativeArchAuditNo = linux.AUDIT_ARCH_AARCH64 +) diff --git a/runsc/specutils/seccomp/seccomp.go b/runsc/specutils/seccomp/seccomp.go new file mode 100644 index 000000000..5932f7a41 --- /dev/null +++ b/runsc/specutils/seccomp/seccomp.go @@ -0,0 +1,229 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package seccomp implements some features of libseccomp in order to support +// OCI. +package seccomp + +import ( + "fmt" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" +) + +var ( + killThreadAction = linux.SECCOMP_RET_KILL_THREAD + trapAction = linux.SECCOMP_RET_TRAP + // runc always returns EPERM as the errorcode for SECCOMP_RET_ERRNO + errnoAction = linux.SECCOMP_RET_ERRNO.WithReturnCode(uint16(syscall.EPERM)) + // runc always returns EPERM as the errorcode for SECCOMP_RET_TRACE + traceAction = linux.SECCOMP_RET_TRACE.WithReturnCode(uint16(syscall.EPERM)) + allowAction = linux.SECCOMP_RET_ALLOW +) + +// BuildProgram generates a bpf program based on the given OCI seccomp +// config. +func BuildProgram(s *specs.LinuxSeccomp) (bpf.Program, error) { + defaultAction, err := convertAction(s.DefaultAction) + if err != nil { + return bpf.Program{}, fmt.Errorf("secomp default action: %w", err) + } + ruleset, err := convertRules(s) + if err != nil { + return bpf.Program{}, fmt.Errorf("invalid seccomp rules: %w", err) + } + + instrs, err := seccomp.BuildProgram(ruleset, defaultAction, killThreadAction) + if err != nil { + return bpf.Program{}, fmt.Errorf("building seccomp program: %w", err) + } + + program, err := bpf.Compile(instrs) + if err != nil { + return bpf.Program{}, fmt.Errorf("compiling seccomp program: %w", err) + } + + return program, nil +} + +// lookupSyscallNo gets the syscall number for the syscall with the given name +// for the given architecture. +func lookupSyscallNo(arch uint32, name string) (uint32, error) { + var table *kernel.SyscallTable + switch arch { + case linux.AUDIT_ARCH_X86_64: + table = slinux.AMD64 + case linux.AUDIT_ARCH_AARCH64: + table = slinux.ARM64 + } + if table == nil { + return 0, fmt.Errorf("unsupported architecture: %d", arch) + } + n, err := table.LookupNo(name) + if err != nil { + return 0, err + } + return uint32(n), nil +} + +// convertAction converts a LinuxSeccompAction to BPFAction +func convertAction(act specs.LinuxSeccompAction) (linux.BPFAction, error) { + // TODO(gvisor.dev/issue/3124): Update specs package to include ActLog and ActKillProcess. + switch act { + case specs.ActKill: + return killThreadAction, nil + case specs.ActTrap: + return trapAction, nil + case specs.ActErrno: + return errnoAction, nil + case specs.ActTrace: + return traceAction, nil + case specs.ActAllow: + return allowAction, nil + default: + return 0, fmt.Errorf("invalid action: %v", act) + } +} + +// convertRules converts OCI linux seccomp rules into RuleSets that can be used by +// the seccomp package to build a seccomp program. +func convertRules(s *specs.LinuxSeccomp) ([]seccomp.RuleSet, error) { + // NOTE: Architectures are only really relevant when calling 32bit syscalls + // on a 64bit system. Since we don't support that in gVisor anyway, we + // ignore Architectures and only test against the native architecture. + + ruleset := []seccomp.RuleSet{} + + for _, syscall := range s.Syscalls { + sysRules := seccomp.NewSyscallRules() + + action, err := convertAction(syscall.Action) + if err != nil { + return nil, err + } + + // Args + rules, err := convertArgs(syscall.Args) + if err != nil { + return nil, err + } + + for _, name := range syscall.Names { + syscallNo, err := lookupSyscallNo(nativeArchAuditNo, name) + if err != nil { + // If there is an error looking up the syscall number, assume it is + // not supported on this architecture and ignore it. This is, for + // better or worse, what runc does. + log.Warningf("OCI seccomp: ignoring syscall %q", name) + continue + } + + for _, rule := range rules { + sysRules.AddRule(uintptr(syscallNo), rule) + } + } + + ruleset = append(ruleset, seccomp.RuleSet{ + Rules: sysRules, + Action: action, + }) + } + + return ruleset, nil +} + +// convertArgs converts an OCI seccomp argument rule to a list of seccomp.Rule. +func convertArgs(args []specs.LinuxSeccompArg) ([]seccomp.Rule, error) { + argCounts := make([]uint, 6) + + for _, arg := range args { + if arg.Index > 6 { + return nil, fmt.Errorf("invalid index: %d", arg.Index) + } + + argCounts[arg.Index]++ + } + + // NOTE: If multiple rules apply to the same argument (same index) the + // action is triggered if any one of the rules matches (OR). If not, then + // all rules much match in order to trigger the action (AND). This appears to + // be some kind of legacy behavior of runc that nevertheless needs to be + // supported to maintain compatibility. + + hasMultipleArgs := false + for _, count := range argCounts { + if count > 1 { + hasMultipleArgs = true + break + } + } + + if hasMultipleArgs { + rules := []seccomp.Rule{} + + // Old runc behavior - do this for compatibility. + // Add rules as ORs by adding separate Rules. + for _, arg := range args { + rule := seccomp.Rule{nil, nil, nil, nil, nil, nil} + + if err := convertRule(arg, &rule); err != nil { + return nil, err + } + + rules = append(rules, rule) + } + + return rules, nil + } + + // Add rules as ANDs by adding to the same Rule. + rule := seccomp.Rule{nil, nil, nil, nil, nil, nil} + for _, arg := range args { + if err := convertRule(arg, &rule); err != nil { + return nil, err + } + } + + return []seccomp.Rule{rule}, nil +} + +// convertRule converts and adds the arg to a rule. +func convertRule(arg specs.LinuxSeccompArg, rule *seccomp.Rule) error { + switch arg.Op { + case specs.OpEqualTo: + rule[arg.Index] = seccomp.EqualTo(arg.Value) + case specs.OpNotEqual: + rule[arg.Index] = seccomp.NotEqual(arg.Value) + case specs.OpGreaterThan: + rule[arg.Index] = seccomp.GreaterThan(arg.Value) + case specs.OpGreaterEqual: + rule[arg.Index] = seccomp.GreaterThanOrEqual(arg.Value) + case specs.OpLessThan: + rule[arg.Index] = seccomp.LessThan(arg.Value) + case specs.OpLessEqual: + rule[arg.Index] = seccomp.LessThanOrEqual(arg.Value) + case specs.OpMaskedEqual: + rule[arg.Index] = seccomp.MaskedEqual(uintptr(arg.Value), uintptr(arg.ValueTwo)) + default: + return fmt.Errorf("unsupported operand: %q", arg.Op) + } + return nil +} diff --git a/runsc/specutils/seccomp/seccomp_test.go b/runsc/specutils/seccomp/seccomp_test.go new file mode 100644 index 000000000..850c237ba --- /dev/null +++ b/runsc/specutils/seccomp/seccomp_test.go @@ -0,0 +1,414 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccomp + +import ( + "fmt" + "syscall" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/bpf" +) + +type seccompData struct { + nr uint32 + arch uint32 + instructionPointer uint64 + args [6]uint64 +} + +// asInput converts a seccompData to a bpf.Input. +func asInput(d seccompData) bpf.Input { + return bpf.InputBytes{binary.Marshal(nil, binary.LittleEndian, d), binary.LittleEndian} +} + +// testInput creates an Input struct with given seccomp input values. +func testInput(arch uint32, syscallName string, args *[6]uint64) bpf.Input { + syscallNo, err := lookupSyscallNo(arch, syscallName) + if err != nil { + // Assume tests set valid syscall names. + panic(err) + } + + if args == nil { + argArray := [6]uint64{0, 0, 0, 0, 0, 0} + args = &argArray + } + + data := seccompData{ + nr: syscallNo, + arch: arch, + args: *args, + } + + return asInput(data) +} + +// testCase holds a seccomp test case. +type testCase struct { + name string + config specs.LinuxSeccomp + input bpf.Input + expected uint32 +} + +var ( + // seccompTests is a list of speccomp test cases. + seccompTests = []testCase{ + { + name: "default_allow", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + }, + input: testInput(nativeArchAuditNo, "read", nil), + expected: uint32(allowAction), + }, + { + name: "default_deny", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActErrno, + }, + input: testInput(nativeArchAuditNo, "read", nil), + expected: uint32(errnoAction), + }, + { + name: "deny_arch", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getcwd", + }, + Action: specs.ActErrno, + }, + }, + }, + // Syscall matches but the arch is AUDIT_ARCH_X86 so the return + // value is the bad arch action. + input: asInput(seccompData{nr: 183, arch: 0x40000003}), // + expected: uint32(killThreadAction), + }, + { + name: "match_name_errno", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getcwd", + "chmod", + }, + Action: specs.ActErrno, + }, + { + Names: []string{ + "write", + }, + Action: specs.ActTrace, + }, + }, + }, + input: testInput(nativeArchAuditNo, "getcwd", nil), + expected: uint32(errnoAction), + }, + { + name: "match_name_trace", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getcwd", + "chmod", + }, + Action: specs.ActErrno, + }, + { + Names: []string{ + "write", + }, + Action: specs.ActTrace, + }, + }, + }, + input: testInput(nativeArchAuditNo, "write", nil), + expected: uint32(traceAction), + }, + { + name: "no_match_name_allow", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getcwd", + "chmod", + }, + Action: specs.ActErrno, + }, + { + Names: []string{ + "write", + }, + Action: specs.ActTrace, + }, + }, + }, + input: testInput(nativeArchAuditNo, "openat", nil), + expected: uint32(allowAction), + }, + { + name: "simple_match_args", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: syscall.CLONE_FS, + Op: specs.OpEqualTo, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}), + expected: uint32(errnoAction), + }, + { + name: "match_args_or", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: syscall.CLONE_FS, + Op: specs.OpEqualTo, + }, + { + Index: 0, + Value: syscall.CLONE_VM, + Op: specs.OpEqualTo, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}), + expected: uint32(errnoAction), + }, + { + name: "match_args_and", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getsockopt", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 1, + Value: syscall.SOL_SOCKET, + Op: specs.OpEqualTo, + }, + { + Index: 2, + Value: syscall.SO_PEERCRED, + Op: specs.OpEqualTo, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "getsockopt", &[6]uint64{0, syscall.SOL_SOCKET, syscall.SO_PEERCRED}), + expected: uint32(errnoAction), + }, + { + name: "no_match_args_and", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getsockopt", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 1, + Value: syscall.SOL_SOCKET, + Op: specs.OpEqualTo, + }, + { + Index: 2, + Value: syscall.SO_PEERCRED, + Op: specs.OpEqualTo, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "getsockopt", &[6]uint64{0, syscall.SOL_SOCKET}), + expected: uint32(allowAction), + }, + { + name: "Simple args (no match)", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: syscall.CLONE_FS, + Op: specs.OpEqualTo, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_VM}), + expected: uint32(allowAction), + }, + { + name: "OpMaskedEqual (match)", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: syscall.CLONE_FS, + ValueTwo: syscall.CLONE_FS, + Op: specs.OpMaskedEqual, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS | syscall.CLONE_VM}), + expected: uint32(errnoAction), + }, + { + name: "OpMaskedEqual (no match)", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: syscall.CLONE_FS | syscall.CLONE_VM, + ValueTwo: syscall.CLONE_FS | syscall.CLONE_VM, + Op: specs.OpMaskedEqual, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}), + expected: uint32(allowAction), + }, + { + name: "OpMaskedEqual (clone)", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActErrno, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + // This comes from the Docker default seccomp + // profile for clone. + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: 0x7e020000, + ValueTwo: 0x0, + Op: specs.OpMaskedEqual, + }, + }, + Action: specs.ActAllow, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{0x50f00}), + expected: uint32(allowAction), + }, + } +) + +// TestRunscSeccomp generates seccomp programs from OCI config and executes +// them using runsc's library, comparing against expected results. +func TestRunscSeccomp(t *testing.T) { + for _, tc := range seccompTests { + t.Run(tc.name, func(t *testing.T) { + runscProgram, err := BuildProgram(&tc.config) + if err != nil { + t.Fatalf("generating runsc BPF: %v", err) + } + + if err := checkProgram(runscProgram, tc.input, tc.expected); err != nil { + t.Fatalf("running runsc BPF: %v", err) + } + }) + } +} + +// checkProgram runs the given program over the given input and checks the +// result against the expected output. +func checkProgram(p bpf.Program, in bpf.Input, expected uint32) error { + result, err := bpf.Exec(p, in) + if err != nil { + return err + } + + if result != expected { + // Include a decoded version of the program in output for debugging purposes. + decoded, _ := bpf.DecodeProgram(p) + return fmt.Errorf("Unexpected result: got: %d, expected: %d\nBPF Program\n%s", result, expected, decoded) + } + + return nil +} diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index f1fa573c5..45abc1425 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -29,11 +29,13 @@ import ( "time" "github.com/cenkalti/backoff" + "github.com/mohae/deepcopy" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/runsc/config" ) // ExePath must point to runsc binary, which is normally the same binary. It's @@ -44,20 +46,31 @@ var ExePath = "/proc/self/exe" var Version = specs.Version // LogSpec logs the spec in a human-friendly way. -func LogSpec(spec *specs.Spec) { - log.Debugf("Spec: %+v", spec) - log.Debugf("Spec.Hooks: %+v", spec.Hooks) - log.Debugf("Spec.Linux: %+v", spec.Linux) - if spec.Linux != nil && spec.Linux.Resources != nil { - res := spec.Linux.Resources - log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory) - log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU) - log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO) - log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network) - } - log.Debugf("Spec.Process: %+v", spec.Process) - log.Debugf("Spec.Root: %+v", spec.Root) - log.Debugf("Spec.Mounts: %+v", spec.Mounts) +func LogSpec(orig *specs.Spec) { + if !log.IsLogging(log.Debug) { + return + } + + // Strip down parts of the spec that are not interesting. + spec := deepcopy.Copy(orig).(*specs.Spec) + if spec.Process != nil { + spec.Process.Capabilities = nil + } + if spec.Linux != nil { + spec.Linux.Seccomp = nil + spec.Linux.MaskedPaths = nil + spec.Linux.ReadonlyPaths = nil + if spec.Linux.Resources != nil { + spec.Linux.Resources.Devices = nil + } + } + + out, err := json.MarshalIndent(spec, "", " ") + if err != nil { + log.Debugf("Failed to marshal spec: %v", err) + return + } + log.Debugf("Spec:\n%s", out) } // ValidateSpec validates that the spec is compatible with runsc. @@ -98,11 +111,6 @@ func ValidateSpec(spec *specs.Spec) error { log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.") } - // TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox. - if spec.Linux != nil && spec.Linux.Seccomp != nil { - log.Warningf("Seccomp spec is being ignored") - } - if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil { return err @@ -149,18 +157,18 @@ func OpenSpec(bundleDir string) (*os.File, error) { // ReadSpec reads an OCI runtime spec from the given bundle directory. // ReadSpec also normalizes all potential relative paths into absolute // path, e.g. spec.Root.Path, mount.Source. -func ReadSpec(bundleDir string) (*specs.Spec, error) { +func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) { specFile, err := OpenSpec(bundleDir) if err != nil { return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err) } defer specFile.Close() - return ReadSpecFromFile(bundleDir, specFile) + return ReadSpecFromFile(bundleDir, specFile, conf) } // ReadSpecFromFile reads an OCI runtime spec from the given File, and // normalizes all relative paths into absolute by prepending the bundle dir. -func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) { +func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) { if _, err := specFile.Seek(0, os.SEEK_SET); err != nil { return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err) } @@ -183,6 +191,20 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) m.Source = absPath(bundleDir, m.Source) } } + + // Override flags using annotation to allow customization per sandbox + // instance. + for annotation, val := range spec.Annotations { + const flagPrefix = "dev.gvisor.flag." + if strings.HasPrefix(annotation, flagPrefix) { + name := annotation[len(flagPrefix):] + log.Infof("Overriding flag: %s=%q", name, val) + if err := conf.Override(name, val); err != nil { + return nil, err + } + } + } + return &spec, nil } @@ -322,15 +344,9 @@ func IsSupportedDevMount(m specs.Mount) bool { var existingDevices = []string{ "/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr", "/dev/null", "/dev/zero", "/dev/full", "/dev/random", - "/dev/urandom", "/dev/shm", "/dev/pts", "/dev/ptmx", + "/dev/urandom", "/dev/shm", "/dev/ptmx", } dst := filepath.Clean(m.Destination) - if dst == "/dev" { - // OCI spec uses many different mounts for the things inside of '/dev'. We - // have a single mount at '/dev' that is always mounted, regardless of - // whether it was asked for, as the spec says we SHOULD. - return false - } for _, dev := range existingDevices { if dst == dev || strings.HasPrefix(dst, dev+"/") { return false @@ -403,7 +419,7 @@ func Mount(src, dst, typ string, flags uint32) error { // Special case, as there is no source directory for proc mounts. isDir = true } else if fi, err := os.Stat(src); err != nil { - return fmt.Errorf("Stat(%q) failed: %v", src, err) + return fmt.Errorf("stat(%q) failed: %v", src, err) } else { isDir = fi.IsDir() } @@ -411,25 +427,25 @@ func Mount(src, dst, typ string, flags uint32) error { if isDir { // Create the destination directory. if err := os.MkdirAll(dst, 0777); err != nil { - return fmt.Errorf("Mkdir(%q) failed: %v", dst, err) + return fmt.Errorf("mkdir(%q) failed: %v", dst, err) } } else { // Create the parent destination directory. parent := path.Dir(dst) if err := os.MkdirAll(parent, 0777); err != nil { - return fmt.Errorf("Mkdir(%q) failed: %v", parent, err) + return fmt.Errorf("mkdir(%q) failed: %v", parent, err) } // Create the destination file if it does not exist. f, err := os.OpenFile(dst, syscall.O_CREAT, 0777) if err != nil { - return fmt.Errorf("Open(%q) failed: %v", dst, err) + return fmt.Errorf("open(%q) failed: %v", dst, err) } f.Close() } // Do the mount. if err := syscall.Mount(src, dst, typ, uintptr(flags), ""); err != nil { - return fmt.Errorf("Mount(%q, %q, %d) failed: %v", src, dst, flags, err) + return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err) } return nil } |