summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
Diffstat (limited to 'runsc')
-rw-r--r--runsc/BUILD58
-rw-r--r--runsc/boot/BUILD13
-rw-r--r--runsc/boot/controller.go80
-rw-r--r--runsc/boot/filter/config.go502
-rw-r--r--runsc/boot/filter/config_amd64.go41
-rw-r--r--runsc/boot/filter/config_arm64.go25
-rw-r--r--runsc/boot/filter/config_profile.go6
-rw-r--r--runsc/boot/fs.go87
-rw-r--r--runsc/boot/fs_test.go11
-rw-r--r--runsc/boot/loader.go446
-rw-r--r--runsc/boot/loader_test.go74
-rw-r--r--runsc/boot/network.go49
-rw-r--r--runsc/boot/strace.go7
-rw-r--r--runsc/boot/vfs.go426
-rw-r--r--runsc/cgroup/BUILD4
-rw-r--r--runsc/cgroup/cgroup.go24
-rw-r--r--runsc/cmd/BUILD11
-rw-r--r--runsc/cmd/boot.go11
-rw-r--r--runsc/cmd/capability_test.go4
-rw-r--r--runsc/cmd/checkpoint.go6
-rw-r--r--runsc/cmd/create.go6
-rw-r--r--runsc/cmd/debug.go44
-rw-r--r--runsc/cmd/delete.go6
-rw-r--r--runsc/cmd/delete_test.go4
-rw-r--r--runsc/cmd/do.go10
-rw-r--r--runsc/cmd/events.go4
-rw-r--r--runsc/cmd/exec.go8
-rw-r--r--runsc/cmd/gofer.go31
-rw-r--r--runsc/cmd/kill.go4
-rw-r--r--runsc/cmd/list.go4
-rw-r--r--runsc/cmd/pause.go4
-rw-r--r--runsc/cmd/ps.go4
-rw-r--r--runsc/cmd/restore.go6
-rw-r--r--runsc/cmd/resume.go4
-rw-r--r--runsc/cmd/run.go6
-rw-r--r--runsc/cmd/spec.go220
-rw-r--r--runsc/cmd/start.go4
-rw-r--r--runsc/cmd/state.go4
-rw-r--r--runsc/cmd/wait.go4
-rw-r--r--runsc/config/BUILD28
-rw-r--r--runsc/config/config.go (renamed from runsc/boot/config.go)422
-rw-r--r--runsc/config/config_test.go272
-rw-r--r--runsc/config/flags.go205
-rw-r--r--runsc/console/console.go18
-rw-r--r--runsc/container/BUILD6
-rw-r--r--runsc/container/console_test.go11
-rw-r--r--runsc/container/container.go26
-rw-r--r--runsc/container/container_test.go251
-rw-r--r--runsc/container/multi_container_test.go228
-rw-r--r--runsc/container/shared_volume_test.go18
-rw-r--r--runsc/debian/description1
-rwxr-xr-xrunsc/debian/postinst.sh24
-rw-r--r--runsc/flag/flag.go14
-rw-r--r--runsc/fsgofer/BUILD2
-rw-r--r--runsc/fsgofer/filter/config.go170
-rw-r--r--runsc/fsgofer/filter/config_amd64.go37
-rw-r--r--runsc/fsgofer/filter/config_arm64.go21
-rw-r--r--runsc/fsgofer/filter/extra_filters_race.go1
-rw-r--r--runsc/fsgofer/fsgofer.go534
-rw-r--r--runsc/fsgofer/fsgofer_amd64_unsafe.go16
-rw-r--r--runsc/fsgofer/fsgofer_arm64_unsafe.go16
-rw-r--r--runsc/fsgofer/fsgofer_test.go345
-rw-r--r--runsc/fsgofer/fsgofer_unsafe.go18
-rw-r--r--runsc/main.go170
-rw-r--r--runsc/sandbox/BUILD3
-rw-r--r--runsc/sandbox/network.go14
-rw-r--r--runsc/sandbox/sandbox.go86
-rw-r--r--runsc/specutils/BUILD5
-rw-r--r--runsc/specutils/seccomp/BUILD34
-rw-r--r--runsc/specutils/seccomp/audit_amd64.go25
-rw-r--r--runsc/specutils/seccomp/audit_arm64.go25
-rw-r--r--runsc/specutils/seccomp/seccomp.go229
-rw-r--r--runsc/specutils/seccomp/seccomp_test.go414
-rw-r--r--runsc/specutils/specutils.go26
74 files changed, 3802 insertions, 2175 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
index 757f6d44c..33d8554af 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_binary", "pkg_deb", "pkg_tar")
+load("//tools:defs.bzl", "go_binary")
package(licenses = ["notice"])
@@ -17,8 +17,8 @@ go_binary(
"//pkg/log",
"//pkg/refs",
"//pkg/sentry/platform",
- "//runsc/boot",
"//runsc/cmd",
+ "//runsc/config",
"//runsc/flag",
"//runsc/specutils",
"@com_github_google_subcommands//:go_default_library",
@@ -53,66 +53,14 @@ go_binary(
"//pkg/log",
"//pkg/refs",
"//pkg/sentry/platform",
- "//runsc/boot",
"//runsc/cmd",
+ "//runsc/config",
"//runsc/flag",
"//runsc/specutils",
"@com_github_google_subcommands//:go_default_library",
],
)
-pkg_tar(
- name = "runsc-bin",
- srcs = [":runsc"],
- mode = "0755",
- package_dir = "/usr/bin",
- strip_prefix = "/runsc/linux_amd64_pure_stripped",
-)
-
-pkg_tar(
- name = "debian-data",
- extension = "tar.gz",
- deps = [
- ":runsc-bin",
- ],
-)
-
-genrule(
- name = "deb-version",
- # Note that runsc must appear in the srcs parameter and not the tools
- # parameter, otherwise it will not be stamped. This is reasonable, as tools
- # may be encoded differently in the build graph (cached more aggressively
- # because they are assumes to be hermetic).
- srcs = [":runsc"],
- outs = ["version.txt"],
- # Note that the little dance here is necessary because files in the $(SRCS)
- # attribute are not executable by default, and we can't touch in place.
- cmd = "cp $(location :runsc) $(@D)/runsc && \
- chmod a+x $(@D)/runsc && \
- $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \
- rm -f $(@D)/runsc",
- stamp = 1,
-)
-
-pkg_deb(
- name = "runsc-debian",
- architecture = "amd64",
- data = ":debian-data",
- # Note that the description_file will be flatten (all newlines removed),
- # and therefore it is kept to a simple one-line description. The expected
- # format for debian packages is "short summary\nLonger explanation of
- # tool." and this is impossible with the flattening.
- description_file = "debian/description",
- homepage = "https://gvisor.dev/",
- maintainer = "The gVisor Authors <gvisor-dev@googlegroups.com>",
- package = "runsc",
- postinst = "debian/postinst.sh",
- version_file = ":version.txt",
- visibility = [
- "//visibility:public",
- ],
-)
-
sh_test(
name = "version_test",
size = "small",
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index aad2a41de..2d9517f4a 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -8,7 +8,6 @@ go_library(
"compat.go",
"compat_amd64.go",
"compat_arm64.go",
- "config.go",
"controller.go",
"debug.go",
"events.go",
@@ -27,10 +26,13 @@ go_library(
deps = [
"//pkg/abi",
"//pkg/abi/linux",
+ "//pkg/bpf",
+ "//pkg/cleanup",
"//pkg/context",
"//pkg/control/server",
"//pkg/cpuid",
"//pkg/eventchannel",
+ "//pkg/fd",
"//pkg/fspath",
"//pkg/log",
"//pkg/memutil",
@@ -90,6 +92,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/link/fdbased",
"//pkg/tcpip/link/loopback",
+ "//pkg/tcpip/link/packetsocket",
"//pkg/tcpip/link/qdisc/fifo",
"//pkg/tcpip/link/sniffer",
"//pkg/tcpip/network/arp",
@@ -104,9 +107,11 @@ go_library(
"//runsc/boot/filter",
"//runsc/boot/platforms",
"//runsc/boot/pprof",
+ "//runsc/config",
"//runsc/specutils",
+ "//runsc/specutils/seccomp",
"@com_github_golang_protobuf//proto:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
)
@@ -122,6 +127,7 @@ go_test(
library = ":boot",
deps = [
"//pkg/control/server",
+ "//pkg/fd",
"//pkg/fspath",
"//pkg/log",
"//pkg/p9",
@@ -130,8 +136,9 @@ go_test(
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/unet",
+ "//runsc/config",
"//runsc/fsgofer",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 8125d5061..894651519 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,6 +22,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/control/server"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -33,6 +34,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot/pprof"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -101,14 +103,13 @@ const (
// Profiling related commands (see pprof.go for more details).
const (
- StartCPUProfile = "Profile.StartCPUProfile"
- StopCPUProfile = "Profile.StopCPUProfile"
- HeapProfile = "Profile.HeapProfile"
- GoroutineProfile = "Profile.GoroutineProfile"
- BlockProfile = "Profile.BlockProfile"
- MutexProfile = "Profile.MutexProfile"
- StartTrace = "Profile.StartTrace"
- StopTrace = "Profile.StopTrace"
+ StartCPUProfile = "Profile.StartCPUProfile"
+ StopCPUProfile = "Profile.StopCPUProfile"
+ HeapProfile = "Profile.HeapProfile"
+ BlockProfile = "Profile.BlockProfile"
+ MutexProfile = "Profile.MutexProfile"
+ StartTrace = "Profile.StartTrace"
+ StopTrace = "Profile.StopTrace"
)
// Logging related commands (see logging.go for more details).
@@ -129,42 +130,52 @@ type controller struct {
// manager holds the containerManager methods.
manager *containerManager
+
+ // pprop holds the profile instance if enabled. It may be nil.
+ pprof *control.Profile
}
// newController creates a new controller. The caller must call
// controller.srv.StartServing() to start the controller.
func newController(fd int, l *Loader) (*controller, error) {
- srv, err := server.CreateFromFD(fd)
+ ctrl := &controller{}
+ var err error
+ ctrl.srv, err = server.CreateFromFD(fd)
if err != nil {
return nil, err
}
- manager := &containerManager{
+ ctrl.manager = &containerManager{
startChan: make(chan struct{}),
startResultChan: make(chan error),
l: l,
}
- srv.Register(manager)
+ ctrl.srv.Register(ctrl.manager)
if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
net := &Network{
Stack: eps.Stack,
}
- srv.Register(net)
+ ctrl.srv.Register(net)
}
- srv.Register(&debug{})
- srv.Register(&control.Logging{})
- if l.conf.ProfileEnable {
- srv.Register(&control.Profile{
- Kernel: l.k,
- })
+ ctrl.srv.Register(&debug{})
+ ctrl.srv.Register(&control.Logging{})
+
+ if l.root.conf.ProfileEnable {
+ ctrl.pprof = &control.Profile{Kernel: l.k}
+ ctrl.srv.Register(ctrl.pprof)
}
- return &controller{
- srv: srv,
- manager: manager,
- }, nil
+ return ctrl, nil
+}
+
+func (c *controller) stop() {
+ if c.pprof != nil {
+ // These are noop if there is nothing being profiled.
+ _ = c.pprof.StopCPUProfile(nil, nil)
+ _ = c.pprof.StopTrace(nil, nil)
+ }
}
// containerManager manages sandbox containers.
@@ -211,7 +222,7 @@ type StartArgs struct {
Spec *specs.Spec
// Config is the runsc-specific configuration for the sandbox.
- Conf *Config
+ Conf *config.Config
// CID is the ID of the container to start.
CID string
@@ -247,13 +258,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
// All validation passed, logs the spec for debugging.
specutils.LogSpec(args.Spec)
- err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+ fds, err := fd.NewFromFiles(args.FilePayload.Files)
if err != nil {
+ return err
+ }
+ defer func() {
+ for _, fd := range fds {
+ _ = fd.Close()
+ }
+ }()
+ if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
return err
}
log.Debugf("Container %q started", args.CID)
-
return nil
}
@@ -333,7 +351,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
// Pause the kernel while we build a new one.
cm.l.k.Pause()
- p, err := createPlatform(cm.l.conf, deviceFile)
+ p, err := createPlatform(cm.l.root.conf, deviceFile)
if err != nil {
return fmt.Errorf("creating platform: %v", err)
}
@@ -349,8 +367,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
cm.l.k = k
// Set up the restore environment.
- mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
- renv, err := mntr.createRestoreEnvironment(cm.l.conf)
+ mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints)
+ renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
if err != nil {
return fmt.Errorf("creating RestoreEnvironment: %v", err)
}
@@ -368,7 +386,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("file cannot be empty")
}
- if cm.l.conf.ProfileEnable {
+ if cm.l.root.conf.ProfileEnable {
// pprof.Initialize opens /proc/self/maps, so has to be called before
// installing seccomp filters.
pprof.Initialize()
@@ -387,13 +405,13 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
// Since we have a new kernel we also must make a new watchdog.
dogOpts := watchdog.DefaultOpts
- dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+ dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction
dog := watchdog.New(k, dogOpts)
// Change the loader fields to reflect the changes made when restoring.
cm.l.k = k
cm.l.watchdog = dog
- cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+ cm.l.root.procArgs = kernel.CreateProcessArgs{}
cm.l.restore = true
// Reinitialize the sandbox ID and processes map. Note that it doesn't
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 60e33425f..6ac19668f 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -27,41 +27,30 @@ import (
// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_CLOCK_GETTIME: {},
- syscall.SYS_CLONE: []seccomp.Rule{
- {
- seccomp.AllowValue(
- syscall.CLONE_VM |
- syscall.CLONE_FS |
- syscall.CLONE_FILES |
- syscall.CLONE_SIGHAND |
- syscall.CLONE_SYSVSEM |
- syscall.CLONE_THREAD),
- },
- },
- syscall.SYS_CLOSE: {},
- syscall.SYS_DUP: {},
+ syscall.SYS_CLOSE: {},
+ syscall.SYS_DUP: {},
syscall.SYS_DUP3: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.O_CLOEXEC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.O_CLOEXEC),
},
},
syscall.SYS_EPOLL_CREATE1: {},
syscall.SYS_EPOLL_CTL: {},
syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0),
},
},
syscall.SYS_EVENTFD2: []seccomp.Rule{
{
- seccomp.AllowValue(0),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(0),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_EXIT: {},
@@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_FCHMOD: {},
syscall.SYS_FCNTL: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_GETFL),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_GETFL),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_SETFL),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_SETFL),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_GETFD),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_GETFD),
},
},
syscall.SYS_FSTAT: {},
@@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_FTRUNCATE: {},
syscall.SYS_FUTEX: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
- seccomp.AllowAny{},
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.MatchAny{},
},
// Non-private variants are included for flipcall support. They are otherwise
// unncessary, as the sentry will use only private futexes internally.
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAIT),
- seccomp.AllowAny{},
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAIT),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAKE),
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAKE),
+ seccomp.MatchAny{},
},
},
syscall.SYS_GETPID: {},
unix.SYS_GETRANDOM: {},
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_DOMAIN),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_DOMAIN),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_TYPE),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_TYPE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_ERROR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_ERROR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_SNDBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_SNDBUF),
},
},
syscall.SYS_GETTID: {},
@@ -141,34 +130,34 @@ var allowedSyscalls = seccomp.SyscallRules{
// setting/getting termios and winsize.
syscall.SYS_IOCTL: []seccomp.Rule{
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCGETS),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCGETS),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCSETS),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCSETS),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCSETSF),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCSETSF),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCSETSW),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCSETSW),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TIOCSWINSZ),
- seccomp.AllowAny{}, /* winsize struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TIOCSWINSZ),
+ seccomp.MatchAny{}, /* winsize struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TIOCGWINSZ),
- seccomp.AllowAny{}, /* winsize struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TIOCGWINSZ),
+ seccomp.MatchAny{}, /* winsize struct */
},
},
syscall.SYS_LSEEK: {},
@@ -182,46 +171,46 @@ var allowedSyscalls = seccomp.SyscallRules{
// TODO(b/148688965): Remove once this is gone from Go.
syscall.SYS_MLOCK: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(4096),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4096),
},
},
syscall.SYS_MMAP: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_SHARED),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_SHARED),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ),
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
},
},
syscall.SYS_MPROTECT: {},
@@ -237,32 +226,32 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_READ: {},
syscall.SYS_RECVMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
},
},
syscall.SYS_RECVMMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
- seccomp.AllowValue(syscall.MSG_DONTWAIT),
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(fdbased.MaxMsgsPerRecv),
+ seccomp.EqualTo(syscall.MSG_DONTWAIT),
+ seccomp.EqualTo(0),
},
},
unix.SYS_SENDMMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT),
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_RESTART_SYSCALL: {},
@@ -272,57 +261,50 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_SCHED_YIELD: {},
syscall.SYS_SENDMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
},
},
syscall.SYS_SETITIMER: {},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
// Used by fs/host to shutdown host sockets.
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)},
// Used by unet to shutdown connections.
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)},
},
syscall.SYS_SIGALTSTACK: {},
unix.SYS_STATX: {},
syscall.SYS_SYNC_FILE_RANGE: {},
syscall.SYS_TEE: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(1), /* len */
- seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(1), /* len */
+ seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */
},
},
syscall.SYS_TGKILL: []seccomp.Rule{
{
- seccomp.AllowValue(uint64(os.Getpid())),
+ seccomp.EqualTo(uint64(os.Getpid())),
},
},
syscall.SYS_UTIMENSAT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(0), /* null pathname */
- seccomp.AllowAny{},
- seccomp.AllowValue(0), /* flags */
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0), /* null pathname */
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0), /* flags */
},
},
syscall.SYS_WRITE: {},
- // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
- // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
- // option is enabled for a packet socket.
+ // For rawfile.NonBlockingWriteIovec.
syscall.SYS_WRITEV: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(2),
- },
- {
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(3),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.GreaterThan(0),
},
},
}
@@ -332,10 +314,10 @@ func hostInetFilters() seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_ACCEPT4: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
},
},
syscall.SYS_BIND: {},
@@ -344,84 +326,84 @@ func hostInetFilters() seccomp.SyscallRules {
syscall.SYS_GETSOCKNAME: {},
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_TOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_TOS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_RECVTOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_RECVTOS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_TCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_TCLASS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_V6ONLY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_V6ONLY),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_ERROR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_ERROR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_KEEPALIVE),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_KEEPALIVE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_SNDBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_SNDBUF),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_RCVBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_RCVBUF),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_REUSEADDR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_REUSEADDR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_TYPE),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_TYPE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_LINGER),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_LINGER),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_TCP),
- seccomp.AllowValue(syscall.TCP_NODELAY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(syscall.TCP_NODELAY),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_TCP),
- seccomp.AllowValue(syscall.TCP_INFO),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(syscall.TCP_INFO),
},
},
syscall.SYS_IOCTL: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.TIOCOUTQ),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.TIOCOUTQ),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.TIOCINQ),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.TIOCINQ),
},
},
syscall.SYS_LISTEN: {},
@@ -432,103 +414,103 @@ func hostInetFilters() seccomp.SyscallRules {
syscall.SYS_SENDTO: {},
syscall.SYS_SETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_V6ONLY),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_V6ONLY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_SNDBUF),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_SNDBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_RCVBUF),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_RCVBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_REUSEADDR),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_REUSEADDR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_TCP),
- seccomp.AllowValue(syscall.TCP_NODELAY),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(syscall.TCP_NODELAY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_TOS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_TOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_RECVTOS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_RECVTOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_TCLASS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_TCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SHUT_RD),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SHUT_RD),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SHUT_WR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SHUT_WR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SHUT_RDWR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SHUT_RDWR),
},
},
syscall.SYS_SOCKET: []seccomp.Rule{
{
- seccomp.AllowValue(syscall.AF_INET),
- seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET),
+ seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_INET),
- seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET),
+ seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_INET6),
- seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET6),
+ seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_INET6),
- seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET6),
+ seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_WRITEV: {},
@@ -539,20 +521,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_ACCEPT: []seccomp.Rule{
{
- seccomp.AllowValue(fd),
+ seccomp.EqualTo(fd),
},
},
syscall.SYS_LISTEN: []seccomp.Rule{
{
- seccomp.AllowValue(fd),
- seccomp.AllowValue(16 /* unet.backlog */),
+ seccomp.EqualTo(fd),
+ seccomp.EqualTo(16 /* unet.backlog */),
},
},
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_PEERCRED),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_PEERCRED),
},
},
}
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
index 5335ff82c..cea5613b8 100644
--- a/runsc/boot/filter/config_amd64.go
+++ b/runsc/boot/filter/config_amd64.go
@@ -24,8 +24,41 @@ import (
)
func init() {
- allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
- seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
- seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
- )
+ allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+ // TODO(b/168828518): No longer used in Go 1.16+.
+ {seccomp.EqualTo(linux.ARCH_SET_FS)},
+ }
+
+ allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+ // parent_tidptr and child_tidptr are always 0 because neither
+ // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+ {
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SETTLS |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ seccomp.EqualTo(0), // parent_tidptr
+ seccomp.EqualTo(0), // child_tidptr
+ seccomp.MatchAny{}, // tls
+ },
+ {
+ // TODO(b/168828518): No longer used in Go 1.16+ (on amd64).
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ seccomp.EqualTo(0), // parent_tidptr
+ seccomp.EqualTo(0), // child_tidptr
+ seccomp.MatchAny{}, // tls
+ },
+ }
}
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
index 7fa9bbda3..37313f97f 100644
--- a/runsc/boot/filter/config_arm64.go
+++ b/runsc/boot/filter/config_arm64.go
@@ -16,6 +16,29 @@
package filter
-// Reserve for future customization.
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/seccomp"
+)
+
func init() {
+ allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+ {
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ // These arguments are left uninitialized by the Go
+ // runtime, so they may be anything (and are unused by
+ // the host).
+ seccomp.MatchAny{}, // parent_tidptr
+ seccomp.MatchAny{}, // tls
+ seccomp.MatchAny{}, // child_tidptr
+ },
+ }
}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
index 194952a7b..7b8669595 100644
--- a/runsc/boot/filter/config_profile.go
+++ b/runsc/boot/filter/config_profile.go
@@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_OPENAT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
},
},
}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e83584b82..ddf288456 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -29,10 +29,12 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
@@ -47,6 +49,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -65,7 +68,7 @@ const (
// tmpfs has some extra supported options that we must pass through.
var tmpfsAllowedData = []string{"mode", "uid", "gid"}
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
// Upper layer uses the same flags as lower, but it must be read-write.
upperFlags := lowerFlags
upperFlags.ReadOnly = false
@@ -155,7 +158,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
}
// p9MountData creates a slice of p9 mount data.
-func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
+func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string {
opts := []string{
"trans=fd",
"rfdno=" + strconv.Itoa(fd),
@@ -166,7 +169,7 @@ func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
// enablement.
opts = append(opts, "privateunixsocket=true")
}
- if fa == FileAccessShared {
+ if fa == config.FileAccessShared {
opts = append(opts, "cache=remote_revalidating")
}
return opts
@@ -251,7 +254,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
// addSubmountOverlay overlays the inode over a ramfs tree containing the given
// paths.
-func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) {
// Construct a ramfs tree of mount points. The contents never
// change, so this can be fully caching. There's no real
// filesystem backing this tree, so we set the filesystem to
@@ -261,7 +264,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
if err != nil {
return nil, fmt.Errorf("creating mount tree: %v", err)
}
- overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+ overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf)
if err != nil {
return nil, fmt.Errorf("adding mount overlay: %v", err)
}
@@ -280,7 +283,7 @@ func subtargets(root string, mnts []specs.Mount) []string {
return targets
}
-func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
if conf.VFS2 {
return setupContainerVFS2(ctx, conf, mntr, procArgs)
}
@@ -318,14 +321,14 @@ func adjustDirentCache(k *kernel.Kernel) error {
}
type fdDispenser struct {
- fds []int
+ fds []*fd.FD
}
func (f *fdDispenser) remove() int {
if f.empty() {
panic("fdDispenser out of fds")
}
- rv := f.fds[0]
+ rv := f.fds[0].Release()
f.fds = f.fds[1:]
return rv
}
@@ -390,6 +393,10 @@ type mountHint struct {
// root is the inode where the volume is mounted. For mounts with 'pod' share
// the volume is mounted once and then bind mounted inside the containers.
root *fs.Inode
+
+ // vfsMount is the master mount for the volume. For mounts with 'pod' share
+ // the master volume is bind mounted inside the containers.
+ vfsMount *vfs.Mount
}
func (m *mountHint) setField(key, val string) error {
@@ -447,27 +454,27 @@ func (m *mountHint) isSupported() bool {
func (m *mountHint) checkCompatible(mount specs.Mount) error {
// Remove options that don't affect to mount's behavior.
masterOpts := filterUnsupportedOptions(m.mount)
- slaveOpts := filterUnsupportedOptions(mount)
+ replicaOpts := filterUnsupportedOptions(mount)
- if len(masterOpts) != len(slaveOpts) {
- return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+ if len(masterOpts) != len(replicaOpts) {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
}
sort.Strings(masterOpts)
- sort.Strings(slaveOpts)
+ sort.Strings(replicaOpts)
for i, opt := range masterOpts {
- if opt != slaveOpts[i] {
- return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+ if opt != replicaOpts[i] {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
}
}
return nil
}
-func (m *mountHint) fileAccessType() FileAccessType {
+func (m *mountHint) fileAccessType() config.FileAccessType {
if m.share == container {
- return FileAccessExclusive
+ return config.FileAccessExclusive
}
- return FileAccessShared
+ return config.FileAccessShared
}
func filterUnsupportedOptions(mount specs.Mount) []string {
@@ -558,7 +565,7 @@ type containerMounter struct {
hints *podMountHints
}
-func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter {
return &containerMounter{
root: spec.Root,
mounts: compileMounts(spec),
@@ -571,9 +578,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
// processHints processes annotations that container hints about how volumes
// should be mounted (e.g. a volume shared between containers). It must be
// called for the root container only.
-func (c *containerMounter) processHints(conf *Config) error {
+func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error {
if conf.VFS2 {
- return nil
+ return c.processHintsVFS2(conf, creds)
}
ctx := c.k.SupervisorContext()
for _, hint := range c.hints.mounts {
@@ -595,7 +602,7 @@ func (c *containerMounter) processHints(conf *Config) error {
// setupFS is used to set up the file system for all containers. This is the
// main entry point method, with most of the other being internal only. It
// returns the mount namespace that is created for the container.
-func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
log.Infof("Configuring container's file system")
// Create context with root credentials to mount the filesystem (the current
@@ -621,7 +628,7 @@ func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessA
return mns, nil
}
-func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) {
rootInode, err := c.createRootMount(ctx, conf)
if err != nil {
return nil, fmt.Errorf("creating filesystem for container: %v", err)
@@ -633,9 +640,9 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi
return mns, nil
}
-func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error {
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
for _, m := range c.mounts {
log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
@@ -669,7 +676,7 @@ func (c *containerMounter) checkDispenser() error {
// mountSharedMaster mounts the master of a volume that is shared among
// containers in a pod. It returns the root mount's inode.
-func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) {
// Map mount type to filesystem name, and parse out the options that we are
// capable of dealing with.
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
@@ -709,7 +716,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config,
}
// createRootMount creates the root filesystem.
-func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) {
// First construct the filesystem from the spec.Root.
mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
@@ -734,7 +741,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
// for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
// mounted even if they are not in the spec.
submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
- rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+ rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf)
if err != nil {
return nil, fmt.Errorf("adding submount overlay: %v", err)
}
@@ -754,7 +761,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
// used for mounts.
-func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) {
var (
fsName string
opts []string
@@ -788,19 +795,19 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
return fsName, opts, useOverlay, nil
}
-func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+func (c *containerMounter) getMountAccessType(mount specs.Mount) config.FileAccessType {
if hint := c.hints.findMount(mount); hint != nil {
return hint.fileAccessType()
}
// Non-root bind mounts are always shared if no hints were provided.
- return FileAccessShared
+ return config.FileAccessShared
}
// mountSubmount mounts volumes inside the container's root. Because mounts may
// be readonly, a lower ramfs overlay is added to create the mount point dir.
// Another overlay is added with tmpfs on top if Config.Overlay is true.
// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
// Map mount type to filesystem name, and parse out the options that we are
// capable of dealing with.
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
@@ -844,7 +851,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
submounts := subtargets(m.Destination, c.mounts)
if len(submounts) > 0 {
log.Infof("Adding submount overlay over %q", m.Destination)
- inode, err = addSubmountOverlay(ctx, inode, submounts)
+ inode, err = addSubmountOverlay(ctx, inode, submounts, mf)
if err != nil {
return fmt.Errorf("adding submount overlay: %v", err)
}
@@ -863,7 +870,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
if err != nil {
return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
}
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
if err := mns.Mount(ctx, dirent, inode); err != nil {
return fmt.Errorf("mount %q error: %v", m.Destination, err)
}
@@ -884,12 +891,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
if err != nil {
return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
}
- defer target.DecRef()
+ defer target.DecRef(ctx)
// Take a ref on the inode that is about to be (re)-mounted.
source.root.IncRef()
if err := mns.Mount(ctx, target, source.root); err != nil {
- source.root.DecRef()
+ source.root.DecRef(ctx)
return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
}
@@ -899,7 +906,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
// addRestoreMount adds a mount to the MountSources map used for restoring a
// checkpointed container.
-func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
if err != nil {
return err
@@ -924,7 +931,7 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron
// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
// the mounts to the environment.
-func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) {
renv := &fs.RestoreEnvironment{
MountSources: make(map[string][]fs.MountArgs),
}
@@ -979,7 +986,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
//
// Note that when there are submounts inside of '/tmp', directories for the
// mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error {
for _, m := range c.mounts {
if filepath.Clean(m.Destination) == "/tmp" {
log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
@@ -992,12 +999,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
switch err {
case nil:
// Found '/tmp' in filesystem, check if it's empty.
- defer tmp.DecRef()
+ defer tmp.DecRef(ctx)
f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
if err != nil {
return err
}
- defer f.DecRef()
+ defer f.DecRef(ctx)
serializer := &fs.CollectEntriesSerializer{}
if err := f.Readdir(ctx, serializer); err != nil {
return err
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 912037075..e986231e5 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -20,6 +20,7 @@ import (
"testing"
specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/runsc/config"
)
func TestPodMountHintsHappy(t *testing.T) {
@@ -196,7 +197,7 @@ func TestGetMountAccessType(t *testing.T) {
for _, tst := range []struct {
name string
annotations map[string]string
- want FileAccessType
+ want config.FileAccessType
}{
{
name: "container=exclusive",
@@ -205,7 +206,7 @@ func TestGetMountAccessType(t *testing.T) {
MountPrefix + "mount1.type": "bind",
MountPrefix + "mount1.share": "container",
},
- want: FileAccessExclusive,
+ want: config.FileAccessExclusive,
},
{
name: "pod=shared",
@@ -214,7 +215,7 @@ func TestGetMountAccessType(t *testing.T) {
MountPrefix + "mount1.type": "bind",
MountPrefix + "mount1.share": "pod",
},
- want: FileAccessShared,
+ want: config.FileAccessShared,
},
{
name: "shared=shared",
@@ -223,7 +224,7 @@ func TestGetMountAccessType(t *testing.T) {
MountPrefix + "mount1.type": "bind",
MountPrefix + "mount1.share": "shared",
},
- want: FileAccessShared,
+ want: config.FileAccessShared,
},
{
name: "default=shared",
@@ -232,7 +233,7 @@ func TestGetMountAccessType(t *testing.T) {
MountPrefix + "mount1.type": "bind",
MountPrefix + "mount1.share": "container",
},
- want: FileAccessShared,
+ want: config.FileAccessShared,
},
} {
t.Run(tst.name, func(t *testing.T) {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index b5df1deb9..dee2c4fbb 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -16,22 +16,25 @@
package boot
import (
+ "errors"
"fmt"
mrand "math/rand"
"os"
"runtime"
"sync/atomic"
- "syscall"
gtime "time"
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/memutil"
"gvisor.dev/gvisor/pkg/rand"
+ "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fdimport"
@@ -66,7 +69,9 @@ import (
"gvisor.dev/gvisor/runsc/boot/filter"
_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
"gvisor.dev/gvisor/runsc/boot/pprof"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
+ "gvisor.dev/gvisor/runsc/specutils/seccomp"
// Include supported socket providers.
"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
@@ -77,6 +82,22 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
)
+type containerInfo struct {
+ conf *config.Config
+
+ // spec is the base configuration for the root container.
+ spec *specs.Spec
+
+ // procArgs refers to the container's init task.
+ procArgs kernel.CreateProcessArgs
+
+ // stdioFDs contains stdin, stdout, and stderr.
+ stdioFDs []*fd.FD
+
+ // goferFDs are the FDs that attach the sandbox to the gofers.
+ goferFDs []*fd.FD
+}
+
// Loader keeps state needed to start the kernel and run the container..
type Loader struct {
// k is the kernel.
@@ -85,22 +106,11 @@ type Loader struct {
// ctrl is the control server.
ctrl *controller
- conf *Config
-
- // console is set to true if terminal is enabled.
- console bool
+ // root contains information about the root container in the sandbox.
+ root containerInfo
watchdog *watchdog.Watchdog
- // stdioFDs contains stdin, stdout, and stderr.
- stdioFDs []int
-
- // goferFDs are the FDs that attach the sandbox to the gofers.
- goferFDs []int
-
- // spec is the base configuration for the root container.
- spec *specs.Spec
-
// stopSignalForwarding disables forwarding of signals to the sandboxed
// container. It should be called when a sandbox is destroyed.
stopSignalForwarding func()
@@ -108,9 +118,6 @@ type Loader struct {
// restore is set to true if we are restoring a container.
restore bool
- // rootProcArgs refers to the root sandbox init task.
- rootProcArgs kernel.CreateProcessArgs
-
// sandboxID is the ID for the whole sandbox.
sandboxID string
@@ -162,7 +169,7 @@ type Args struct {
// Spec is the sandbox specification.
Spec *specs.Spec
// Conf is the system configuration.
- Conf *Config
+ Conf *config.Config
// ControllerFD is the FD to the URPC controller. The Loader takes ownership
// of this FD and may close it at any time.
ControllerFD int
@@ -175,8 +182,6 @@ type Args struct {
// StdioFDs is the stdio for the application. The Loader takes ownership of
// these FDs and may close them at any time.
StdioFDs []int
- // Console is set to true if using TTY.
- Console bool
// NumCPU is the number of CPUs to create inside the sandbox.
NumCPU int
// TotalMem is the initial amount of total memory to report back to the
@@ -187,7 +192,7 @@ type Args struct {
}
// make sure stdioFDs are always the same on initial start and on restore
-const startingStdioFD = 64
+const startingStdioFD = 256
// New initializes a new kernel loader configured by spec.
// New also handles setting up a kernel for restoring a container.
@@ -205,6 +210,10 @@ func New(args Args) (*Loader, error) {
// Is this a VFSv2 kernel?
if args.Conf.VFS2 {
kernel.VFS2Enabled = true
+ if args.Conf.FUSE {
+ kernel.FUSEEnabled = true
+ }
+
vfs2.Override()
}
@@ -227,9 +236,7 @@ func New(args Args) (*Loader, error) {
// Create VDSO.
//
// Pass k as the platform since it is savable, unlike the actual platform.
- //
- // FIXME(b/109889800): Use non-nil context.
- vdso, err := loader.PrepareVDSO(nil, k)
+ vdso, err := loader.PrepareVDSO(k)
if err != nil {
return nil, fmt.Errorf("creating vdso: %v", err)
}
@@ -275,6 +282,7 @@ func New(args Args) (*Loader, error) {
args.NumCPU = runtime.NumCPU()
}
log.Infof("CPUs: %d", args.NumCPU)
+ runtime.GOMAXPROCS(args.NumCPU)
if args.TotalMem > 0 {
// Adjust the total memory returned by the Sentry so that applications that
@@ -300,6 +308,12 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("initializing kernel: %v", err)
}
+ if kernel.VFS2Enabled {
+ if err := registerFilesystems(k); err != nil {
+ return nil, fmt.Errorf("registering filesystems: %w", err)
+ }
+ }
+
if err := adjustDirentCache(k); err != nil {
return nil, err
}
@@ -318,7 +332,7 @@ func New(args Args) (*Loader, error) {
dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
dog := watchdog.New(k, dogOpts)
- procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
+ procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
if err != nil {
return nil, fmt.Errorf("creating init process for root container: %v", err)
}
@@ -338,7 +352,7 @@ func New(args Args) (*Loader, error) {
if err != nil {
return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err)
}
- defer hostFilesystem.DecRef()
+ defer hostFilesystem.DecRef(k.SupervisorContext())
hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
@@ -346,37 +360,45 @@ func New(args Args) (*Loader, error) {
k.SetHostMount(hostMount)
}
+ info := containerInfo{
+ conf: args.Conf,
+ spec: args.Spec,
+ procArgs: procArgs,
+ }
+
// Make host FDs stable between invocations. Host FDs must map to the exact
// same number when the sandbox is restored. Otherwise the wrong FD will be
// used.
- var stdioFDs []int
newfd := startingStdioFD
- for _, fd := range args.StdioFDs {
- err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
- if err != nil {
- return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+ for _, stdioFD := range args.StdioFDs {
+ // Check that newfd is unused to avoid clobbering over it.
+ if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
+ if err != nil {
+ return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
+ }
+ return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
}
- stdioFDs = append(stdioFDs, newfd)
- err = syscall.Close(fd)
+
+ err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
if err != nil {
- return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
+ return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
}
+ info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+ _ = unix.Close(stdioFD)
newfd++
}
+ for _, goferFD := range args.GoferFDs {
+ info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+ }
eid := execID{cid: args.ID}
l := &Loader{
- k: k,
- conf: args.Conf,
- console: args.Console,
- watchdog: dog,
- spec: args.Spec,
- goferFDs: args.GoferFDs,
- stdioFDs: stdioFDs,
- rootProcArgs: procArgs,
- sandboxID: args.ID,
- processes: map[execID]*execProcess{eid: {}},
- mountHints: mountHints,
+ k: k,
+ watchdog: dog,
+ sandboxID: args.ID,
+ processes: map[execID]*execProcess{eid: {}},
+ mountHints: mountHints,
+ root: info,
}
// We don't care about child signals; some platforms can generate a
@@ -404,8 +426,8 @@ func New(args Args) (*Loader, error) {
return l, nil
}
-// newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
+// createProcessArgs creates args that can be used with kernel.CreateProcess.
+func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
// Create initial limits.
ls, err := createLimitSet(spec)
if err != nil {
@@ -449,9 +471,19 @@ func (l *Loader) Destroy() {
l.stopSignalForwarding()
}
l.watchdog.Stop()
+
+ // In the success case, stdioFDs and goferFDs will only contain
+ // released/closed FDs that ownership has been passed over to host FDs and
+ // gofer sessions. Close them here in case on failure.
+ for _, fd := range l.root.stdioFDs {
+ _ = fd.Close()
+ }
+ for _, fd := range l.root.goferFDs {
+ _ = fd.Close()
+ }
}
-func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
p, err := platform.Lookup(conf.Platform)
if err != nil {
panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
@@ -478,14 +510,15 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
return mf, nil
}
+// installSeccompFilters installs sandbox seccomp filters with the host.
func (l *Loader) installSeccompFilters() error {
- if l.conf.DisableSeccomp {
+ if l.root.conf.DisableSeccomp {
filter.Report("syscall filter is DISABLED. Running in less secure mode.")
} else {
opts := filter.Options{
Platform: l.k.Platform,
- HostNetwork: l.conf.Network == NetworkHost,
- ProfileEnable: l.conf.ProfileEnable,
+ HostNetwork: l.root.conf.Network == config.NetworkHost,
+ ProfileEnable: l.root.conf.ProfileEnable,
ControllerFD: l.ctrl.srv.FD(),
}
if err := filter.Install(opts); err != nil {
@@ -511,7 +544,7 @@ func (l *Loader) Run() error {
}
func (l *Loader) run() error {
- if l.conf.Network == NetworkHost {
+ if l.root.conf.Network == config.NetworkHost {
// Delay host network configuration to this point because network namespace
// is configured after the loader is created and before Run() is called.
log.Debugf("Configuring host network")
@@ -532,10 +565,8 @@ func (l *Loader) run() error {
// If we are restoring, we do not want to create a process.
// l.restore is set by the container manager when a restore call is made.
- var ttyFile *host.TTYFileOperations
- var ttyFileVFS2 *hostvfs2.TTYFileDescription
if !l.restore {
- if l.conf.ProfileEnable {
+ if l.root.conf.ProfileEnable {
pprof.Initialize()
}
@@ -545,82 +576,30 @@ func (l *Loader) run() error {
return err
}
- // Create the FD map, which will set stdin, stdout, and stderr. If console
- // is true, then ioctl calls will be passed through to the host fd.
- ctx := l.rootProcArgs.NewContext(l.k)
- var err error
-
- // CreateProcess takes a reference on FDMap if successful. We won't need
- // ours either way.
- l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
- if err != nil {
- return fmt.Errorf("importing fds: %v", err)
- }
-
- // Setup the root container file system.
- l.startGoferMonitor(l.sandboxID, l.goferFDs)
-
- mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
- if err := mntr.processHints(l.conf); err != nil {
- return err
- }
- if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
- return err
- }
-
- // Add the HOME enviroment variable if it is not already set.
- var envv []string
- if kernel.VFS2Enabled {
- envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
- l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
-
- } else {
- envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
- l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
- }
- if err != nil {
- return err
- }
- l.rootProcArgs.Envv = envv
-
// Create the root container init task. It will begin running
// when the kernel is started.
- if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
- return fmt.Errorf("creating init process: %v", err)
+ if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
+ return err
}
- // CreateProcess takes a reference on FDTable if successful.
- l.rootProcArgs.FDTable.DecRef()
}
ep.tg = l.k.GlobalInit()
- if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
+ if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
ep.pidnsPath = ns.Path
}
- if l.console {
- // Set the foreground process group on the TTY to the global init process
- // group, since that is what we are about to start running.
- switch {
- case ttyFileVFS2 != nil:
- ep.ttyVFS2 = ttyFileVFS2
- ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
- case ttyFile != nil:
- ep.tty = ttyFile
- ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
- }
- }
// Handle signals by forwarding them to the root container process
// (except for panic signal, which should cause a panic).
l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
// Panic signal should cause a panic.
- if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+ if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
panic("Signal-induced panic")
}
// Otherwise forward to root container.
deliveryMode := DeliverToProcess
- if l.console {
+ if l.root.spec.Process.Terminal {
// Since we are running with a console, we should forward the signal to
// the foreground process group so that job control signals like ^C can
// be handled properly.
@@ -632,19 +611,6 @@ func (l *Loader) run() error {
}
})
- // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
- // either in createFDTable() during initial start or in descriptor.initAfterLoad()
- // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
- // passed FDs, so only close for VFS1.
- if !kernel.VFS2Enabled {
- for _, fd := range l.stdioFDs {
- err := syscall.Close(fd)
- if err != nil {
- return fmt.Errorf("close dup()ed stdioFDs: %v", err)
- }
- }
- }
-
log.Infof("Process should have started...")
l.watchdog.Start()
return l.k.Start()
@@ -664,9 +630,9 @@ func (l *Loader) createContainer(cid string) error {
}
// startContainer starts a child container. It returns the thread group ID of
-// the newly created process. Caller owns 'files' and may close them after
-// this method returns.
-func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+// the newly created process. Used FDs are either closed or released. It's safe
+// for the caller to close any remaining files upon return.
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
// Create capabilities.
caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
if err != nil {
@@ -676,8 +642,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
l.mu.Lock()
defer l.mu.Unlock()
- eid := execID{cid: cid}
- if _, ok := l.processes[eid]; !ok {
+ ep := l.processes[execID{cid: cid}]
+ if ep == nil {
return fmt.Errorf("trying to start a deleted container %q", cid)
}
@@ -711,88 +677,136 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
if pidns == nil {
pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
}
- l.processes[eid].pidnsPath = ns.Path
+ ep.pidnsPath = ns.Path
} else {
pidns = l.k.RootPIDNamespace()
}
- procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
+
+ info := &containerInfo{
+ conf: conf,
+ spec: spec,
+ stdioFDs: files[:3],
+ goferFDs: files[3:],
+ }
+ info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
if err != nil {
return fmt.Errorf("creating new process: %v", err)
}
+ tg, err := l.createContainerProcess(false, cid, info, ep)
+ if err != nil {
+ return err
+ }
+
+ // Success!
+ l.k.StartProcess(tg)
+ ep.tg = tg
+ return nil
+}
- // setupContainerFS() dups stdioFDs, so we don't need to dup them here.
- var stdioFDs []int
- for _, f := range files[:3] {
- stdioFDs = append(stdioFDs, int(f.Fd()))
+func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) {
+ console := false
+ if root {
+ // Only root container supports terminal for now.
+ console = info.spec.Process.Terminal
}
// Create the FD map, which will set stdin, stdout, and stderr.
- ctx := procArgs.NewContext(l.k)
- fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
+ ctx := info.procArgs.NewContext(l.k)
+ fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs)
if err != nil {
- return fmt.Errorf("importing fds: %v", err)
- }
- // CreateProcess takes a reference on fdTable if successful. We won't
- // need ours either way.
- procArgs.FDTable = fdTable
-
- // Can't take ownership away from os.File. dup them to get a new FDs.
- var goferFDs []int
- for _, f := range files[3:] {
- fd, err := syscall.Dup(int(f.Fd()))
- if err != nil {
- return fmt.Errorf("failed to dup file: %v", err)
- }
- goferFDs = append(goferFDs, fd)
+ return nil, fmt.Errorf("importing fds: %v", err)
}
+ // CreateProcess takes a reference on fdTable if successful. We won't need
+ // ours either way.
+ info.procArgs.FDTable = fdTable
// Setup the child container file system.
- l.startGoferMonitor(cid, goferFDs)
+ l.startGoferMonitor(cid, info.goferFDs)
- mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
- if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
- return err
+ mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints)
+ if root {
+ if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
+ return nil, err
+ }
+ }
+ if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
+ return nil, err
}
// Add the HOME enviroment variable if it is not already set.
var envv []string
if kernel.VFS2Enabled {
- envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2,
- procArgs.Credentials.RealKUID, procArgs.Envv)
+ envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
+ info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
} else {
- envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace,
- procArgs.Credentials.RealKUID, procArgs.Envv)
+ envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
+ info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
}
if err != nil {
- return err
+ return nil, err
}
- procArgs.Envv = envv
+ info.procArgs.Envv = envv
// Create and start the new process.
- tg, _, err := l.k.CreateProcess(procArgs)
+ tg, _, err := l.k.CreateProcess(info.procArgs)
if err != nil {
- return fmt.Errorf("creating process: %v", err)
+ return nil, fmt.Errorf("creating process: %v", err)
}
- l.k.StartProcess(tg)
-
// CreateProcess takes a reference on FDTable if successful.
- procArgs.FDTable.DecRef()
+ info.procArgs.FDTable.DecRef(ctx)
- l.processes[eid].tg = tg
- return nil
+ // Set the foreground process group on the TTY to the global init process
+ // group, since that is what we are about to start running.
+ if root {
+ switch {
+ case ttyFileVFS2 != nil:
+ ep.ttyVFS2 = ttyFileVFS2
+ ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
+ case ttyFile != nil:
+ ep.tty = ttyFile
+ ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
+ }
+ }
+
+ // Install seccomp filters with the new task if there are any.
+ if info.conf.OCISeccomp {
+ if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+ program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
+ if err != nil {
+ return nil, fmt.Errorf("building seccomp program: %v", err)
+ }
+
+ if log.IsLogging(log.Debug) {
+ out, _ := bpf.DecodeProgram(program)
+ log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
+ }
+
+ task := tg.Leader()
+ // NOTE: It seems Flags are ignored by runc so we ignore them too.
+ if err := task.AppendSyscallFilter(program, true); err != nil {
+ return nil, fmt.Errorf("appending seccomp filters: %v", err)
+ }
+ }
+ } else {
+ if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+ log.Warningf("Seccomp spec is being ignored")
+ }
+ }
+
+ return tg, nil
}
// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
-// the gofer FDs looking for disconnects, and destroys the container if a
+// the gofer FDs looking for disconnects, and kills the container processes if a
// disconnect occurs in any of the gofer FDs.
-func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
go func() {
log.Debugf("Monitoring gofer health for container %q", cid)
var events []unix.PollFd
- for _, fd := range goferFDs {
+ for _, goferFD := range goferFDs {
events = append(events, unix.PollFd{
- Fd: int32(fd),
+ Fd: int32(goferFD.FD()),
Events: unix.POLLHUP | unix.POLLRDHUP,
})
}
@@ -805,18 +819,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
}
- // Check if the gofer has stopped as part of normal container destruction.
- // This is done just to avoid sending an annoying error message to the log.
- // Note that there is a small race window in between mu.Unlock() and the
- // lock being reacquired in destroyContainer(), but it's harmless to call
- // destroyContainer() multiple times.
l.mu.Lock()
- _, ok := l.processes[execID{cid: cid}]
- l.mu.Unlock()
- if ok {
- log.Infof("Gofer socket disconnected, destroying container %q", cid)
- if err := l.destroyContainer(cid); err != nil {
- log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+ defer l.mu.Unlock()
+
+ // The gofer could have been stopped due to a normal container shutdown.
+ // Check if the container has not stopped yet.
+ if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
+ log.Infof("Gofer socket disconnected, killing container %q", cid)
+ if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+ log.Warningf("Error killing container %q after gofer stopped: %v", cid, err)
}
}
}()
@@ -885,37 +896,42 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
return 0, fmt.Errorf("container %q not started", args.ContainerID)
}
- // Get the container MountNamespace from the Task.
+ // Get the container MountNamespace from the Task. Try to acquire ref may fail
+ // in case it raced with task exit.
if kernel.VFS2Enabled {
// task.MountNamespace() does not take a ref, so we must do so ourselves.
args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
- args.MountNamespaceVFS2.IncRef()
+ if !args.MountNamespaceVFS2.TryIncRef() {
+ return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+ }
} else {
+ var reffed bool
tg.Leader().WithMuLocked(func(t *kernel.Task) {
// task.MountNamespace() does not take a ref, so we must do so ourselves.
args.MountNamespace = t.MountNamespace()
- args.MountNamespace.IncRef()
+ reffed = args.MountNamespace.TryIncRef()
})
+ if !reffed {
+ return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+ }
}
// Add the HOME environment variable if it is not already set.
if kernel.VFS2Enabled {
- defer args.MountNamespaceVFS2.DecRef()
-
root := args.MountNamespaceVFS2.Root()
- defer root.DecRef()
ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+ defer args.MountNamespaceVFS2.DecRef(ctx)
+ defer root.DecRef(ctx)
envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
if err != nil {
return 0, err
}
args.Envv = envv
} else {
- defer args.MountNamespace.DecRef()
-
root := args.MountNamespace.Root()
- defer root.DecRef()
ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+ defer args.MountNamespace.DecRef(ctx)
+ defer root.DecRef(ctx)
envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
if err != nil {
return 0, err
@@ -1012,20 +1028,25 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
// Wait for container.
l.k.WaitExited()
+ // Cleanup
+ l.ctrl.stop()
+
+ refs.OnExit()
+
return l.k.GlobalInit().ExitStatus()
}
-func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
// Create an empty network stack because the network namespace may be empty at
// this point. Netns is configured before Run() is called. Netstack is
// configured using a control uRPC message. Host network is configured inside
// Run().
switch conf.Network {
- case NetworkHost:
+ case config.NetworkHost:
// No network namespacing support for hostinet yet, hence creator is nil.
return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
- case NetworkNone, NetworkSandbox:
+ case config.NetworkNone, config.NetworkSandbox:
s, err := newEmptySandboxNetworkStack(clock, uniqueID)
if err != nil {
return nil, err
@@ -1043,8 +1064,8 @@ func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.Uni
}
func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
- netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
- transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+ netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
+ transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4}
s := netstack.Stack{stack.New(stack.Options{
NetworkProtocols: netProtos,
TransportProtocols: transProtos,
@@ -1058,17 +1079,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
})}
// Enable SACK Recovery.
- if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
- return nil, fmt.Errorf("failed to enable SACK: %s", err)
+ {
+ opt := tcpip.TCPSACKEnabled(true)
+ if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+ }
}
// Set default TTLs as required by socket/netstack.
- s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
- s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+ {
+ opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
+ if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
+ }
+ if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+ }
+ }
// Enable Receive Buffer Auto-Tuning.
- if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
- return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+ {
+ opt := tcpip.TCPModerateReceiveBufferOption(true)
+ if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+ }
}
return &s, nil
@@ -1264,7 +1298,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2
return ep.tty, ep.ttyVFS2, nil
}
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
if len(stdioFDs) != 3 {
return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
}
@@ -1273,7 +1307,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
fdTable := k.NewFDTable()
ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
if err != nil {
- fdTable.DecRef()
+ fdTable.DecRef(ctx)
return nil, nil, nil, err
}
return fdTable, ttyFile, ttyFileVFS2, nil
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index e448fd773..1f49431a2 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -26,6 +26,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/control/server"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
@@ -34,6 +35,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/fsgofer"
)
@@ -43,15 +45,19 @@ func init() {
if err := fsgofer.OpenProcSelfFD(); err != nil {
panic(err)
}
+ config.RegisterFlags()
}
-func testConfig() *Config {
- return &Config{
- RootDir: "unused_root_dir",
- Network: NetworkNone,
- DisableSeccomp: true,
- Platform: "ptrace",
+func testConfig() *config.Config {
+ conf, err := config.NewFromFlags()
+ if err != nil {
+ panic(err)
}
+ // Change test defaults.
+ conf.RootDir = "unused_root_dir"
+ conf.Network = config.NetworkNone
+ conf.DisableSeccomp = true
+ return conf
}
// testSpec returns a simple spec that can be used in tests.
@@ -258,7 +264,7 @@ type CreateMountTestcase struct {
expectedPaths []string
}
-func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+func createMountTestcases() []*CreateMountTestcase {
testCases := []*CreateMountTestcase{
&CreateMountTestcase{
// Only proc.
@@ -403,32 +409,26 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
Destination: "/proc",
Type: "tmpfs",
},
- // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
- // MkDirAt in VFS2 (and remove the reduntant append).
- // {
- // Destination: "/sys/bar",
- // Type: "tmpfs",
- // },
- //
+ {
+ Destination: "/sys/bar",
+ Type: "tmpfs",
+ },
+
{
Destination: "/tmp/baz",
Type: "tmpfs",
},
},
},
- expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
+ expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"},
}
- if !vfs2 {
- vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
- vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
- }
return append(testCases, vfsCase)
}
// Test that MountNamespace can be created with various specs.
func TestCreateMountNamespace(t *testing.T) {
- for _, tc := range createMountTestcases(false /* vfs2 */) {
+ for _, tc := range createMountTestcases() {
t.Run(tc.name, func(t *testing.T) {
conf := testConfig()
ctx := contexttest.Context(t)
@@ -439,7 +439,7 @@ func TestCreateMountNamespace(t *testing.T) {
}
defer cleanup()
- mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+ mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{})
mns, err := mntr.createMountNamespace(ctx, conf)
if err != nil {
t.Fatalf("failed to create mount namespace: %v", err)
@@ -450,13 +450,13 @@ func TestCreateMountNamespace(t *testing.T) {
}
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
for _, p := range tc.expectedPaths {
maxTraversals := uint(0)
if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
} else {
- d.DecRef()
+ d.DecRef(ctx)
}
}
})
@@ -465,7 +465,7 @@ func TestCreateMountNamespace(t *testing.T) {
// Test that MountNamespace can be created with various specs.
func TestCreateMountNamespaceVFS2(t *testing.T) {
- for _, tc := range createMountTestcases(true /* vfs2 */) {
+ for _, tc := range createMountTestcases() {
t.Run(tc.name, func(t *testing.T) {
spec := testSpec()
spec.Mounts = tc.spec.Mounts
@@ -479,19 +479,19 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
defer l.Destroy()
defer loaderCleanup()
- mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
- if err := mntr.processHints(l.conf); err != nil {
+ mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints)
+ if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil {
t.Fatalf("failed process hints: %v", err)
}
ctx := l.k.SupervisorContext()
- mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
+ mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs)
if err != nil {
- t.Fatalf("failed to setupVFS2: %v", err)
+ t.Fatalf("mountAll: %v", err)
}
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
for _, p := range tc.expectedPaths {
target := &vfs.PathOperation{
Root: root,
@@ -499,10 +499,10 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
Path: fspath.Parse(p),
}
- if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+ if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
} else {
- d.DecRef()
+ d.DecRef(ctx)
}
}
})
@@ -545,7 +545,7 @@ func TestRestoreEnvironment(t *testing.T) {
{
Dev: "9pfs-/",
Flags: fs.MountSourceFlags{ReadOnly: true},
- DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+ DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
},
},
"tmpfs": {
@@ -599,7 +599,7 @@ func TestRestoreEnvironment(t *testing.T) {
{
Dev: "9pfs-/",
Flags: fs.MountSourceFlags{ReadOnly: true},
- DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+ DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
},
{
Dev: "9pfs-/dev/fd-foo",
@@ -657,7 +657,7 @@ func TestRestoreEnvironment(t *testing.T) {
{
Dev: "9pfs-/",
Flags: fs.MountSourceFlags{ReadOnly: true},
- DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+ DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
},
},
"tmpfs": {
@@ -697,7 +697,11 @@ func TestRestoreEnvironment(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
conf := testConfig()
- mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
+ var ioFDs []*fd.FD
+ for _, ioFD := range tc.ioFDs {
+ ioFDs = append(ioFDs, fd.New(ioFD))
+ }
+ mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{})
actualRenv, err := mntr.createRestoreEnvironment(conf)
if !tc.errorExpected && err != nil {
t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 14d2f56a5..988573640 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket"
"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
@@ -32,6 +33,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/urpc"
+ "gvisor.dev/gvisor/runsc/config"
)
var (
@@ -77,44 +79,6 @@ type DefaultRoute struct {
Name string
}
-// QueueingDiscipline is used to specify the kind of Queueing Discipline to
-// apply for a give FDBasedLink.
-type QueueingDiscipline int
-
-const (
- // QDiscNone disables any queueing for the underlying FD.
- QDiscNone QueueingDiscipline = iota
-
- // QDiscFIFO applies a simple fifo based queue to the underlying
- // FD.
- QDiscFIFO
-)
-
-// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
-// else returns an error.
-func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
- switch s {
- case "none":
- return QDiscNone, nil
- case "fifo":
- return QDiscFIFO, nil
- default:
- return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
- }
-}
-
-// String implements fmt.Stringer.
-func (q QueueingDiscipline) String() string {
- switch q {
- case QDiscNone:
- return "none"
- case QDiscFIFO:
- return "fifo"
- default:
- panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
- }
-}
-
// FDBasedLink configures an fd-based link.
type FDBasedLink struct {
Name string
@@ -126,7 +90,7 @@ type FDBasedLink struct {
TXChecksumOffload bool
RXChecksumOffload bool
LinkAddress net.HardwareAddr
- QDisc QueueingDiscipline
+ QDisc config.QueueingDiscipline
// NumChannels controls how many underlying FD's are to be used to
// create this endpoint.
@@ -246,12 +210,15 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
}
switch link.QDisc {
- case QDiscNone:
- case QDiscFIFO:
+ case config.QDiscNone:
+ case config.QDiscFIFO:
log.Infof("Enabling FIFO QDisc on %q", link.Name)
linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
}
+ // Enable support for AF_PACKET sockets to receive outgoing packets.
+ linkEP = packetsocket.New(linkEP)
+
log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return err
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index fbfd3b07c..c21648a32 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -15,10 +15,13 @@
package boot
import (
+ "strings"
+
"gvisor.dev/gvisor/pkg/sentry/strace"
+ "gvisor.dev/gvisor/runsc/config"
)
-func enableStrace(conf *Config) error {
+func enableStrace(conf *config.Config) error {
// We must initialize even if strace is not enabled.
strace.Initialize()
@@ -36,5 +39,5 @@ func enableStrace(conf *Config) error {
strace.EnableAll(strace.SinkTypeLog)
return nil
}
- return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+ return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog)
}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index b68117867..e36664938 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -16,12 +16,12 @@ package boot
import (
"fmt"
- "path"
"sort"
"strings"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
@@ -37,13 +37,19 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/runsc/config"
)
-func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+func registerFilesystems(k *kernel.Kernel) error {
+ ctx := k.SupervisorContext()
+ creds := auth.NewRootCredentials(k.RootUserNamespace())
+ vfsObj := k.VFS()
+
vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserList: true,
// TODO(b/29356795): Users may mount this once the terminals are in a
@@ -73,6 +79,10 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
AllowUserMount: true,
AllowUserList: true,
})
+ vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
// Setup files in devtmpfs.
if err := memdev.Register(vfsObj); err != nil {
@@ -81,18 +91,24 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
if err := ttydev.Register(vfsObj); err != nil {
return fmt.Errorf("registering ttydev: %w", err)
}
-
- if err := fuse.Register(vfsObj); err != nil {
- return fmt.Errorf("registering fusedev: %w", err)
+ tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
+ if tunSupported {
+ if err := tundev.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering tundev: %v", err)
+ }
}
- if err := tundev.Register(vfsObj); err != nil {
- return fmt.Errorf("registering tundev: %v", err)
+
+ if kernel.FUSEEnabled {
+ if err := fuse.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering fusedev: %w", err)
+ }
}
+
a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
if err != nil {
return fmt.Errorf("creating devtmpfs accessor: %w", err)
}
- defer a.Release()
+ defer a.Release(ctx)
if err := a.UserspaceInit(ctx); err != nil {
return fmt.Errorf("initializing userspace: %w", err)
@@ -103,20 +119,23 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
}
- if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
- return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+ if tunSupported {
+ if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
+ return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+ }
}
- if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
- return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+
+ if kernel.FUSEEnabled {
+ if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
+ return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+ }
}
+
return nil
}
-func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
- if err := mntr.k.VFS().Init(); err != nil {
- return fmt.Errorf("failed to initialize VFS: %w", err)
- }
- mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+ mns, err := mntr.mountAll(conf, procArgs)
if err != nil {
return fmt.Errorf("failed to setupFS: %w", err)
}
@@ -131,7 +150,7 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte
return nil
}
-func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
log.Infof("Configuring container's file system with VFS2")
// Create context with root credentials to mount the filesystem (the current
@@ -144,36 +163,115 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
rootCtx := procArgs.NewContext(c.k)
- if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil {
- return nil, fmt.Errorf("register filesystems: %w", err)
- }
-
mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
if err != nil {
return nil, fmt.Errorf("creating mount namespace: %w", err)
}
rootProcArgs.MountNamespaceVFS2 = mns
+ root := mns.Root()
+ defer root.DecRef(rootCtx)
+ if root.Mount().ReadOnly() {
+ // Switch to ReadWrite while we setup submounts.
+ if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
+ return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
+ }
+ // Restore back to ReadOnly at the end.
+ defer func() {
+ if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
+ panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
+ }
+ }()
+ }
+
// Mount submounts.
if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
}
+
return mns, nil
}
-func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+// createMountNamespaceVFS2 creates the container's root mount and namespace.
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
fd := c.fds.remove()
- opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",")
+ data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+
+ if conf.OverlayfsStaleRead {
+ // We can't check for overlayfs here because sandbox is chroot'ed and gofer
+ // can only send mount options for specs.Mounts (specs.Root is missing
+ // Options field). So assume root is always on top of overlayfs.
+ data = append(data, "overlayfs_stale_read")
+ }
log.Infof("Mounting root over 9P, ioFD: %d", fd)
- mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts})
+ opts := &vfs.MountOptions{
+ ReadOnly: c.root.Readonly,
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ Data: strings.Join(data, ","),
+ },
+ InternalMount: true,
+ }
+
+ fsName := gofer.Name
+ if conf.Overlay && !c.root.Readonly {
+ log.Infof("Adding overlay on top of root")
+ var err error
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting root with overlay: %w", err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
+ mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts)
if err != nil {
return nil, fmt.Errorf("setting up mount namespace: %w", err)
}
return mns, nil
}
-func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
+// layer using tmpfs, and return overlay mount options. "cleanup" must be called
+// after the options have been used to mount the overlay, to release refs on
+// lower and upper mounts.
+func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) {
+ // First copy options from lower layer to upper layer and overlay. Clear
+ // filesystem specific options.
+ upperOpts := *lowerOpts
+ upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+ overlayOpts := *lowerOpts
+ overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+ // Next mount upper and lower. Upper is a tmpfs mount to keep all
+ // modifications inside the sandbox.
+ upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
+ }
+ cu := cleanup.Make(func() { upper.DecRef(ctx) })
+ defer cu.Clean()
+
+ // All writes go to the upper layer, be paranoid and make lower readonly.
+ lowerOpts.ReadOnly = true
+ lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
+ if err != nil {
+ return nil, nil, err
+ }
+ cu.Add(func() { lower.DecRef(ctx) })
+
+ // Configure overlay with both layers.
+ overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
+ UpperRoot: vfs.MakeVirtualDentry(upper, upper.Root()),
+ LowerRoots: []vfs.VirtualDentry{vfs.MakeVirtualDentry(lower, lower.Root())},
+ }
+ return &overlayOpts, cu.Release(), nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
mounts, err := c.prepareMountsVFS2()
if err != nil {
return err
@@ -182,8 +280,34 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
for i := range mounts {
submount := &mounts[i]
log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
- if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
- return err
+ var (
+ mnt *vfs.Mount
+ err error
+ )
+
+ if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
+ mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint)
+ if err != nil {
+ return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
+ }
+ } else {
+ mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount)
+ if err != nil {
+ return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
+ }
+ }
+
+ if mnt != nil && mnt.ReadOnly() {
+ // Switch to ReadWrite while we setup submounts.
+ if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
+ return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err)
+ }
+ // Restore back to ReadOnly at the end.
+ defer func() {
+ if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
+ panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
+ }
+ }()
}
}
@@ -227,62 +351,83 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
return mounts, nil
}
-func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
- root := mns.Root()
- defer root.DecRef()
- target := &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(submount.Destination),
- }
- fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
+ fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
if err != nil {
- return fmt.Errorf("mountOptions failed: %w", err)
+ return nil, fmt.Errorf("mountOptions failed: %w", err)
}
if len(fsName) == 0 {
// Filesystem is not supported (e.g. cgroup), just skip it.
- return nil
+ return nil, nil
}
- if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
- return err
+ if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil {
+ return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err)
}
- if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
- return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+
+ if useOverlay {
+ log.Infof("Adding overlay on top of mount %q", submount.Destination)
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(submount.Destination),
+ }
+ mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
+ if err != nil {
+ return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
}
log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
- return nil
+ return mnt, nil
}
// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
// used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
- var (
- fsName string
- data []string
- )
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) {
+ fsName := m.Type
+ useOverlay := false
+ var data []string
// Find filesystem name and FS specific data field.
switch m.Type {
case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
- fsName = m.Type
+ // Nothing to do.
+
case nonefs:
fsName = sys.Name
- case tmpfs.Name:
- fsName = m.Type
+ case tmpfs.Name:
var err error
data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
if err != nil {
- return "", nil, err
+ return "", nil, false, err
}
case bind:
fsName = gofer.Name
+ if m.fd == 0 {
+ // Check that an FD was provided to fails fast. Technically FD=0 is valid,
+ // but unlikely to be correct in this context.
+ return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
+ }
data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+ // If configured, add overlay to all writable mounts.
+ useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
default:
log.Warningf("ignoring unknown filesystem type %q", m.Type)
+ return "", nil, false, nil
}
opts := &vfs.MountOptions{
@@ -307,38 +452,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
}
}
- if conf.Overlay {
- // All writes go to upper, be paranoid and make lower readonly.
- opts.ReadOnly = true
- }
- return fsName, opts, nil
-}
-
-func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
- target := &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(currentPath),
- }
- _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
- if err == nil {
- // Mount point exists, nothing else to do.
- return nil
- }
- if err != syserror.ENOENT {
- return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err)
- }
-
- // Recurse to ensure parent is created and then create the mount point.
- if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
- return err
- }
- log.Debugf("Creating dir %q for mount point", currentPath)
- mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
- if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
- return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err)
- }
- return nil
+ return fsName, opts, useOverlay, nil
}
// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
@@ -350,7 +464,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
//
// Note that when there are submounts inside of '/tmp', directories for the
// mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
for _, m := range c.mounts {
// m.Destination has been cleaned, so it's to use equality here.
if m.Destination == "/tmp" {
@@ -360,28 +474,35 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
}
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
pop := vfs.PathOperation{
Root: root,
Start: root,
Path: fspath.Parse("/tmp"),
}
// TODO(gvisor.dev/issue/2782): Use O_PATH when available.
- statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{})
+ fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
switch err {
case nil:
- // Found '/tmp' in filesystem, check if it's empty.
- if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory {
- // Not a dir?! Leave it be.
+ defer fd.DecRef(ctx)
+
+ err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+ if dirent.Name != "." && dirent.Name != ".." {
+ return syserror.ENOTEMPTY
+ }
return nil
- }
- if statx.Nlink > 2 {
+ }))
+ switch err {
+ case nil:
+ log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
+ case syserror.ENOTEMPTY:
// If more than "." and ".." is found, skip internal tmpfs to prevent
// hiding existing files.
log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
return nil
+ default:
+ return err
}
- log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
fallthrough
case syserror.ENOENT:
@@ -394,9 +515,122 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
// another user. This is normally done for /tmp.
Options: []string{"mode=01777"},
}
- return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+ _, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+ return err
+
+ case syserror.ENOTDIR:
+ // Not a dir?! Let it be.
+ return nil
default:
- return fmt.Errorf(`stating "/tmp" inside container: %w`, err)
+ return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
+ }
+}
+
+// processHintsVFS2 processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error {
+ ctx := c.k.SupervisorContext()
+ for _, hint := range c.hints.mounts {
+ // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+ // common gofer to mount all shared volumes.
+ if hint.mount.Type != tmpfs.Name {
+ continue
+ }
+
+ log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+ mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
+ if err != nil {
+ return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+ }
+ hint.vfsMount = mnt
+ }
+ return nil
+}
+
+// mountSharedMasterVFS2 mounts the master of a volume that is shared among
+// containers in a pod.
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ mntFD := &mountAndFD{Mount: hint.mount}
+ fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+ if err != nil {
+ return nil, err
+ }
+ if len(fsName) == 0 {
+ return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+ }
+
+ if useOverlay {
+ log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination)
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
+ return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) {
+ if err := source.checkCompatible(mount); err != nil {
+ return nil, err
+ }
+
+ // Ignore data and useOverlay because these were already applied to
+ // the master mount.
+ _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+ if err != nil {
+ return nil, err
+ }
+ newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
+ if err != nil {
+ return nil, err
+ }
+ defer newMnt.DecRef(ctx)
+
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(mount.Destination),
+ }
+
+ if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
+ return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
+ }
+
+ if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
+ return nil, err
+ }
+ log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+ return newMnt, nil
+}
+
+func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(dest),
+ }
+ // First check if mount point exists. When overlay is enabled, gofer doesn't
+ // allow changes to the FS, making MakeSytheticMountpoint() ineffective
+ // because MkdirAt fails with EROFS even if file exists.
+ vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
+ if err == nil {
+ // File exists, we're done.
+ vd.DecRef(ctx)
+ return nil
}
+ return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
}
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index 7e34a284a..37f4253ba 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -10,7 +10,7 @@ go_library(
"//pkg/cleanup",
"//pkg/log",
"@com_github_cenkalti_backoff//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
],
)
@@ -22,6 +22,6 @@ go_test(
tags = ["local"],
deps = [
"//pkg/test/testutil",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
],
)
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index e5cc9d622..8fbc3887a 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -92,7 +92,17 @@ func setOptionalValueUint16(path, name string, val *uint16) error {
func setValue(path, name, data string) error {
fullpath := filepath.Join(path, name)
- return ioutil.WriteFile(fullpath, []byte(data), 0700)
+
+ // Retry writes on EINTR; see:
+ // https://github.com/golang/go/issues/38033
+ for {
+ err := ioutil.WriteFile(fullpath, []byte(data), 0700)
+ if err == nil {
+ return nil
+ } else if !errors.Is(err, syscall.EINTR) {
+ return err
+ }
+ }
}
func getValue(path, name string) (string, error) {
@@ -132,8 +142,16 @@ func fillFromAncestor(path string) (string, error) {
if err != nil {
return "", err
}
- if err := ioutil.WriteFile(path, []byte(val), 0700); err != nil {
- return "", err
+
+ // Retry writes on EINTR; see:
+ // https://github.com/golang/go/issues/38033
+ for {
+ err := ioutil.WriteFile(path, []byte(val), 0700)
+ if err == nil {
+ break
+ } else if !errors.Is(err, syscall.EINTR) {
+ return "", err
+ }
}
return val, nil
}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index dae9b3b3e..2556f6d9e 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -51,6 +51,7 @@ go_library(
"//pkg/unet",
"//pkg/urpc",
"//runsc/boot",
+ "//runsc/config",
"//runsc/console",
"//runsc/container",
"//runsc/flag",
@@ -58,7 +59,7 @@ go_library(
"//runsc/fsgofer/filter",
"//runsc/specutils",
"@com_github_google_subcommands//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@com_github_syndtr_gocapability//capability:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
@@ -84,12 +85,12 @@ go_test(
"//pkg/sentry/kernel/auth",
"//pkg/test/testutil",
"//pkg/urpc",
- "//runsc/boot",
+ "//runsc/config",
"//runsc/container",
"//runsc/specutils",
- "@com_github_google_go-cmp//cmp:go_default_library",
- "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_google_go_cmp//cmp:go_default_library",
+ "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@com_github_syndtr_gocapability//capability:go_default_library",
],
)
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 01204ab4d..cd419e1aa 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -27,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/flag"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -54,10 +55,6 @@ type Boot struct {
// provided in that order.
stdioFDs intFlags
- // console is set to true if the sandbox should allow terminal ioctl(2)
- // syscalls.
- console bool
-
// applyCaps determines if capabilities defined in the spec should be applied
// to the process.
applyCaps bool
@@ -115,7 +112,6 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
- f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
@@ -138,7 +134,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
// Ensure that if there is a panic, all goroutine stacks are printed.
debug.SetTraceback("system")
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
if b.attached {
// Ensure this process is killed after parent process terminates when
@@ -172,7 +168,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
// Get the spec from the specFD.
specFile := os.NewFile(uintptr(b.specFD), "spec file")
defer specFile.Close()
- spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile)
+ spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf)
if err != nil {
Fatalf("reading spec: %v", err)
}
@@ -229,7 +225,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
Device: os.NewFile(uintptr(b.deviceFD), "platform device"),
GoferFDs: b.ioFDs.GetArray(),
StdioFDs: b.stdioFDs.GetArray(),
- Console: b.console,
NumCPU: b.cpuNum,
TotalMem: b.totalMem,
UserLogFD: b.userLogFD,
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index a84067112..e13a94486 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -24,7 +24,7 @@ import (
"github.com/syndtr/gocapability/capability"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/test/testutil"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -88,7 +88,7 @@ func TestCapabilities(t *testing.T) {
conf := testutil.TestConfig(t)
// Use --network=host to make sandbox use spec's capabilities.
- conf.Network = boot.NetworkHost
+ conf.Network = config.NetworkHost
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
if err != nil {
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 8a29e521e..8fe0c427a 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -22,7 +22,7 @@ import (
"github.com/google/subcommands"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
"gvisor.dev/gvisor/runsc/specutils"
@@ -72,7 +72,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
waitStatus := args[1].(*syscall.WaitStatus)
cont, err := container.Load(conf.RootDir, id)
@@ -118,7 +118,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
Fatalf("setting bundleDir")
}
- spec, err := specutils.ReadSpec(bundleDir)
+ spec, err := specutils.ReadSpec(bundleDir, conf)
if err != nil {
Fatalf("reading spec: %v", err)
}
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 910e97577..e76f7ba1d 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -18,7 +18,7 @@ import (
"context"
"github.com/google/subcommands"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
"gvisor.dev/gvisor/runsc/specutils"
@@ -81,7 +81,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
if conf.Rootless {
return Errorf("Rootless mode not supported with %q", c.Name())
@@ -91,7 +91,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
if bundleDir == "" {
bundleDir = getwdOrDie()
}
- spec, err := specutils.ReadSpec(bundleDir)
+ spec, err := specutils.ReadSpec(bundleDir, conf)
if err != nil {
return Errorf("reading spec: %v", err)
}
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index b5de2588b..132198222 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -25,27 +25,26 @@ import (
"github.com/google/subcommands"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
// Debug implements subcommands.Command for the "debug" command.
type Debug struct {
- pid int
- stacks bool
- signal int
- profileHeap string
- profileCPU string
- profileGoroutine string
- profileBlock string
- profileMutex string
- trace string
- strace string
- logLevel string
- logPackets string
- duration time.Duration
- ps bool
+ pid int
+ stacks bool
+ signal int
+ profileHeap string
+ profileCPU string
+ profileBlock string
+ profileMutex string
+ trace string
+ strace string
+ logLevel string
+ logPackets string
+ duration time.Duration
+ ps bool
}
// Name implements subcommands.Command.
@@ -69,7 +68,6 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
- f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.")
f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.")
f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.")
f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
@@ -84,7 +82,7 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
// Execute implements subcommands.Command.Execute.
func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
var c *container.Container
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
if d.pid == 0 {
// No pid, container ID must have been provided.
@@ -153,18 +151,6 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
log.Infof("Heap profile written to %q", d.profileHeap)
}
- if d.profileGoroutine != "" {
- f, err := os.Create(d.profileGoroutine)
- if err != nil {
- return Errorf(err.Error())
- }
- defer f.Close()
-
- if err := c.Sandbox.GoroutineProfile(f); err != nil {
- return Errorf(err.Error())
- }
- log.Infof("Goroutine profile written to %q", d.profileGoroutine)
- }
if d.profileBlock != "" {
f, err := os.Create(d.profileBlock)
if err != nil {
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index 0e4863f50..4e49deff8 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -21,7 +21,7 @@ import (
"github.com/google/subcommands"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -59,14 +59,14 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
return subcommands.ExitUsageError
}
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
if err := d.execute(f.Args(), conf); err != nil {
Fatalf("%v", err)
}
return subcommands.ExitSuccess
}
-func (d *Delete) execute(ids []string, conf *boot.Config) error {
+func (d *Delete) execute(ids []string, conf *config.Config) error {
for _, id := range ids {
c, err := container.Load(conf.RootDir, id)
if err != nil {
diff --git a/runsc/cmd/delete_test.go b/runsc/cmd/delete_test.go
index cb59516a3..e2d994a05 100644
--- a/runsc/cmd/delete_test.go
+++ b/runsc/cmd/delete_test.go
@@ -18,7 +18,7 @@ import (
"io/ioutil"
"testing"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
)
func TestNotFound(t *testing.T) {
@@ -27,7 +27,7 @@ func TestNotFound(t *testing.T) {
if err != nil {
t.Fatalf("error creating dir: %v", err)
}
- conf := &boot.Config{RootDir: dir}
+ conf := &config.Config{RootDir: dir}
d := Delete{}
if err := d.execute(ids, conf); err == nil {
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 7d1310c96..d1f2e9e6d 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -30,7 +30,7 @@ import (
"github.com/google/subcommands"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
"gvisor.dev/gvisor/runsc/specutils"
@@ -82,7 +82,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
return subcommands.ExitUsageError
}
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
waitStatus := args[1].(*syscall.WaitStatus)
if conf.Rootless {
@@ -125,7 +125,7 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
specutils.LogSpec(spec)
cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
- if conf.Network == boot.NetworkNone {
+ if conf.Network == config.NetworkNone {
netns := specs.LinuxNamespace{
Type: specs.NetworkNamespace,
}
@@ -135,9 +135,9 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}}
} else if conf.Rootless {
- if conf.Network == boot.NetworkSandbox {
+ if conf.Network == config.NetworkSandbox {
c.notifyUser("*** Warning: using host network due to --rootless ***")
- conf.Network = boot.NetworkHost
+ conf.Network = config.NetworkHost
}
} else {
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 51f6a98ed..25fe2cf1c 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -22,7 +22,7 @@ import (
"github.com/google/subcommands"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -72,7 +72,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
c, err := container.Load(conf.RootDir, id)
if err != nil {
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index d9a94903e..775ed4b43 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -33,7 +33,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/urpc"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/console"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
@@ -105,7 +105,7 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
// Execute implements subcommands.Command.Execute. It starts a process in an
// already created container.
func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
e, id, err := ex.parseArgs(f, conf.EnableRaw)
if err != nil {
Fatalf("parsing process spec: %v", err)
@@ -220,7 +220,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
cmd.Stderr = os.Stderr
// If the console control socket file is provided, then create a new
- // pty master/slave pair and set the TTY on the sandbox process.
+ // pty master/replica pair and set the TTY on the sandbox process.
if ex.consoleSocket != "" {
// Create a new TTY pair and send the master on the provided socket.
tty, err := console.NewWithSocket(ex.consoleSocket)
@@ -229,7 +229,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi
}
defer tty.Close()
- // Set stdio to the new TTY slave.
+ // Set stdio to the new TTY replica.
cmd.Stdin = tty
cmd.Stdout = tty
cmd.Stderr = tty
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 3966e2d21..371fcc0ae 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -30,7 +30,7 @@ import (
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/flag"
"gvisor.dev/gvisor/runsc/fsgofer"
"gvisor.dev/gvisor/runsc/fsgofer/filter"
@@ -62,9 +62,8 @@ type Gofer struct {
applyCaps bool
setUpRoot bool
- panicOnWrite bool
- specFD int
- mountsFD int
+ specFD int
+ mountsFD int
}
// Name implements subcommands.Command.
@@ -87,7 +86,6 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
- f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected")
f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
@@ -100,15 +98,15 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
return subcommands.ExitUsageError
}
+ conf := args[0].(*config.Config)
+
specFile := os.NewFile(uintptr(g.specFD), "spec file")
defer specFile.Close()
- spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile)
+ spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf)
if err != nil {
Fatalf("reading spec: %v", err)
}
- conf := args[0].(*boot.Config)
-
if g.setUpRoot {
if err := setupRootFS(spec, conf); err != nil {
Fatalf("Error setting up root FS: %v", err)
@@ -168,8 +166,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
// Start with root mount, then add any other additional mount as needed.
ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
- ROMount: spec.Root.Readonly || conf.Overlay,
- PanicOnWrite: g.panicOnWrite,
+ ROMount: spec.Root.Readonly || conf.Overlay,
})
if err != nil {
Fatalf("creating attach point: %v", err)
@@ -181,9 +178,8 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
for _, m := range spec.Mounts {
if specutils.Is9PMount(m) {
cfg := fsgofer.Config{
- ROMount: isReadonlyMount(m.Options) || conf.Overlay,
- PanicOnWrite: g.panicOnWrite,
- HostUDS: conf.FSGoferHostUDS,
+ ROMount: isReadonlyMount(m.Options) || conf.Overlay,
+ HostUDS: conf.FSGoferHostUDS,
}
ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
if err != nil {
@@ -263,7 +259,7 @@ func isReadonlyMount(opts []string) bool {
return false
}
-func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
+func setupRootFS(spec *specs.Spec, conf *config.Config) error {
// Convert all shared mounts into slaves to be sure that nothing will be
// propagated outside of our namespace.
if err := syscall.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
@@ -316,6 +312,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
if err != nil {
return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
}
+ log.Infof("Create working directory %q if needed", spec.Process.Cwd)
if err := os.MkdirAll(dst, 0755); err != nil {
return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
}
@@ -346,7 +343,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
// setupMounts binds mount all mounts specified in the spec in their correct
// location inside root. It will resolve relative paths and symlinks. It also
// creates directories as needed.
-func setupMounts(conf *boot.Config, mounts []specs.Mount, root string) error {
+func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error {
for _, m := range mounts {
if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
continue
@@ -385,7 +382,7 @@ func setupMounts(conf *boot.Config, mounts []specs.Mount, root string) error {
// Otherwise, it may follow symlinks to locations that would be overwritten
// with another mount point and return the wrong location. In short, make sure
// setupMounts() has been called before.
-func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
+func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
cleanMounts := make([]specs.Mount, 0, len(mounts))
for _, m := range mounts {
if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
@@ -467,7 +464,7 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro
}
// adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
-func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) {
+func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) {
rv := make([]string, len(opts))
copy(rv, opts)
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index 8282ea0e0..04eee99b2 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -23,7 +23,7 @@ import (
"github.com/google/subcommands"
"golang.org/x/sys/unix"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -63,7 +63,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
if k.pid != 0 && k.all {
Fatalf("it is invalid to specify both --all and --pid")
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index d8d906fe3..f92d6fef9 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -24,7 +24,7 @@ import (
"github.com/google/subcommands"
specs "github.com/opencontainers/runtime-spec/specs-go"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -63,7 +63,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
return subcommands.ExitUsageError
}
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
ids, err := container.List(conf.RootDir)
if err != nil {
Fatalf("%v", err)
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index 6f95a9837..0eb1402ed 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -18,7 +18,7 @@ import (
"context"
"github.com/google/subcommands"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -53,7 +53,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
cont, err := container.Load(conf.RootDir, id)
if err != nil {
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 7fb8041af..bc58c928f 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -20,7 +20,7 @@ import (
"github.com/google/subcommands"
"gvisor.dev/gvisor/pkg/sentry/control"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -58,7 +58,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
c, err := container.Load(conf.RootDir, id)
if err != nil {
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 72584b326..096ec814c 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -20,7 +20,7 @@ import (
"syscall"
"github.com/google/subcommands"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
"gvisor.dev/gvisor/runsc/specutils"
@@ -77,7 +77,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
waitStatus := args[1].(*syscall.WaitStatus)
if conf.Rootless {
@@ -88,7 +88,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
if bundleDir == "" {
bundleDir = getwdOrDie()
}
- spec, err := specutils.ReadSpec(bundleDir)
+ spec, err := specutils.ReadSpec(bundleDir, conf)
if err != nil {
return Errorf("reading spec: %v", err)
}
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index 61a55a554..f24823f99 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -18,7 +18,7 @@ import (
"context"
"github.com/google/subcommands"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -54,7 +54,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
cont, err := container.Load(conf.RootDir, id)
if err != nil {
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index cf41581ad..c48cbe4cd 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -19,7 +19,7 @@ import (
"syscall"
"github.com/google/subcommands"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
"gvisor.dev/gvisor/runsc/specutils"
@@ -64,7 +64,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
waitStatus := args[1].(*syscall.WaitStatus)
if conf.Rootless {
@@ -75,7 +75,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
if bundleDir == "" {
bundleDir = getwdOrDie()
}
- spec, err := specutils.ReadSpec(bundleDir)
+ spec, err := specutils.ReadSpec(bundleDir, conf)
if err != nil {
return Errorf("reading spec: %v", err)
}
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index a2b0a4b14..55194e641 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -16,124 +16,122 @@ package cmd
import (
"context"
- "fmt"
- "io/ioutil"
+ "encoding/json"
+ "io"
"os"
"path/filepath"
"github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/runsc/flag"
)
-func genSpec(cwd string) []byte {
- var template = fmt.Sprintf(`{
- "ociVersion": "1.0.0",
- "process": {
- "terminal": true,
- "user": {
- "uid": 0,
- "gid": 0
- },
- "args": [
- "sh"
- ],
- "env": [
- "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
- "TERM=xterm"
- ],
- "cwd": "%s",
- "capabilities": {
- "bounding": [
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE"
- ],
- "effective": [
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE"
- ],
- "inheritable": [
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE"
- ],
- "permitted": [
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE"
- ],
- "ambient": [
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE"
- ]
- },
- "rlimits": [
- {
- "type": "RLIMIT_NOFILE",
- "hard": 1024,
- "soft": 1024
- }
- ]
- },
- "root": {
- "path": "rootfs",
- "readonly": true
- },
- "hostname": "runsc",
- "mounts": [
- {
- "destination": "/proc",
- "type": "proc",
- "source": "proc"
+func writeSpec(w io.Writer, cwd string, netns string, args []string) error {
+ spec := &specs.Spec{
+ Version: "1.0.0",
+ Process: &specs.Process{
+ Terminal: true,
+ User: specs.User{
+ UID: 0,
+ GID: 0,
+ },
+ Args: args,
+ Env: []string{
+ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+ "TERM=xterm",
+ },
+ Cwd: cwd,
+ Capabilities: &specs.LinuxCapabilities{
+ Bounding: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Effective: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Inheritable: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Permitted: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ // TODO(gvisor.dev/issue/3166): support ambient capabilities
+ },
+ Rlimits: []specs.POSIXRlimit{
+ {
+ Type: "RLIMIT_NOFILE",
+ Hard: 1024,
+ Soft: 1024,
+ },
+ },
},
- {
- "destination": "/dev",
- "type": "tmpfs",
- "source": "tmpfs",
- "options": []
+ Root: &specs.Root{
+ Path: "rootfs",
+ Readonly: true,
},
- {
- "destination": "/sys",
- "type": "sysfs",
- "source": "sysfs",
- "options": [
- "nosuid",
- "noexec",
- "nodev",
- "ro"
- ]
- }
- ],
- "linux": {
- "namespaces": [
+ Hostname: "runsc",
+ Mounts: []specs.Mount{
{
- "type": "pid"
+ Destination: "/proc",
+ Type: "proc",
+ Source: "proc",
},
{
- "type": "network"
+ Destination: "/dev",
+ Type: "tmpfs",
+ Source: "tmpfs",
},
{
- "type": "ipc"
+ Destination: "/sys",
+ Type: "sysfs",
+ Source: "sysfs",
+ Options: []string{
+ "nosuid",
+ "noexec",
+ "nodev",
+ "ro",
+ },
},
- {
- "type": "uts"
+ },
+ Linux: &specs.Linux{
+ Namespaces: []specs.LinuxNamespace{
+ {
+ Type: "pid",
+ },
+ {
+ Type: "network",
+ Path: netns,
+ },
+ {
+ Type: "ipc",
+ },
+ {
+ Type: "uts",
+ },
+ {
+ Type: "mount",
+ },
},
- {
- "type": "mount"
- }
- ]
+ },
}
-}`, cwd)
- return []byte(template)
+ e := json.NewEncoder(w)
+ e.SetIndent("", " ")
+ return e.Encode(spec)
}
// Spec implements subcommands.Command for the "spec" command.
type Spec struct {
bundle string
cwd string
+ netns string
}
// Name implements subcommands.Command.Name.
@@ -148,21 +146,26 @@ func (*Spec) Synopsis() string {
// Usage implements subcommands.Command.Usage.
func (*Spec) Usage() string {
- return `spec [options] - create a new OCI bundle specification file.
+ return `spec [options] [-- args...] - create a new OCI bundle specification file.
+
+The spec command creates a new specification file (config.json) for a new OCI
+bundle.
-The spec command creates a new specification file (config.json) for a new OCI bundle.
+The specification file is a starter file that runs the command specified by
+'args' in the container. If 'args' is not specified the default is to run the
+'sh' program.
-The specification file is a starter file that runs the "sh" command in the container. You
-should edit the file to suit your needs. You can find out more about the format of the
-specification file by visiting the OCI runtime spec repository:
+While a number of flags are provided to change values in the specification, you
+can examine the file and edit it to suit your needs after this command runs.
+You can find out more about the format of the specification file by visiting
+the OCI runtime spec repository:
https://github.com/opencontainers/runtime-spec/
EXAMPLE:
$ mkdir -p bundle/rootfs
$ cd bundle
- $ runsc spec
+ $ runsc spec -- /hello
$ docker export $(docker create hello-world) | tar -xf - -C rootfs
- $ sed -i 's;"sh";"/hello";' config.json
$ sudo runsc run hello
`
@@ -173,18 +176,29 @@ func (s *Spec) SetFlags(f *flag.FlagSet) {
f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle")
f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+
"this value MUST be an absolute path")
+ f.StringVar(&s.netns, "netns", "", "network namespace path")
}
// Execute implements subcommands.Command.Execute.
func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ // Grab the arguments.
+ containerArgs := f.Args()
+ if len(containerArgs) == 0 {
+ containerArgs = []string{"sh"}
+ }
+
confPath := filepath.Join(s.bundle, "config.json")
if _, err := os.Stat(confPath); !os.IsNotExist(err) {
Fatalf("file %q already exists", confPath)
}
- var spec = genSpec(s.cwd)
+ configFile, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0664)
+ if err != nil {
+ Fatalf("opening file %q: %v", confPath, err)
+ }
- if err := ioutil.WriteFile(confPath, spec, 0664); err != nil {
+ err = writeSpec(configFile, s.cwd, s.netns, containerArgs)
+ if err != nil {
Fatalf("writing to %q: %v", confPath, err)
}
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 0205fd9f7..88991b521 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -18,7 +18,7 @@ import (
"context"
"github.com/google/subcommands"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -52,7 +52,7 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
c, err := container.Load(conf.RootDir, id)
if err != nil {
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index cf2413deb..2bd2ab9f8 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -21,7 +21,7 @@ import (
"github.com/google/subcommands"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -55,7 +55,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
c, err := container.Load(conf.RootDir, id)
if err != nil {
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index 29c0a15f0..28d0642ed 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -21,7 +21,7 @@ import (
"syscall"
"github.com/google/subcommands"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/container"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -70,7 +70,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
id := f.Arg(0)
- conf := args[0].(*boot.Config)
+ conf := args[0].(*config.Config)
c, err := container.Load(conf.RootDir, id)
if err != nil {
diff --git a/runsc/config/BUILD b/runsc/config/BUILD
new file mode 100644
index 000000000..b1672bb9d
--- /dev/null
+++ b/runsc/config/BUILD
@@ -0,0 +1,28 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "config",
+ srcs = [
+ "config.go",
+ "flags.go",
+ ],
+ visibility = ["//:sandbox"],
+ deps = [
+ "//pkg/refs",
+ "//pkg/sentry/watchdog",
+ "//pkg/sync",
+ "//runsc/flag",
+ ],
+)
+
+go_test(
+ name = "config_test",
+ size = "small",
+ srcs = [
+ "config_test.go",
+ ],
+ library = ":config",
+ deps = ["//runsc/flag"],
+)
diff --git a/runsc/boot/config.go b/runsc/config/config.go
index bb01b8fb5..f30f79f68 100644
--- a/runsc/boot/config.go
+++ b/runsc/config/config.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,220 +12,112 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-package boot
+// Package config provides basic infrastructure to set configuration settings
+// for runsc. The configuration is set by flags to the command line. They can
+// also propagate to a different process using the same flags.
+package config
import (
"fmt"
- "strconv"
- "strings"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
)
-// FileAccessType tells how the filesystem is accessed.
-type FileAccessType int
-
-const (
- // FileAccessShared sends IO requests to a Gofer process that validates the
- // requests and forwards them to the host.
- FileAccessShared FileAccessType = iota
-
- // FileAccessExclusive is the same as FileAccessShared, but enables
- // extra caching for improved performance. It should only be used if
- // the sandbox has exclusive access to the filesystem.
- FileAccessExclusive
-)
-
-// MakeFileAccessType converts type from string.
-func MakeFileAccessType(s string) (FileAccessType, error) {
- switch s {
- case "shared":
- return FileAccessShared, nil
- case "exclusive":
- return FileAccessExclusive, nil
- default:
- return 0, fmt.Errorf("invalid file access type %q", s)
- }
-}
-
-func (f FileAccessType) String() string {
- switch f {
- case FileAccessShared:
- return "shared"
- case FileAccessExclusive:
- return "exclusive"
- default:
- return fmt.Sprintf("unknown(%d)", f)
- }
-}
-
-// NetworkType tells which network stack to use.
-type NetworkType int
-
-const (
- // NetworkSandbox uses internal network stack, isolated from the host.
- NetworkSandbox NetworkType = iota
-
- // NetworkHost redirects network related syscalls to the host network.
- NetworkHost
-
- // NetworkNone sets up just loopback using netstack.
- NetworkNone
-)
-
-// MakeNetworkType converts type from string.
-func MakeNetworkType(s string) (NetworkType, error) {
- switch s {
- case "sandbox":
- return NetworkSandbox, nil
- case "host":
- return NetworkHost, nil
- case "none":
- return NetworkNone, nil
- default:
- return 0, fmt.Errorf("invalid network type %q", s)
- }
-}
-
-func (n NetworkType) String() string {
- switch n {
- case NetworkSandbox:
- return "sandbox"
- case NetworkHost:
- return "host"
- case NetworkNone:
- return "none"
- default:
- return fmt.Sprintf("unknown(%d)", n)
- }
-}
-
-// MakeWatchdogAction converts type from string.
-func MakeWatchdogAction(s string) (watchdog.Action, error) {
- switch strings.ToLower(s) {
- case "log", "logwarning":
- return watchdog.LogWarning, nil
- case "panic":
- return watchdog.Panic, nil
- default:
- return 0, fmt.Errorf("invalid watchdog action %q", s)
- }
-}
-
-// MakeRefsLeakMode converts type from string.
-func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
- switch strings.ToLower(s) {
- case "disabled":
- return refs.NoLeakChecking, nil
- case "log-names":
- return refs.LeaksLogWarning, nil
- case "log-traces":
- return refs.LeaksLogTraces, nil
- default:
- return 0, fmt.Errorf("invalid refs leakmode %q", s)
- }
-}
-
-func refsLeakModeToString(mode refs.LeakMode) string {
- switch mode {
- // If not set, default it to disabled.
- case refs.UninitializedLeakChecking, refs.NoLeakChecking:
- return "disabled"
- case refs.LeaksLogWarning:
- return "log-names"
- case refs.LeaksLogTraces:
- return "log-traces"
- default:
- panic(fmt.Sprintf("Invalid leakmode: %d", mode))
- }
-}
-
// Config holds configuration that is not part of the runtime spec.
+//
+// Follow these steps to add a new flag:
+// 1. Create a new field in Config.
+// 2. Add a field tag with the flag name
+// 3. Register a new flag in flags.go, with name and description
+// 4. Add any necessary validation into validate()
+// 5. If adding an enum, follow the same pattern as FileAccessType
+//
type Config struct {
// RootDir is the runtime root directory.
- RootDir string
+ RootDir string `flag:"root"`
// Debug indicates that debug logging should be enabled.
- Debug bool
+ Debug bool `flag:"debug"`
// LogFilename is the filename to log to, if not empty.
- LogFilename string
+ LogFilename string `flag:"log"`
// LogFormat is the log format.
- LogFormat string
+ LogFormat string `flag:"log-format"`
// DebugLog is the path to log debug information to, if not empty.
- DebugLog string
+ DebugLog string `flag:"debug-log"`
// PanicLog is the path to log GO's runtime messages, if not empty.
- PanicLog string
+ PanicLog string `flag:"panic-log"`
// DebugLogFormat is the log format for debug.
- DebugLogFormat string
+ DebugLogFormat string `flag:"debug-log-format"`
// FileAccess indicates how the filesystem is accessed.
- FileAccess FileAccessType
+ FileAccess FileAccessType `flag:"file-access"`
// Overlay is whether to wrap the root filesystem in an overlay.
- Overlay bool
+ Overlay bool `flag:"overlay"`
// FSGoferHostUDS enables the gofer to mount a host UDS.
- FSGoferHostUDS bool
+ FSGoferHostUDS bool `flag:"fsgofer-host-uds"`
// Network indicates what type of network to use.
- Network NetworkType
+ Network NetworkType `flag:"network"`
// EnableRaw indicates whether raw sockets should be enabled. Raw
// sockets are disabled by stripping CAP_NET_RAW from the list of
// capabilities.
- EnableRaw bool
+ EnableRaw bool `flag:"net-raw"`
// HardwareGSO indicates that hardware segmentation offload is enabled.
- HardwareGSO bool
+ HardwareGSO bool `flag:"gso"`
// SoftwareGSO indicates that software segmentation offload is enabled.
- SoftwareGSO bool
+ SoftwareGSO bool `flag:"software-gso"`
// TXChecksumOffload indicates that TX Checksum Offload is enabled.
- TXChecksumOffload bool
+ TXChecksumOffload bool `flag:"tx-checksum-offload"`
// RXChecksumOffload indicates that RX Checksum Offload is enabled.
- RXChecksumOffload bool
+ RXChecksumOffload bool `flag:"rx-checksum-offload"`
// QDisc indicates the type of queuening discipline to use by default
// for non-loopback interfaces.
- QDisc QueueingDiscipline
+ QDisc QueueingDiscipline `flag:"qdisc"`
// LogPackets indicates that all network packets should be logged.
- LogPackets bool
+ LogPackets bool `flag:"log-packets"`
// Platform is the platform to run on.
- Platform string
+ Platform string `flag:"platform"`
// Strace indicates that strace should be enabled.
- Strace bool
+ Strace bool `flag:"strace"`
- // StraceSyscalls is the set of syscalls to trace. If StraceEnable is
- // true and this list is empty, then all syscalls will be traced.
- StraceSyscalls []string
+ // StraceSyscalls is the set of syscalls to trace (comma-separated values).
+ // If StraceEnable is true and this string is empty, then all syscalls will
+ // be traced.
+ StraceSyscalls string `flag:"strace-syscalls"`
// StraceLogSize is the max size of data blobs to display.
- StraceLogSize uint
+ StraceLogSize uint `flag:"strace-log-size"`
// DisableSeccomp indicates whether seccomp syscall filters should be
// disabled. Pardon the double negation, but default to enabled is important.
DisableSeccomp bool
// WatchdogAction sets what action the watchdog takes when triggered.
- WatchdogAction watchdog.Action
+ WatchdogAction watchdog.Action `flag:"watchdog-action"`
// PanicSignal registers signal handling that panics. Usually set to
// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
- PanicSignal int
+ PanicSignal int `flag:"panic-signal"`
// ProfileEnable is set to prepare the sandbox to be profiled.
- ProfileEnable bool
+ ProfileEnable bool `flag:"profile"`
// RestoreFile is the path to the saved container image
RestoreFile string
@@ -233,97 +125,215 @@ type Config struct {
// NumNetworkChannels controls the number of AF_PACKET sockets that map
// to the same underlying network device. This allows netstack to better
// scale for high throughput use cases.
- NumNetworkChannels int
+ NumNetworkChannels int `flag:"num-network-channels"`
// Rootless allows the sandbox to be started with a user that is not root.
// Defense is depth measures are weaker with rootless. Specifically, the
// sandbox and Gofer process run as root inside a user namespace with root
// mapped to the caller's user.
- Rootless bool
+ Rootless bool `flag:"rootless"`
// AlsoLogToStderr allows to send log messages to stderr.
- AlsoLogToStderr bool
+ AlsoLogToStderr bool `flag:"alsologtostderr"`
// ReferenceLeakMode sets reference leak check mode
- ReferenceLeakMode refs.LeakMode
+ ReferenceLeak refs.LeakMode `flag:"ref-leak-mode"`
// OverlayfsStaleRead instructs the sandbox to assume that the root mount
// is on a Linux overlayfs mount, which does not necessarily preserve
// coherence between read-only and subsequent writable file descriptors
// representing the "same" file.
- OverlayfsStaleRead bool
+ OverlayfsStaleRead bool `flag:"overlayfs-stale-read"`
+
+ // CPUNumFromQuota sets CPU number count to available CPU quota, using
+ // least integer value greater than or equal to quota.
+ //
+ // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
+ CPUNumFromQuota bool `flag:"cpu-num-from-quota"`
+
+ // Enables VFS2.
+ VFS2 bool `flag:"vfs2"`
+
+ // Enables FUSE usage.
+ FUSE bool `flag:"fuse"`
+
+ // Allows overriding of flags in OCI annotations.
+ AllowFlagOverride bool `flag:"allow-flag-override"`
+
+ // Enables seccomp inside the sandbox.
+ OCISeccomp bool `flag:"oci-seccomp"`
// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
// tests. It allows runsc to start the sandbox process as the current
// user, and without chrooting the sandbox process. This can be
// necessary in test environments that have limited capabilities.
- TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+ TestOnlyAllowRunAsCurrentUserWithoutChroot bool `flag:"TESTONLY-unsafe-nonroot"`
// TestOnlyTestNameEnv should only be used in tests. It looks up for the
// test name in the container environment variables and adds it to the debug
// log file name. This is done to help identify the log with the test when
// multiple tests are run in parallel, since there is no way to pass
// parameters to the runtime from docker.
- TestOnlyTestNameEnv string
+ TestOnlyTestNameEnv string `flag:"TESTONLY-test-name-env"`
+}
- // CPUNumFromQuota sets CPU number count to available CPU quota, using
- // least integer value greater than or equal to quota.
- //
- // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
- CPUNumFromQuota bool
+func (c *Config) validate() error {
+ if c.FileAccess == FileAccessShared && c.Overlay {
+ return fmt.Errorf("overlay flag is incompatible with shared file access")
+ }
+ if c.NumNetworkChannels <= 0 {
+ return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels)
+ }
+ return nil
+}
- // Enables VFS2 (not plumbled through yet).
- VFS2 bool
+// FileAccessType tells how the filesystem is accessed.
+type FileAccessType int
+
+const (
+ // FileAccessExclusive is the same as FileAccessShared, but enables
+ // extra caching for improved performance. It should only be used if
+ // the sandbox has exclusive access to the filesystem.
+ FileAccessExclusive FileAccessType = iota
+
+ // FileAccessShared sends IO requests to a Gofer process that validates the
+ // requests and forwards them to the host.
+ FileAccessShared
+)
+
+func fileAccessTypePtr(v FileAccessType) *FileAccessType {
+ return &v
}
-// ToFlags returns a slice of flags that correspond to the given Config.
-func (c *Config) ToFlags() []string {
- f := []string{
- "--root=" + c.RootDir,
- "--debug=" + strconv.FormatBool(c.Debug),
- "--log=" + c.LogFilename,
- "--log-format=" + c.LogFormat,
- "--debug-log=" + c.DebugLog,
- "--panic-log=" + c.PanicLog,
- "--debug-log-format=" + c.DebugLogFormat,
- "--file-access=" + c.FileAccess.String(),
- "--overlay=" + strconv.FormatBool(c.Overlay),
- "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
- "--network=" + c.Network.String(),
- "--log-packets=" + strconv.FormatBool(c.LogPackets),
- "--platform=" + c.Platform,
- "--strace=" + strconv.FormatBool(c.Strace),
- "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
- "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
- "--watchdog-action=" + c.WatchdogAction.String(),
- "--panic-signal=" + strconv.Itoa(c.PanicSignal),
- "--profile=" + strconv.FormatBool(c.ProfileEnable),
- "--net-raw=" + strconv.FormatBool(c.EnableRaw),
- "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
- "--rootless=" + strconv.FormatBool(c.Rootless),
- "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
- "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
- "--gso=" + strconv.FormatBool(c.HardwareGSO),
- "--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
- "--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
- "--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
- "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
- "--qdisc=" + c.QDisc.String(),
+// Set implements flag.Value.
+func (f *FileAccessType) Set(v string) error {
+ switch v {
+ case "shared":
+ *f = FileAccessShared
+ case "exclusive":
+ *f = FileAccessExclusive
+ default:
+ return fmt.Errorf("invalid file access type %q", v)
}
- if c.CPUNumFromQuota {
- f = append(f, "--cpu-num-from-quota")
+ return nil
+}
+
+// Get implements flag.Value.
+func (f *FileAccessType) Get() interface{} {
+ return *f
+}
+
+// String implements flag.Value.
+func (f *FileAccessType) String() string {
+ switch *f {
+ case FileAccessShared:
+ return "shared"
+ case FileAccessExclusive:
+ return "exclusive"
}
- // Only include these if set since it is never to be used by users.
- if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
- f = append(f, "--TESTONLY-unsafe-nonroot=true")
+ panic(fmt.Sprintf("Invalid file access type %v", *f))
+}
+
+// NetworkType tells which network stack to use.
+type NetworkType int
+
+const (
+ // NetworkSandbox uses internal network stack, isolated from the host.
+ NetworkSandbox NetworkType = iota
+
+ // NetworkHost redirects network related syscalls to the host network.
+ NetworkHost
+
+ // NetworkNone sets up just loopback using netstack.
+ NetworkNone
+)
+
+func networkTypePtr(v NetworkType) *NetworkType {
+ return &v
+}
+
+// Set implements flag.Value.
+func (n *NetworkType) Set(v string) error {
+ switch v {
+ case "sandbox":
+ *n = NetworkSandbox
+ case "host":
+ *n = NetworkHost
+ case "none":
+ *n = NetworkNone
+ default:
+ return fmt.Errorf("invalid network type %q", v)
+ }
+ return nil
+}
+
+// Get implements flag.Value.
+func (n *NetworkType) Get() interface{} {
+ return *n
+}
+
+// String implements flag.Value.
+func (n *NetworkType) String() string {
+ switch *n {
+ case NetworkSandbox:
+ return "sandbox"
+ case NetworkHost:
+ return "host"
+ case NetworkNone:
+ return "none"
}
- if len(c.TestOnlyTestNameEnv) != 0 {
- f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
+ panic(fmt.Sprintf("Invalid network type %v", *n))
+}
+
+// QueueingDiscipline is used to specify the kind of Queueing Discipline to
+// apply for a give FDBasedLink.
+type QueueingDiscipline int
+
+const (
+ // QDiscNone disables any queueing for the underlying FD.
+ QDiscNone QueueingDiscipline = iota
+
+ // QDiscFIFO applies a simple fifo based queue to the underlying FD.
+ QDiscFIFO
+)
+
+func queueingDisciplinePtr(v QueueingDiscipline) *QueueingDiscipline {
+ return &v
+}
+
+// Set implements flag.Value.
+func (q *QueueingDiscipline) Set(v string) error {
+ switch v {
+ case "none":
+ *q = QDiscNone
+ case "fifo":
+ *q = QDiscFIFO
+ default:
+ return fmt.Errorf("invalid qdisc %q", v)
}
+ return nil
+}
+
+// Get implements flag.Value.
+func (q *QueueingDiscipline) Get() interface{} {
+ return *q
+}
- if c.VFS2 {
- f = append(f, "--vfs2=true")
+// String implements flag.Value.
+func (q *QueueingDiscipline) String() string {
+ switch *q {
+ case QDiscNone:
+ return "none"
+ case QDiscFIFO:
+ return "fifo"
}
+ panic(fmt.Sprintf("Invalid qdisc %v", *q))
+}
+
+func leakModePtr(v refs.LeakMode) *refs.LeakMode {
+ return &v
+}
- return f
+func watchdogActionPtr(v watchdog.Action) *watchdog.Action {
+ return &v
}
diff --git a/runsc/config/config_test.go b/runsc/config/config_test.go
new file mode 100644
index 000000000..fb162b7eb
--- /dev/null
+++ b/runsc/config/config_test.go
@@ -0,0 +1,272 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package config
+
+import (
+ "strings"
+ "testing"
+
+ "gvisor.dev/gvisor/runsc/flag"
+)
+
+func init() {
+ RegisterFlags()
+}
+
+func TestDefault(t *testing.T) {
+ c, err := NewFromFlags()
+ if err != nil {
+ t.Fatal(err)
+ }
+ // "--root" is always set to something different than the default. Reset it
+ // to make it easier to test that default values do not generate flags.
+ c.RootDir = ""
+
+ // All defaults doesn't require setting flags.
+ flags := c.ToFlags()
+ if len(flags) > 0 {
+ t.Errorf("default flags not set correctly for: %s", flags)
+ }
+}
+
+func setDefault(name string) {
+ fl := flag.CommandLine.Lookup(name)
+ fl.Value.Set(fl.DefValue)
+}
+
+func TestFromFlags(t *testing.T) {
+ flag.CommandLine.Lookup("root").Value.Set("some-path")
+ flag.CommandLine.Lookup("debug").Value.Set("true")
+ flag.CommandLine.Lookup("num-network-channels").Value.Set("123")
+ flag.CommandLine.Lookup("network").Value.Set("none")
+ defer func() {
+ setDefault("root")
+ setDefault("debug")
+ setDefault("num-network-channels")
+ setDefault("network")
+ }()
+
+ c, err := NewFromFlags()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if want := "some-path"; c.RootDir != want {
+ t.Errorf("RootDir=%v, want: %v", c.RootDir, want)
+ }
+ if want := true; c.Debug != want {
+ t.Errorf("Debug=%v, want: %v", c.Debug, want)
+ }
+ if want := 123; c.NumNetworkChannels != want {
+ t.Errorf("NumNetworkChannels=%v, want: %v", c.NumNetworkChannels, want)
+ }
+ if want := NetworkNone; c.Network != want {
+ t.Errorf("Network=%v, want: %v", c.Network, want)
+ }
+}
+
+func TestToFlags(t *testing.T) {
+ c, err := NewFromFlags()
+ if err != nil {
+ t.Fatal(err)
+ }
+ c.RootDir = "some-path"
+ c.Debug = true
+ c.NumNetworkChannels = 123
+ c.Network = NetworkNone
+
+ flags := c.ToFlags()
+ if len(flags) != 4 {
+ t.Errorf("wrong number of flags set, want: 4, got: %d: %s", len(flags), flags)
+ }
+ t.Logf("Flags: %s", flags)
+ fm := map[string]string{}
+ for _, f := range flags {
+ kv := strings.Split(f, "=")
+ fm[kv[0]] = kv[1]
+ }
+ for name, want := range map[string]string{
+ "--root": "some-path",
+ "--debug": "true",
+ "--num-network-channels": "123",
+ "--network": "none",
+ } {
+ if got, ok := fm[name]; ok {
+ if got != want {
+ t.Errorf("flag %q, want: %q, got: %q", name, want, got)
+ }
+ } else {
+ t.Errorf("flag %q not set", name)
+ }
+ }
+}
+
+// TestInvalidFlags checks that enum flags fail when value is not in enum set.
+func TestInvalidFlags(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ error string
+ }{
+ {
+ name: "file-access",
+ error: "invalid file access type",
+ },
+ {
+ name: "network",
+ error: "invalid network type",
+ },
+ {
+ name: "qdisc",
+ error: "invalid qdisc",
+ },
+ {
+ name: "watchdog-action",
+ error: "invalid watchdog action",
+ },
+ {
+ name: "ref-leak-mode",
+ error: "invalid ref leak mode",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ defer setDefault(tc.name)
+ if err := flag.CommandLine.Lookup(tc.name).Value.Set("invalid"); err == nil || !strings.Contains(err.Error(), tc.error) {
+ t.Errorf("flag.Value.Set(invalid) wrong error reported: %v", err)
+ }
+ })
+ }
+}
+
+func TestValidationFail(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ flags map[string]string
+ error string
+ }{
+ {
+ name: "shared+overlay",
+ flags: map[string]string{
+ "file-access": "shared",
+ "overlay": "true",
+ },
+ error: "overlay flag is incompatible",
+ },
+ {
+ name: "network-channels",
+ flags: map[string]string{
+ "num-network-channels": "-1",
+ },
+ error: "num_network_channels must be > 0",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ for name, val := range tc.flags {
+ defer setDefault(name)
+ if err := flag.CommandLine.Lookup(name).Value.Set(val); err != nil {
+ t.Errorf("%s=%q: %v", name, val, err)
+ }
+ }
+ if _, err := NewFromFlags(); err == nil || !strings.Contains(err.Error(), tc.error) {
+ t.Errorf("NewFromFlags() wrong error reported: %v", err)
+ }
+ })
+ }
+}
+
+func TestOverride(t *testing.T) {
+ c, err := NewFromFlags()
+ if err != nil {
+ t.Fatal(err)
+ }
+ c.AllowFlagOverride = true
+
+ t.Run("string", func(t *testing.T) {
+ c.RootDir = "foobar"
+ if err := c.Override("root", "bar"); err != nil {
+ t.Fatalf("Override(root, bar) failed: %v", err)
+ }
+ defer setDefault("root")
+ if c.RootDir != "bar" {
+ t.Errorf("Override(root, bar) didn't work: %+v", c)
+ }
+ })
+
+ t.Run("bool", func(t *testing.T) {
+ c.Debug = true
+ if err := c.Override("debug", "false"); err != nil {
+ t.Fatalf("Override(debug, false) failed: %v", err)
+ }
+ defer setDefault("debug")
+ if c.Debug {
+ t.Errorf("Override(debug, false) didn't work: %+v", c)
+ }
+ })
+
+ t.Run("enum", func(t *testing.T) {
+ c.FileAccess = FileAccessShared
+ if err := c.Override("file-access", "exclusive"); err != nil {
+ t.Fatalf("Override(file-access, exclusive) failed: %v", err)
+ }
+ defer setDefault("file-access")
+ if c.FileAccess != FileAccessExclusive {
+ t.Errorf("Override(file-access, exclusive) didn't work: %+v", c)
+ }
+ })
+}
+
+func TestOverrideDisabled(t *testing.T) {
+ c, err := NewFromFlags()
+ if err != nil {
+ t.Fatal(err)
+ }
+ const errMsg = "flag override disabled"
+ if err := c.Override("root", "path"); err == nil || !strings.Contains(err.Error(), errMsg) {
+ t.Errorf("Override() wrong error: %v", err)
+ }
+}
+
+func TestOverrideError(t *testing.T) {
+ c, err := NewFromFlags()
+ if err != nil {
+ t.Fatal(err)
+ }
+ c.AllowFlagOverride = true
+ for _, tc := range []struct {
+ name string
+ value string
+ error string
+ }{
+ {
+ name: "invalid",
+ value: "valid",
+ error: `flag "invalid" not found`,
+ },
+ {
+ name: "debug",
+ value: "invalid",
+ error: "error setting flag debug",
+ },
+ {
+ name: "file-access",
+ value: "invalid",
+ error: "invalid file access type",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ if err := c.Override(tc.name, tc.value); err == nil || !strings.Contains(err.Error(), tc.error) {
+ t.Errorf("Override(%q, %q) wrong error: %v", tc.name, tc.value, err)
+ }
+ })
+ }
+}
diff --git a/runsc/config/flags.go b/runsc/config/flags.go
new file mode 100644
index 000000000..a5f25cfa2
--- /dev/null
+++ b/runsc/config/flags.go
@@ -0,0 +1,205 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package config
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "reflect"
+ "strconv"
+
+ "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/watchdog"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/runsc/flag"
+)
+
+var registration sync.Once
+
+// This is the set of flags used to populate Config.
+func RegisterFlags() {
+ registration.Do(func() {
+ // Although these flags are not part of the OCI spec, they are used by
+ // Docker, and thus should not be changed.
+ flag.String("root", "", "root directory for storage of container state.")
+ flag.String("log", "", "file path where internal debug information is written, default is stdout.")
+ flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
+ flag.Bool("debug", false, "enable debug logging.")
+
+ // These flags are unique to runsc, and are used to configure parts of the
+ // system that are not covered by the runtime spec.
+
+ // Debugging flags.
+ flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+ flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
+ flag.Bool("log-packets", false, "enable network packet logging.")
+ flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
+ flag.Bool("alsologtostderr", false, "send log messages to stderr.")
+ flag.Bool("allow-flag-override", false, "allow OCI annotations (dev.gvisor.flag.<name>) to override flags for debugging.")
+
+ // Debugging flags: strace related
+ flag.Bool("strace", false, "enable strace.")
+ flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
+ flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
+
+ // Flags that control sandbox runtime behavior.
+ flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
+ flag.Var(watchdogActionPtr(watchdog.LogWarning), "watchdog-action", "sets what action the watchdog takes when triggered: log (default), panic.")
+ flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+ flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+ flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
+ flag.Var(leakModePtr(refs.NoLeakChecking), "ref-leak-mode", "sets reference leak check mode: disabled (default), log-names, log-traces.")
+ flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
+ flag.Bool("oci-seccomp", false, "Enables loading OCI seccomp filters inside the sandbox.")
+
+ // Flags that control sandbox runtime behavior: FS related.
+ flag.Var(fileAccessTypePtr(FileAccessExclusive), "file-access", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+ flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+ flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem")
+ flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
+ flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
+ flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.")
+
+ // Flags that control sandbox runtime behavior: network related.
+ flag.Var(networkTypePtr(NetworkSandbox), "network", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+ flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+ flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
+ flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.")
+ flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.")
+ flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.")
+ flag.Var(queueingDisciplinePtr(QDiscFIFO), "qdisc", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
+ flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
+
+ // Test flags, not to be used outside tests, ever.
+ flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
+ flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
+ })
+}
+
+// NewFromFlags creates a new Config with values coming from command line flags.
+func NewFromFlags() (*Config, error) {
+ conf := &Config{}
+
+ obj := reflect.ValueOf(conf).Elem()
+ st := obj.Type()
+ for i := 0; i < st.NumField(); i++ {
+ f := st.Field(i)
+ name, ok := f.Tag.Lookup("flag")
+ if !ok {
+ // No flag set for this field.
+ continue
+ }
+ fl := flag.CommandLine.Lookup(name)
+ if fl == nil {
+ panic(fmt.Sprintf("Flag %q not found", name))
+ }
+ x := reflect.ValueOf(flag.Get(fl.Value))
+ obj.Field(i).Set(x)
+ }
+
+ if len(conf.RootDir) == 0 {
+ // If not set, set default root dir to something (hopefully) user-writeable.
+ conf.RootDir = "/var/run/runsc"
+ if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
+ conf.RootDir = filepath.Join(runtimeDir, "runsc")
+ }
+ }
+
+ if err := conf.validate(); err != nil {
+ return nil, err
+ }
+ return conf, nil
+}
+
+// ToFlags returns a slice of flags that correspond to the given Config.
+func (c *Config) ToFlags() []string {
+ var rv []string
+
+ obj := reflect.ValueOf(c).Elem()
+ st := obj.Type()
+ for i := 0; i < st.NumField(); i++ {
+ f := st.Field(i)
+ name, ok := f.Tag.Lookup("flag")
+ if !ok {
+ // No flag set for this field.
+ continue
+ }
+ val := getVal(obj.Field(i))
+
+ flag := flag.CommandLine.Lookup(name)
+ if flag == nil {
+ panic(fmt.Sprintf("Flag %q not found", name))
+ }
+ if val == flag.DefValue {
+ continue
+ }
+ rv = append(rv, fmt.Sprintf("--%s=%s", flag.Name, val))
+ }
+ return rv
+}
+
+// Override writes a new value to a flag.
+func (c *Config) Override(name string, value string) error {
+ if !c.AllowFlagOverride {
+ return fmt.Errorf("flag override disabled, use --allow-flag-override to enable it")
+ }
+
+ obj := reflect.ValueOf(c).Elem()
+ st := obj.Type()
+ for i := 0; i < st.NumField(); i++ {
+ f := st.Field(i)
+ fieldName, ok := f.Tag.Lookup("flag")
+ if !ok || fieldName != name {
+ // Not a flag field, or flag name doesn't match.
+ continue
+ }
+ fl := flag.CommandLine.Lookup(name)
+ if fl == nil {
+ // Flag must exist if there is a field match above.
+ panic(fmt.Sprintf("Flag %q not found", name))
+ }
+
+ // Use flag to convert the string value to the underlying flag type, using
+ // the same rules as the command-line for consistency.
+ if err := fl.Value.Set(value); err != nil {
+ return fmt.Errorf("error setting flag %s=%q: %w", name, value, err)
+ }
+ x := reflect.ValueOf(flag.Get(fl.Value))
+ obj.Field(i).Set(x)
+
+ // Validates the config again to ensure it's left in a consistent state.
+ return c.validate()
+ }
+ return fmt.Errorf("flag %q not found. Cannot set it to %q", name, value)
+}
+
+func getVal(field reflect.Value) string {
+ if str, ok := field.Addr().Interface().(fmt.Stringer); ok {
+ return str.String()
+ }
+ switch field.Kind() {
+ case reflect.Bool:
+ return strconv.FormatBool(field.Bool())
+ case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
+ return strconv.FormatInt(field.Int(), 10)
+ case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
+ return strconv.FormatUint(field.Uint(), 10)
+ case reflect.String:
+ return field.String()
+ default:
+ panic("unknown type " + field.Kind().String())
+ }
+}
diff --git a/runsc/console/console.go b/runsc/console/console.go
index 64b23639a..dbb88e117 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -24,11 +24,11 @@ import (
"golang.org/x/sys/unix"
)
-// NewWithSocket creates pty master/slave pair, sends the master FD over the given
-// socket, and returns the slave.
+// NewWithSocket creates pty master/replica pair, sends the master FD over the given
+// socket, and returns the replica.
func NewWithSocket(socketPath string) (*os.File, error) {
- // Create a new pty master and slave.
- ptyMaster, ptySlave, err := pty.Open()
+ // Create a new pty master and replica.
+ ptyMaster, ptyReplica, err := pty.Open()
if err != nil {
return nil, fmt.Errorf("opening pty: %v", err)
}
@@ -37,18 +37,18 @@ func NewWithSocket(socketPath string) (*os.File, error) {
// Get a connection to the socket path.
conn, err := net.Dial("unix", socketPath)
if err != nil {
- ptySlave.Close()
+ ptyReplica.Close()
return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err)
}
defer conn.Close()
uc, ok := conn.(*net.UnixConn)
if !ok {
- ptySlave.Close()
+ ptyReplica.Close()
return nil, fmt.Errorf("connection is not a UnixConn: %T", conn)
}
socket, err := uc.File()
if err != nil {
- ptySlave.Close()
+ ptyReplica.Close()
return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err)
}
defer socket.Close()
@@ -56,8 +56,8 @@ func NewWithSocket(socketPath string) (*os.File, error) {
// Send the master FD over the connection.
msg := unix.UnixRights(int(ptyMaster.Fd()))
if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil {
- ptySlave.Close()
+ ptyReplica.Close()
return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err)
}
- return ptySlave, nil
+ return ptyReplica, nil
}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 49cfb0837..c33755482 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -23,11 +23,12 @@ go_library(
"//pkg/sync",
"//runsc/boot",
"//runsc/cgroup",
+ "//runsc/config",
"//runsc/sandbox",
"//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
"@com_github_gofrs_flock//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
],
)
@@ -65,10 +66,11 @@ go_test(
"//pkg/urpc",
"//runsc/boot",
"//runsc/boot/platforms",
+ "//runsc/config",
"//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
"@com_github_kr_pty//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 3813c6b93..4228399b8 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -122,6 +122,7 @@ func TestConsoleSocket(t *testing.T) {
for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
spec := testutil.NewSpecWithArgs("true")
+ spec.Process.Terminal = true
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
if err != nil {
t.Fatalf("error setting up container: %v", err)
@@ -184,14 +185,14 @@ func TestJobControlSignalExec(t *testing.T) {
t.Fatalf("error starting container: %v", err)
}
- // Create a pty master/slave. The slave will be passed to the exec
+ // Create a pty master/replica. The replica will be passed to the exec
// process.
- ptyMaster, ptySlave, err := pty.Open()
+ ptyMaster, ptyReplica, err := pty.Open()
if err != nil {
t.Fatalf("error opening pty: %v", err)
}
defer ptyMaster.Close()
- defer ptySlave.Close()
+ defer ptyReplica.Close()
// Exec bash and attach a terminal. Note that occasionally /bin/sh
// may be a different shell or have a different configuration (such
@@ -202,9 +203,9 @@ func TestJobControlSignalExec(t *testing.T) {
// Don't let bash execute from profile or rc files, otherwise
// our PID counts get messed up.
Argv: []string{"/bin/bash", "--noprofile", "--norc"},
- // Pass the pty slave as FD 0, 1, and 2.
+ // Pass the pty replica as FD 0, 1, and 2.
FilePayload: urpc.FilePayload{
- Files: []*os.File{ptySlave, ptySlave, ptySlave},
+ Files: []*os.File{ptyReplica, ptyReplica, ptyReplica},
},
StdioIsPty: true,
}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 6d297d0df..63478ba8c 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -37,6 +37,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/sighandling"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/cgroup"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/sandbox"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -269,7 +270,7 @@ type Args struct {
// New creates the container in a new Sandbox process, unless the metadata
// indicates that an existing Sandbox should be used. The caller must call
// Destroy() on the container.
-func New(conf *boot.Config, args Args) (*Container, error) {
+func New(conf *config.Config, args Args) (*Container, error) {
log.Debugf("Create container %q in root dir: %s", args.ID, conf.RootDir)
if err := validateID(args.ID); err != nil {
return nil, err
@@ -324,7 +325,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
}
}
if err := runInCgroup(cg, func() error {
- ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir)
+ ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached)
if err != nil {
return err
}
@@ -397,7 +398,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
}
// Start starts running the containerized process inside the sandbox.
-func (c *Container) Start(conf *boot.Config) error {
+func (c *Container) Start(conf *config.Config) error {
log.Debugf("Start container %q", c.ID)
if err := c.Saver.lock(); err != nil {
@@ -427,7 +428,7 @@ func (c *Container) Start(conf *boot.Config) error {
// the start (and all their children processes).
if err := runInCgroup(c.Sandbox.Cgroup, func() error {
// Create the gofer process.
- ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
+ ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false)
if err != nil {
return err
}
@@ -472,7 +473,7 @@ func (c *Container) Start(conf *boot.Config) error {
// Restore takes a container and replaces its kernel and file system
// to restore a container from its state file.
-func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
+func (c *Container) Restore(spec *specs.Spec, conf *config.Config, restoreFile string) error {
log.Debugf("Restore container %q", c.ID)
if err := c.Saver.lock(); err != nil {
return err
@@ -499,7 +500,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
}
// Run is a helper that calls Create + Start + Wait.
-func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) {
+func Run(conf *config.Config, args Args) (syscall.WaitStatus, error) {
log.Debugf("Run container %q in root dir: %s", args.ID, conf.RootDir)
c, err := New(conf, args)
if err != nil {
@@ -861,7 +862,7 @@ func (c *Container) waitForStopped() error {
return backoff.Retry(op, b)
}
-func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) {
+func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) {
// Start with the general config flags.
args := conf.ToFlags()
@@ -901,9 +902,6 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
}
args = append(args, "gofer", "--bundle", bundleDir)
- if conf.Overlay {
- args = append(args, "--panic-on-write=true")
- }
// Open the spec file to donate to the sandbox.
specFile, err := specutils.OpenSpec(bundleDir)
@@ -955,6 +953,14 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
cmd.ExtraFiles = goferEnds
cmd.Args[0] = "runsc-gofer"
+ if attached {
+ // The gofer is attached to the lifetime of this process, so it
+ // should synchronously die when this process dies.
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ Pdeathsig: syscall.SIGKILL,
+ }
+ }
+
// Enter new namespaces to isolate from the rest of the system. Don't unshare
// cgroup because gofer is added to a cgroup in the caller's namespace.
nss := []specs.LinuxNamespace{
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index cd76645bd..1f8e277cc 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -41,8 +41,9 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/test/testutil"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot/platforms"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -250,7 +251,7 @@ func readOutputNum(file string, position int) (int, error) {
// run starts the sandbox and waits for it to exit, checking that the
// application succeeded.
-func run(spec *specs.Spec, conf *boot.Config) error {
+func run(spec *specs.Spec, conf *config.Config) error {
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
if err != nil {
return fmt.Errorf("error setting up container: %v", err)
@@ -289,26 +290,24 @@ var (
)
// configs generates different configurations to run tests.
-func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
+func configs(t *testing.T, opts ...configOption) map[string]*config.Config {
// Always load the default config.
- cs := make(map[string]*boot.Config)
+ cs := make(map[string]*config.Config)
+ testutil.TestConfig(t)
for _, o := range opts {
+ c := testutil.TestConfig(t)
switch o {
case overlay:
- c := testutil.TestConfig(t)
c.Overlay = true
cs["overlay"] = c
case ptrace:
- c := testutil.TestConfig(t)
c.Platform = platforms.Ptrace
cs["ptrace"] = c
case kvm:
- c := testutil.TestConfig(t)
c.Platform = platforms.KVM
cs["kvm"] = c
case nonExclusiveFS:
- c := testutil.TestConfig(t)
- c.FileAccess = boot.FileAccessShared
+ c.FileAccess = config.FileAccessShared
cs["non-exclusive"] = c
default:
panic(fmt.Sprintf("unknown config option %v", o))
@@ -317,23 +316,14 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
return cs
}
-func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config {
- vfs1 := configs(t, opts...)
-
- var optsVFS2 []configOption
- for _, opt := range opts {
- // TODO(gvisor.dev/issue/1487): Enable overlay tests.
- if opt != overlay {
- optsVFS2 = append(optsVFS2, opt)
- }
- }
-
- for key, value := range configs(t, optsVFS2...) {
+// TODO(gvisor.dev/issue/1624): Merge with configs when VFS2 is the default.
+func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config {
+ all := configs(t, opts...)
+ for key, value := range configs(t, opts...) {
value.VFS2 = true
- vfs1[key+"VFS2"] = value
+ all[key+"VFS2"] = value
}
-
- return vfs1
+ return all
}
// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
@@ -512,7 +502,7 @@ func TestExePath(t *testing.T) {
t.Fatalf("error making directory: %v", err)
}
- for name, conf := range configsWithVFS2(t, overlay) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
for _, test := range []struct {
path string
@@ -643,7 +633,9 @@ func TestExec(t *testing.T) {
if err != nil {
t.Fatalf("error creating temporary directory: %v", err)
}
- cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100", dir)
+ // Note that some shells may exec the final command in a sequence as
+ // an optimization. We avoid this here by adding the exit 0.
+ cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100 && exit 0", dir)
spec := testutil.NewSpecWithArgs("sh", "-c", cmd)
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
@@ -835,7 +827,7 @@ func TestExecProcList(t *testing.T) {
// TestKillPid verifies that we can signal individual exec'd processes.
func TestKillPid(t *testing.T) {
- for name, conf := range configsWithVFS2(t, overlay) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
app, err := testutil.FindFile("test/cmd/test_app/test_app")
if err != nil {
@@ -903,13 +895,15 @@ func TestKillPid(t *testing.T) {
}
}
-// TestCheckpointRestore creates a container that continuously writes successive integers
-// to a file. To test checkpoint and restore functionality, the container is
-// checkpointed and the last number printed to the file is recorded. Then, it is restored in two
-// new containers and the first number printed from these containers is checked. Both should
-// be the next consecutive number after the last number from the checkpointed container.
+// TestCheckpointRestore creates a container that continuously writes successive
+// integers to a file. To test checkpoint and restore functionality, the
+// container is checkpointed and the last number printed to the file is
+// recorded. Then, it is restored in two new containers and the first number
+// printed from these containers is checked. Both should be the next consecutive
+// number after the last number from the checkpointed container.
func TestCheckpointRestore(t *testing.T) {
// Skip overlay because test requires writing to host file.
+ // TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added.
for name, conf := range configs(t, noOverlay...) {
t.Run(name, func(t *testing.T) {
dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
@@ -1071,6 +1065,7 @@ func TestCheckpointRestore(t *testing.T) {
// with filesystem Unix Domain Socket use.
func TestUnixDomainSockets(t *testing.T) {
// Skip overlay because test requires writing to host file.
+ // TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added.
for name, conf := range configs(t, noOverlay...) {
t.Run(name, func(t *testing.T) {
// UDS path is limited to 108 chars for compatibility with older systems.
@@ -1208,7 +1203,7 @@ func TestUnixDomainSockets(t *testing.T) {
// recreated. Then it resumes the container, verify that the file gets created
// again.
func TestPauseResume(t *testing.T) {
- for name, conf := range configs(t, noOverlay...) {
+ for name, conf := range configsWithVFS2(t, noOverlay...) {
t.Run(name, func(t *testing.T) {
tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock")
if err != nil {
@@ -1468,7 +1463,7 @@ func TestRunNonRoot(t *testing.T) {
// TestMountNewDir checks that runsc will create destination directory if it
// doesn't exit.
func TestMountNewDir(t *testing.T) {
- for name, conf := range configsWithVFS2(t, overlay) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
root, err := ioutil.TempDir(testutil.TmpDir(), "root")
if err != nil {
@@ -1488,6 +1483,8 @@ func TestMountNewDir(t *testing.T) {
Source: srcDir,
Type: "bind",
})
+ // Extra points for creating the mount with a readonly root.
+ spec.Root.Readonly = true
if err := run(spec, conf); err != nil {
t.Fatalf("error running sandbox: %v", err)
@@ -1497,17 +1494,17 @@ func TestMountNewDir(t *testing.T) {
}
func TestReadonlyRoot(t *testing.T) {
- for name, conf := range configsWithVFS2(t, overlay) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
- spec := testutil.NewSpecWithArgs("/bin/touch", "/foo")
+ spec := testutil.NewSpecWithArgs("sleep", "100")
spec.Root.Readonly = true
+
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
if err != nil {
t.Fatalf("error setting up container: %v", err)
}
defer cleanup()
- // Create, start and wait for the container.
args := Args{
ID: testutil.RandomContainerID(),
Spec: spec,
@@ -1522,12 +1519,82 @@ func TestReadonlyRoot(t *testing.T) {
t.Fatalf("error starting container: %v", err)
}
- ws, err := c.Wait()
+ // Read mounts to check that root is readonly.
+ out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", "mount | grep ' / '")
+ if err != nil || ws != 0 {
+ t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+ }
+ t.Logf("root mount: %q", out)
+ if !strings.Contains(string(out), "(ro)") {
+ t.Errorf("root not mounted readonly: %q", out)
+ }
+
+ // Check that file cannot be created.
+ ws, err = execute(c, "/bin/touch", "/foo")
if err != nil {
- t.Fatalf("error waiting on container: %v", err)
+ t.Fatalf("touch file in ro mount: %v", err)
}
if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
- t.Fatalf("container failed, waitStatus: %v", ws)
+ t.Fatalf("wrong waitStatus: %v", ws)
+ }
+ })
+ }
+}
+
+func TestReadonlyMount(t *testing.T) {
+ for name, conf := range configsWithVFS2(t, all...) {
+ t.Run(name, func(t *testing.T) {
+ dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
+ if err != nil {
+ t.Fatalf("ioutil.TempDir() failed: %v", err)
+ }
+ spec := testutil.NewSpecWithArgs("sleep", "100")
+ spec.Mounts = append(spec.Mounts, specs.Mount{
+ Destination: dir,
+ Source: dir,
+ Type: "bind",
+ Options: []string{"ro"},
+ })
+ spec.Root.Readonly = false
+
+ _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer cleanup()
+
+ args := Args{
+ ID: testutil.RandomContainerID(),
+ Spec: spec,
+ BundleDir: bundleDir,
+ }
+ c, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("error creating container: %v", err)
+ }
+ defer c.Destroy()
+ if err := c.Start(conf); err != nil {
+ t.Fatalf("error starting container: %v", err)
+ }
+
+ // Read mounts to check that volume is readonly.
+ cmd := fmt.Sprintf("mount | grep ' %s '", dir)
+ out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", cmd)
+ if err != nil || ws != 0 {
+ t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+ }
+ t.Logf("mount: %q", out)
+ if !strings.Contains(string(out), "(ro)") {
+ t.Errorf("volume not mounted readonly: %q", out)
+ }
+
+ // Check that file cannot be created.
+ ws, err = execute(c, "/bin/touch", path.Join(dir, "file"))
+ if err != nil {
+ t.Fatalf("touch file in ro mount: %v", err)
+ }
+ if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
+ t.Fatalf("wrong WaitStatus: %v", ws)
}
})
}
@@ -1614,54 +1681,6 @@ func TestUIDMap(t *testing.T) {
}
}
-func TestReadonlyMount(t *testing.T) {
- for name, conf := range configsWithVFS2(t, overlay) {
- t.Run(name, func(t *testing.T) {
- dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount")
- spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
- if err != nil {
- t.Fatalf("ioutil.TempDir() failed: %v", err)
- }
- spec.Mounts = append(spec.Mounts, specs.Mount{
- Destination: dir,
- Source: dir,
- Type: "bind",
- Options: []string{"ro"},
- })
- spec.Root.Readonly = false
-
- _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
- if err != nil {
- t.Fatalf("error setting up container: %v", err)
- }
- defer cleanup()
-
- // Create, start and wait for the container.
- args := Args{
- ID: testutil.RandomContainerID(),
- Spec: spec,
- BundleDir: bundleDir,
- }
- c, err := New(conf, args)
- if err != nil {
- t.Fatalf("error creating container: %v", err)
- }
- defer c.Destroy()
- if err := c.Start(conf); err != nil {
- t.Fatalf("error starting container: %v", err)
- }
-
- ws, err := c.Wait()
- if err != nil {
- t.Fatalf("error waiting on container: %v", err)
- }
- if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM {
- t.Fatalf("container failed, waitStatus: %v", ws)
- }
- })
- }
-}
-
// TestAbbreviatedIDs checks that runsc supports using abbreviated container
// IDs in place of full IDs.
func TestAbbreviatedIDs(t *testing.T) {
@@ -1828,8 +1847,9 @@ func TestUserLog(t *testing.T) {
t.Fatal("error finding test_app:", err)
}
- // sched_rr_get_interval = 148 - not implemented in gvisor.
- spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148")
+ // sched_rr_get_interval - not implemented in gvisor.
+ num := strconv.Itoa(syscall.SYS_SCHED_RR_GET_INTERVAL)
+ spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall="+num)
conf := testutil.TestConfig(t)
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
if err != nil {
@@ -2011,7 +2031,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) {
}
func TestCreateWorkingDir(t *testing.T) {
- for name, conf := range configsWithVFS2(t, overlay) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
if err != nil {
@@ -2114,27 +2134,19 @@ func TestMountPropagation(t *testing.T) {
// Check that mount didn't propagate to private mount.
privFile := filepath.Join(priv, "mnt", "file")
- execArgs := &control.ExecArgs{
- Filename: "/usr/bin/test",
- Argv: []string{"test", "!", "-f", privFile},
- }
- if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+ if ws, err := execute(cont, "/usr/bin/test", "!", "-f", privFile); err != nil || ws != 0 {
t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err)
}
// Check that mount propagated to slave mount.
slaveFile := filepath.Join(slave, "mnt", "file")
- execArgs = &control.ExecArgs{
- Filename: "/usr/bin/test",
- Argv: []string{"test", "-f", slaveFile},
- }
- if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+ if ws, err := execute(cont, "/usr/bin/test", "-f", slaveFile); err != nil || ws != 0 {
t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err)
}
}
func TestMountSymlink(t *testing.T) {
- for name, conf := range configsWithVFS2(t, overlay) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
if err != nil {
@@ -2194,11 +2206,7 @@ func TestMountSymlink(t *testing.T) {
// Check that symlink was resolved and mount was created where the symlink
// is pointing to.
file := path.Join(target, "file")
- execArgs := &control.ExecArgs{
- Filename: "/usr/bin/test",
- Argv: []string{"test", "-f", file},
- }
- if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 {
+ if ws, err := execute(cont, "/usr/bin/test", "-f", file); err != nil || ws != 0 {
t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err)
}
})
@@ -2324,6 +2332,35 @@ func TestTTYField(t *testing.T) {
}
}
+func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) {
+ args := &control.ExecArgs{
+ Filename: name,
+ Argv: append([]string{name}, arg...),
+ }
+ return cont.executeSync(args)
+}
+
+func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, syscall.WaitStatus, error) {
+ r, w, err := os.Pipe()
+ if err != nil {
+ return nil, 0, err
+ }
+ defer r.Close()
+
+ args := &control.ExecArgs{
+ Filename: name,
+ Argv: append([]string{name}, arg...),
+ FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, w, w}},
+ }
+ ws, err := cont.executeSync(args)
+ w.Close()
+ if err != nil {
+ return nil, 0, err
+ }
+ out, err := ioutil.ReadAll(r)
+ return out, ws, err
+}
+
// executeSync synchronously executes a new process.
func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index a27a01942..850e80290 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -33,6 +33,7 @@ import (
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/test/testutil"
"gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -60,7 +61,7 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
return specs, ids
}
-func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
+func startContainers(conf *config.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
if len(conf.RootDir) == 0 {
panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
}
@@ -100,19 +101,20 @@ type execDesc struct {
c *Container
cmd []string
want int
- desc string
+ name string
}
-func execMany(execs []execDesc) error {
+func execMany(t *testing.T, execs []execDesc) {
for _, exec := range execs {
- args := &control.ExecArgs{Argv: exec.cmd}
- if ws, err := exec.c.executeSync(args); err != nil {
- return fmt.Errorf("error executing %+v: %v", args, err)
- } else if ws.ExitStatus() != exec.want {
- return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want)
- }
+ t.Run(exec.name, func(t *testing.T) {
+ args := &control.ExecArgs{Argv: exec.cmd}
+ if ws, err := exec.c.executeSync(args); err != nil {
+ t.Errorf("error executing %+v: %v", args, err)
+ } else if ws.ExitStatus() != exec.want {
+ t.Errorf("%q: exec %q got exit status: %d, want: %d", exec.name, exec.cmd, ws.ExitStatus(), exec.want)
+ }
+ })
}
- return nil
}
func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
@@ -167,7 +169,7 @@ func TestMultiContainerSanity(t *testing.T) {
// TestMultiPIDNS checks that it is possible to run 2 dead-simple
// containers in the same sandbox with different pidns.
func TestMultiPIDNS(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -212,7 +214,7 @@ func TestMultiPIDNS(t *testing.T) {
// TestMultiPIDNSPath checks the pidns path.
func TestMultiPIDNSPath(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -478,7 +480,7 @@ func TestMultiContainerMount(t *testing.T) {
// TestMultiContainerSignal checks that it is possible to signal individual
// containers without killing the entire sandbox.
func TestMultiContainerSignal(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -578,7 +580,7 @@ func TestMultiContainerDestroy(t *testing.T) {
t.Fatal("error finding test_app:", err)
}
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -1072,7 +1074,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
// Test that pod shared mounts are properly mounted in 2 containers and that
// changes from one container is reflected in the other.
func TestMultiContainerSharedMount(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -1110,84 +1112,82 @@ func TestMultiContainerSharedMount(t *testing.T) {
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
- desc: "directory is mounted in container0",
+ name: "directory is mounted in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
- desc: "directory is mounted in container1",
+ name: "directory is mounted in container1",
},
{
c: containers[0],
- cmd: []string{"/usr/bin/touch", file0},
- desc: "create file in container0",
+ cmd: []string{"/bin/touch", file0},
+ name: "create file in container0",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-f", file0},
- desc: "file appears in container0",
+ name: "file appears in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-f", file1},
- desc: "file appears in container1",
+ name: "file appears in container1",
},
{
c: containers[1],
cmd: []string{"/bin/rm", file1},
- desc: "file removed from container1",
+ name: "remove file from container1",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "!", "-f", file0},
- desc: "file removed from container0",
+ name: "file removed from container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "!", "-f", file1},
- desc: "file removed from container1",
+ name: "file removed from container1",
},
{
c: containers[1],
cmd: []string{"/bin/mkdir", file1},
- desc: "create directory in container1",
+ name: "create directory in container1",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-d", file0},
- desc: "dir appears in container0",
+ name: "dir appears in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-d", file1},
- desc: "dir appears in container1",
+ name: "dir appears in container1",
},
{
c: containers[0],
cmd: []string{"/bin/rmdir", file0},
- desc: "create directory in container0",
+ name: "remove directory from container0",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "!", "-d", file0},
- desc: "dir removed from container0",
+ name: "dir removed from container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "!", "-d", file1},
- desc: "dir removed from container1",
+ name: "dir removed from container1",
},
}
- if err := execMany(execs); err != nil {
- t.Fatal(err.Error())
- }
+ execMany(t, execs)
})
}
}
// Test that pod mounts are mounted as readonly when requested.
func TestMultiContainerSharedMountReadonly(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -1225,36 +1225,34 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
- desc: "directory is mounted in container0",
+ name: "directory is mounted in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
- desc: "directory is mounted in container1",
+ name: "directory is mounted in container1",
},
{
c: containers[0],
- cmd: []string{"/usr/bin/touch", file0},
+ cmd: []string{"/bin/touch", file0},
want: 1,
- desc: "fails to write to container0",
+ name: "fails to write to container0",
},
{
c: containers[1],
- cmd: []string{"/usr/bin/touch", file1},
+ cmd: []string{"/bin/touch", file1},
want: 1,
- desc: "fails to write to container1",
+ name: "fails to write to container1",
},
}
- if err := execMany(execs); err != nil {
- t.Fatal(err.Error())
- }
+ execMany(t, execs)
})
}
}
// Test that shared pod mounts continue to work after container is restarted.
func TestMultiContainerSharedMountRestart(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -1291,23 +1289,21 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
execs := []execDesc{
{
c: containers[0],
- cmd: []string{"/usr/bin/touch", file0},
- desc: "create file in container0",
+ cmd: []string{"/bin/touch", file0},
+ name: "create file in container0",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-f", file0},
- desc: "file appears in container0",
+ name: "file appears in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-f", file1},
- desc: "file appears in container1",
+ name: "file appears in container1",
},
}
- if err := execMany(execs); err != nil {
- t.Fatal(err.Error())
- }
+ execMany(t, execs)
containers[1].Destroy()
@@ -1334,86 +1330,84 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-f", file0},
- desc: "file is still in container0",
+ name: "file is still in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-f", file1},
- desc: "file is still in container1",
+ name: "file is still in container1",
},
{
c: containers[1],
cmd: []string{"/bin/rm", file1},
- desc: "file removed from container1",
+ name: "file removed from container1",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "!", "-f", file0},
- desc: "file removed from container0",
+ name: "file removed from container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "!", "-f", file1},
- desc: "file removed from container1",
+ name: "file removed from container1",
},
}
- if err := execMany(execs); err != nil {
- t.Fatal(err.Error())
- }
+ execMany(t, execs)
})
}
}
// Test that unsupported pod mounts options are ignored when matching master and
-// slave mounts.
+// replica mounts.
func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
- rootDir, cleanup, err := testutil.SetupRootDir()
- if err != nil {
- t.Fatalf("error creating root dir: %v", err)
- }
- defer cleanup()
-
- conf := testutil.TestConfig(t)
- conf.RootDir = rootDir
+ for name, conf := range configsWithVFS2(t, all...) {
+ t.Run(name, func(t *testing.T) {
+ rootDir, cleanup, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer cleanup()
+ conf.RootDir = rootDir
- // Setup the containers.
- sleep := []string{"/bin/sleep", "100"}
- podSpec, ids := createSpecs(sleep, sleep)
- mnt0 := specs.Mount{
- Destination: "/mydir/test",
- Source: "/some/dir",
- Type: "tmpfs",
- Options: []string{"rw", "rbind", "relatime"},
- }
- podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+ // Setup the containers.
+ sleep := []string{"/bin/sleep", "100"}
+ podSpec, ids := createSpecs(sleep, sleep)
+ mnt0 := specs.Mount{
+ Destination: "/mydir/test",
+ Source: "/some/dir",
+ Type: "tmpfs",
+ Options: []string{"rw", "rbind", "relatime"},
+ }
+ podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
- mnt1 := mnt0
- mnt1.Destination = "/mydir2/test2"
- mnt1.Options = []string{"rw", "nosuid"}
- podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+ mnt1 := mnt0
+ mnt1.Destination = "/mydir2/test2"
+ mnt1.Options = []string{"rw", "nosuid"}
+ podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
- createSharedMount(mnt0, "test-mount", podSpec...)
+ createSharedMount(mnt0, "test-mount", podSpec...)
- containers, cleanup, err := startContainers(conf, podSpec, ids)
- if err != nil {
- t.Fatalf("error starting containers: %v", err)
- }
- defer cleanup()
+ containers, cleanup, err := startContainers(conf, podSpec, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
- execs := []execDesc{
- {
- c: containers[0],
- cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
- desc: "directory is mounted in container0",
- },
- {
- c: containers[1],
- cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
- desc: "directory is mounted in container1",
- },
- }
- if err := execMany(execs); err != nil {
- t.Fatal(err.Error())
+ execs := []execDesc{
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
+ name: "directory is mounted in container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
+ name: "directory is mounted in container1",
+ },
+ }
+ execMany(t, execs)
+ })
}
}
@@ -1523,8 +1517,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
}
// Check that container isn't running anymore.
- args := &control.ExecArgs{Argv: []string{"/bin/true"}}
- if _, err := c.executeSync(args); err == nil {
+ if _, err := execute(c, "/bin/true"); err == nil {
t.Fatalf("Container %q was not stopped after gofer death", c.ID)
}
@@ -1539,8 +1532,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
if err := waitForProcessList(c, pl); err != nil {
t.Errorf("Container %q was affected by another container: %v", c.ID, err)
}
- args := &control.ExecArgs{Argv: []string{"/bin/true"}}
- if _, err := c.executeSync(args); err != nil {
+ if _, err := execute(c, "/bin/true"); err != nil {
t.Fatalf("Container %q was affected by another container: %v", c.ID, err)
}
}
@@ -1562,8 +1554,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
// Check that entire sandbox isn't running anymore.
for _, c := range containers {
- args := &control.ExecArgs{Argv: []string{"/bin/true"}}
- if _, err := c.executeSync(args); err == nil {
+ if _, err := execute(c, "/bin/true"); err == nil {
t.Fatalf("Container %q was not stopped after gofer death", c.ID)
}
}
@@ -1700,12 +1691,11 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
}
// TestMultiContainerHomeEnvDir tests that the HOME environment variable is set
-// for root containers, sub-containers, and execed processes.
+// for root containers, sub-containers, and exec'ed processes.
func TestMultiContainerHomeEnvDir(t *testing.T) {
- // TODO(gvisor.dev/issue/1487): VFSv2 configs failing.
// NOTE: Don't use overlay since we need changes to persist to the temp dir
// outside the sandbox.
- for testName, conf := range configs(t, noOverlay...) {
+ for testName, conf := range configsWithVFS2(t, noOverlay...) {
t.Run(testName, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
@@ -1725,12 +1715,11 @@ func TestMultiContainerHomeEnvDir(t *testing.T) {
homeDirs[name] = homeFile
}
- // We will sleep in the root container in order to ensure that
- // the root container doesn't terminate before sub containers can be
- // created.
- rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())}
- subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())}
- execCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())}
+ // We will sleep in the root container in order to ensure that the root
+ //container doesn't terminate before sub containers can be created.
+ rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s; sleep 1000`, homeDirs["root"].Name())}
+ subCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["sub"].Name())}
+ execCmd := fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["exec"].Name())
// Setup the containers, a root container and sub container.
specConfig, ids := createSpecs(rootCmd, subCmd)
@@ -1741,9 +1730,8 @@ func TestMultiContainerHomeEnvDir(t *testing.T) {
defer cleanup()
// Exec into the root container synchronously.
- args := &control.ExecArgs{Argv: execCmd}
- if _, err := containers[0].executeSync(args); err != nil {
- t.Errorf("error executing %+v: %v", args, err)
+ if _, err := execute(containers[0], "/bin/sh", "-c", execCmd); err != nil {
+ t.Errorf("error executing %+v: %v", execCmd, err)
}
// Wait for the subcontainer to finish.
diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go
index bac177a88..cb5bffb89 100644
--- a/runsc/container/shared_volume_test.go
+++ b/runsc/container/shared_volume_test.go
@@ -25,14 +25,14 @@ import (
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/test/testutil"
- "gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
)
// TestSharedVolume checks that modifications to a volume mount are propagated
// into and out of the sandbox.
func TestSharedVolume(t *testing.T) {
conf := testutil.TestConfig(t)
- conf.FileAccess = boot.FileAccessShared
+ conf.FileAccess = config.FileAccessShared
// Main process just sleeps. We will use "exec" to probe the state of
// the filesystem.
@@ -168,11 +168,7 @@ func TestSharedVolume(t *testing.T) {
func checkFile(c *Container, filename string, want []byte) error {
cpy := filename + ".copy"
- argsCp := &control.ExecArgs{
- Filename: "/bin/cp",
- Argv: []string{"cp", "-f", filename, cpy},
- }
- if _, err := c.executeSync(argsCp); err != nil {
+ if _, err := execute(c, "/bin/cp", "-f", filename, cpy); err != nil {
return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err)
}
got, err := ioutil.ReadFile(cpy)
@@ -189,7 +185,7 @@ func checkFile(c *Container, filename string, want []byte) error {
// is reflected inside.
func TestSharedVolumeFile(t *testing.T) {
conf := testutil.TestConfig(t)
- conf.FileAccess = boot.FileAccessShared
+ conf.FileAccess = config.FileAccessShared
// Main process just sleeps. We will use "exec" to probe the state of
// the filesystem.
@@ -235,11 +231,7 @@ func TestSharedVolumeFile(t *testing.T) {
}
// Append to file inside the container and check that content is not lost.
- argsAppend := &control.ExecArgs{
- Filename: "/bin/bash",
- Argv: []string{"bash", "-c", "echo -n sandbox- >> " + filename},
- }
- if _, err := c.executeSync(argsAppend); err != nil {
+ if _, err := execute(c, "/bin/bash", "-c", "echo -n sandbox- >> "+filename); err != nil {
t.Fatalf("unexpected error appending file %q: %v", filename, err)
}
want = []byte("host-sandbox-")
diff --git a/runsc/debian/description b/runsc/debian/description
deleted file mode 100644
index 9e8e08805..000000000
--- a/runsc/debian/description
+++ /dev/null
@@ -1 +0,0 @@
-gVisor container sandbox runtime
diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh
deleted file mode 100755
index dc7aeee87..000000000
--- a/runsc/debian/postinst.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh -e
-
-# Copyright 2019 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if [ "$1" != configure ]; then
- exit 0
-fi
-
-if [ -f /etc/docker/daemon.json ]; then
- runsc install
- systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2
-fi
diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go
index 0ca4829d7..ba1ff833f 100644
--- a/runsc/flag/flag.go
+++ b/runsc/flag/flag.go
@@ -21,13 +21,19 @@ import (
type FlagSet = flag.FlagSet
var (
- NewFlagSet = flag.NewFlagSet
- String = flag.String
Bool = flag.Bool
- Int = flag.Int
- Uint = flag.Uint
CommandLine = flag.CommandLine
+ Int = flag.Int
+ NewFlagSet = flag.NewFlagSet
Parse = flag.Parse
+ String = flag.String
+ Uint = flag.Uint
+ Var = flag.Var
)
const ContinueOnError = flag.ContinueOnError
+
+// Get returns the flag's underlying object.
+func Get(v flag.Value) interface{} {
+ return v.(flag.Getter).Get()
+}
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 1036b0630..96c57a426 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -31,5 +31,7 @@ go_test(
deps = [
"//pkg/log",
"//pkg/p9",
+ "//pkg/test/testutil",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 1dce36965..39b8a0b1e 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -27,62 +27,51 @@ import (
var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_ACCEPT: {},
syscall.SYS_CLOCK_GETTIME: {},
- syscall.SYS_CLONE: []seccomp.Rule{
- {
- seccomp.AllowValue(
- syscall.CLONE_VM |
- syscall.CLONE_FS |
- syscall.CLONE_FILES |
- syscall.CLONE_SIGHAND |
- syscall.CLONE_SYSVSEM |
- syscall.CLONE_THREAD),
- },
- },
- syscall.SYS_CLOSE: {},
- syscall.SYS_DUP: {},
- syscall.SYS_EPOLL_CTL: {},
+ syscall.SYS_CLOSE: {},
+ syscall.SYS_DUP: {},
+ syscall.SYS_EPOLL_CTL: {},
syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0),
},
},
syscall.SYS_EVENTFD2: []seccomp.Rule{
{
- seccomp.AllowValue(0),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(0),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_EXIT: {},
syscall.SYS_EXIT_GROUP: {},
syscall.SYS_FALLOCATE: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0),
},
},
syscall.SYS_FCHMOD: {},
syscall.SYS_FCHOWNAT: {},
syscall.SYS_FCNTL: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_GETFL),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_GETFL),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_SETFL),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_SETFL),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_GETFD),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_GETFD),
},
// Used by flipcall.PacketWindowAllocator.Init().
{
- seccomp.AllowAny{},
- seccomp.AllowValue(unix.F_ADD_SEALS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(unix.F_ADD_SEALS),
},
},
syscall.SYS_FSTAT: {},
@@ -91,31 +80,31 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_FTRUNCATE: {},
syscall.SYS_FUTEX: {
seccomp.Rule{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0),
},
seccomp.Rule{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0),
},
// Non-private futex used for flipcall.
seccomp.Rule{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAIT),
- seccomp.AllowAny{},
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAIT),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
},
seccomp.Rule{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAKE),
- seccomp.AllowAny{},
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAKE),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
},
},
syscall.SYS_GETDENTS64: {},
@@ -128,6 +117,7 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_MADVISE: {},
unix.SYS_MEMFD_CREATE: {}, /// Used by flipcall.PacketWindowAllocator.Init().
syscall.SYS_MKDIRAT: {},
+ syscall.SYS_MKNODAT: {},
// Used by the Go runtime as a temporarily workaround for a Linux
// 5.2-5.4 bug.
//
@@ -136,28 +126,28 @@ var allowedSyscalls = seccomp.SyscallRules{
// TODO(b/148688965): Remove once this is gone from Go.
syscall.SYS_MLOCK: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(4096),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4096),
},
},
syscall.SYS_MMAP: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_SHARED),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_SHARED),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
},
},
syscall.SYS_MPROTECT: {},
@@ -171,14 +161,14 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_READLINKAT: {},
syscall.SYS_RECVMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
},
},
syscall.SYS_RENAMEAT: {},
@@ -189,33 +179,33 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_SENDMSG: []seccomp.Rule{
// Used by fdchannel.Endpoint.SendFD().
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0),
},
// Used by unet.SocketWriter.WriteVec().
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
},
},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)},
},
syscall.SYS_SIGALTSTACK: {},
// Used by fdchannel.NewConnectedSockets().
syscall.SYS_SOCKETPAIR: {
{
- seccomp.AllowValue(syscall.AF_UNIX),
- seccomp.AllowValue(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_UNIX),
+ seccomp.EqualTo(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_SYMLINKAT: {},
syscall.SYS_TGKILL: []seccomp.Rule{
{
- seccomp.AllowValue(uint64(os.Getpid())),
+ seccomp.EqualTo(uint64(os.Getpid())),
},
},
syscall.SYS_UNLINKAT: {},
@@ -226,24 +216,24 @@ var allowedSyscalls = seccomp.SyscallRules{
var udsSyscalls = seccomp.SyscallRules{
syscall.SYS_SOCKET: []seccomp.Rule{
{
- seccomp.AllowValue(syscall.AF_UNIX),
- seccomp.AllowValue(syscall.SOCK_STREAM),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_UNIX),
+ seccomp.EqualTo(syscall.SOCK_STREAM),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_UNIX),
- seccomp.AllowValue(syscall.SOCK_DGRAM),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_UNIX),
+ seccomp.EqualTo(syscall.SOCK_DGRAM),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_UNIX),
- seccomp.AllowValue(syscall.SOCK_SEQPACKET),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_UNIX),
+ seccomp.EqualTo(syscall.SOCK_SEQPACKET),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_CONNECT: []seccomp.Rule{
{
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
},
},
}
diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go
index a4b28cb8b..686753d96 100644
--- a/runsc/fsgofer/filter/config_amd64.go
+++ b/runsc/fsgofer/filter/config_amd64.go
@@ -25,8 +25,41 @@ import (
func init() {
allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
- {seccomp.AllowValue(linux.ARCH_GET_FS)},
- {seccomp.AllowValue(linux.ARCH_SET_FS)},
+ // TODO(b/168828518): No longer used in Go 1.16+.
+ {seccomp.EqualTo(linux.ARCH_SET_FS)},
+ }
+
+ allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+ // parent_tidptr and child_tidptr are always 0 because neither
+ // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+ {
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SETTLS |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ seccomp.EqualTo(0), // parent_tidptr
+ seccomp.EqualTo(0), // child_tidptr
+ seccomp.MatchAny{}, // tls
+ },
+ {
+ // TODO(b/168828518): No longer used in Go 1.16+ (on amd64).
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ seccomp.EqualTo(0), // parent_tidptr
+ seccomp.EqualTo(0), // child_tidptr
+ seccomp.MatchAny{}, // tls
+ },
}
allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{}
diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go
index d2697deb7..ff0cf77a0 100644
--- a/runsc/fsgofer/filter/config_arm64.go
+++ b/runsc/fsgofer/filter/config_arm64.go
@@ -23,5 +23,26 @@ import (
)
func init() {
+ allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+ // parent_tidptr and child_tidptr are always 0 because neither
+ // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+ {
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ // These arguments are left uninitialized by the Go
+ // runtime, so they may be anything (and are unused by
+ // the host).
+ seccomp.MatchAny{}, // parent_tidptr
+ seccomp.MatchAny{}, // tls
+ seccomp.MatchAny{}, // child_tidptr
+ },
+ }
+
allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{}
}
diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go
index 885c92f7a..20a0732be 100644
--- a/runsc/fsgofer/filter/extra_filters_race.go
+++ b/runsc/fsgofer/filter/extra_filters_race.go
@@ -35,6 +35,7 @@ func instrumentationFilters() seccomp.SyscallRules {
syscall.SYS_MUNLOCK: {},
syscall.SYS_NANOSLEEP: {},
syscall.SYS_OPEN: {},
+ syscall.SYS_OPENAT: {},
syscall.SYS_SET_ROBUST_LIST: {},
// Used within glibc's malloc.
syscall.SYS_TIME: {},
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 74977c313..0b628c8ce 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -29,7 +29,6 @@ import (
"path/filepath"
"runtime"
"strconv"
- "syscall"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -45,39 +44,11 @@ const (
// modes to ensure an unopened/closed file fails all mode checks.
invalidMode = p9.OpenFlags(math.MaxUint32)
- openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
-)
-
-type fileType int
+ openFlags = unix.O_NOFOLLOW | unix.O_CLOEXEC
-const (
- regular fileType = iota
- directory
- symlink
- socket
- unknown
+ allowedOpenFlags = unix.O_TRUNC
)
-// String implements fmt.Stringer.
-func (f fileType) String() string {
- switch f {
- case regular:
- return "regular"
- case directory:
- return "directory"
- case symlink:
- return "symlink"
- case socket:
- return "socket"
- }
- return "unknown"
-}
-
-// ControlSocketAddr generates an abstract unix socket name for the given id.
-func ControlSocketAddr(id string) string {
- return fmt.Sprintf("\x00runsc-gofer.%s", id)
-}
-
// Config sets configuration options for each attach point.
type Config struct {
// ROMount is set to true if this is a readonly mount.
@@ -132,19 +103,19 @@ func (a *attachPoint) Attach() (p9.File, error) {
return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
}
- f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) {
+ f, readable, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) {
return fd.Open(a.prefix, openFlags|mode, 0)
})
if err != nil {
return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err)
}
- stat, err := stat(f.FD())
+ stat, err := fstat(f.FD())
if err != nil {
return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err)
}
- lf, err := newLocalFile(a, f, a.prefix, stat)
+ lf, err := newLocalFile(a, f, a.prefix, readable, stat)
if err != nil {
return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err)
}
@@ -153,7 +124,7 @@ func (a *attachPoint) Attach() (p9.File, error) {
}
// makeQID returns a unique QID for the given stat buffer.
-func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
+func (a *attachPoint) makeQID(stat unix.Stat_t) p9.QID {
a.deviceMu.Lock()
defer a.deviceMu.Unlock()
@@ -184,9 +155,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
// localFile implements p9.File wrapping a local file. The underlying file
// is opened during Walk() and stored in 'file' to be used with other
// operations. The file is opened as readonly, unless it's a symlink or there is
-// no read access, which requires O_PATH. 'file' is dup'ed when Walk(nil) is
-// called to clone the file. This reduces the number of walks that need to be
-// done by the host file system when files are reused.
+// no read access, which requires O_PATH.
//
// The file may be reopened if the requested mode in Open() is not a subset of
// current mode. Consequently, 'file' could have a mode wider than requested and
@@ -198,13 +167,30 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
// performance with 'overlay2' storage driver. overlay2 eagerly copies the
// entire file up when it's opened in write mode, and would perform badly when
// multiple files are only being opened for read (esp. startup).
+//
+// File operations must use "at" functions whenever possible:
+// * Local operations must use AT_EMPTY_PATH:
+// fchownat(fd, "", AT_EMPTY_PATH, ...), instead of chown(fullpath, ...)
+// * Creation operations must use (fd + name):
+// mkdirat(fd, name, ...), instead of mkdir(fullpath, ...)
+//
+// Apart from being faster, it also adds another layer of defense against
+// symlink attacks (note that O_NOFOLLOW applies only to the last element in
+// the path).
+//
+// The few exceptions where it cannot be done are: utimensat on symlinks, and
+// Connect() for the socket address.
type localFile struct {
- p9.DefaultWalkGetAttr
+ p9.DisallowClientCalls
// attachPoint is the attachPoint that serves this localFile.
attachPoint *attachPoint
- // hostPath will be safely updated by the Renamed hook.
+ // hostPath is the full path to the host file. It can be used for logging and
+ // the few cases where full path is required to operation the host file. In
+ // all other cases, use "file" directly.
+ //
+ // Note: it's safely updated by the Renamed hook.
hostPath string
// file is opened when localFile is created and it's never nil. It may be
@@ -212,12 +198,19 @@ type localFile struct {
// opened with.
file *fd.FD
+ // controlReadable tells whether 'file' was opened with read permissions
+ // during a walk.
+ controlReadable bool
+
// mode is the mode in which the file was opened. Set to invalidMode
// if localFile isn't opened.
mode p9.OpenFlags
- // ft is the fileType for this file.
- ft fileType
+ // fileType for this file. It is equivalent to:
+ // unix.Stat_t.Mode & unix.S_IFMT
+ fileType uint32
+
+ qid p9.QID
// readDirMu protects against concurrent Readdir calls.
readDirMu sync.Mutex
@@ -234,7 +227,7 @@ var procSelfFD *fd.FD
// OpenProcSelfFD opens the /proc/self/fd directory, which will be used to
// reopen file descriptors.
func OpenProcSelfFD() error {
- d, err := syscall.Open("/proc/self/fd", syscall.O_RDONLY|syscall.O_DIRECTORY, 0)
+ d, err := unix.Open("/proc/self/fd", unix.O_RDONLY|unix.O_DIRECTORY, 0)
if err != nil {
return fmt.Errorf("error opening /proc/self/fd: %v", err)
}
@@ -243,7 +236,7 @@ func OpenProcSelfFD() error {
}
func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
- d, err := syscall.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^syscall.O_NOFOLLOW, 0)
+ d, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(f.FD()), mode&^unix.O_NOFOLLOW, 0)
if err != nil {
return nil, err
}
@@ -251,83 +244,88 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
return fd.New(d), nil
}
-func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) {
- path := path.Join(parent.hostPath, name)
- f, err := openAnyFile(path, func(mode int) (*fd.FD, error) {
+func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) {
+ pathDebug := path.Join(parent.hostPath, name)
+ f, readable, err := openAnyFile(pathDebug, func(mode int) (*fd.FD, error) {
return fd.OpenAt(parent.file, name, openFlags|mode, 0)
})
- return f, path, err
+ return f, pathDebug, readable, err
}
-// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback
+// openAnyFile attempts to open the file in O_RDONLY. If it fails, falls back
// to O_PATH. 'path' is used for logging messages only. 'fn' is what does the
// actual file open and is customizable by the caller.
-func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) {
+func openAnyFile(pathDebug string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) {
// Attempt to open file in the following mode in order:
// 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
// Use non-blocking to prevent getting stuck inside open(2) for
// FIFOs. This option has no effect on regular files.
// 2. PATH: for symlinks, sockets.
- modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
+ options := []struct {
+ mode int
+ readable bool
+ }{
+ {
+ mode: unix.O_RDONLY | unix.O_NONBLOCK,
+ readable: true,
+ },
+ {
+ mode: unix.O_PATH,
+ readable: false,
+ },
+ }
var err error
- var file *fd.FD
- for i, mode := range modes {
- file, err = fn(mode)
+ for i, option := range options {
+ var file *fd.FD
+ file, err = fn(option.mode)
if err == nil {
- // openat succeeded, we're done.
- break
+ // Succeeded opening the file, we're done.
+ return file, option.readable, nil
}
switch e := extractErrno(err); e {
- case syscall.ENOENT:
+ case unix.ENOENT:
// File doesn't exist, no point in retrying.
- return nil, e
+ return nil, false, e
}
- // openat failed. Try again with next mode, preserving 'err' in case this
- // was the last attempt.
- log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err)
+ // File failed to open. Try again with next mode, preserving 'err' in case
+ // this was the last attempt.
+ log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, pathDebug, err)
}
- if err != nil {
- // All attempts to open file have failed, return the last error.
- log.Debugf("Failed to open file, path: %q, err: %v", path, err)
- return nil, extractErrno(err)
- }
-
- return file, nil
+ // All attempts to open file have failed, return the last error.
+ log.Debugf("Failed to open file, path: %q, err: %v", pathDebug, err)
+ return nil, false, extractErrno(err)
}
-func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) {
- var ft fileType
- switch stat.Mode & syscall.S_IFMT {
- case syscall.S_IFREG:
- ft = regular
- case syscall.S_IFDIR:
- ft = directory
- case syscall.S_IFLNK:
- ft = symlink
- case syscall.S_IFSOCK:
+func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error {
+ switch stat.Mode & unix.S_IFMT {
+ case unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK:
+ return nil
+
+ case unix.S_IFSOCK:
if !permitSocket {
- return unknown, syscall.EPERM
+ return unix.EPERM
}
- ft = socket
+ return nil
+
default:
- return unknown, syscall.EPERM
+ return unix.EPERM
}
- return ft, nil
}
-func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) {
- ft, err := getSupportedFileType(stat, a.conf.HostUDS)
- if err != nil {
+func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat unix.Stat_t) (*localFile, error) {
+ if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil {
return nil, err
}
return &localFile{
- attachPoint: a,
- hostPath: path,
- file: file,
- mode: invalidMode,
- ft: ft,
+ attachPoint: a,
+ hostPath: path,
+ file: file,
+ mode: invalidMode,
+ fileType: stat.Mode & unix.S_IFMT,
+ qid: a.makeQID(stat),
+ controlReadable: readable,
}, nil
}
@@ -335,7 +333,7 @@ func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t)
// non-blocking. If anything fails, returns nil. It's better to have a file
// without host FD, than to fail the operation.
func newFDMaybe(file *fd.FD) *fd.FD {
- dupFD, err := syscall.Dup(file.FD())
+ dupFD, err := unix.Dup(file.FD())
// Technically, the runtime may call the finalizer on file as soon as
// FD() returns.
runtime.KeepAlive(file)
@@ -345,23 +343,23 @@ func newFDMaybe(file *fd.FD) *fd.FD {
dup := fd.New(dupFD)
// fd is blocking; non-blocking is required.
- if err := syscall.SetNonblock(dup.FD(), true); err != nil {
- dup.Close()
+ if err := unix.SetNonblock(dup.FD(), true); err != nil {
+ _ = dup.Close()
return nil
}
return dup
}
-func stat(fd int) (syscall.Stat_t, error) {
- var stat syscall.Stat_t
- if err := syscall.Fstat(fd, &stat); err != nil {
- return syscall.Stat_t{}, err
+func fstat(fd int) (unix.Stat_t, error) {
+ var stat unix.Stat_t
+ if err := unix.Fstat(fd, &stat); err != nil {
+ return unix.Stat_t{}, err
}
return stat, nil
}
func fchown(fd int, uid p9.UID, gid p9.GID) error {
- return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
+ return unix.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
}
// Open implements p9.File.
@@ -369,10 +367,16 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
if l.isOpen() {
panic(fmt.Sprintf("attempting to open already opened file: %q", l.hostPath))
}
+ mode := flags & p9.OpenFlagsModeMask
+ if mode == p9.WriteOnly || mode == p9.ReadWrite || flags&p9.OpenTruncate != 0 {
+ if err := l.checkROMount(); err != nil {
+ return nil, p9.QID{}, 0, err
+ }
+ }
// Check if control file can be used or if a new open must be created.
var newFile *fd.FD
- if flags == p9.ReadOnly {
+ if mode == p9.ReadOnly && l.controlReadable && flags.OSFlags()&allowedOpenFlags == 0 {
log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath)
newFile = l.file
} else {
@@ -381,23 +385,15 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
// name_to_handle_at and open_by_handle_at aren't supported by overlay2.
log.Debugf("Open reopening file, flags: %v, %q", flags, l.hostPath)
var err error
- // Constrain open flags to the open mode and O_TRUNC.
- newFile, err = reopenProcFd(l.file, openFlags|(flags.OSFlags()&(syscall.O_ACCMODE|syscall.O_TRUNC)))
+ osFlags := flags.OSFlags() & (unix.O_ACCMODE | allowedOpenFlags)
+ newFile, err = reopenProcFd(l.file, openFlags|osFlags)
if err != nil {
return nil, p9.QID{}, 0, extractErrno(err)
}
}
- stat, err := stat(newFile.FD())
- if err != nil {
- if newFile != l.file {
- newFile.Close()
- }
- return nil, p9.QID{}, 0, extractErrno(err)
- }
-
var fd *fd.FD
- if stat.Mode&syscall.S_IFMT == syscall.S_IFREG {
+ if l.fileType == unix.S_IFREG {
// Donate FD for regular files only.
fd = newFDMaybe(newFile)
}
@@ -409,38 +405,38 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
}
l.file = newFile
}
- l.mode = flags & p9.OpenFlagsModeMask
- return fd, l.attachPoint.makeQID(stat), 0, nil
+ l.mode = mode
+ return fd, l.qid, 0, nil
}
// Create implements p9.File.
-func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
- conf := l.attachPoint.conf
- if conf.ROMount {
- if conf.PanicOnWrite {
- panic("attempt to write to RO mount")
- }
- return nil, nil, p9.QID{}, 0, syscall.EBADF
+func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) {
+ if err := l.checkROMount(); err != nil {
+ return nil, nil, p9.QID{}, 0, err
}
+ // Set file creation flags, plus allowed open flags from caller.
+ osFlags := openFlags | unix.O_CREAT | unix.O_EXCL
+ osFlags |= p9Flags.OSFlags() & allowedOpenFlags
+
// 'file' may be used for other operations (e.g. Walk), so read access is
// always added to flags. Note that resulting file might have a wider mode
// than needed for each particular case.
- flags := openFlags | syscall.O_CREAT | syscall.O_EXCL
+ mode := p9Flags & p9.OpenFlagsModeMask
if mode == p9.WriteOnly {
- flags |= syscall.O_RDWR
+ osFlags |= unix.O_RDWR
} else {
- flags |= mode.OSFlags()
+ osFlags |= mode.OSFlags()
}
- child, err := fd.OpenAt(l.file, name, flags, uint32(perm.Permissions()))
+ child, err := fd.OpenAt(l.file, name, osFlags, uint32(perm.Permissions()))
if err != nil {
return nil, nil, p9.QID{}, 0, extractErrno(err)
}
cu := cleanup.Make(func() {
- child.Close()
+ _ = child.Close()
// Best effort attempt to remove the file in case of failure.
- if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
+ if err := unix.Unlinkat(l.file.FD(), name, 0); err != nil {
log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
}
})
@@ -449,7 +445,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
if err := fchown(child.FD(), uid, gid); err != nil {
return nil, nil, p9.QID{}, 0, extractErrno(err)
}
- stat, err := stat(child.FD())
+ stat, err := fstat(child.FD())
if err != nil {
return nil, nil, p9.QID{}, 0, extractErrno(err)
}
@@ -459,23 +455,21 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
hostPath: path.Join(l.hostPath, name),
file: child,
mode: mode,
+ fileType: unix.S_IFREG,
+ qid: l.attachPoint.makeQID(stat),
}
cu.Release()
- return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil
+ return newFDMaybe(c.file), c, c.qid, 0, nil
}
// Mkdir implements p9.File.
func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
- conf := l.attachPoint.conf
- if conf.ROMount {
- if conf.PanicOnWrite {
- panic("attempt to write to RO mount")
- }
- return p9.QID{}, syscall.EBADF
+ if err := l.checkROMount(); err != nil {
+ return p9.QID{}, err
}
- if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
+ if err := unix.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
return p9.QID{}, extractErrno(err)
}
cu := cleanup.Make(func() {
@@ -487,7 +481,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
defer cu.Clean()
// Open directory to change ownership and stat it.
- flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags
+ flags := unix.O_DIRECTORY | unix.O_RDONLY | openFlags
f, err := fd.OpenAt(l.file, name, flags, 0)
if err != nil {
return p9.QID{}, extractErrno(err)
@@ -497,7 +491,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
if err := fchown(f.FD(), uid, gid); err != nil {
return p9.QID{}, extractErrno(err)
}
- stat, err := stat(f.FD())
+ stat, err := fstat(f.FD())
if err != nil {
return p9.QID{}, extractErrno(err)
}
@@ -508,61 +502,80 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
// Walk implements p9.File.
func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
+ qids, file, _, err := l.walk(names)
+ return qids, file, err
+}
+
+// WalkGetAttr implements p9.File.
+func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) {
+ qids, file, stat, err := l.walk(names)
+ if err != nil {
+ return nil, nil, p9.AttrMask{}, p9.Attr{}, err
+ }
+ mask, attr := l.fillAttr(stat)
+ return qids, file, mask, attr, nil
+}
+
+func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error) {
// Duplicate current file if 'names' is empty.
if len(names) == 0 {
- newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
+ newFile, readable, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
return reopenProcFd(l.file, openFlags|mode)
})
if err != nil {
- return nil, nil, extractErrno(err)
+ return nil, nil, unix.Stat_t{}, extractErrno(err)
}
- stat, err := stat(newFile.FD())
+ stat, err := fstat(newFile.FD())
if err != nil {
- newFile.Close()
- return nil, nil, extractErrno(err)
+ _ = newFile.Close()
+ return nil, nil, unix.Stat_t{}, extractErrno(err)
}
c := &localFile{
- attachPoint: l.attachPoint,
- hostPath: l.hostPath,
- file: newFile,
- mode: invalidMode,
+ attachPoint: l.attachPoint,
+ hostPath: l.hostPath,
+ file: newFile,
+ mode: invalidMode,
+ fileType: l.fileType,
+ qid: l.attachPoint.makeQID(stat),
+ controlReadable: readable,
}
- return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil
+ return []p9.QID{c.qid}, c, stat, nil
}
var qids []p9.QID
+ var lastStat unix.Stat_t
last := l
for _, name := range names {
- f, path, err := openAnyFileFromParent(last, name)
+ f, path, readable, err := openAnyFileFromParent(last, name)
if last != l {
- last.Close()
+ _ = last.Close()
}
if err != nil {
- return nil, nil, extractErrno(err)
+ return nil, nil, unix.Stat_t{}, extractErrno(err)
}
- stat, err := stat(f.FD())
+ lastStat, err = fstat(f.FD())
if err != nil {
- f.Close()
- return nil, nil, extractErrno(err)
+ _ = f.Close()
+ return nil, nil, unix.Stat_t{}, extractErrno(err)
}
- c, err := newLocalFile(last.attachPoint, f, path, stat)
+ c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat)
if err != nil {
- f.Close()
- return nil, nil, extractErrno(err)
+ _ = f.Close()
+ return nil, nil, unix.Stat_t{}, extractErrno(err)
}
- qids = append(qids, l.attachPoint.makeQID(stat))
+ qids = append(qids, c.qid)
last = c
}
- return qids, last, nil
+ return qids, last, lastStat, nil
}
// StatFS implements p9.File.
func (l *localFile) StatFS() (p9.FSStat, error) {
- var s syscall.Statfs_t
- if err := syscall.Fstatfs(l.file.FD(), &s); err != nil {
+ var s unix.Statfs_t
+ if err := unix.Fstatfs(l.file.FD(), &s); err != nil {
return p9.FSStat{}, extractErrno(err)
}
@@ -582,9 +595,9 @@ func (l *localFile) StatFS() (p9.FSStat, error) {
// FSync implements p9.File.
func (l *localFile) FSync() error {
if !l.isOpen() {
- return syscall.EBADF
+ return unix.EBADF
}
- if err := syscall.Fsync(l.file.FD()); err != nil {
+ if err := unix.Fsync(l.file.FD()); err != nil {
return extractErrno(err)
}
return nil
@@ -592,11 +605,15 @@ func (l *localFile) FSync() error {
// GetAttr implements p9.File.
func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
- stat, err := stat(l.file.FD())
+ stat, err := fstat(l.file.FD())
if err != nil {
return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
}
+ mask, attr := l.fillAttr(stat)
+ return l.qid, mask, attr, nil
+}
+func (l *localFile) fillAttr(stat unix.Stat_t) (p9.AttrMask, p9.Attr) {
attr := p9.Attr{
Mode: p9.FileMode(stat.Mode),
UID: p9.UID(stat.Uid),
@@ -625,20 +642,15 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
MTime: true,
CTime: true,
}
-
- return l.attachPoint.makeQID(stat), valid, attr, nil
+ return valid, attr
}
// SetAttr implements p9.File. Due to mismatch in file API, options
// cannot be changed atomically and user may see partial changes when
// an error happens.
func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
- conf := l.attachPoint.conf
- if conf.ROMount {
- if conf.PanicOnWrite {
- panic("attempt to write to RO mount")
- }
- return syscall.EBADF
+ if err := l.checkROMount(); err != nil {
+ return err
}
allowed := p9.SetAttrMask{
@@ -661,13 +673,13 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
// consistent result that is not attribute dependent.
if !valid.IsSubsetOf(allowed) {
log.Warningf("SetAttr() failed for %q, mask: %v", l.hostPath, valid)
- return syscall.EPERM
+ return unix.EPERM
}
// Check if it's possible to use cached file, or if another one needs to be
// opened for write.
f := l.file
- if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+ if l.fileType == unix.S_IFREG && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
var err error
f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY)
if err != nil {
@@ -688,21 +700,21 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
// over another.
var err error
if valid.Permissions {
- if cerr := syscall.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
+ if cerr := unix.Fchmod(f.FD(), uint32(attr.Permissions)); cerr != nil {
log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr)
err = extractErrno(cerr)
}
}
if valid.Size {
- if terr := syscall.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
+ if terr := unix.Ftruncate(f.FD(), int64(attr.Size)); terr != nil {
log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr)
err = extractErrno(terr)
}
}
if valid.ATime || valid.MTime {
- utimes := [2]syscall.Timespec{
+ utimes := [2]unix.Timespec{
{Sec: 0, Nsec: linux.UTIME_OMIT},
{Sec: 0, Nsec: linux.UTIME_OMIT},
}
@@ -723,15 +735,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
}
}
- if l.ft == symlink {
+ if l.fileType == unix.S_IFLNK {
// utimensat operates different that other syscalls. To operate on a
// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
// name.
- parent, err := syscall.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+ parent, err := unix.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
if err != nil {
return extractErrno(err)
}
- defer syscall.Close(parent)
+ defer unix.Close(parent)
if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
@@ -756,7 +768,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
if valid.GID {
gid = int(attr.GID)
}
- if oerr := syscall.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
+ if oerr := unix.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
err = extractErrno(oerr)
}
@@ -766,28 +778,28 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
}
func (*localFile) GetXattr(string, uint64) (string, error) {
- return "", syscall.EOPNOTSUPP
+ return "", unix.EOPNOTSUPP
}
func (*localFile) SetXattr(string, string, uint32) error {
- return syscall.EOPNOTSUPP
+ return unix.EOPNOTSUPP
}
func (*localFile) ListXattr(uint64) (map[string]struct{}, error) {
- return nil, syscall.EOPNOTSUPP
+ return nil, unix.EOPNOTSUPP
}
func (*localFile) RemoveXattr(string) error {
- return syscall.EOPNOTSUPP
+ return unix.EOPNOTSUPP
}
// Allocate implements p9.File.
func (l *localFile) Allocate(mode p9.AllocateMode, offset, length uint64) error {
if !l.isOpen() {
- return syscall.EBADF
+ return unix.EBADF
}
- if err := syscall.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
+ if err := unix.Fallocate(l.file.FD(), mode.ToLinux(), int64(offset), int64(length)); err != nil {
return extractErrno(err)
}
return nil
@@ -800,12 +812,8 @@ func (*localFile) Rename(p9.File, string) error {
// RenameAt implements p9.File.RenameAt.
func (l *localFile) RenameAt(oldName string, directory p9.File, newName string) error {
- conf := l.attachPoint.conf
- if conf.ROMount {
- if conf.PanicOnWrite {
- panic("attempt to write to RO mount")
- }
- return syscall.EBADF
+ if err := l.checkROMount(); err != nil {
+ return err
}
newParent := directory.(*localFile)
@@ -818,10 +826,10 @@ func (l *localFile) RenameAt(oldName string, directory p9.File, newName string)
// ReadAt implements p9.File.
func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
- return 0, syscall.EBADF
+ return 0, unix.EBADF
}
if !l.isOpen() {
- return 0, syscall.EBADF
+ return 0, unix.EBADF
}
r, err := l.file.ReadAt(p, int64(offset))
@@ -836,10 +844,10 @@ func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) {
// WriteAt implements p9.File.
func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
- return 0, syscall.EBADF
+ return 0, unix.EBADF
}
if !l.isOpen() {
- return 0, syscall.EBADF
+ return 0, unix.EBADF
}
w, err := l.file.WriteAt(p, int64(offset))
@@ -851,12 +859,8 @@ func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) {
// Symlink implements p9.File.
func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
- conf := l.attachPoint.conf
- if conf.ROMount {
- if conf.PanicOnWrite {
- panic("attempt to write to RO mount")
- }
- return p9.QID{}, syscall.EBADF
+ if err := l.checkROMount(); err != nil {
+ return p9.QID{}, err
}
if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil {
@@ -864,7 +868,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
}
cu := cleanup.Make(func() {
// Best effort attempt to remove the symlink in case of failure.
- if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
+ if err := unix.Unlinkat(l.file.FD(), newName, 0); err != nil {
log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
}
})
@@ -880,7 +884,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
if err := fchown(f.FD(), uid, gid); err != nil {
return p9.QID{}, extractErrno(err)
}
- stat, err := stat(f.FD())
+ stat, err := fstat(f.FD())
if err != nil {
return p9.QID{}, extractErrno(err)
}
@@ -891,12 +895,8 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
// Link implements p9.File.
func (l *localFile) Link(target p9.File, newName string) error {
- conf := l.attachPoint.conf
- if conf.ROMount {
- if conf.PanicOnWrite {
- panic("attempt to write to RO mount")
- }
- return syscall.EBADF
+ if err := l.checkROMount(); err != nil {
+ return err
}
targetFile := target.(*localFile)
@@ -907,23 +907,53 @@ func (l *localFile) Link(target p9.File, newName string) error {
}
// Mknod implements p9.File.
-//
-// Not implemented.
-func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
+ if err := l.checkROMount(); err != nil {
+ return p9.QID{}, err
+ }
+
// From mknod(2) man page:
// "EPERM: [...] if the filesystem containing pathname does not support
// the type of node requested."
- return p9.QID{}, syscall.EPERM
+ if mode.FileType() != p9.ModeRegular {
+ return p9.QID{}, unix.EPERM
+ }
+
+ // Allow Mknod to create regular files.
+ if err := unix.Mknodat(l.file.FD(), name, uint32(mode), 0); err != nil {
+ return p9.QID{}, err
+ }
+ cu := cleanup.Make(func() {
+ // Best effort attempt to remove the file in case of failure.
+ if err := unix.Unlinkat(l.file.FD(), name, 0); err != nil {
+ log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
+ }
+ })
+ defer cu.Clean()
+
+ // Open file to change ownership and stat it.
+ child, err := fd.OpenAt(l.file, name, unix.O_PATH|openFlags, 0)
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ defer child.Close()
+
+ if err := fchown(child.FD(), uid, gid); err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ stat, err := fstat(child.FD())
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+
+ cu.Release()
+ return l.attachPoint.makeQID(stat), nil
}
// UnlinkAt implements p9.File.
func (l *localFile) UnlinkAt(name string, flags uint32) error {
- conf := l.attachPoint.conf
- if conf.ROMount {
- if conf.PanicOnWrite {
- panic("attempt to write to RO mount")
- }
- return syscall.EBADF
+ if err := l.checkROMount(); err != nil {
+ return err
}
if err := unix.Unlinkat(l.file.FD(), name, int(flags)); err != nil {
@@ -935,10 +965,10 @@ func (l *localFile) UnlinkAt(name string, flags uint32) error {
// Readdir implements p9.File.
func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite {
- return nil, syscall.EBADF
+ return nil, unix.EBADF
}
if !l.isOpen() {
- return nil, syscall.EBADF
+ return nil, unix.EBADF
}
// Readdirnames is a cursor over directories, so seek back to 0 to ensure it's
@@ -949,10 +979,13 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
skip := uint64(0)
- // Check if the file is at the correct position already. If not, seek to the
- // beginning and read the entire directory again.
- if l.lastDirentOffset != offset {
- if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
+ // Check if the file is at the correct position already. If not, seek to
+ // the beginning and read the entire directory again. We always seek if
+ // offset is 0, since this is side-effectual (equivalent to rewinddir(3),
+ // which causes the directory stream to resynchronize with the directory's
+ // current contents).
+ if l.lastDirentOffset != offset || offset == 0 {
+ if _, err := unix.Seek(l.file.FD(), 0, 0); err != nil {
return nil, extractErrno(err)
}
skip = offset
@@ -985,7 +1018,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64)
end := offset + uint64(count)
for offset < end {
- dirSize, err := syscall.ReadDirent(f, direntsBuf)
+ dirSize, err := unix.ReadDirent(f, direntsBuf)
if err != nil {
return dirents, err
}
@@ -994,7 +1027,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64)
}
names := names[:0]
- _, _, names = syscall.ParseDirent(direntsBuf[:dirSize], -1, names)
+ _, _, names = unix.ParseDirent(direntsBuf[:dirSize], -1, names)
// Skip over entries that the caller is not interested in.
if skip > 0 {
@@ -1039,7 +1072,7 @@ func (l *localFile) Readlink() (string, error) {
return string(b[:n]), nil
}
}
- return "", syscall.ENOMEM
+ return "", unix.ENOMEM
}
// Flush implements p9.File.
@@ -1050,7 +1083,7 @@ func (l *localFile) Flush() error {
// Connect implements p9.File.
func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
if !l.attachPoint.conf.HostUDS {
- return nil, syscall.ECONNREFUSED
+ return nil, unix.ECONNREFUSED
}
// TODO(gvisor.dev/issue/1003): Due to different app vs replacement
@@ -1058,34 +1091,34 @@ func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
// fit f.path in our sockaddr. We'd need to redirect through a shorter
// path in order to actually connect to this socket.
if len(l.hostPath) > linux.UnixPathMax {
- return nil, syscall.ECONNREFUSED
+ return nil, unix.ECONNREFUSED
}
var stype int
switch flags {
case p9.StreamSocket:
- stype = syscall.SOCK_STREAM
+ stype = unix.SOCK_STREAM
case p9.DgramSocket:
- stype = syscall.SOCK_DGRAM
+ stype = unix.SOCK_DGRAM
case p9.SeqpacketSocket:
- stype = syscall.SOCK_SEQPACKET
+ stype = unix.SOCK_SEQPACKET
default:
- return nil, syscall.ENXIO
+ return nil, unix.ENXIO
}
- f, err := syscall.Socket(syscall.AF_UNIX, stype, 0)
+ f, err := unix.Socket(unix.AF_UNIX, stype, 0)
if err != nil {
return nil, err
}
- if err := syscall.SetNonblock(f, true); err != nil {
- syscall.Close(f)
+ if err := unix.SetNonblock(f, true); err != nil {
+ _ = unix.Close(f)
return nil, err
}
- sa := syscall.SockaddrUnix{Name: l.hostPath}
- if err := syscall.Connect(f, &sa); err != nil {
- syscall.Close(f)
+ sa := unix.SockaddrUnix{Name: l.hostPath}
+ if err := unix.Connect(f, &sa); err != nil {
+ _ = unix.Close(f)
return nil, err
}
@@ -1110,7 +1143,7 @@ func (l *localFile) Renamed(newDir p9.File, newName string) {
}
// extractErrno tries to determine the errno.
-func extractErrno(err error) syscall.Errno {
+func extractErrno(err error) unix.Errno {
if err == nil {
// This should never happen. The likely result will be that
// some user gets the frustrating "error: SUCCESS" message.
@@ -1120,18 +1153,18 @@ func extractErrno(err error) syscall.Errno {
switch err {
case os.ErrNotExist:
- return syscall.ENOENT
+ return unix.ENOENT
case os.ErrExist:
- return syscall.EEXIST
+ return unix.EEXIST
case os.ErrPermission:
- return syscall.EACCES
+ return unix.EACCES
case os.ErrInvalid:
- return syscall.EINVAL
+ return unix.EINVAL
}
// See if it's an errno or a common wrapped error.
switch e := err.(type) {
- case syscall.Errno:
+ case unix.Errno:
return e
case *os.PathError:
return extractErrno(e.Err)
@@ -1143,5 +1176,12 @@ func extractErrno(err error) syscall.Errno {
// Fall back to EIO.
log.Debugf("Unknown error: %v, defaulting to EIO", err)
- return syscall.EIO
+ return unix.EIO
+}
+
+func (l *localFile) checkROMount() error {
+ if conf := l.attachPoint.conf; conf.ROMount {
+ return unix.EROFS
+ }
+ return nil
}
diff --git a/runsc/fsgofer/fsgofer_amd64_unsafe.go b/runsc/fsgofer/fsgofer_amd64_unsafe.go
index 5d4aab597..c46958185 100644
--- a/runsc/fsgofer/fsgofer_amd64_unsafe.go
+++ b/runsc/fsgofer/fsgofer_amd64_unsafe.go
@@ -17,25 +17,25 @@
package fsgofer
import (
- "syscall"
"unsafe"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/syserr"
)
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
- nameBytes, err := syscall.BytePtrFromString(name)
+func statAt(dirFd int, name string) (unix.Stat_t, error) {
+ nameBytes, err := unix.BytePtrFromString(name)
if err != nil {
- return syscall.Stat_t{}, err
+ return unix.Stat_t{}, err
}
namePtr := unsafe.Pointer(nameBytes)
- var stat syscall.Stat_t
+ var stat unix.Stat_t
statPtr := unsafe.Pointer(&stat)
- if _, _, errno := syscall.Syscall6(
- syscall.SYS_NEWFSTATAT,
+ if _, _, errno := unix.Syscall6(
+ unix.SYS_NEWFSTATAT,
uintptr(dirFd),
uintptr(namePtr),
uintptr(statPtr),
@@ -43,7 +43,7 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) {
0,
0); errno != 0 {
- return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+ return unix.Stat_t{}, syserr.FromHost(errno).ToError()
}
return stat, nil
}
diff --git a/runsc/fsgofer/fsgofer_arm64_unsafe.go b/runsc/fsgofer/fsgofer_arm64_unsafe.go
index 8041fd352..491460718 100644
--- a/runsc/fsgofer/fsgofer_arm64_unsafe.go
+++ b/runsc/fsgofer/fsgofer_arm64_unsafe.go
@@ -17,25 +17,25 @@
package fsgofer
import (
- "syscall"
"unsafe"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/syserr"
)
-func statAt(dirFd int, name string) (syscall.Stat_t, error) {
- nameBytes, err := syscall.BytePtrFromString(name)
+func statAt(dirFd int, name string) (unix.Stat_t, error) {
+ nameBytes, err := unix.BytePtrFromString(name)
if err != nil {
- return syscall.Stat_t{}, err
+ return unix.Stat_t{}, err
}
namePtr := unsafe.Pointer(nameBytes)
- var stat syscall.Stat_t
+ var stat unix.Stat_t
statPtr := unsafe.Pointer(&stat)
- if _, _, errno := syscall.Syscall6(
- syscall.SYS_FSTATAT,
+ if _, _, errno := unix.Syscall6(
+ unix.SYS_FSTATAT,
uintptr(dirFd),
uintptr(namePtr),
uintptr(statPtr),
@@ -43,7 +43,7 @@ func statAt(dirFd int, name string) (syscall.Stat_t, error) {
0,
0); errno != 0 {
- return syscall.Stat_t{}, syserr.FromHost(errno).ToError()
+ return unix.Stat_t{}, syserr.FromHost(errno).ToError()
}
return stat, nil
}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 05af7e397..a84206686 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -21,11 +21,24 @@ import (
"os"
"path"
"path/filepath"
- "syscall"
"testing"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
+
+var (
+ allTypes = []uint32{unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK}
+
+ // allConfs is set in init().
+ allConfs []Config
+
+ rwConfs = []Config{{ROMount: false}}
+ roConfs = []Config{{ROMount: true}}
)
func init() {
@@ -39,6 +52,13 @@ func init() {
}
}
+func configTestName(conf *Config) string {
+ if conf.ROMount {
+ return "ROMount"
+ }
+ return "RWMount"
+}
+
func assertPanic(t *testing.T, f func()) {
defer func() {
if r := recover(); r == nil {
@@ -63,7 +83,7 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
}
want = append(want, b...)
} else {
- if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+ if e, ok := err.(unix.Errno); !ok || e != unix.EBADF {
return fmt.Errorf("WriteAt() should have failed, got: %d, want: EBADFD", err)
}
}
@@ -81,78 +101,83 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
return fmt.Errorf("ReadAt() wrong data, got: %s, want: %s", string(rBuf), want)
}
} else {
- if e, ok := err.(syscall.Errno); !ok || e != syscall.EBADF {
+ if e, ok := err.(unix.Errno); !ok || e != unix.EBADF {
return fmt.Errorf("ReadAt() should have failed, got: %d, want: EBADFD", err)
}
}
return nil
}
-var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
-
-var (
- allTypes = []fileType{regular, directory, symlink}
-
- // allConfs is set in init() above.
- allConfs []Config
-
- rwConfs = []Config{{ROMount: false}}
- roConfs = []Config{{ROMount: true}}
-)
-
type state struct {
- root *localFile
- file *localFile
- conf Config
- ft fileType
+ root *localFile
+ file *localFile
+ conf Config
+ fileType uint32
}
func (s state) String() string {
- return fmt.Sprintf("type(%v)", s.ft)
+ return fmt.Sprintf("type(%v)", s.fileType)
+}
+
+func typeName(fileType uint32) string {
+ switch fileType {
+ case unix.S_IFREG:
+ return "file"
+ case unix.S_IFDIR:
+ return "directory"
+ case unix.S_IFLNK:
+ return "symlink"
+ default:
+ panic(fmt.Sprintf("invalid file type for test: %d", fileType))
+ }
}
func runAll(t *testing.T, test func(*testing.T, state)) {
runCustom(t, allTypes, allConfs, test)
}
-func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) {
+func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.T, state)) {
for _, c := range confs {
- t.Logf("Config: %+v", c)
-
for _, ft := range types {
- t.Logf("File type: %v", ft)
+ name := fmt.Sprintf("%s/%s", configTestName(&c), typeName(ft))
+ t.Run(name, func(t *testing.T) {
+ path, name, err := setup(ft)
+ if err != nil {
+ t.Fatalf("%v", err)
+ }
+ defer os.RemoveAll(path)
- path, name, err := setup(ft)
- if err != nil {
- t.Fatalf("%v", err)
- }
- defer os.RemoveAll(path)
+ a, err := NewAttachPoint(path, c)
+ if err != nil {
+ t.Fatalf("NewAttachPoint failed: %v", err)
+ }
+ root, err := a.Attach()
+ if err != nil {
+ t.Fatalf("Attach failed, err: %v", err)
+ }
- a, err := NewAttachPoint(path, c)
- if err != nil {
- t.Fatalf("NewAttachPoint failed: %v", err)
- }
- root, err := a.Attach()
- if err != nil {
- t.Fatalf("Attach failed, err: %v", err)
- }
+ _, file, err := root.Walk([]string{name})
+ if err != nil {
+ root.Close()
+ t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
+ }
- _, file, err := root.Walk([]string{name})
- if err != nil {
+ st := state{
+ root: root.(*localFile),
+ file: file.(*localFile),
+ conf: c,
+ fileType: ft,
+ }
+ test(t, st)
+ file.Close()
root.Close()
- t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
- }
-
- st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft}
- test(t, st)
- file.Close()
- root.Close()
+ })
}
}
}
-func setup(ft fileType) (string, string, error) {
- path, err := ioutil.TempDir("", "root-")
+func setup(fileType uint32) (string, string, error) {
+ path, err := ioutil.TempDir(testutil.TmpDir(), "root-")
if err != nil {
return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err)
}
@@ -169,26 +194,26 @@ func setup(ft fileType) (string, string, error) {
defer root.Close()
var name string
- switch ft {
- case regular:
+ switch fileType {
+ case unix.S_IFREG:
name = "file"
_, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
if err != nil {
return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err)
}
defer f.Close()
- case directory:
+ case unix.S_IFDIR:
name = "dir"
if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err)
}
- case symlink:
+ case unix.S_IFLNK:
name = "symlink"
if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err)
}
default:
- panic(fmt.Sprintf("unknown file type %v", ft))
+ panic(fmt.Sprintf("unknown file type %v", fileType))
}
return path, name, nil
}
@@ -202,7 +227,7 @@ func createFile(dir *localFile, name string) (*localFile, error) {
}
func TestReadWrite(t *testing.T) {
- runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
child, err := createFile(s.file, "test")
if err != nil {
t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -221,9 +246,13 @@ func TestReadWrite(t *testing.T) {
if err != nil {
t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err)
}
- if _, _, _, err := l.Open(flags); err != nil {
+ fd, _, _, err := l.Open(flags)
+ if err != nil {
t.Fatalf("%v: Open(%v) failed, err: %v", s, flags, err)
}
+ if fd != nil {
+ defer fd.Close()
+ }
if err := testReadWrite(l, flags, want); err != nil {
t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
}
@@ -232,14 +261,14 @@ func TestReadWrite(t *testing.T) {
}
func TestCreate(t *testing.T) {
- runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
for i, flags := range allOpenFlags {
_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
if err != nil {
t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, flags, err)
}
- if err := testReadWrite(l, flags, []byte{}); err != nil {
+ if err := testReadWrite(l, flags, nil); err != nil {
t.Fatalf("%v: testReadWrite(%v) failed: %v", s, flags, err)
}
}
@@ -249,7 +278,7 @@ func TestCreate(t *testing.T) {
// TestReadWriteDup tests that a file opened in any mode can be dup'ed and
// reopened in any other mode.
func TestReadWriteDup(t *testing.T) {
- runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
child, err := createFile(s.file, "test")
if err != nil {
t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -279,9 +308,13 @@ func TestReadWriteDup(t *testing.T) {
t.Fatalf("%v: Walk(<empty>) failed: %v", s, err)
}
defer dup.Close()
- if _, _, _, err := dup.Open(dupFlags); err != nil {
+ fd, _, _, err := dup.Open(dupFlags)
+ if err != nil {
t.Fatalf("%v: Open(%v) failed: %v", s, flags, err)
}
+ if fd != nil {
+ defer fd.Close()
+ }
if err := testReadWrite(dup, dupFlags, want); err != nil {
t.Fatalf("%v: testReadWrite(%v) failed: %v", s, dupFlags, err)
}
@@ -291,19 +324,45 @@ func TestReadWriteDup(t *testing.T) {
}
func TestUnopened(t *testing.T) {
- runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFREG}, allConfs, func(t *testing.T, s state) {
b := []byte("foobar")
- if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF {
- t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ if _, err := s.file.WriteAt(b, 0); err != unix.EBADF {
+ t.Errorf("%v: WriteAt() should have failed, got: %v, expected: unix.EBADF", s, err)
}
- if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF {
- t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ if _, err := s.file.ReadAt(b, 0); err != unix.EBADF {
+ t.Errorf("%v: ReadAt() should have failed, got: %v, expected: unix.EBADF", s, err)
}
- if _, err := s.file.Readdir(0, 100); err != syscall.EBADF {
- t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ if _, err := s.file.Readdir(0, 100); err != unix.EBADF {
+ t.Errorf("%v: Readdir() should have failed, got: %v, expected: unix.EBADF", s, err)
}
- if err := s.file.FSync(); err != syscall.EBADF {
- t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ if err := s.file.FSync(); err != unix.EBADF {
+ t.Errorf("%v: FSync() should have failed, got: %v, expected: unix.EBADF", s, err)
+ }
+ })
+}
+
+// TestOpenOPath is a regression test to ensure that a file that cannot be open
+// for read is allowed to be open. This was happening because the control file
+// was open with O_PATH, but Open() was not checking for it and allowing the
+// control file to be reused.
+func TestOpenOPath(t *testing.T) {
+ runCustom(t, []uint32{unix.S_IFREG}, rwConfs, func(t *testing.T, s state) {
+ // Fist remove all permissions on the file.
+ if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil {
+ t.Fatalf("SetAttr(): %v", err)
+ }
+ // Then walk to the file again to open a new control file.
+ filename := filepath.Base(s.file.hostPath)
+ _, newFile, err := s.root.Walk([]string{filename})
+ if err != nil {
+ t.Fatalf("root.Walk(%q): %v", filename, err)
+ }
+
+ if newFile.(*localFile).controlReadable {
+ t.Fatalf("control file didn't open with O_PATH: %+v", newFile)
+ }
+ if _, _, _, err := newFile.Open(p9.ReadOnly); err != unix.EACCES {
+ t.Fatalf("Open() should have failed, got: %v, wanted: EACCES", err)
}
})
}
@@ -324,7 +383,7 @@ func TestSetAttrPerm(t *testing.T) {
valid := p9.SetAttrMask{Permissions: true}
attr := p9.SetAttr{Permissions: 0777}
got, err := SetGetAttr(s.file, valid, attr)
- if s.ft == symlink {
+ if s.fileType == unix.S_IFLNK {
if err == nil {
t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
}
@@ -345,7 +404,7 @@ func TestSetAttrSize(t *testing.T) {
valid := p9.SetAttrMask{Size: true}
attr := p9.SetAttr{Size: size}
got, err := SetGetAttr(s.file, valid, attr)
- if s.ft == symlink || s.ft == directory {
+ if s.fileType == unix.S_IFLNK || s.fileType == unix.S_IFDIR {
if err == nil {
t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
}
@@ -427,9 +486,9 @@ func TestLink(t *testing.T) {
}
err = dir.Link(s.file, linkFile)
- if s.ft == directory {
- if err != syscall.EPERM {
- t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err)
+ if s.fileType == unix.S_IFDIR {
+ if err != unix.EPERM {
+ t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: unix.EPERM", s, linkFile, err)
}
return
}
@@ -440,54 +499,64 @@ func TestLink(t *testing.T) {
}
func TestROMountChecks(t *testing.T) {
+ const want = unix.EROFS
+ uid := p9.UID(os.Getuid())
+ gid := p9.GID(os.Getgid())
+
runCustom(t, allTypes, roConfs, func(t *testing.T, s state) {
- if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
- t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ if s.fileType != unix.S_IFLNK {
+ if _, _, _, err := s.file.Open(p9.WriteOnly); err != want {
+ t.Errorf("Open() should have failed, got: %v, expected: %v", err, want)
+ }
+ if _, _, _, err := s.file.Open(p9.ReadWrite); err != want {
+ t.Errorf("Open() should have failed, got: %v, expected: %v", err, want)
+ }
+ if _, _, _, err := s.file.Open(p9.ReadOnly | p9.OpenTruncate); err != want {
+ t.Errorf("Open() should have failed, got: %v, expected: %v", err, want)
+ }
+ f, _, _, err := s.file.Open(p9.ReadOnly)
+ if err != nil {
+ t.Errorf("Open() failed: %v", err)
+ }
+ if f != nil {
+ _ = f.Close()
+ }
}
- if _, err := s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
- t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err)
+
+ if _, _, _, _, err := s.file.Create("some_file", p9.ReadWrite, 0777, uid, gid); err != want {
+ t.Errorf("Create() should have failed, got: %v, expected: %v", err, want)
}
- if err := s.file.RenameAt("some_file", s.file, "other_file"); err != syscall.EBADF {
- t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ if _, err := s.file.Mkdir("some_dir", 0777, uid, gid); err != want {
+ t.Errorf("MkDir() should have failed, got: %v, expected: %v", err, want)
}
- if _, err := s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF {
- t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ if err := s.file.RenameAt("some_file", s.file, "other_file"); err != want {
+ t.Errorf("Rename() should have failed, got: %v, expected: %v", err, want)
}
- if err := s.file.UnlinkAt("some_file", 0); err != syscall.EBADF {
- t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ if _, err := s.file.Symlink("some_place", "some_symlink", uid, gid); err != want {
+ t.Errorf("Symlink() should have failed, got: %v, expected: %v", err, want)
}
- if err := s.file.Link(s.file, "some_link"); err != syscall.EBADF {
- t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ if err := s.file.UnlinkAt("some_file", 0); err != want {
+ t.Errorf("UnlinkAt() should have failed, got: %v, expected: %v", err, want)
}
-
- valid := p9.SetAttrMask{Size: true}
- attr := p9.SetAttr{Size: 0}
- if err := s.file.SetAttr(valid, attr); err != syscall.EBADF {
- t.Errorf("%v: SetAttr() should have failed, got: %v, expected: syscall.EBADF", s, err)
+ if err := s.file.Link(s.file, "some_link"); err != want {
+ t.Errorf("Link() should have failed, got: %v, expected: %v", err, want)
+ }
+ if _, err := s.file.Mknod("some-nod", 0777, 1, 2, uid, gid); err != want {
+ t.Errorf("Mknod() should have failed, got: %v, expected: %v", err, want)
}
- })
-}
-
-func TestROMountPanics(t *testing.T) {
- conf := Config{ROMount: true, PanicOnWrite: true}
- runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) {
- assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
- assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
- assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") })
- assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", p9.UID(os.Getuid()), p9.GID(os.Getgid())) })
- assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) })
- assertPanic(t, func() { s.file.Link(s.file, "some_link") })
valid := p9.SetAttrMask{Size: true}
attr := p9.SetAttr{Size: 0}
- assertPanic(t, func() { s.file.SetAttr(valid, attr) })
+ if err := s.file.SetAttr(valid, attr); err != want {
+ t.Errorf("SetAttr() should have failed, got: %v, expected: %v", err, want)
+ }
})
}
func TestWalkNotFound(t *testing.T) {
- runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) {
- if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
- t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err)
+ runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) {
+ if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT {
+ t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: unix.ENOENT", s, "nobody-here", err)
}
})
}
@@ -506,7 +575,7 @@ func TestWalkDup(t *testing.T) {
}
func TestReaddir(t *testing.T) {
- runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
name := "dir"
if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err)
@@ -631,7 +700,7 @@ func TestAttachInvalidType(t *testing.T) {
defer os.RemoveAll(dir)
fifo := filepath.Join(dir, "fifo")
- if err := syscall.Mkfifo(fifo, 0755); err != nil {
+ if err := unix.Mkfifo(fifo, 0755); err != nil {
t.Fatalf("Mkfifo(%q): %v", fifo, err)
}
@@ -690,3 +759,63 @@ func TestDoubleAttachError(t *testing.T) {
t.Fatalf("Attach should have failed, got %v want non-nil", err)
}
}
+
+func TestTruncate(t *testing.T) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+ child, err := createFile(s.file, "test")
+ if err != nil {
+ t.Fatalf("createFile() failed: %v", err)
+ }
+ defer child.Close()
+ want := []byte("foobar")
+ w, err := child.WriteAt(want, 0)
+ if err != nil {
+ t.Fatalf("Write() failed: %v", err)
+ }
+ if w != len(want) {
+ t.Fatalf("Write() was partial, got: %d, expected: %d", w, len(want))
+ }
+
+ _, l, err := s.file.Walk([]string{"test"})
+ if err != nil {
+ t.Fatalf("Walk(%s) failed: %v", "test", err)
+ }
+ if _, _, _, err := l.Open(p9.ReadOnly | p9.OpenTruncate); err != nil {
+ t.Fatalf("Open() failed: %v", err)
+ }
+ _, mask, attr, err := l.GetAttr(p9.AttrMask{Size: true})
+ if err != nil {
+ t.Fatalf("GetAttr() failed: %v", err)
+ }
+ if !mask.Size {
+ t.Fatalf("GetAttr() didn't return size: %+v", mask)
+ }
+ if attr.Size != 0 {
+ t.Fatalf("truncate didn't work, want: 0, got: %d", attr.Size)
+ }
+ })
+}
+
+func TestMknod(t *testing.T) {
+ runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+ _, err := s.file.Mknod("test", p9.ModeRegular|0777, 1, 2, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+ if err != nil {
+ t.Fatalf("Mknod() failed: %v", err)
+ }
+
+ _, f, err := s.file.Walk([]string{"test"})
+ if err != nil {
+ t.Fatalf("Walk() failed: %v", err)
+ }
+ fd, _, _, err := f.Open(p9.ReadWrite)
+ if err != nil {
+ t.Fatalf("Open() failed: %v", err)
+ }
+ if fd != nil {
+ defer fd.Close()
+ }
+ if err := testReadWrite(f, p9.ReadWrite, nil); err != nil {
+ t.Fatalf("testReadWrite() failed: %v", err)
+ }
+ })
+}
diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go
index 542b54365..f11fea40d 100644
--- a/runsc/fsgofer/fsgofer_unsafe.go
+++ b/runsc/fsgofer/fsgofer_unsafe.go
@@ -15,18 +15,18 @@
package fsgofer
import (
- "syscall"
"unsafe"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/syserr"
)
-func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error {
+func utimensat(dirFd int, name string, times [2]unix.Timespec, flags int) error {
// utimensat(2) doesn't accept empty name, instead name must be nil to make it
// operate directly on 'dirFd' unlike other *at syscalls.
var namePtr unsafe.Pointer
if name != "" {
- nameBytes, err := syscall.BytePtrFromString(name)
+ nameBytes, err := unix.BytePtrFromString(name)
if err != nil {
return err
}
@@ -35,8 +35,8 @@ func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) err
timesPtr := unsafe.Pointer(&times[0])
- if _, _, errno := syscall.Syscall6(
- syscall.SYS_UTIMENSAT,
+ if _, _, errno := unix.Syscall6(
+ unix.SYS_UTIMENSAT,
uintptr(dirFd),
uintptr(namePtr),
uintptr(timesPtr),
@@ -52,7 +52,7 @@ func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) err
func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error {
var oldNamePtr unsafe.Pointer
if oldName != "" {
- nameBytes, err := syscall.BytePtrFromString(oldName)
+ nameBytes, err := unix.BytePtrFromString(oldName)
if err != nil {
return err
}
@@ -60,15 +60,15 @@ func renameat(oldDirFD int, oldName string, newDirFD int, newName string) error
}
var newNamePtr unsafe.Pointer
if newName != "" {
- nameBytes, err := syscall.BytePtrFromString(newName)
+ nameBytes, err := unix.BytePtrFromString(newName)
if err != nil {
return err
}
newNamePtr = unsafe.Pointer(nameBytes)
}
- if _, _, errno := syscall.Syscall6(
- syscall.SYS_RENAMEAT,
+ if _, _, errno := unix.Syscall6(
+ unix.SYS_RENAMEAT,
uintptr(oldDirFD),
uintptr(oldNamePtr),
uintptr(newDirFD),
diff --git a/runsc/main.go b/runsc/main.go
index c9f47c579..ed244c4ba 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -23,8 +23,6 @@ import (
"io/ioutil"
"os"
"os/signal"
- "path/filepath"
- "strings"
"syscall"
"time"
@@ -32,8 +30,8 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/platform"
- "gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/cmd"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/flag"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -41,57 +39,17 @@ import (
var (
// Although these flags are not part of the OCI spec, they are used by
// Docker, and thus should not be changed.
- rootDir = flag.String("root", "", "root directory for storage of container state.")
- logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.")
- logFormat = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.")
- debug = flag.Bool("debug", false, "enable debug logging.")
- showVersion = flag.Bool("version", false, "show version and exit.")
// TODO(gvisor.dev/issue/193): support systemd cgroups
systemdCgroup = flag.Bool("systemd-cgroup", false, "Use systemd for cgroups. NOT SUPPORTED.")
+ showVersion = flag.Bool("version", false, "show version and exit.")
// These flags are unique to runsc, and are used to configure parts of the
// system that are not covered by the runtime spec.
// Debugging flags.
- debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
- panicLog = flag.String("panic-log", "", "file path were panic reports and other Go's runtime messages are written.")
- logPackets = flag.Bool("log-packets", false, "enable network packet logging.")
- logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.")
- debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.")
- panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
- debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.")
- alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.")
-
- // Debugging flags: strace related
- strace = flag.Bool("strace", false, "enable strace.")
- straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.")
- straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.")
-
- // Flags that control sandbox runtime behavior.
- platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
- network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
- hardwareGSO = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
- softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.")
- txChecksumOffload = flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.")
- rxChecksumOffload = flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.")
- qDisc = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
- fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
- fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
- overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
- overlayfsStaleRead = flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem")
- watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
- panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
- profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
- netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
- numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
- rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
- referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
- cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
- vfs2Enabled = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
-
- // Test flags, not to be used outside tests, ever.
- testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
- testOnlyTestNameEnv = flag.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.")
+ logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.")
+ debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.")
+ panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.")
)
func main() {
@@ -135,6 +93,8 @@ func main() {
subcommands.Register(new(cmd.Gofer), internalGroup)
subcommands.Register(new(cmd.Statefile), internalGroup)
+ config.RegisterFlags()
+
// All subcommands must be registered before flag parsing.
flag.Parse()
@@ -146,6 +106,12 @@ func main() {
os.Exit(0)
}
+ // Create a new Config from the flags.
+ conf, err := config.NewFromFlags()
+ if err != nil {
+ cmd.Fatalf(err.Error())
+ }
+
// TODO(gvisor.dev/issue/193): support systemd cgroups
if *systemdCgroup {
fmt.Fprintln(os.Stderr, "systemd cgroup flag passed, but systemd cgroups not supported. See gvisor.dev/issue/193")
@@ -156,102 +122,28 @@ func main() {
if *logFD > -1 {
errorLogger = os.NewFile(uintptr(*logFD), "error log file")
- } else if *logFilename != "" {
+ } else if conf.LogFilename != "" {
// We must set O_APPEND and not O_TRUNC because Docker passes
// the same log file for all commands (and also parses these
// log files), so we can't destroy them on each command.
var err error
- errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
+ errorLogger, err = os.OpenFile(conf.LogFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
if err != nil {
- cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
+ cmd.Fatalf("error opening log file %q: %v", conf.LogFilename, err)
}
}
cmd.ErrorLogger = errorLogger
- platformType := *platformName
- if _, err := platform.Lookup(platformType); err != nil {
- cmd.Fatalf("%v", err)
- }
-
- fsAccess, err := boot.MakeFileAccessType(*fileAccess)
- if err != nil {
- cmd.Fatalf("%v", err)
- }
-
- if fsAccess == boot.FileAccessShared && *overlay {
- cmd.Fatalf("overlay flag is incompatible with shared file access")
- }
-
- netType, err := boot.MakeNetworkType(*network)
- if err != nil {
+ if _, err := platform.Lookup(conf.Platform); err != nil {
cmd.Fatalf("%v", err)
}
- wa, err := boot.MakeWatchdogAction(*watchdogAction)
- if err != nil {
- cmd.Fatalf("%v", err)
- }
-
- if *numNetworkChannels <= 0 {
- cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
- }
-
- refsLeakMode, err := boot.MakeRefsLeakMode(*referenceLeakMode)
- if err != nil {
- cmd.Fatalf("%v", err)
- }
-
- queueingDiscipline, err := boot.MakeQueueingDiscipline(*qDisc)
- if err != nil {
- cmd.Fatalf("%s", err)
- }
-
// Sets the reference leak check mode. Also set it in config below to
// propagate it to child processes.
- refs.SetLeakMode(refsLeakMode)
-
- // Create a new Config from the flags.
- conf := &boot.Config{
- RootDir: *rootDir,
- Debug: *debug,
- LogFilename: *logFilename,
- LogFormat: *logFormat,
- DebugLog: *debugLog,
- PanicLog: *panicLog,
- DebugLogFormat: *debugLogFormat,
- FileAccess: fsAccess,
- FSGoferHostUDS: *fsGoferHostUDS,
- Overlay: *overlay,
- Network: netType,
- HardwareGSO: *hardwareGSO,
- SoftwareGSO: *softwareGSO,
- TXChecksumOffload: *txChecksumOffload,
- RXChecksumOffload: *rxChecksumOffload,
- LogPackets: *logPackets,
- Platform: platformType,
- Strace: *strace,
- StraceLogSize: *straceLogSize,
- WatchdogAction: wa,
- PanicSignal: *panicSignal,
- ProfileEnable: *profile,
- EnableRaw: *netRaw,
- NumNetworkChannels: *numNetworkChannels,
- Rootless: *rootless,
- AlsoLogToStderr: *alsoLogToStderr,
- ReferenceLeakMode: refsLeakMode,
- OverlayfsStaleRead: *overlayfsStaleRead,
- CPUNumFromQuota: *cpuNumFromQuota,
- VFS2: *vfs2Enabled,
- QDisc: queueingDiscipline,
- TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
- TestOnlyTestNameEnv: *testOnlyTestNameEnv,
- }
- if len(*straceSyscalls) != 0 {
- conf.StraceSyscalls = strings.Split(*straceSyscalls, ",")
- }
+ refs.SetLeakMode(conf.ReferenceLeak)
// Set up logging.
- if *debug {
+ if conf.Debug {
log.SetLevel(log.Debug)
}
@@ -273,14 +165,14 @@ func main() {
if *debugLogFD > -1 {
f := os.NewFile(uintptr(*debugLogFD), "debug log file")
- e = newEmitter(*debugLogFormat, f)
+ e = newEmitter(conf.DebugLogFormat, f)
- } else if *debugLog != "" {
- f, err := specutils.DebugLogFile(*debugLog, subcommand, "" /* name */)
+ } else if conf.DebugLog != "" {
+ f, err := specutils.DebugLogFile(conf.DebugLog, subcommand, "" /* name */)
if err != nil {
- cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
+ cmd.Fatalf("error opening debug log file in %q: %v", conf.DebugLog, err)
}
- e = newEmitter(*debugLogFormat, f)
+ e = newEmitter(conf.DebugLogFormat, f)
} else {
// Stderr is reserved for the application, just discard the logs if no debug
@@ -306,8 +198,8 @@ func main() {
if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil {
cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err)
}
- } else if *alsoLogToStderr {
- e = &log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
+ } else if conf.AlsoLogToStderr {
+ e = &log.MultiEmitter{e, newEmitter(conf.DebugLogFormat, os.Stderr)}
}
log.SetTarget(e)
@@ -326,7 +218,7 @@ func main() {
log.Infof("\t\tVFS2 enabled: %v", conf.VFS2)
log.Infof("***************************")
- if *testOnlyAllowRunAsCurrentUserWithoutChroot {
+ if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
// SIGTERM is sent to all processes if a test exceeds its
// timeout and this case is handled by syscall_test_runner.
log.Warningf("Block the TERM signal. This is only safe in tests!")
@@ -362,11 +254,3 @@ func newEmitter(format string, logFile io.Writer) log.Emitter {
cmd.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format)
panic("unreachable")
}
-
-func init() {
- // Set default root dir to something (hopefully) user-writeable.
- *rootDir = "/var/run/runsc"
- if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" {
- *rootDir = filepath.Join(runtimeDir, "runsc")
- }
-}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 035dcd3e3..f0a551a1e 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -26,10 +26,11 @@ go_library(
"//runsc/boot",
"//runsc/boot/platforms",
"//runsc/cgroup",
+ "//runsc/config",
"//runsc/console",
"//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@com_github_syndtr_gocapability//capability:go_default_library",
"@com_github_vishvananda_netlink//:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index deee619f3..0b9f39466 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -31,6 +31,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -49,26 +50,26 @@ import (
//
// Run the following container to test it:
// docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4
-func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error {
+func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *config.Config) error {
log.Infof("Setting up network")
switch conf.Network {
- case boot.NetworkNone:
+ case config.NetworkNone:
log.Infof("Network is disabled, create loopback interface only")
if err := createDefaultLoopbackInterface(conn); err != nil {
return fmt.Errorf("creating default loopback interface: %v", err)
}
- case boot.NetworkSandbox:
+ case config.NetworkSandbox:
// Build the path to the net namespace of the sandbox process.
// This is what we will copy.
nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil {
return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
}
- case boot.NetworkHost:
+ case config.NetworkHost:
// Nothing to do here.
default:
- return fmt.Errorf("invalid network type: %d", conf.Network)
+ return fmt.Errorf("invalid network type: %v", conf.Network)
}
return nil
}
@@ -115,7 +116,7 @@ func isRootNS() (bool, error) {
// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
// net namespace with the given path, creates them in the sandbox, and removes
// them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc config.QueueingDiscipline) error {
// Join the network namespace that we will be copying.
restore, err := joinNetNS(nsPath)
if err != nil {
@@ -134,7 +135,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
return err
}
if isRoot {
-
return fmt.Errorf("cannot run with network enabled in root network namespace")
}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 6e1a2af25..c4309feb3 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -41,6 +41,7 @@ import (
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/boot/platforms"
"gvisor.dev/gvisor/runsc/cgroup"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/console"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -71,11 +72,14 @@ type Sandbox struct {
// will have it as a child process.
child bool
- // status is an exit status of a sandbox process.
- status syscall.WaitStatus
-
// statusMu protects status.
statusMu sync.Mutex
+
+ // status is the exit status of a sandbox process. It's only set if the
+ // child==true and the sandbox was waited on. This field allows for multiple
+ // threads to wait on sandbox and get the exit code, since Linux will return
+ // WaitStatus to one of the waiters only.
+ status syscall.WaitStatus
}
// Args is used to configure a new sandbox.
@@ -116,7 +120,7 @@ type Args struct {
// New creates the sandbox process. The caller must call Destroy() on the
// sandbox.
-func New(conf *boot.Config, args *Args) (*Sandbox, error) {
+func New(conf *config.Config, args *Args) (*Sandbox, error) {
s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup}
// The Cleanup object cleans up partially created sandboxes when an error
// occurs. Any errors occurring during cleanup itself are ignored.
@@ -180,7 +184,7 @@ func (s *Sandbox) CreateContainer(cid string) error {
}
// StartRoot starts running the root container process inside the sandbox.
-func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
+func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error {
log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid)
conn, err := s.sandboxConnect()
if err != nil {
@@ -203,7 +207,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
}
// StartContainer starts running a non-root container inside the sandbox.
-func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string, goferFiles []*os.File) error {
+func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error {
for _, f := range goferFiles {
defer f.Close()
}
@@ -232,7 +236,7 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *boot.Config, cid string
}
// Restore sends the restore call for a container in the sandbox.
-func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *boot.Config, filename string) error {
+func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *config.Config, filename string) error {
log.Debugf("Restore sandbox %q", s.ID)
rf, err := os.Open(filename)
@@ -344,7 +348,7 @@ func (s *Sandbox) connError(err error) error {
// createSandboxProcess starts the sandbox as a subprocess by running the "boot"
// command, passing in the bundle dir.
-func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncFile *os.File) error {
+func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error {
// nextFD is used to get unused FDs that we can pass to the sandbox. It
// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
nextFD := 3
@@ -477,12 +481,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
cmd.Stderr = nil
// If the console control socket file is provided, then create a new
- // pty master/slave pair and set the TTY on the sandbox process.
- if args.ConsoleSocket != "" {
- cmd.Args = append(cmd.Args, "--console=true")
-
+ // pty master/replica pair and set the TTY on the sandbox process.
+ if args.Spec.Process.Terminal && args.ConsoleSocket != "" {
// console.NewWithSocket will send the master on the given
- // socket, and return the slave.
+ // socket, and return the replica.
tty, err := console.NewWithSocket(args.ConsoleSocket)
if err != nil {
return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err)
@@ -557,10 +559,10 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
// Joins the network namespace if network is enabled. the sandbox talks
// directly to the host network, which may have been configured in the
// namespace.
- if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != boot.NetworkNone {
+ if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone {
log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
nss = append(nss, ns)
- } else if conf.Network == boot.NetworkHost {
+ } else if conf.Network == config.NetworkHost {
log.Infof("Sandbox will be started in the host network namespace")
} else {
log.Infof("Sandbox will be started in new network namespace")
@@ -570,7 +572,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
// User namespace depends on the network type. Host network requires to run
// inside the user namespace specified in the spec or the current namespace
// if none is configured.
- if conf.Network == boot.NetworkHost {
+ if conf.Network == config.NetworkHost {
if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok {
log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
nss = append(nss, userns)
@@ -747,35 +749,47 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
// Wait waits for the containerized process to exit, and returns its WaitStatus.
func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
- var ws syscall.WaitStatus
if conn, err := s.sandboxConnect(); err != nil {
- // The sandbox may have exited while before we had a chance to
- // wait on it.
+ // The sandbox may have exited while before we had a chance to wait on it.
+ // There is nothing we can do for subcontainers. For the init container, we
+ // can try to get the sandbox exit code.
+ if !s.IsRootContainer(cid) {
+ return syscall.WaitStatus(0), err
+ }
log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
} else {
defer conn.Close()
+
// Try the Wait RPC to the sandbox.
+ var ws syscall.WaitStatus
err = conn.Call(boot.ContainerWait, &cid, &ws)
if err == nil {
// It worked!
return ws, nil
}
+ // See comment above.
+ if !s.IsRootContainer(cid) {
+ return syscall.WaitStatus(0), err
+ }
+
// The sandbox may have exited after we connected, but before
// or during the Wait RPC.
log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
}
- // The sandbox may have already exited, or exited while handling the
- // Wait RPC. The best we can do is ask Linux what the sandbox exit
- // status was, since in most cases that will be the same as the
- // container exit status.
+ // The sandbox may have already exited, or exited while handling the Wait RPC.
+ // The best we can do is ask Linux what the sandbox exit status was, since in
+ // most cases that will be the same as the container exit status.
if err := s.waitForStopped(); err != nil {
- return ws, err
+ return syscall.WaitStatus(0), err
}
if !s.child {
- return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable")
+ return syscall.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable")
}
+
+ s.statusMu.Lock()
+ defer s.statusMu.Unlock()
return s.status, nil
}
@@ -1014,26 +1028,6 @@ func (s *Sandbox) StopCPUProfile() error {
return nil
}
-// GoroutineProfile writes a goroutine profile to the given file.
-func (s *Sandbox) GoroutineProfile(f *os.File) error {
- log.Debugf("Goroutine profile %q", s.ID)
- conn, err := s.sandboxConnect()
- if err != nil {
- return err
- }
- defer conn.Close()
-
- opts := control.ProfileOpts{
- FilePayload: urpc.FilePayload{
- Files: []*os.File{f},
- },
- }
- if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil {
- return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err)
- }
- return nil
-}
-
// BlockProfile writes a block profile to the given file.
func (s *Sandbox) BlockProfile(f *os.File) error {
log.Debugf("Block profile %q", s.ID)
@@ -1201,7 +1195,7 @@ func deviceFileForPlatform(name string) (*os.File, error) {
// checkBinaryPermissions verifies that the required binary bits are set on
// the runsc executable.
-func checkBinaryPermissions(conf *boot.Config) error {
+func checkBinaryPermissions(conf *config.Config) error {
// All platforms need the other exe bit
neededBits := os.FileMode(0001)
if conf.Platform == platforms.Ptrace {
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 62d4f5113..679d8bc8e 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -16,9 +16,10 @@ go_library(
"//pkg/bits",
"//pkg/log",
"//pkg/sentry/kernel/auth",
+ "//runsc/config",
"@com_github_cenkalti_backoff//:go_default_library",
"@com_github_mohae_deepcopy//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@com_github_syndtr_gocapability//capability:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
@@ -29,5 +30,5 @@ go_test(
size = "small",
srcs = ["specutils_test.go"],
library = ":specutils",
- deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"],
+ deps = ["@com_github_opencontainers_runtime_spec//specs-go:go_default_library"],
)
diff --git a/runsc/specutils/seccomp/BUILD b/runsc/specutils/seccomp/BUILD
new file mode 100644
index 000000000..3520f2d6d
--- /dev/null
+++ b/runsc/specutils/seccomp/BUILD
@@ -0,0 +1,34 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "seccomp",
+ srcs = [
+ "audit_amd64.go",
+ "audit_arm64.go",
+ "seccomp.go",
+ ],
+ visibility = ["//:sandbox"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/bpf",
+ "//pkg/log",
+ "//pkg/seccomp",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/syscalls/linux",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+ ],
+)
+
+go_test(
+ name = "seccomp_test",
+ size = "small",
+ srcs = ["seccomp_test.go"],
+ library = ":seccomp",
+ deps = [
+ "//pkg/binary",
+ "//pkg/bpf",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+ ],
+)
diff --git a/runsc/specutils/seccomp/audit_amd64.go b/runsc/specutils/seccomp/audit_amd64.go
new file mode 100644
index 000000000..417cf4a7a
--- /dev/null
+++ b/runsc/specutils/seccomp/audit_amd64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package seccomp
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+const (
+ nativeArchAuditNo = linux.AUDIT_ARCH_X86_64
+)
diff --git a/runsc/specutils/seccomp/audit_arm64.go b/runsc/specutils/seccomp/audit_arm64.go
new file mode 100644
index 000000000..b727ceff2
--- /dev/null
+++ b/runsc/specutils/seccomp/audit_arm64.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package seccomp
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+const (
+ nativeArchAuditNo = linux.AUDIT_ARCH_AARCH64
+)
diff --git a/runsc/specutils/seccomp/seccomp.go b/runsc/specutils/seccomp/seccomp.go
new file mode 100644
index 000000000..5932f7a41
--- /dev/null
+++ b/runsc/specutils/seccomp/seccomp.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package seccomp implements some features of libseccomp in order to support
+// OCI.
+package seccomp
+
+import (
+ "fmt"
+ "syscall"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/bpf"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/seccomp"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+)
+
+var (
+ killThreadAction = linux.SECCOMP_RET_KILL_THREAD
+ trapAction = linux.SECCOMP_RET_TRAP
+ // runc always returns EPERM as the errorcode for SECCOMP_RET_ERRNO
+ errnoAction = linux.SECCOMP_RET_ERRNO.WithReturnCode(uint16(syscall.EPERM))
+ // runc always returns EPERM as the errorcode for SECCOMP_RET_TRACE
+ traceAction = linux.SECCOMP_RET_TRACE.WithReturnCode(uint16(syscall.EPERM))
+ allowAction = linux.SECCOMP_RET_ALLOW
+)
+
+// BuildProgram generates a bpf program based on the given OCI seccomp
+// config.
+func BuildProgram(s *specs.LinuxSeccomp) (bpf.Program, error) {
+ defaultAction, err := convertAction(s.DefaultAction)
+ if err != nil {
+ return bpf.Program{}, fmt.Errorf("secomp default action: %w", err)
+ }
+ ruleset, err := convertRules(s)
+ if err != nil {
+ return bpf.Program{}, fmt.Errorf("invalid seccomp rules: %w", err)
+ }
+
+ instrs, err := seccomp.BuildProgram(ruleset, defaultAction, killThreadAction)
+ if err != nil {
+ return bpf.Program{}, fmt.Errorf("building seccomp program: %w", err)
+ }
+
+ program, err := bpf.Compile(instrs)
+ if err != nil {
+ return bpf.Program{}, fmt.Errorf("compiling seccomp program: %w", err)
+ }
+
+ return program, nil
+}
+
+// lookupSyscallNo gets the syscall number for the syscall with the given name
+// for the given architecture.
+func lookupSyscallNo(arch uint32, name string) (uint32, error) {
+ var table *kernel.SyscallTable
+ switch arch {
+ case linux.AUDIT_ARCH_X86_64:
+ table = slinux.AMD64
+ case linux.AUDIT_ARCH_AARCH64:
+ table = slinux.ARM64
+ }
+ if table == nil {
+ return 0, fmt.Errorf("unsupported architecture: %d", arch)
+ }
+ n, err := table.LookupNo(name)
+ if err != nil {
+ return 0, err
+ }
+ return uint32(n), nil
+}
+
+// convertAction converts a LinuxSeccompAction to BPFAction
+func convertAction(act specs.LinuxSeccompAction) (linux.BPFAction, error) {
+ // TODO(gvisor.dev/issue/3124): Update specs package to include ActLog and ActKillProcess.
+ switch act {
+ case specs.ActKill:
+ return killThreadAction, nil
+ case specs.ActTrap:
+ return trapAction, nil
+ case specs.ActErrno:
+ return errnoAction, nil
+ case specs.ActTrace:
+ return traceAction, nil
+ case specs.ActAllow:
+ return allowAction, nil
+ default:
+ return 0, fmt.Errorf("invalid action: %v", act)
+ }
+}
+
+// convertRules converts OCI linux seccomp rules into RuleSets that can be used by
+// the seccomp package to build a seccomp program.
+func convertRules(s *specs.LinuxSeccomp) ([]seccomp.RuleSet, error) {
+ // NOTE: Architectures are only really relevant when calling 32bit syscalls
+ // on a 64bit system. Since we don't support that in gVisor anyway, we
+ // ignore Architectures and only test against the native architecture.
+
+ ruleset := []seccomp.RuleSet{}
+
+ for _, syscall := range s.Syscalls {
+ sysRules := seccomp.NewSyscallRules()
+
+ action, err := convertAction(syscall.Action)
+ if err != nil {
+ return nil, err
+ }
+
+ // Args
+ rules, err := convertArgs(syscall.Args)
+ if err != nil {
+ return nil, err
+ }
+
+ for _, name := range syscall.Names {
+ syscallNo, err := lookupSyscallNo(nativeArchAuditNo, name)
+ if err != nil {
+ // If there is an error looking up the syscall number, assume it is
+ // not supported on this architecture and ignore it. This is, for
+ // better or worse, what runc does.
+ log.Warningf("OCI seccomp: ignoring syscall %q", name)
+ continue
+ }
+
+ for _, rule := range rules {
+ sysRules.AddRule(uintptr(syscallNo), rule)
+ }
+ }
+
+ ruleset = append(ruleset, seccomp.RuleSet{
+ Rules: sysRules,
+ Action: action,
+ })
+ }
+
+ return ruleset, nil
+}
+
+// convertArgs converts an OCI seccomp argument rule to a list of seccomp.Rule.
+func convertArgs(args []specs.LinuxSeccompArg) ([]seccomp.Rule, error) {
+ argCounts := make([]uint, 6)
+
+ for _, arg := range args {
+ if arg.Index > 6 {
+ return nil, fmt.Errorf("invalid index: %d", arg.Index)
+ }
+
+ argCounts[arg.Index]++
+ }
+
+ // NOTE: If multiple rules apply to the same argument (same index) the
+ // action is triggered if any one of the rules matches (OR). If not, then
+ // all rules much match in order to trigger the action (AND). This appears to
+ // be some kind of legacy behavior of runc that nevertheless needs to be
+ // supported to maintain compatibility.
+
+ hasMultipleArgs := false
+ for _, count := range argCounts {
+ if count > 1 {
+ hasMultipleArgs = true
+ break
+ }
+ }
+
+ if hasMultipleArgs {
+ rules := []seccomp.Rule{}
+
+ // Old runc behavior - do this for compatibility.
+ // Add rules as ORs by adding separate Rules.
+ for _, arg := range args {
+ rule := seccomp.Rule{nil, nil, nil, nil, nil, nil}
+
+ if err := convertRule(arg, &rule); err != nil {
+ return nil, err
+ }
+
+ rules = append(rules, rule)
+ }
+
+ return rules, nil
+ }
+
+ // Add rules as ANDs by adding to the same Rule.
+ rule := seccomp.Rule{nil, nil, nil, nil, nil, nil}
+ for _, arg := range args {
+ if err := convertRule(arg, &rule); err != nil {
+ return nil, err
+ }
+ }
+
+ return []seccomp.Rule{rule}, nil
+}
+
+// convertRule converts and adds the arg to a rule.
+func convertRule(arg specs.LinuxSeccompArg, rule *seccomp.Rule) error {
+ switch arg.Op {
+ case specs.OpEqualTo:
+ rule[arg.Index] = seccomp.EqualTo(arg.Value)
+ case specs.OpNotEqual:
+ rule[arg.Index] = seccomp.NotEqual(arg.Value)
+ case specs.OpGreaterThan:
+ rule[arg.Index] = seccomp.GreaterThan(arg.Value)
+ case specs.OpGreaterEqual:
+ rule[arg.Index] = seccomp.GreaterThanOrEqual(arg.Value)
+ case specs.OpLessThan:
+ rule[arg.Index] = seccomp.LessThan(arg.Value)
+ case specs.OpLessEqual:
+ rule[arg.Index] = seccomp.LessThanOrEqual(arg.Value)
+ case specs.OpMaskedEqual:
+ rule[arg.Index] = seccomp.MaskedEqual(uintptr(arg.Value), uintptr(arg.ValueTwo))
+ default:
+ return fmt.Errorf("unsupported operand: %q", arg.Op)
+ }
+ return nil
+}
diff --git a/runsc/specutils/seccomp/seccomp_test.go b/runsc/specutils/seccomp/seccomp_test.go
new file mode 100644
index 000000000..850c237ba
--- /dev/null
+++ b/runsc/specutils/seccomp/seccomp_test.go
@@ -0,0 +1,414 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package seccomp
+
+import (
+ "fmt"
+ "syscall"
+ "testing"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/bpf"
+)
+
+type seccompData struct {
+ nr uint32
+ arch uint32
+ instructionPointer uint64
+ args [6]uint64
+}
+
+// asInput converts a seccompData to a bpf.Input.
+func asInput(d seccompData) bpf.Input {
+ return bpf.InputBytes{binary.Marshal(nil, binary.LittleEndian, d), binary.LittleEndian}
+}
+
+// testInput creates an Input struct with given seccomp input values.
+func testInput(arch uint32, syscallName string, args *[6]uint64) bpf.Input {
+ syscallNo, err := lookupSyscallNo(arch, syscallName)
+ if err != nil {
+ // Assume tests set valid syscall names.
+ panic(err)
+ }
+
+ if args == nil {
+ argArray := [6]uint64{0, 0, 0, 0, 0, 0}
+ args = &argArray
+ }
+
+ data := seccompData{
+ nr: syscallNo,
+ arch: arch,
+ args: *args,
+ }
+
+ return asInput(data)
+}
+
+// testCase holds a seccomp test case.
+type testCase struct {
+ name string
+ config specs.LinuxSeccomp
+ input bpf.Input
+ expected uint32
+}
+
+var (
+ // seccompTests is a list of speccomp test cases.
+ seccompTests = []testCase{
+ {
+ name: "default_allow",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ },
+ input: testInput(nativeArchAuditNo, "read", nil),
+ expected: uint32(allowAction),
+ },
+ {
+ name: "default_deny",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActErrno,
+ },
+ input: testInput(nativeArchAuditNo, "read", nil),
+ expected: uint32(errnoAction),
+ },
+ {
+ name: "deny_arch",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "getcwd",
+ },
+ Action: specs.ActErrno,
+ },
+ },
+ },
+ // Syscall matches but the arch is AUDIT_ARCH_X86 so the return
+ // value is the bad arch action.
+ input: asInput(seccompData{nr: 183, arch: 0x40000003}), //
+ expected: uint32(killThreadAction),
+ },
+ {
+ name: "match_name_errno",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "getcwd",
+ "chmod",
+ },
+ Action: specs.ActErrno,
+ },
+ {
+ Names: []string{
+ "write",
+ },
+ Action: specs.ActTrace,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "getcwd", nil),
+ expected: uint32(errnoAction),
+ },
+ {
+ name: "match_name_trace",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "getcwd",
+ "chmod",
+ },
+ Action: specs.ActErrno,
+ },
+ {
+ Names: []string{
+ "write",
+ },
+ Action: specs.ActTrace,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "write", nil),
+ expected: uint32(traceAction),
+ },
+ {
+ name: "no_match_name_allow",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "getcwd",
+ "chmod",
+ },
+ Action: specs.ActErrno,
+ },
+ {
+ Names: []string{
+ "write",
+ },
+ Action: specs.ActTrace,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "openat", nil),
+ expected: uint32(allowAction),
+ },
+ {
+ name: "simple_match_args",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "clone",
+ },
+ Args: []specs.LinuxSeccompArg{
+ {
+ Index: 0,
+ Value: syscall.CLONE_FS,
+ Op: specs.OpEqualTo,
+ },
+ },
+ Action: specs.ActErrno,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}),
+ expected: uint32(errnoAction),
+ },
+ {
+ name: "match_args_or",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "clone",
+ },
+ Args: []specs.LinuxSeccompArg{
+ {
+ Index: 0,
+ Value: syscall.CLONE_FS,
+ Op: specs.OpEqualTo,
+ },
+ {
+ Index: 0,
+ Value: syscall.CLONE_VM,
+ Op: specs.OpEqualTo,
+ },
+ },
+ Action: specs.ActErrno,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}),
+ expected: uint32(errnoAction),
+ },
+ {
+ name: "match_args_and",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "getsockopt",
+ },
+ Args: []specs.LinuxSeccompArg{
+ {
+ Index: 1,
+ Value: syscall.SOL_SOCKET,
+ Op: specs.OpEqualTo,
+ },
+ {
+ Index: 2,
+ Value: syscall.SO_PEERCRED,
+ Op: specs.OpEqualTo,
+ },
+ },
+ Action: specs.ActErrno,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "getsockopt", &[6]uint64{0, syscall.SOL_SOCKET, syscall.SO_PEERCRED}),
+ expected: uint32(errnoAction),
+ },
+ {
+ name: "no_match_args_and",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "getsockopt",
+ },
+ Args: []specs.LinuxSeccompArg{
+ {
+ Index: 1,
+ Value: syscall.SOL_SOCKET,
+ Op: specs.OpEqualTo,
+ },
+ {
+ Index: 2,
+ Value: syscall.SO_PEERCRED,
+ Op: specs.OpEqualTo,
+ },
+ },
+ Action: specs.ActErrno,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "getsockopt", &[6]uint64{0, syscall.SOL_SOCKET}),
+ expected: uint32(allowAction),
+ },
+ {
+ name: "Simple args (no match)",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "clone",
+ },
+ Args: []specs.LinuxSeccompArg{
+ {
+ Index: 0,
+ Value: syscall.CLONE_FS,
+ Op: specs.OpEqualTo,
+ },
+ },
+ Action: specs.ActErrno,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_VM}),
+ expected: uint32(allowAction),
+ },
+ {
+ name: "OpMaskedEqual (match)",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "clone",
+ },
+ Args: []specs.LinuxSeccompArg{
+ {
+ Index: 0,
+ Value: syscall.CLONE_FS,
+ ValueTwo: syscall.CLONE_FS,
+ Op: specs.OpMaskedEqual,
+ },
+ },
+ Action: specs.ActErrno,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS | syscall.CLONE_VM}),
+ expected: uint32(errnoAction),
+ },
+ {
+ name: "OpMaskedEqual (no match)",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActAllow,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "clone",
+ },
+ Args: []specs.LinuxSeccompArg{
+ {
+ Index: 0,
+ Value: syscall.CLONE_FS | syscall.CLONE_VM,
+ ValueTwo: syscall.CLONE_FS | syscall.CLONE_VM,
+ Op: specs.OpMaskedEqual,
+ },
+ },
+ Action: specs.ActErrno,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}),
+ expected: uint32(allowAction),
+ },
+ {
+ name: "OpMaskedEqual (clone)",
+ config: specs.LinuxSeccomp{
+ DefaultAction: specs.ActErrno,
+ Syscalls: []specs.LinuxSyscall{
+ {
+ Names: []string{
+ "clone",
+ },
+ // This comes from the Docker default seccomp
+ // profile for clone.
+ Args: []specs.LinuxSeccompArg{
+ {
+ Index: 0,
+ Value: 0x7e020000,
+ ValueTwo: 0x0,
+ Op: specs.OpMaskedEqual,
+ },
+ },
+ Action: specs.ActAllow,
+ },
+ },
+ },
+ input: testInput(nativeArchAuditNo, "clone", &[6]uint64{0x50f00}),
+ expected: uint32(allowAction),
+ },
+ }
+)
+
+// TestRunscSeccomp generates seccomp programs from OCI config and executes
+// them using runsc's library, comparing against expected results.
+func TestRunscSeccomp(t *testing.T) {
+ for _, tc := range seccompTests {
+ t.Run(tc.name, func(t *testing.T) {
+ runscProgram, err := BuildProgram(&tc.config)
+ if err != nil {
+ t.Fatalf("generating runsc BPF: %v", err)
+ }
+
+ if err := checkProgram(runscProgram, tc.input, tc.expected); err != nil {
+ t.Fatalf("running runsc BPF: %v", err)
+ }
+ })
+ }
+}
+
+// checkProgram runs the given program over the given input and checks the
+// result against the expected output.
+func checkProgram(p bpf.Program, in bpf.Input, expected uint32) error {
+ result, err := bpf.Exec(p, in)
+ if err != nil {
+ return err
+ }
+
+ if result != expected {
+ // Include a decoded version of the program in output for debugging purposes.
+ decoded, _ := bpf.DecodeProgram(p)
+ return fmt.Errorf("Unexpected result: got: %d, expected: %d\nBPF Program\n%s", result, expected, decoded)
+ }
+
+ return nil
+}
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 5015c3a84..0392e3e83 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -35,6 +35,7 @@ import (
"gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/runsc/config"
)
// ExePath must point to runsc binary, which is normally the same binary. It's
@@ -110,11 +111,6 @@ func ValidateSpec(spec *specs.Spec) error {
log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.")
}
- // TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox.
- if spec.Linux != nil && spec.Linux.Seccomp != nil {
- log.Warningf("Seccomp spec is being ignored")
- }
-
if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil {
return err
@@ -161,18 +157,18 @@ func OpenSpec(bundleDir string) (*os.File, error) {
// ReadSpec reads an OCI runtime spec from the given bundle directory.
// ReadSpec also normalizes all potential relative paths into absolute
// path, e.g. spec.Root.Path, mount.Source.
-func ReadSpec(bundleDir string) (*specs.Spec, error) {
+func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) {
specFile, err := OpenSpec(bundleDir)
if err != nil {
return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err)
}
defer specFile.Close()
- return ReadSpecFromFile(bundleDir, specFile)
+ return ReadSpecFromFile(bundleDir, specFile, conf)
}
// ReadSpecFromFile reads an OCI runtime spec from the given File, and
// normalizes all relative paths into absolute by prepending the bundle dir.
-func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) {
+func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) {
if _, err := specFile.Seek(0, os.SEEK_SET); err != nil {
return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err)
}
@@ -195,6 +191,20 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error)
m.Source = absPath(bundleDir, m.Source)
}
}
+
+ // Override flags using annotation to allow customization per sandbox
+ // instance.
+ for annotation, val := range spec.Annotations {
+ const flagPrefix = "dev.gvisor.flag."
+ if strings.HasPrefix(annotation, flagPrefix) {
+ name := annotation[len(flagPrefix):]
+ log.Infof("Overriding flag: %s=%q", name, val)
+ if err := conf.Override(name, val); err != nil {
+ return nil, err
+ }
+ }
+ }
+
return &spec, nil
}