summaryrefslogtreecommitdiffhomepage
path: root/runsc/boot
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/boot')
-rw-r--r--runsc/boot/BUILD13
-rw-r--r--runsc/boot/config.go329
-rw-r--r--runsc/boot/controller.go80
-rw-r--r--runsc/boot/filter/config.go502
-rw-r--r--runsc/boot/filter/config_amd64.go41
-rw-r--r--runsc/boot/filter/config_arm64.go25
-rw-r--r--runsc/boot/filter/config_profile.go6
-rw-r--r--runsc/boot/fs.go87
-rw-r--r--runsc/boot/fs_test.go11
-rw-r--r--runsc/boot/loader.go446
-rw-r--r--runsc/boot/loader_test.go74
-rw-r--r--runsc/boot/network.go49
-rw-r--r--runsc/boot/strace.go7
-rw-r--r--runsc/boot/vfs.go426
14 files changed, 1040 insertions, 1056 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index aad2a41de..2d9517f4a 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -8,7 +8,6 @@ go_library(
"compat.go",
"compat_amd64.go",
"compat_arm64.go",
- "config.go",
"controller.go",
"debug.go",
"events.go",
@@ -27,10 +26,13 @@ go_library(
deps = [
"//pkg/abi",
"//pkg/abi/linux",
+ "//pkg/bpf",
+ "//pkg/cleanup",
"//pkg/context",
"//pkg/control/server",
"//pkg/cpuid",
"//pkg/eventchannel",
+ "//pkg/fd",
"//pkg/fspath",
"//pkg/log",
"//pkg/memutil",
@@ -90,6 +92,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/link/fdbased",
"//pkg/tcpip/link/loopback",
+ "//pkg/tcpip/link/packetsocket",
"//pkg/tcpip/link/qdisc/fifo",
"//pkg/tcpip/link/sniffer",
"//pkg/tcpip/network/arp",
@@ -104,9 +107,11 @@ go_library(
"//runsc/boot/filter",
"//runsc/boot/platforms",
"//runsc/boot/pprof",
+ "//runsc/config",
"//runsc/specutils",
+ "//runsc/specutils/seccomp",
"@com_github_golang_protobuf//proto:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
)
@@ -122,6 +127,7 @@ go_test(
library = ":boot",
deps = [
"//pkg/control/server",
+ "//pkg/fd",
"//pkg/fspath",
"//pkg/log",
"//pkg/p9",
@@ -130,8 +136,9 @@ go_test(
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/unet",
+ "//runsc/config",
"//runsc/fsgofer",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
deleted file mode 100644
index bb01b8fb5..000000000
--- a/runsc/boot/config.go
+++ /dev/null
@@ -1,329 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
- "fmt"
- "strconv"
- "strings"
-
- "gvisor.dev/gvisor/pkg/refs"
- "gvisor.dev/gvisor/pkg/sentry/watchdog"
-)
-
-// FileAccessType tells how the filesystem is accessed.
-type FileAccessType int
-
-const (
- // FileAccessShared sends IO requests to a Gofer process that validates the
- // requests and forwards them to the host.
- FileAccessShared FileAccessType = iota
-
- // FileAccessExclusive is the same as FileAccessShared, but enables
- // extra caching for improved performance. It should only be used if
- // the sandbox has exclusive access to the filesystem.
- FileAccessExclusive
-)
-
-// MakeFileAccessType converts type from string.
-func MakeFileAccessType(s string) (FileAccessType, error) {
- switch s {
- case "shared":
- return FileAccessShared, nil
- case "exclusive":
- return FileAccessExclusive, nil
- default:
- return 0, fmt.Errorf("invalid file access type %q", s)
- }
-}
-
-func (f FileAccessType) String() string {
- switch f {
- case FileAccessShared:
- return "shared"
- case FileAccessExclusive:
- return "exclusive"
- default:
- return fmt.Sprintf("unknown(%d)", f)
- }
-}
-
-// NetworkType tells which network stack to use.
-type NetworkType int
-
-const (
- // NetworkSandbox uses internal network stack, isolated from the host.
- NetworkSandbox NetworkType = iota
-
- // NetworkHost redirects network related syscalls to the host network.
- NetworkHost
-
- // NetworkNone sets up just loopback using netstack.
- NetworkNone
-)
-
-// MakeNetworkType converts type from string.
-func MakeNetworkType(s string) (NetworkType, error) {
- switch s {
- case "sandbox":
- return NetworkSandbox, nil
- case "host":
- return NetworkHost, nil
- case "none":
- return NetworkNone, nil
- default:
- return 0, fmt.Errorf("invalid network type %q", s)
- }
-}
-
-func (n NetworkType) String() string {
- switch n {
- case NetworkSandbox:
- return "sandbox"
- case NetworkHost:
- return "host"
- case NetworkNone:
- return "none"
- default:
- return fmt.Sprintf("unknown(%d)", n)
- }
-}
-
-// MakeWatchdogAction converts type from string.
-func MakeWatchdogAction(s string) (watchdog.Action, error) {
- switch strings.ToLower(s) {
- case "log", "logwarning":
- return watchdog.LogWarning, nil
- case "panic":
- return watchdog.Panic, nil
- default:
- return 0, fmt.Errorf("invalid watchdog action %q", s)
- }
-}
-
-// MakeRefsLeakMode converts type from string.
-func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
- switch strings.ToLower(s) {
- case "disabled":
- return refs.NoLeakChecking, nil
- case "log-names":
- return refs.LeaksLogWarning, nil
- case "log-traces":
- return refs.LeaksLogTraces, nil
- default:
- return 0, fmt.Errorf("invalid refs leakmode %q", s)
- }
-}
-
-func refsLeakModeToString(mode refs.LeakMode) string {
- switch mode {
- // If not set, default it to disabled.
- case refs.UninitializedLeakChecking, refs.NoLeakChecking:
- return "disabled"
- case refs.LeaksLogWarning:
- return "log-names"
- case refs.LeaksLogTraces:
- return "log-traces"
- default:
- panic(fmt.Sprintf("Invalid leakmode: %d", mode))
- }
-}
-
-// Config holds configuration that is not part of the runtime spec.
-type Config struct {
- // RootDir is the runtime root directory.
- RootDir string
-
- // Debug indicates that debug logging should be enabled.
- Debug bool
-
- // LogFilename is the filename to log to, if not empty.
- LogFilename string
-
- // LogFormat is the log format.
- LogFormat string
-
- // DebugLog is the path to log debug information to, if not empty.
- DebugLog string
-
- // PanicLog is the path to log GO's runtime messages, if not empty.
- PanicLog string
-
- // DebugLogFormat is the log format for debug.
- DebugLogFormat string
-
- // FileAccess indicates how the filesystem is accessed.
- FileAccess FileAccessType
-
- // Overlay is whether to wrap the root filesystem in an overlay.
- Overlay bool
-
- // FSGoferHostUDS enables the gofer to mount a host UDS.
- FSGoferHostUDS bool
-
- // Network indicates what type of network to use.
- Network NetworkType
-
- // EnableRaw indicates whether raw sockets should be enabled. Raw
- // sockets are disabled by stripping CAP_NET_RAW from the list of
- // capabilities.
- EnableRaw bool
-
- // HardwareGSO indicates that hardware segmentation offload is enabled.
- HardwareGSO bool
-
- // SoftwareGSO indicates that software segmentation offload is enabled.
- SoftwareGSO bool
-
- // TXChecksumOffload indicates that TX Checksum Offload is enabled.
- TXChecksumOffload bool
-
- // RXChecksumOffload indicates that RX Checksum Offload is enabled.
- RXChecksumOffload bool
-
- // QDisc indicates the type of queuening discipline to use by default
- // for non-loopback interfaces.
- QDisc QueueingDiscipline
-
- // LogPackets indicates that all network packets should be logged.
- LogPackets bool
-
- // Platform is the platform to run on.
- Platform string
-
- // Strace indicates that strace should be enabled.
- Strace bool
-
- // StraceSyscalls is the set of syscalls to trace. If StraceEnable is
- // true and this list is empty, then all syscalls will be traced.
- StraceSyscalls []string
-
- // StraceLogSize is the max size of data blobs to display.
- StraceLogSize uint
-
- // DisableSeccomp indicates whether seccomp syscall filters should be
- // disabled. Pardon the double negation, but default to enabled is important.
- DisableSeccomp bool
-
- // WatchdogAction sets what action the watchdog takes when triggered.
- WatchdogAction watchdog.Action
-
- // PanicSignal registers signal handling that panics. Usually set to
- // SIGUSR2(12) to troubleshoot hangs. -1 disables it.
- PanicSignal int
-
- // ProfileEnable is set to prepare the sandbox to be profiled.
- ProfileEnable bool
-
- // RestoreFile is the path to the saved container image
- RestoreFile string
-
- // NumNetworkChannels controls the number of AF_PACKET sockets that map
- // to the same underlying network device. This allows netstack to better
- // scale for high throughput use cases.
- NumNetworkChannels int
-
- // Rootless allows the sandbox to be started with a user that is not root.
- // Defense is depth measures are weaker with rootless. Specifically, the
- // sandbox and Gofer process run as root inside a user namespace with root
- // mapped to the caller's user.
- Rootless bool
-
- // AlsoLogToStderr allows to send log messages to stderr.
- AlsoLogToStderr bool
-
- // ReferenceLeakMode sets reference leak check mode
- ReferenceLeakMode refs.LeakMode
-
- // OverlayfsStaleRead instructs the sandbox to assume that the root mount
- // is on a Linux overlayfs mount, which does not necessarily preserve
- // coherence between read-only and subsequent writable file descriptors
- // representing the "same" file.
- OverlayfsStaleRead bool
-
- // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
- // tests. It allows runsc to start the sandbox process as the current
- // user, and without chrooting the sandbox process. This can be
- // necessary in test environments that have limited capabilities.
- TestOnlyAllowRunAsCurrentUserWithoutChroot bool
-
- // TestOnlyTestNameEnv should only be used in tests. It looks up for the
- // test name in the container environment variables and adds it to the debug
- // log file name. This is done to help identify the log with the test when
- // multiple tests are run in parallel, since there is no way to pass
- // parameters to the runtime from docker.
- TestOnlyTestNameEnv string
-
- // CPUNumFromQuota sets CPU number count to available CPU quota, using
- // least integer value greater than or equal to quota.
- //
- // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
- CPUNumFromQuota bool
-
- // Enables VFS2 (not plumbled through yet).
- VFS2 bool
-}
-
-// ToFlags returns a slice of flags that correspond to the given Config.
-func (c *Config) ToFlags() []string {
- f := []string{
- "--root=" + c.RootDir,
- "--debug=" + strconv.FormatBool(c.Debug),
- "--log=" + c.LogFilename,
- "--log-format=" + c.LogFormat,
- "--debug-log=" + c.DebugLog,
- "--panic-log=" + c.PanicLog,
- "--debug-log-format=" + c.DebugLogFormat,
- "--file-access=" + c.FileAccess.String(),
- "--overlay=" + strconv.FormatBool(c.Overlay),
- "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
- "--network=" + c.Network.String(),
- "--log-packets=" + strconv.FormatBool(c.LogPackets),
- "--platform=" + c.Platform,
- "--strace=" + strconv.FormatBool(c.Strace),
- "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","),
- "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)),
- "--watchdog-action=" + c.WatchdogAction.String(),
- "--panic-signal=" + strconv.Itoa(c.PanicSignal),
- "--profile=" + strconv.FormatBool(c.ProfileEnable),
- "--net-raw=" + strconv.FormatBool(c.EnableRaw),
- "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
- "--rootless=" + strconv.FormatBool(c.Rootless),
- "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
- "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
- "--gso=" + strconv.FormatBool(c.HardwareGSO),
- "--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
- "--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
- "--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
- "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
- "--qdisc=" + c.QDisc.String(),
- }
- if c.CPUNumFromQuota {
- f = append(f, "--cpu-num-from-quota")
- }
- // Only include these if set since it is never to be used by users.
- if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
- f = append(f, "--TESTONLY-unsafe-nonroot=true")
- }
- if len(c.TestOnlyTestNameEnv) != 0 {
- f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
- }
-
- if c.VFS2 {
- f = append(f, "--vfs2=true")
- }
-
- return f
-}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 8125d5061..894651519 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,6 +22,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/control/server"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -33,6 +34,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot/pprof"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -101,14 +103,13 @@ const (
// Profiling related commands (see pprof.go for more details).
const (
- StartCPUProfile = "Profile.StartCPUProfile"
- StopCPUProfile = "Profile.StopCPUProfile"
- HeapProfile = "Profile.HeapProfile"
- GoroutineProfile = "Profile.GoroutineProfile"
- BlockProfile = "Profile.BlockProfile"
- MutexProfile = "Profile.MutexProfile"
- StartTrace = "Profile.StartTrace"
- StopTrace = "Profile.StopTrace"
+ StartCPUProfile = "Profile.StartCPUProfile"
+ StopCPUProfile = "Profile.StopCPUProfile"
+ HeapProfile = "Profile.HeapProfile"
+ BlockProfile = "Profile.BlockProfile"
+ MutexProfile = "Profile.MutexProfile"
+ StartTrace = "Profile.StartTrace"
+ StopTrace = "Profile.StopTrace"
)
// Logging related commands (see logging.go for more details).
@@ -129,42 +130,52 @@ type controller struct {
// manager holds the containerManager methods.
manager *containerManager
+
+ // pprop holds the profile instance if enabled. It may be nil.
+ pprof *control.Profile
}
// newController creates a new controller. The caller must call
// controller.srv.StartServing() to start the controller.
func newController(fd int, l *Loader) (*controller, error) {
- srv, err := server.CreateFromFD(fd)
+ ctrl := &controller{}
+ var err error
+ ctrl.srv, err = server.CreateFromFD(fd)
if err != nil {
return nil, err
}
- manager := &containerManager{
+ ctrl.manager = &containerManager{
startChan: make(chan struct{}),
startResultChan: make(chan error),
l: l,
}
- srv.Register(manager)
+ ctrl.srv.Register(ctrl.manager)
if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
net := &Network{
Stack: eps.Stack,
}
- srv.Register(net)
+ ctrl.srv.Register(net)
}
- srv.Register(&debug{})
- srv.Register(&control.Logging{})
- if l.conf.ProfileEnable {
- srv.Register(&control.Profile{
- Kernel: l.k,
- })
+ ctrl.srv.Register(&debug{})
+ ctrl.srv.Register(&control.Logging{})
+
+ if l.root.conf.ProfileEnable {
+ ctrl.pprof = &control.Profile{Kernel: l.k}
+ ctrl.srv.Register(ctrl.pprof)
}
- return &controller{
- srv: srv,
- manager: manager,
- }, nil
+ return ctrl, nil
+}
+
+func (c *controller) stop() {
+ if c.pprof != nil {
+ // These are noop if there is nothing being profiled.
+ _ = c.pprof.StopCPUProfile(nil, nil)
+ _ = c.pprof.StopTrace(nil, nil)
+ }
}
// containerManager manages sandbox containers.
@@ -211,7 +222,7 @@ type StartArgs struct {
Spec *specs.Spec
// Config is the runsc-specific configuration for the sandbox.
- Conf *Config
+ Conf *config.Config
// CID is the ID of the container to start.
CID string
@@ -247,13 +258,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
// All validation passed, logs the spec for debugging.
specutils.LogSpec(args.Spec)
- err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+ fds, err := fd.NewFromFiles(args.FilePayload.Files)
if err != nil {
+ return err
+ }
+ defer func() {
+ for _, fd := range fds {
+ _ = fd.Close()
+ }
+ }()
+ if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
return err
}
log.Debugf("Container %q started", args.CID)
-
return nil
}
@@ -333,7 +351,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
// Pause the kernel while we build a new one.
cm.l.k.Pause()
- p, err := createPlatform(cm.l.conf, deviceFile)
+ p, err := createPlatform(cm.l.root.conf, deviceFile)
if err != nil {
return fmt.Errorf("creating platform: %v", err)
}
@@ -349,8 +367,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
cm.l.k = k
// Set up the restore environment.
- mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
- renv, err := mntr.createRestoreEnvironment(cm.l.conf)
+ mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints)
+ renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
if err != nil {
return fmt.Errorf("creating RestoreEnvironment: %v", err)
}
@@ -368,7 +386,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("file cannot be empty")
}
- if cm.l.conf.ProfileEnable {
+ if cm.l.root.conf.ProfileEnable {
// pprof.Initialize opens /proc/self/maps, so has to be called before
// installing seccomp filters.
pprof.Initialize()
@@ -387,13 +405,13 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
// Since we have a new kernel we also must make a new watchdog.
dogOpts := watchdog.DefaultOpts
- dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+ dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction
dog := watchdog.New(k, dogOpts)
// Change the loader fields to reflect the changes made when restoring.
cm.l.k = k
cm.l.watchdog = dog
- cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+ cm.l.root.procArgs = kernel.CreateProcessArgs{}
cm.l.restore = true
// Reinitialize the sandbox ID and processes map. Note that it doesn't
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 60e33425f..6ac19668f 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -27,41 +27,30 @@ import (
// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_CLOCK_GETTIME: {},
- syscall.SYS_CLONE: []seccomp.Rule{
- {
- seccomp.AllowValue(
- syscall.CLONE_VM |
- syscall.CLONE_FS |
- syscall.CLONE_FILES |
- syscall.CLONE_SIGHAND |
- syscall.CLONE_SYSVSEM |
- syscall.CLONE_THREAD),
- },
- },
- syscall.SYS_CLOSE: {},
- syscall.SYS_DUP: {},
+ syscall.SYS_CLOSE: {},
+ syscall.SYS_DUP: {},
syscall.SYS_DUP3: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.O_CLOEXEC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.O_CLOEXEC),
},
},
syscall.SYS_EPOLL_CREATE1: {},
syscall.SYS_EPOLL_CTL: {},
syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0),
},
},
syscall.SYS_EVENTFD2: []seccomp.Rule{
{
- seccomp.AllowValue(0),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(0),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_EXIT: {},
@@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_FCHMOD: {},
syscall.SYS_FCNTL: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_GETFL),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_GETFL),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_SETFL),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_SETFL),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_GETFD),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_GETFD),
},
},
syscall.SYS_FSTAT: {},
@@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_FTRUNCATE: {},
syscall.SYS_FUTEX: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
- seccomp.AllowAny{},
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.MatchAny{},
},
// Non-private variants are included for flipcall support. They are otherwise
// unncessary, as the sentry will use only private futexes internally.
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAIT),
- seccomp.AllowAny{},
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAIT),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAKE),
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAKE),
+ seccomp.MatchAny{},
},
},
syscall.SYS_GETPID: {},
unix.SYS_GETRANDOM: {},
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_DOMAIN),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_DOMAIN),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_TYPE),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_TYPE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_ERROR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_ERROR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_SNDBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_SNDBUF),
},
},
syscall.SYS_GETTID: {},
@@ -141,34 +130,34 @@ var allowedSyscalls = seccomp.SyscallRules{
// setting/getting termios and winsize.
syscall.SYS_IOCTL: []seccomp.Rule{
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCGETS),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCGETS),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCSETS),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCSETS),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCSETSF),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCSETSF),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCSETSW),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCSETSW),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TIOCSWINSZ),
- seccomp.AllowAny{}, /* winsize struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TIOCSWINSZ),
+ seccomp.MatchAny{}, /* winsize struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TIOCGWINSZ),
- seccomp.AllowAny{}, /* winsize struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TIOCGWINSZ),
+ seccomp.MatchAny{}, /* winsize struct */
},
},
syscall.SYS_LSEEK: {},
@@ -182,46 +171,46 @@ var allowedSyscalls = seccomp.SyscallRules{
// TODO(b/148688965): Remove once this is gone from Go.
syscall.SYS_MLOCK: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(4096),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4096),
},
},
syscall.SYS_MMAP: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_SHARED),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_SHARED),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ),
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
},
},
syscall.SYS_MPROTECT: {},
@@ -237,32 +226,32 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_READ: {},
syscall.SYS_RECVMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
},
},
syscall.SYS_RECVMMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
- seccomp.AllowValue(syscall.MSG_DONTWAIT),
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(fdbased.MaxMsgsPerRecv),
+ seccomp.EqualTo(syscall.MSG_DONTWAIT),
+ seccomp.EqualTo(0),
},
},
unix.SYS_SENDMMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT),
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_RESTART_SYSCALL: {},
@@ -272,57 +261,50 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_SCHED_YIELD: {},
syscall.SYS_SENDMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
},
},
syscall.SYS_SETITIMER: {},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
// Used by fs/host to shutdown host sockets.
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)},
// Used by unet to shutdown connections.
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)},
},
syscall.SYS_SIGALTSTACK: {},
unix.SYS_STATX: {},
syscall.SYS_SYNC_FILE_RANGE: {},
syscall.SYS_TEE: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(1), /* len */
- seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(1), /* len */
+ seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */
},
},
syscall.SYS_TGKILL: []seccomp.Rule{
{
- seccomp.AllowValue(uint64(os.Getpid())),
+ seccomp.EqualTo(uint64(os.Getpid())),
},
},
syscall.SYS_UTIMENSAT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(0), /* null pathname */
- seccomp.AllowAny{},
- seccomp.AllowValue(0), /* flags */
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0), /* null pathname */
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0), /* flags */
},
},
syscall.SYS_WRITE: {},
- // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
- // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
- // option is enabled for a packet socket.
+ // For rawfile.NonBlockingWriteIovec.
syscall.SYS_WRITEV: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(2),
- },
- {
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(3),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.GreaterThan(0),
},
},
}
@@ -332,10 +314,10 @@ func hostInetFilters() seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_ACCEPT4: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
},
},
syscall.SYS_BIND: {},
@@ -344,84 +326,84 @@ func hostInetFilters() seccomp.SyscallRules {
syscall.SYS_GETSOCKNAME: {},
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_TOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_TOS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_RECVTOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_RECVTOS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_TCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_TCLASS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_V6ONLY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_V6ONLY),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_ERROR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_ERROR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_KEEPALIVE),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_KEEPALIVE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_SNDBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_SNDBUF),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_RCVBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_RCVBUF),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_REUSEADDR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_REUSEADDR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_TYPE),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_TYPE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_LINGER),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_LINGER),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_TCP),
- seccomp.AllowValue(syscall.TCP_NODELAY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(syscall.TCP_NODELAY),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_TCP),
- seccomp.AllowValue(syscall.TCP_INFO),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(syscall.TCP_INFO),
},
},
syscall.SYS_IOCTL: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.TIOCOUTQ),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.TIOCOUTQ),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.TIOCINQ),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.TIOCINQ),
},
},
syscall.SYS_LISTEN: {},
@@ -432,103 +414,103 @@ func hostInetFilters() seccomp.SyscallRules {
syscall.SYS_SENDTO: {},
syscall.SYS_SETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_V6ONLY),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_V6ONLY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_SNDBUF),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_SNDBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_RCVBUF),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_RCVBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_REUSEADDR),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_REUSEADDR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_TCP),
- seccomp.AllowValue(syscall.TCP_NODELAY),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(syscall.TCP_NODELAY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_TOS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_TOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_RECVTOS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_RECVTOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_TCLASS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_TCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SHUT_RD),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SHUT_RD),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SHUT_WR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SHUT_WR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SHUT_RDWR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SHUT_RDWR),
},
},
syscall.SYS_SOCKET: []seccomp.Rule{
{
- seccomp.AllowValue(syscall.AF_INET),
- seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET),
+ seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_INET),
- seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET),
+ seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_INET6),
- seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET6),
+ seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_INET6),
- seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET6),
+ seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_WRITEV: {},
@@ -539,20 +521,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_ACCEPT: []seccomp.Rule{
{
- seccomp.AllowValue(fd),
+ seccomp.EqualTo(fd),
},
},
syscall.SYS_LISTEN: []seccomp.Rule{
{
- seccomp.AllowValue(fd),
- seccomp.AllowValue(16 /* unet.backlog */),
+ seccomp.EqualTo(fd),
+ seccomp.EqualTo(16 /* unet.backlog */),
},
},
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_PEERCRED),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_PEERCRED),
},
},
}
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
index 5335ff82c..cea5613b8 100644
--- a/runsc/boot/filter/config_amd64.go
+++ b/runsc/boot/filter/config_amd64.go
@@ -24,8 +24,41 @@ import (
)
func init() {
- allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
- seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
- seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
- )
+ allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+ // TODO(b/168828518): No longer used in Go 1.16+.
+ {seccomp.EqualTo(linux.ARCH_SET_FS)},
+ }
+
+ allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+ // parent_tidptr and child_tidptr are always 0 because neither
+ // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+ {
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SETTLS |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ seccomp.EqualTo(0), // parent_tidptr
+ seccomp.EqualTo(0), // child_tidptr
+ seccomp.MatchAny{}, // tls
+ },
+ {
+ // TODO(b/168828518): No longer used in Go 1.16+ (on amd64).
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ seccomp.EqualTo(0), // parent_tidptr
+ seccomp.EqualTo(0), // child_tidptr
+ seccomp.MatchAny{}, // tls
+ },
+ }
}
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
index 7fa9bbda3..37313f97f 100644
--- a/runsc/boot/filter/config_arm64.go
+++ b/runsc/boot/filter/config_arm64.go
@@ -16,6 +16,29 @@
package filter
-// Reserve for future customization.
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/seccomp"
+)
+
func init() {
+ allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+ {
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ // These arguments are left uninitialized by the Go
+ // runtime, so they may be anything (and are unused by
+ // the host).
+ seccomp.MatchAny{}, // parent_tidptr
+ seccomp.MatchAny{}, // tls
+ seccomp.MatchAny{}, // child_tidptr
+ },
+ }
}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
index 194952a7b..7b8669595 100644
--- a/runsc/boot/filter/config_profile.go
+++ b/runsc/boot/filter/config_profile.go
@@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_OPENAT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
},
},
}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e83584b82..ddf288456 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -29,10 +29,12 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
@@ -47,6 +49,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -65,7 +68,7 @@ const (
// tmpfs has some extra supported options that we must pass through.
var tmpfsAllowedData = []string{"mode", "uid", "gid"}
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
// Upper layer uses the same flags as lower, but it must be read-write.
upperFlags := lowerFlags
upperFlags.ReadOnly = false
@@ -155,7 +158,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
}
// p9MountData creates a slice of p9 mount data.
-func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
+func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string {
opts := []string{
"trans=fd",
"rfdno=" + strconv.Itoa(fd),
@@ -166,7 +169,7 @@ func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
// enablement.
opts = append(opts, "privateunixsocket=true")
}
- if fa == FileAccessShared {
+ if fa == config.FileAccessShared {
opts = append(opts, "cache=remote_revalidating")
}
return opts
@@ -251,7 +254,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
// addSubmountOverlay overlays the inode over a ramfs tree containing the given
// paths.
-func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) {
// Construct a ramfs tree of mount points. The contents never
// change, so this can be fully caching. There's no real
// filesystem backing this tree, so we set the filesystem to
@@ -261,7 +264,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
if err != nil {
return nil, fmt.Errorf("creating mount tree: %v", err)
}
- overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+ overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf)
if err != nil {
return nil, fmt.Errorf("adding mount overlay: %v", err)
}
@@ -280,7 +283,7 @@ func subtargets(root string, mnts []specs.Mount) []string {
return targets
}
-func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
if conf.VFS2 {
return setupContainerVFS2(ctx, conf, mntr, procArgs)
}
@@ -318,14 +321,14 @@ func adjustDirentCache(k *kernel.Kernel) error {
}
type fdDispenser struct {
- fds []int
+ fds []*fd.FD
}
func (f *fdDispenser) remove() int {
if f.empty() {
panic("fdDispenser out of fds")
}
- rv := f.fds[0]
+ rv := f.fds[0].Release()
f.fds = f.fds[1:]
return rv
}
@@ -390,6 +393,10 @@ type mountHint struct {
// root is the inode where the volume is mounted. For mounts with 'pod' share
// the volume is mounted once and then bind mounted inside the containers.
root *fs.Inode
+
+ // vfsMount is the master mount for the volume. For mounts with 'pod' share
+ // the master volume is bind mounted inside the containers.
+ vfsMount *vfs.Mount
}
func (m *mountHint) setField(key, val string) error {
@@ -447,27 +454,27 @@ func (m *mountHint) isSupported() bool {
func (m *mountHint) checkCompatible(mount specs.Mount) error {
// Remove options that don't affect to mount's behavior.
masterOpts := filterUnsupportedOptions(m.mount)
- slaveOpts := filterUnsupportedOptions(mount)
+ replicaOpts := filterUnsupportedOptions(mount)
- if len(masterOpts) != len(slaveOpts) {
- return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+ if len(masterOpts) != len(replicaOpts) {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
}
sort.Strings(masterOpts)
- sort.Strings(slaveOpts)
+ sort.Strings(replicaOpts)
for i, opt := range masterOpts {
- if opt != slaveOpts[i] {
- return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+ if opt != replicaOpts[i] {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
}
}
return nil
}
-func (m *mountHint) fileAccessType() FileAccessType {
+func (m *mountHint) fileAccessType() config.FileAccessType {
if m.share == container {
- return FileAccessExclusive
+ return config.FileAccessExclusive
}
- return FileAccessShared
+ return config.FileAccessShared
}
func filterUnsupportedOptions(mount specs.Mount) []string {
@@ -558,7 +565,7 @@ type containerMounter struct {
hints *podMountHints
}
-func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter {
return &containerMounter{
root: spec.Root,
mounts: compileMounts(spec),
@@ -571,9 +578,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
// processHints processes annotations that container hints about how volumes
// should be mounted (e.g. a volume shared between containers). It must be
// called for the root container only.
-func (c *containerMounter) processHints(conf *Config) error {
+func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error {
if conf.VFS2 {
- return nil
+ return c.processHintsVFS2(conf, creds)
}
ctx := c.k.SupervisorContext()
for _, hint := range c.hints.mounts {
@@ -595,7 +602,7 @@ func (c *containerMounter) processHints(conf *Config) error {
// setupFS is used to set up the file system for all containers. This is the
// main entry point method, with most of the other being internal only. It
// returns the mount namespace that is created for the container.
-func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
log.Infof("Configuring container's file system")
// Create context with root credentials to mount the filesystem (the current
@@ -621,7 +628,7 @@ func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessA
return mns, nil
}
-func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) {
rootInode, err := c.createRootMount(ctx, conf)
if err != nil {
return nil, fmt.Errorf("creating filesystem for container: %v", err)
@@ -633,9 +640,9 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi
return mns, nil
}
-func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error {
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
for _, m := range c.mounts {
log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
@@ -669,7 +676,7 @@ func (c *containerMounter) checkDispenser() error {
// mountSharedMaster mounts the master of a volume that is shared among
// containers in a pod. It returns the root mount's inode.
-func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) {
// Map mount type to filesystem name, and parse out the options that we are
// capable of dealing with.
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
@@ -709,7 +716,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config,
}
// createRootMount creates the root filesystem.
-func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) {
// First construct the filesystem from the spec.Root.
mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
@@ -734,7 +741,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
// for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
// mounted even if they are not in the spec.
submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
- rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+ rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf)
if err != nil {
return nil, fmt.Errorf("adding submount overlay: %v", err)
}
@@ -754,7 +761,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
// used for mounts.
-func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) {
var (
fsName string
opts []string
@@ -788,19 +795,19 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
return fsName, opts, useOverlay, nil
}
-func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
+func (c *containerMounter) getMountAccessType(mount specs.Mount) config.FileAccessType {
if hint := c.hints.findMount(mount); hint != nil {
return hint.fileAccessType()
}
// Non-root bind mounts are always shared if no hints were provided.
- return FileAccessShared
+ return config.FileAccessShared
}
// mountSubmount mounts volumes inside the container's root. Because mounts may
// be readonly, a lower ramfs overlay is added to create the mount point dir.
// Another overlay is added with tmpfs on top if Config.Overlay is true.
// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
// Map mount type to filesystem name, and parse out the options that we are
// capable of dealing with.
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
@@ -844,7 +851,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
submounts := subtargets(m.Destination, c.mounts)
if len(submounts) > 0 {
log.Infof("Adding submount overlay over %q", m.Destination)
- inode, err = addSubmountOverlay(ctx, inode, submounts)
+ inode, err = addSubmountOverlay(ctx, inode, submounts, mf)
if err != nil {
return fmt.Errorf("adding submount overlay: %v", err)
}
@@ -863,7 +870,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
if err != nil {
return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
}
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
if err := mns.Mount(ctx, dirent, inode); err != nil {
return fmt.Errorf("mount %q error: %v", m.Destination, err)
}
@@ -884,12 +891,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
if err != nil {
return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
}
- defer target.DecRef()
+ defer target.DecRef(ctx)
// Take a ref on the inode that is about to be (re)-mounted.
source.root.IncRef()
if err := mns.Mount(ctx, target, source.root); err != nil {
- source.root.DecRef()
+ source.root.DecRef(ctx)
return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
}
@@ -899,7 +906,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
// addRestoreMount adds a mount to the MountSources map used for restoring a
// checkpointed container.
-func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
if err != nil {
return err
@@ -924,7 +931,7 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron
// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
// the mounts to the environment.
-func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) {
renv := &fs.RestoreEnvironment{
MountSources: make(map[string][]fs.MountArgs),
}
@@ -979,7 +986,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
//
// Note that when there are submounts inside of '/tmp', directories for the
// mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error {
for _, m := range c.mounts {
if filepath.Clean(m.Destination) == "/tmp" {
log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
@@ -992,12 +999,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
switch err {
case nil:
// Found '/tmp' in filesystem, check if it's empty.
- defer tmp.DecRef()
+ defer tmp.DecRef(ctx)
f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
if err != nil {
return err
}
- defer f.DecRef()
+ defer f.DecRef(ctx)
serializer := &fs.CollectEntriesSerializer{}
if err := f.Readdir(ctx, serializer); err != nil {
return err
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 912037075..e986231e5 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -20,6 +20,7 @@ import (
"testing"
specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/runsc/config"
)
func TestPodMountHintsHappy(t *testing.T) {
@@ -196,7 +197,7 @@ func TestGetMountAccessType(t *testing.T) {
for _, tst := range []struct {
name string
annotations map[string]string
- want FileAccessType
+ want config.FileAccessType
}{
{
name: "container=exclusive",
@@ -205,7 +206,7 @@ func TestGetMountAccessType(t *testing.T) {
MountPrefix + "mount1.type": "bind",
MountPrefix + "mount1.share": "container",
},
- want: FileAccessExclusive,
+ want: config.FileAccessExclusive,
},
{
name: "pod=shared",
@@ -214,7 +215,7 @@ func TestGetMountAccessType(t *testing.T) {
MountPrefix + "mount1.type": "bind",
MountPrefix + "mount1.share": "pod",
},
- want: FileAccessShared,
+ want: config.FileAccessShared,
},
{
name: "shared=shared",
@@ -223,7 +224,7 @@ func TestGetMountAccessType(t *testing.T) {
MountPrefix + "mount1.type": "bind",
MountPrefix + "mount1.share": "shared",
},
- want: FileAccessShared,
+ want: config.FileAccessShared,
},
{
name: "default=shared",
@@ -232,7 +233,7 @@ func TestGetMountAccessType(t *testing.T) {
MountPrefix + "mount1.type": "bind",
MountPrefix + "mount1.share": "container",
},
- want: FileAccessShared,
+ want: config.FileAccessShared,
},
} {
t.Run(tst.name, func(t *testing.T) {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index b5df1deb9..dee2c4fbb 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -16,22 +16,25 @@
package boot
import (
+ "errors"
"fmt"
mrand "math/rand"
"os"
"runtime"
"sync/atomic"
- "syscall"
gtime "time"
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/memutil"
"gvisor.dev/gvisor/pkg/rand"
+ "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fdimport"
@@ -66,7 +69,9 @@ import (
"gvisor.dev/gvisor/runsc/boot/filter"
_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
"gvisor.dev/gvisor/runsc/boot/pprof"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
+ "gvisor.dev/gvisor/runsc/specutils/seccomp"
// Include supported socket providers.
"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
@@ -77,6 +82,22 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
)
+type containerInfo struct {
+ conf *config.Config
+
+ // spec is the base configuration for the root container.
+ spec *specs.Spec
+
+ // procArgs refers to the container's init task.
+ procArgs kernel.CreateProcessArgs
+
+ // stdioFDs contains stdin, stdout, and stderr.
+ stdioFDs []*fd.FD
+
+ // goferFDs are the FDs that attach the sandbox to the gofers.
+ goferFDs []*fd.FD
+}
+
// Loader keeps state needed to start the kernel and run the container..
type Loader struct {
// k is the kernel.
@@ -85,22 +106,11 @@ type Loader struct {
// ctrl is the control server.
ctrl *controller
- conf *Config
-
- // console is set to true if terminal is enabled.
- console bool
+ // root contains information about the root container in the sandbox.
+ root containerInfo
watchdog *watchdog.Watchdog
- // stdioFDs contains stdin, stdout, and stderr.
- stdioFDs []int
-
- // goferFDs are the FDs that attach the sandbox to the gofers.
- goferFDs []int
-
- // spec is the base configuration for the root container.
- spec *specs.Spec
-
// stopSignalForwarding disables forwarding of signals to the sandboxed
// container. It should be called when a sandbox is destroyed.
stopSignalForwarding func()
@@ -108,9 +118,6 @@ type Loader struct {
// restore is set to true if we are restoring a container.
restore bool
- // rootProcArgs refers to the root sandbox init task.
- rootProcArgs kernel.CreateProcessArgs
-
// sandboxID is the ID for the whole sandbox.
sandboxID string
@@ -162,7 +169,7 @@ type Args struct {
// Spec is the sandbox specification.
Spec *specs.Spec
// Conf is the system configuration.
- Conf *Config
+ Conf *config.Config
// ControllerFD is the FD to the URPC controller. The Loader takes ownership
// of this FD and may close it at any time.
ControllerFD int
@@ -175,8 +182,6 @@ type Args struct {
// StdioFDs is the stdio for the application. The Loader takes ownership of
// these FDs and may close them at any time.
StdioFDs []int
- // Console is set to true if using TTY.
- Console bool
// NumCPU is the number of CPUs to create inside the sandbox.
NumCPU int
// TotalMem is the initial amount of total memory to report back to the
@@ -187,7 +192,7 @@ type Args struct {
}
// make sure stdioFDs are always the same on initial start and on restore
-const startingStdioFD = 64
+const startingStdioFD = 256
// New initializes a new kernel loader configured by spec.
// New also handles setting up a kernel for restoring a container.
@@ -205,6 +210,10 @@ func New(args Args) (*Loader, error) {
// Is this a VFSv2 kernel?
if args.Conf.VFS2 {
kernel.VFS2Enabled = true
+ if args.Conf.FUSE {
+ kernel.FUSEEnabled = true
+ }
+
vfs2.Override()
}
@@ -227,9 +236,7 @@ func New(args Args) (*Loader, error) {
// Create VDSO.
//
// Pass k as the platform since it is savable, unlike the actual platform.
- //
- // FIXME(b/109889800): Use non-nil context.
- vdso, err := loader.PrepareVDSO(nil, k)
+ vdso, err := loader.PrepareVDSO(k)
if err != nil {
return nil, fmt.Errorf("creating vdso: %v", err)
}
@@ -275,6 +282,7 @@ func New(args Args) (*Loader, error) {
args.NumCPU = runtime.NumCPU()
}
log.Infof("CPUs: %d", args.NumCPU)
+ runtime.GOMAXPROCS(args.NumCPU)
if args.TotalMem > 0 {
// Adjust the total memory returned by the Sentry so that applications that
@@ -300,6 +308,12 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("initializing kernel: %v", err)
}
+ if kernel.VFS2Enabled {
+ if err := registerFilesystems(k); err != nil {
+ return nil, fmt.Errorf("registering filesystems: %w", err)
+ }
+ }
+
if err := adjustDirentCache(k); err != nil {
return nil, err
}
@@ -318,7 +332,7 @@ func New(args Args) (*Loader, error) {
dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
dog := watchdog.New(k, dogOpts)
- procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
+ procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
if err != nil {
return nil, fmt.Errorf("creating init process for root container: %v", err)
}
@@ -338,7 +352,7 @@ func New(args Args) (*Loader, error) {
if err != nil {
return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err)
}
- defer hostFilesystem.DecRef()
+ defer hostFilesystem.DecRef(k.SupervisorContext())
hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
@@ -346,37 +360,45 @@ func New(args Args) (*Loader, error) {
k.SetHostMount(hostMount)
}
+ info := containerInfo{
+ conf: args.Conf,
+ spec: args.Spec,
+ procArgs: procArgs,
+ }
+
// Make host FDs stable between invocations. Host FDs must map to the exact
// same number when the sandbox is restored. Otherwise the wrong FD will be
// used.
- var stdioFDs []int
newfd := startingStdioFD
- for _, fd := range args.StdioFDs {
- err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
- if err != nil {
- return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+ for _, stdioFD := range args.StdioFDs {
+ // Check that newfd is unused to avoid clobbering over it.
+ if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
+ if err != nil {
+ return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
+ }
+ return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
}
- stdioFDs = append(stdioFDs, newfd)
- err = syscall.Close(fd)
+
+ err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
if err != nil {
- return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
+ return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
}
+ info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+ _ = unix.Close(stdioFD)
newfd++
}
+ for _, goferFD := range args.GoferFDs {
+ info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+ }
eid := execID{cid: args.ID}
l := &Loader{
- k: k,
- conf: args.Conf,
- console: args.Console,
- watchdog: dog,
- spec: args.Spec,
- goferFDs: args.GoferFDs,
- stdioFDs: stdioFDs,
- rootProcArgs: procArgs,
- sandboxID: args.ID,
- processes: map[execID]*execProcess{eid: {}},
- mountHints: mountHints,
+ k: k,
+ watchdog: dog,
+ sandboxID: args.ID,
+ processes: map[execID]*execProcess{eid: {}},
+ mountHints: mountHints,
+ root: info,
}
// We don't care about child signals; some platforms can generate a
@@ -404,8 +426,8 @@ func New(args Args) (*Loader, error) {
return l, nil
}
-// newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
+// createProcessArgs creates args that can be used with kernel.CreateProcess.
+func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
// Create initial limits.
ls, err := createLimitSet(spec)
if err != nil {
@@ -449,9 +471,19 @@ func (l *Loader) Destroy() {
l.stopSignalForwarding()
}
l.watchdog.Stop()
+
+ // In the success case, stdioFDs and goferFDs will only contain
+ // released/closed FDs that ownership has been passed over to host FDs and
+ // gofer sessions. Close them here in case on failure.
+ for _, fd := range l.root.stdioFDs {
+ _ = fd.Close()
+ }
+ for _, fd := range l.root.goferFDs {
+ _ = fd.Close()
+ }
}
-func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
p, err := platform.Lookup(conf.Platform)
if err != nil {
panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
@@ -478,14 +510,15 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
return mf, nil
}
+// installSeccompFilters installs sandbox seccomp filters with the host.
func (l *Loader) installSeccompFilters() error {
- if l.conf.DisableSeccomp {
+ if l.root.conf.DisableSeccomp {
filter.Report("syscall filter is DISABLED. Running in less secure mode.")
} else {
opts := filter.Options{
Platform: l.k.Platform,
- HostNetwork: l.conf.Network == NetworkHost,
- ProfileEnable: l.conf.ProfileEnable,
+ HostNetwork: l.root.conf.Network == config.NetworkHost,
+ ProfileEnable: l.root.conf.ProfileEnable,
ControllerFD: l.ctrl.srv.FD(),
}
if err := filter.Install(opts); err != nil {
@@ -511,7 +544,7 @@ func (l *Loader) Run() error {
}
func (l *Loader) run() error {
- if l.conf.Network == NetworkHost {
+ if l.root.conf.Network == config.NetworkHost {
// Delay host network configuration to this point because network namespace
// is configured after the loader is created and before Run() is called.
log.Debugf("Configuring host network")
@@ -532,10 +565,8 @@ func (l *Loader) run() error {
// If we are restoring, we do not want to create a process.
// l.restore is set by the container manager when a restore call is made.
- var ttyFile *host.TTYFileOperations
- var ttyFileVFS2 *hostvfs2.TTYFileDescription
if !l.restore {
- if l.conf.ProfileEnable {
+ if l.root.conf.ProfileEnable {
pprof.Initialize()
}
@@ -545,82 +576,30 @@ func (l *Loader) run() error {
return err
}
- // Create the FD map, which will set stdin, stdout, and stderr. If console
- // is true, then ioctl calls will be passed through to the host fd.
- ctx := l.rootProcArgs.NewContext(l.k)
- var err error
-
- // CreateProcess takes a reference on FDMap if successful. We won't need
- // ours either way.
- l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
- if err != nil {
- return fmt.Errorf("importing fds: %v", err)
- }
-
- // Setup the root container file system.
- l.startGoferMonitor(l.sandboxID, l.goferFDs)
-
- mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
- if err := mntr.processHints(l.conf); err != nil {
- return err
- }
- if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
- return err
- }
-
- // Add the HOME enviroment variable if it is not already set.
- var envv []string
- if kernel.VFS2Enabled {
- envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
- l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
-
- } else {
- envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
- l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
- }
- if err != nil {
- return err
- }
- l.rootProcArgs.Envv = envv
-
// Create the root container init task. It will begin running
// when the kernel is started.
- if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
- return fmt.Errorf("creating init process: %v", err)
+ if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
+ return err
}
- // CreateProcess takes a reference on FDTable if successful.
- l.rootProcArgs.FDTable.DecRef()
}
ep.tg = l.k.GlobalInit()
- if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
+ if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
ep.pidnsPath = ns.Path
}
- if l.console {
- // Set the foreground process group on the TTY to the global init process
- // group, since that is what we are about to start running.
- switch {
- case ttyFileVFS2 != nil:
- ep.ttyVFS2 = ttyFileVFS2
- ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
- case ttyFile != nil:
- ep.tty = ttyFile
- ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
- }
- }
// Handle signals by forwarding them to the root container process
// (except for panic signal, which should cause a panic).
l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
// Panic signal should cause a panic.
- if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+ if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
panic("Signal-induced panic")
}
// Otherwise forward to root container.
deliveryMode := DeliverToProcess
- if l.console {
+ if l.root.spec.Process.Terminal {
// Since we are running with a console, we should forward the signal to
// the foreground process group so that job control signals like ^C can
// be handled properly.
@@ -632,19 +611,6 @@ func (l *Loader) run() error {
}
})
- // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
- // either in createFDTable() during initial start or in descriptor.initAfterLoad()
- // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
- // passed FDs, so only close for VFS1.
- if !kernel.VFS2Enabled {
- for _, fd := range l.stdioFDs {
- err := syscall.Close(fd)
- if err != nil {
- return fmt.Errorf("close dup()ed stdioFDs: %v", err)
- }
- }
- }
-
log.Infof("Process should have started...")
l.watchdog.Start()
return l.k.Start()
@@ -664,9 +630,9 @@ func (l *Loader) createContainer(cid string) error {
}
// startContainer starts a child container. It returns the thread group ID of
-// the newly created process. Caller owns 'files' and may close them after
-// this method returns.
-func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+// the newly created process. Used FDs are either closed or released. It's safe
+// for the caller to close any remaining files upon return.
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
// Create capabilities.
caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
if err != nil {
@@ -676,8 +642,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
l.mu.Lock()
defer l.mu.Unlock()
- eid := execID{cid: cid}
- if _, ok := l.processes[eid]; !ok {
+ ep := l.processes[execID{cid: cid}]
+ if ep == nil {
return fmt.Errorf("trying to start a deleted container %q", cid)
}
@@ -711,88 +677,136 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
if pidns == nil {
pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
}
- l.processes[eid].pidnsPath = ns.Path
+ ep.pidnsPath = ns.Path
} else {
pidns = l.k.RootPIDNamespace()
}
- procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
+
+ info := &containerInfo{
+ conf: conf,
+ spec: spec,
+ stdioFDs: files[:3],
+ goferFDs: files[3:],
+ }
+ info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
if err != nil {
return fmt.Errorf("creating new process: %v", err)
}
+ tg, err := l.createContainerProcess(false, cid, info, ep)
+ if err != nil {
+ return err
+ }
+
+ // Success!
+ l.k.StartProcess(tg)
+ ep.tg = tg
+ return nil
+}
- // setupContainerFS() dups stdioFDs, so we don't need to dup them here.
- var stdioFDs []int
- for _, f := range files[:3] {
- stdioFDs = append(stdioFDs, int(f.Fd()))
+func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) {
+ console := false
+ if root {
+ // Only root container supports terminal for now.
+ console = info.spec.Process.Terminal
}
// Create the FD map, which will set stdin, stdout, and stderr.
- ctx := procArgs.NewContext(l.k)
- fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
+ ctx := info.procArgs.NewContext(l.k)
+ fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs)
if err != nil {
- return fmt.Errorf("importing fds: %v", err)
- }
- // CreateProcess takes a reference on fdTable if successful. We won't
- // need ours either way.
- procArgs.FDTable = fdTable
-
- // Can't take ownership away from os.File. dup them to get a new FDs.
- var goferFDs []int
- for _, f := range files[3:] {
- fd, err := syscall.Dup(int(f.Fd()))
- if err != nil {
- return fmt.Errorf("failed to dup file: %v", err)
- }
- goferFDs = append(goferFDs, fd)
+ return nil, fmt.Errorf("importing fds: %v", err)
}
+ // CreateProcess takes a reference on fdTable if successful. We won't need
+ // ours either way.
+ info.procArgs.FDTable = fdTable
// Setup the child container file system.
- l.startGoferMonitor(cid, goferFDs)
+ l.startGoferMonitor(cid, info.goferFDs)
- mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
- if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
- return err
+ mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints)
+ if root {
+ if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
+ return nil, err
+ }
+ }
+ if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
+ return nil, err
}
// Add the HOME enviroment variable if it is not already set.
var envv []string
if kernel.VFS2Enabled {
- envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2,
- procArgs.Credentials.RealKUID, procArgs.Envv)
+ envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
+ info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
} else {
- envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace,
- procArgs.Credentials.RealKUID, procArgs.Envv)
+ envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
+ info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
}
if err != nil {
- return err
+ return nil, err
}
- procArgs.Envv = envv
+ info.procArgs.Envv = envv
// Create and start the new process.
- tg, _, err := l.k.CreateProcess(procArgs)
+ tg, _, err := l.k.CreateProcess(info.procArgs)
if err != nil {
- return fmt.Errorf("creating process: %v", err)
+ return nil, fmt.Errorf("creating process: %v", err)
}
- l.k.StartProcess(tg)
-
// CreateProcess takes a reference on FDTable if successful.
- procArgs.FDTable.DecRef()
+ info.procArgs.FDTable.DecRef(ctx)
- l.processes[eid].tg = tg
- return nil
+ // Set the foreground process group on the TTY to the global init process
+ // group, since that is what we are about to start running.
+ if root {
+ switch {
+ case ttyFileVFS2 != nil:
+ ep.ttyVFS2 = ttyFileVFS2
+ ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
+ case ttyFile != nil:
+ ep.tty = ttyFile
+ ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
+ }
+ }
+
+ // Install seccomp filters with the new task if there are any.
+ if info.conf.OCISeccomp {
+ if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+ program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
+ if err != nil {
+ return nil, fmt.Errorf("building seccomp program: %v", err)
+ }
+
+ if log.IsLogging(log.Debug) {
+ out, _ := bpf.DecodeProgram(program)
+ log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
+ }
+
+ task := tg.Leader()
+ // NOTE: It seems Flags are ignored by runc so we ignore them too.
+ if err := task.AppendSyscallFilter(program, true); err != nil {
+ return nil, fmt.Errorf("appending seccomp filters: %v", err)
+ }
+ }
+ } else {
+ if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+ log.Warningf("Seccomp spec is being ignored")
+ }
+ }
+
+ return tg, nil
}
// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
-// the gofer FDs looking for disconnects, and destroys the container if a
+// the gofer FDs looking for disconnects, and kills the container processes if a
// disconnect occurs in any of the gofer FDs.
-func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
go func() {
log.Debugf("Monitoring gofer health for container %q", cid)
var events []unix.PollFd
- for _, fd := range goferFDs {
+ for _, goferFD := range goferFDs {
events = append(events, unix.PollFd{
- Fd: int32(fd),
+ Fd: int32(goferFD.FD()),
Events: unix.POLLHUP | unix.POLLRDHUP,
})
}
@@ -805,18 +819,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
}
- // Check if the gofer has stopped as part of normal container destruction.
- // This is done just to avoid sending an annoying error message to the log.
- // Note that there is a small race window in between mu.Unlock() and the
- // lock being reacquired in destroyContainer(), but it's harmless to call
- // destroyContainer() multiple times.
l.mu.Lock()
- _, ok := l.processes[execID{cid: cid}]
- l.mu.Unlock()
- if ok {
- log.Infof("Gofer socket disconnected, destroying container %q", cid)
- if err := l.destroyContainer(cid); err != nil {
- log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+ defer l.mu.Unlock()
+
+ // The gofer could have been stopped due to a normal container shutdown.
+ // Check if the container has not stopped yet.
+ if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
+ log.Infof("Gofer socket disconnected, killing container %q", cid)
+ if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+ log.Warningf("Error killing container %q after gofer stopped: %v", cid, err)
}
}
}()
@@ -885,37 +896,42 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
return 0, fmt.Errorf("container %q not started", args.ContainerID)
}
- // Get the container MountNamespace from the Task.
+ // Get the container MountNamespace from the Task. Try to acquire ref may fail
+ // in case it raced with task exit.
if kernel.VFS2Enabled {
// task.MountNamespace() does not take a ref, so we must do so ourselves.
args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
- args.MountNamespaceVFS2.IncRef()
+ if !args.MountNamespaceVFS2.TryIncRef() {
+ return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+ }
} else {
+ var reffed bool
tg.Leader().WithMuLocked(func(t *kernel.Task) {
// task.MountNamespace() does not take a ref, so we must do so ourselves.
args.MountNamespace = t.MountNamespace()
- args.MountNamespace.IncRef()
+ reffed = args.MountNamespace.TryIncRef()
})
+ if !reffed {
+ return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+ }
}
// Add the HOME environment variable if it is not already set.
if kernel.VFS2Enabled {
- defer args.MountNamespaceVFS2.DecRef()
-
root := args.MountNamespaceVFS2.Root()
- defer root.DecRef()
ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+ defer args.MountNamespaceVFS2.DecRef(ctx)
+ defer root.DecRef(ctx)
envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
if err != nil {
return 0, err
}
args.Envv = envv
} else {
- defer args.MountNamespace.DecRef()
-
root := args.MountNamespace.Root()
- defer root.DecRef()
ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+ defer args.MountNamespace.DecRef(ctx)
+ defer root.DecRef(ctx)
envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
if err != nil {
return 0, err
@@ -1012,20 +1028,25 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
// Wait for container.
l.k.WaitExited()
+ // Cleanup
+ l.ctrl.stop()
+
+ refs.OnExit()
+
return l.k.GlobalInit().ExitStatus()
}
-func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
// Create an empty network stack because the network namespace may be empty at
// this point. Netns is configured before Run() is called. Netstack is
// configured using a control uRPC message. Host network is configured inside
// Run().
switch conf.Network {
- case NetworkHost:
+ case config.NetworkHost:
// No network namespacing support for hostinet yet, hence creator is nil.
return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
- case NetworkNone, NetworkSandbox:
+ case config.NetworkNone, config.NetworkSandbox:
s, err := newEmptySandboxNetworkStack(clock, uniqueID)
if err != nil {
return nil, err
@@ -1043,8 +1064,8 @@ func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.Uni
}
func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
- netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
- transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+ netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
+ transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4}
s := netstack.Stack{stack.New(stack.Options{
NetworkProtocols: netProtos,
TransportProtocols: transProtos,
@@ -1058,17 +1079,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
})}
// Enable SACK Recovery.
- if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
- return nil, fmt.Errorf("failed to enable SACK: %s", err)
+ {
+ opt := tcpip.TCPSACKEnabled(true)
+ if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+ }
}
// Set default TTLs as required by socket/netstack.
- s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
- s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+ {
+ opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
+ if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
+ }
+ if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+ }
+ }
// Enable Receive Buffer Auto-Tuning.
- if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
- return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+ {
+ opt := tcpip.TCPModerateReceiveBufferOption(true)
+ if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+ }
}
return &s, nil
@@ -1264,7 +1298,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2
return ep.tty, ep.ttyVFS2, nil
}
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
if len(stdioFDs) != 3 {
return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
}
@@ -1273,7 +1307,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
fdTable := k.NewFDTable()
ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
if err != nil {
- fdTable.DecRef()
+ fdTable.DecRef(ctx)
return nil, nil, nil, err
}
return fdTable, ttyFile, ttyFileVFS2, nil
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index e448fd773..1f49431a2 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -26,6 +26,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/control/server"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
@@ -34,6 +35,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
+ "gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/fsgofer"
)
@@ -43,15 +45,19 @@ func init() {
if err := fsgofer.OpenProcSelfFD(); err != nil {
panic(err)
}
+ config.RegisterFlags()
}
-func testConfig() *Config {
- return &Config{
- RootDir: "unused_root_dir",
- Network: NetworkNone,
- DisableSeccomp: true,
- Platform: "ptrace",
+func testConfig() *config.Config {
+ conf, err := config.NewFromFlags()
+ if err != nil {
+ panic(err)
}
+ // Change test defaults.
+ conf.RootDir = "unused_root_dir"
+ conf.Network = config.NetworkNone
+ conf.DisableSeccomp = true
+ return conf
}
// testSpec returns a simple spec that can be used in tests.
@@ -258,7 +264,7 @@ type CreateMountTestcase struct {
expectedPaths []string
}
-func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+func createMountTestcases() []*CreateMountTestcase {
testCases := []*CreateMountTestcase{
&CreateMountTestcase{
// Only proc.
@@ -403,32 +409,26 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
Destination: "/proc",
Type: "tmpfs",
},
- // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
- // MkDirAt in VFS2 (and remove the reduntant append).
- // {
- // Destination: "/sys/bar",
- // Type: "tmpfs",
- // },
- //
+ {
+ Destination: "/sys/bar",
+ Type: "tmpfs",
+ },
+
{
Destination: "/tmp/baz",
Type: "tmpfs",
},
},
},
- expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
+ expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"},
}
- if !vfs2 {
- vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
- vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
- }
return append(testCases, vfsCase)
}
// Test that MountNamespace can be created with various specs.
func TestCreateMountNamespace(t *testing.T) {
- for _, tc := range createMountTestcases(false /* vfs2 */) {
+ for _, tc := range createMountTestcases() {
t.Run(tc.name, func(t *testing.T) {
conf := testConfig()
ctx := contexttest.Context(t)
@@ -439,7 +439,7 @@ func TestCreateMountNamespace(t *testing.T) {
}
defer cleanup()
- mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+ mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{})
mns, err := mntr.createMountNamespace(ctx, conf)
if err != nil {
t.Fatalf("failed to create mount namespace: %v", err)
@@ -450,13 +450,13 @@ func TestCreateMountNamespace(t *testing.T) {
}
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
for _, p := range tc.expectedPaths {
maxTraversals := uint(0)
if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
} else {
- d.DecRef()
+ d.DecRef(ctx)
}
}
})
@@ -465,7 +465,7 @@ func TestCreateMountNamespace(t *testing.T) {
// Test that MountNamespace can be created with various specs.
func TestCreateMountNamespaceVFS2(t *testing.T) {
- for _, tc := range createMountTestcases(true /* vfs2 */) {
+ for _, tc := range createMountTestcases() {
t.Run(tc.name, func(t *testing.T) {
spec := testSpec()
spec.Mounts = tc.spec.Mounts
@@ -479,19 +479,19 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
defer l.Destroy()
defer loaderCleanup()
- mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
- if err := mntr.processHints(l.conf); err != nil {
+ mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints)
+ if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil {
t.Fatalf("failed process hints: %v", err)
}
ctx := l.k.SupervisorContext()
- mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
+ mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs)
if err != nil {
- t.Fatalf("failed to setupVFS2: %v", err)
+ t.Fatalf("mountAll: %v", err)
}
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
for _, p := range tc.expectedPaths {
target := &vfs.PathOperation{
Root: root,
@@ -499,10 +499,10 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
Path: fspath.Parse(p),
}
- if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+ if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
} else {
- d.DecRef()
+ d.DecRef(ctx)
}
}
})
@@ -545,7 +545,7 @@ func TestRestoreEnvironment(t *testing.T) {
{
Dev: "9pfs-/",
Flags: fs.MountSourceFlags{ReadOnly: true},
- DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+ DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
},
},
"tmpfs": {
@@ -599,7 +599,7 @@ func TestRestoreEnvironment(t *testing.T) {
{
Dev: "9pfs-/",
Flags: fs.MountSourceFlags{ReadOnly: true},
- DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+ DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
},
{
Dev: "9pfs-/dev/fd-foo",
@@ -657,7 +657,7 @@ func TestRestoreEnvironment(t *testing.T) {
{
Dev: "9pfs-/",
Flags: fs.MountSourceFlags{ReadOnly: true},
- DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating",
+ DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true",
},
},
"tmpfs": {
@@ -697,7 +697,11 @@ func TestRestoreEnvironment(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
conf := testConfig()
- mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
+ var ioFDs []*fd.FD
+ for _, ioFD := range tc.ioFDs {
+ ioFDs = append(ioFDs, fd.New(ioFD))
+ }
+ mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{})
actualRenv, err := mntr.createRestoreEnvironment(conf)
if !tc.errorExpected && err != nil {
t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 14d2f56a5..988573640 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket"
"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
@@ -32,6 +33,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/urpc"
+ "gvisor.dev/gvisor/runsc/config"
)
var (
@@ -77,44 +79,6 @@ type DefaultRoute struct {
Name string
}
-// QueueingDiscipline is used to specify the kind of Queueing Discipline to
-// apply for a give FDBasedLink.
-type QueueingDiscipline int
-
-const (
- // QDiscNone disables any queueing for the underlying FD.
- QDiscNone QueueingDiscipline = iota
-
- // QDiscFIFO applies a simple fifo based queue to the underlying
- // FD.
- QDiscFIFO
-)
-
-// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
-// else returns an error.
-func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
- switch s {
- case "none":
- return QDiscNone, nil
- case "fifo":
- return QDiscFIFO, nil
- default:
- return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
- }
-}
-
-// String implements fmt.Stringer.
-func (q QueueingDiscipline) String() string {
- switch q {
- case QDiscNone:
- return "none"
- case QDiscFIFO:
- return "fifo"
- default:
- panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
- }
-}
-
// FDBasedLink configures an fd-based link.
type FDBasedLink struct {
Name string
@@ -126,7 +90,7 @@ type FDBasedLink struct {
TXChecksumOffload bool
RXChecksumOffload bool
LinkAddress net.HardwareAddr
- QDisc QueueingDiscipline
+ QDisc config.QueueingDiscipline
// NumChannels controls how many underlying FD's are to be used to
// create this endpoint.
@@ -246,12 +210,15 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
}
switch link.QDisc {
- case QDiscNone:
- case QDiscFIFO:
+ case config.QDiscNone:
+ case config.QDiscFIFO:
log.Infof("Enabling FIFO QDisc on %q", link.Name)
linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
}
+ // Enable support for AF_PACKET sockets to receive outgoing packets.
+ linkEP = packetsocket.New(linkEP)
+
log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return err
diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go
index fbfd3b07c..c21648a32 100644
--- a/runsc/boot/strace.go
+++ b/runsc/boot/strace.go
@@ -15,10 +15,13 @@
package boot
import (
+ "strings"
+
"gvisor.dev/gvisor/pkg/sentry/strace"
+ "gvisor.dev/gvisor/runsc/config"
)
-func enableStrace(conf *Config) error {
+func enableStrace(conf *config.Config) error {
// We must initialize even if strace is not enabled.
strace.Initialize()
@@ -36,5 +39,5 @@ func enableStrace(conf *Config) error {
strace.EnableAll(strace.SinkTypeLog)
return nil
}
- return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog)
+ return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog)
}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index b68117867..e36664938 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -16,12 +16,12 @@ package boot
import (
"fmt"
- "path"
"sort"
"strings"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
@@ -37,13 +37,19 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/runsc/config"
)
-func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+func registerFilesystems(k *kernel.Kernel) error {
+ ctx := k.SupervisorContext()
+ creds := auth.NewRootCredentials(k.RootUserNamespace())
+ vfsObj := k.VFS()
+
vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserList: true,
// TODO(b/29356795): Users may mount this once the terminals are in a
@@ -73,6 +79,10 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
AllowUserMount: true,
AllowUserList: true,
})
+ vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
// Setup files in devtmpfs.
if err := memdev.Register(vfsObj); err != nil {
@@ -81,18 +91,24 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
if err := ttydev.Register(vfsObj); err != nil {
return fmt.Errorf("registering ttydev: %w", err)
}
-
- if err := fuse.Register(vfsObj); err != nil {
- return fmt.Errorf("registering fusedev: %w", err)
+ tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
+ if tunSupported {
+ if err := tundev.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering tundev: %v", err)
+ }
}
- if err := tundev.Register(vfsObj); err != nil {
- return fmt.Errorf("registering tundev: %v", err)
+
+ if kernel.FUSEEnabled {
+ if err := fuse.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering fusedev: %w", err)
+ }
}
+
a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
if err != nil {
return fmt.Errorf("creating devtmpfs accessor: %w", err)
}
- defer a.Release()
+ defer a.Release(ctx)
if err := a.UserspaceInit(ctx); err != nil {
return fmt.Errorf("initializing userspace: %w", err)
@@ -103,20 +119,23 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
}
- if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
- return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+ if tunSupported {
+ if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
+ return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+ }
}
- if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
- return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+
+ if kernel.FUSEEnabled {
+ if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
+ return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+ }
}
+
return nil
}
-func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
- if err := mntr.k.VFS().Init(); err != nil {
- return fmt.Errorf("failed to initialize VFS: %w", err)
- }
- mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+ mns, err := mntr.mountAll(conf, procArgs)
if err != nil {
return fmt.Errorf("failed to setupFS: %w", err)
}
@@ -131,7 +150,7 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte
return nil
}
-func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
log.Infof("Configuring container's file system with VFS2")
// Create context with root credentials to mount the filesystem (the current
@@ -144,36 +163,115 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
rootCtx := procArgs.NewContext(c.k)
- if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil {
- return nil, fmt.Errorf("register filesystems: %w", err)
- }
-
mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
if err != nil {
return nil, fmt.Errorf("creating mount namespace: %w", err)
}
rootProcArgs.MountNamespaceVFS2 = mns
+ root := mns.Root()
+ defer root.DecRef(rootCtx)
+ if root.Mount().ReadOnly() {
+ // Switch to ReadWrite while we setup submounts.
+ if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
+ return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
+ }
+ // Restore back to ReadOnly at the end.
+ defer func() {
+ if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
+ panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
+ }
+ }()
+ }
+
// Mount submounts.
if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
}
+
return mns, nil
}
-func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
+// createMountNamespaceVFS2 creates the container's root mount and namespace.
+func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
fd := c.fds.remove()
- opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",")
+ data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+
+ if conf.OverlayfsStaleRead {
+ // We can't check for overlayfs here because sandbox is chroot'ed and gofer
+ // can only send mount options for specs.Mounts (specs.Root is missing
+ // Options field). So assume root is always on top of overlayfs.
+ data = append(data, "overlayfs_stale_read")
+ }
log.Infof("Mounting root over 9P, ioFD: %d", fd)
- mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts})
+ opts := &vfs.MountOptions{
+ ReadOnly: c.root.Readonly,
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ Data: strings.Join(data, ","),
+ },
+ InternalMount: true,
+ }
+
+ fsName := gofer.Name
+ if conf.Overlay && !c.root.Readonly {
+ log.Infof("Adding overlay on top of root")
+ var err error
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting root with overlay: %w", err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
+ mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts)
if err != nil {
return nil, fmt.Errorf("setting up mount namespace: %w", err)
}
return mns, nil
}
-func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
+// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
+// layer using tmpfs, and return overlay mount options. "cleanup" must be called
+// after the options have been used to mount the overlay, to release refs on
+// lower and upper mounts.
+func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) {
+ // First copy options from lower layer to upper layer and overlay. Clear
+ // filesystem specific options.
+ upperOpts := *lowerOpts
+ upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+ overlayOpts := *lowerOpts
+ overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+ // Next mount upper and lower. Upper is a tmpfs mount to keep all
+ // modifications inside the sandbox.
+ upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
+ }
+ cu := cleanup.Make(func() { upper.DecRef(ctx) })
+ defer cu.Clean()
+
+ // All writes go to the upper layer, be paranoid and make lower readonly.
+ lowerOpts.ReadOnly = true
+ lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
+ if err != nil {
+ return nil, nil, err
+ }
+ cu.Add(func() { lower.DecRef(ctx) })
+
+ // Configure overlay with both layers.
+ overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
+ UpperRoot: vfs.MakeVirtualDentry(upper, upper.Root()),
+ LowerRoots: []vfs.VirtualDentry{vfs.MakeVirtualDentry(lower, lower.Root())},
+ }
+ return &overlayOpts, cu.Release(), nil
+}
+
+func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
mounts, err := c.prepareMountsVFS2()
if err != nil {
return err
@@ -182,8 +280,34 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
for i := range mounts {
submount := &mounts[i]
log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
- if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
- return err
+ var (
+ mnt *vfs.Mount
+ err error
+ )
+
+ if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
+ mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint)
+ if err != nil {
+ return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
+ }
+ } else {
+ mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount)
+ if err != nil {
+ return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
+ }
+ }
+
+ if mnt != nil && mnt.ReadOnly() {
+ // Switch to ReadWrite while we setup submounts.
+ if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
+ return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err)
+ }
+ // Restore back to ReadOnly at the end.
+ defer func() {
+ if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
+ panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
+ }
+ }()
}
}
@@ -227,62 +351,83 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
return mounts, nil
}
-func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
- root := mns.Root()
- defer root.DecRef()
- target := &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(submount.Destination),
- }
- fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
+ fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
if err != nil {
- return fmt.Errorf("mountOptions failed: %w", err)
+ return nil, fmt.Errorf("mountOptions failed: %w", err)
}
if len(fsName) == 0 {
// Filesystem is not supported (e.g. cgroup), just skip it.
- return nil
+ return nil, nil
}
- if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
- return err
+ if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil {
+ return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err)
}
- if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
- return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
+
+ if useOverlay {
+ log.Infof("Adding overlay on top of mount %q", submount.Destination)
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(submount.Destination),
+ }
+ mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
+ if err != nil {
+ return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
}
log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
- return nil
+ return mnt, nil
}
// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
// used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
- var (
- fsName string
- data []string
- )
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) {
+ fsName := m.Type
+ useOverlay := false
+ var data []string
// Find filesystem name and FS specific data field.
switch m.Type {
case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
- fsName = m.Type
+ // Nothing to do.
+
case nonefs:
fsName = sys.Name
- case tmpfs.Name:
- fsName = m.Type
+ case tmpfs.Name:
var err error
data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
if err != nil {
- return "", nil, err
+ return "", nil, false, err
}
case bind:
fsName = gofer.Name
+ if m.fd == 0 {
+ // Check that an FD was provided to fails fast. Technically FD=0 is valid,
+ // but unlikely to be correct in this context.
+ return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
+ }
data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+ // If configured, add overlay to all writable mounts.
+ useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
default:
log.Warningf("ignoring unknown filesystem type %q", m.Type)
+ return "", nil, false, nil
}
opts := &vfs.MountOptions{
@@ -307,38 +452,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
}
}
- if conf.Overlay {
- // All writes go to upper, be paranoid and make lower readonly.
- opts.ReadOnly = true
- }
- return fsName, opts, nil
-}
-
-func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
- target := &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(currentPath),
- }
- _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
- if err == nil {
- // Mount point exists, nothing else to do.
- return nil
- }
- if err != syserror.ENOENT {
- return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err)
- }
-
- // Recurse to ensure parent is created and then create the mount point.
- if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
- return err
- }
- log.Debugf("Creating dir %q for mount point", currentPath)
- mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
- if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
- return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err)
- }
- return nil
+ return fsName, opts, useOverlay, nil
}
// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
@@ -350,7 +464,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
//
// Note that when there are submounts inside of '/tmp', directories for the
// mount points must be present, making '/tmp' not empty anymore.
-func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
for _, m := range c.mounts {
// m.Destination has been cleaned, so it's to use equality here.
if m.Destination == "/tmp" {
@@ -360,28 +474,35 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
}
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
pop := vfs.PathOperation{
Root: root,
Start: root,
Path: fspath.Parse("/tmp"),
}
// TODO(gvisor.dev/issue/2782): Use O_PATH when available.
- statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{})
+ fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
switch err {
case nil:
- // Found '/tmp' in filesystem, check if it's empty.
- if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory {
- // Not a dir?! Leave it be.
+ defer fd.DecRef(ctx)
+
+ err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+ if dirent.Name != "." && dirent.Name != ".." {
+ return syserror.ENOTEMPTY
+ }
return nil
- }
- if statx.Nlink > 2 {
+ }))
+ switch err {
+ case nil:
+ log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
+ case syserror.ENOTEMPTY:
// If more than "." and ".." is found, skip internal tmpfs to prevent
// hiding existing files.
log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
return nil
+ default:
+ return err
}
- log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
fallthrough
case syserror.ENOENT:
@@ -394,9 +515,122 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
// another user. This is normally done for /tmp.
Options: []string{"mode=01777"},
}
- return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+ _, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+ return err
+
+ case syserror.ENOTDIR:
+ // Not a dir?! Let it be.
+ return nil
default:
- return fmt.Errorf(`stating "/tmp" inside container: %w`, err)
+ return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
+ }
+}
+
+// processHintsVFS2 processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error {
+ ctx := c.k.SupervisorContext()
+ for _, hint := range c.hints.mounts {
+ // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+ // common gofer to mount all shared volumes.
+ if hint.mount.Type != tmpfs.Name {
+ continue
+ }
+
+ log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+ mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
+ if err != nil {
+ return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+ }
+ hint.vfsMount = mnt
+ }
+ return nil
+}
+
+// mountSharedMasterVFS2 mounts the master of a volume that is shared among
+// containers in a pod.
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ mntFD := &mountAndFD{Mount: hint.mount}
+ fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+ if err != nil {
+ return nil, err
+ }
+ if len(fsName) == 0 {
+ return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+ }
+
+ if useOverlay {
+ log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination)
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
+ return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) {
+ if err := source.checkCompatible(mount); err != nil {
+ return nil, err
+ }
+
+ // Ignore data and useOverlay because these were already applied to
+ // the master mount.
+ _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+ if err != nil {
+ return nil, err
+ }
+ newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
+ if err != nil {
+ return nil, err
+ }
+ defer newMnt.DecRef(ctx)
+
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(mount.Destination),
+ }
+
+ if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
+ return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
+ }
+
+ if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
+ return nil, err
+ }
+ log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+ return newMnt, nil
+}
+
+func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(dest),
+ }
+ // First check if mount point exists. When overlay is enabled, gofer doesn't
+ // allow changes to the FS, making MakeSytheticMountpoint() ineffective
+ // because MkdirAt fails with EROFS even if file exists.
+ vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
+ if err == nil {
+ // File exists, we're done.
+ vd.DecRef(ctx)
+ return nil
}
+ return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
}