diff options
Diffstat (limited to 'runsc/boot')
-rw-r--r-- | runsc/boot/BUILD | 13 | ||||
-rw-r--r-- | runsc/boot/config.go | 329 | ||||
-rw-r--r-- | runsc/boot/controller.go | 80 | ||||
-rw-r--r-- | runsc/boot/filter/config.go | 502 | ||||
-rw-r--r-- | runsc/boot/filter/config_amd64.go | 41 | ||||
-rw-r--r-- | runsc/boot/filter/config_arm64.go | 25 | ||||
-rw-r--r-- | runsc/boot/filter/config_profile.go | 6 | ||||
-rw-r--r-- | runsc/boot/fs.go | 87 | ||||
-rw-r--r-- | runsc/boot/fs_test.go | 11 | ||||
-rw-r--r-- | runsc/boot/loader.go | 446 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 74 | ||||
-rw-r--r-- | runsc/boot/network.go | 49 | ||||
-rw-r--r-- | runsc/boot/strace.go | 7 | ||||
-rw-r--r-- | runsc/boot/vfs.go | 426 |
14 files changed, 1040 insertions, 1056 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index aad2a41de..2d9517f4a 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -8,7 +8,6 @@ go_library( "compat.go", "compat_amd64.go", "compat_arm64.go", - "config.go", "controller.go", "debug.go", "events.go", @@ -27,10 +26,13 @@ go_library( deps = [ "//pkg/abi", "//pkg/abi/linux", + "//pkg/bpf", + "//pkg/cleanup", "//pkg/context", "//pkg/control/server", "//pkg/cpuid", "//pkg/eventchannel", + "//pkg/fd", "//pkg/fspath", "//pkg/log", "//pkg/memutil", @@ -90,6 +92,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/link/fdbased", "//pkg/tcpip/link/loopback", + "//pkg/tcpip/link/packetsocket", "//pkg/tcpip/link/qdisc/fifo", "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/arp", @@ -104,9 +107,11 @@ go_library( "//runsc/boot/filter", "//runsc/boot/platforms", "//runsc/boot/pprof", + "//runsc/config", "//runsc/specutils", + "//runsc/specutils/seccomp", "@com_github_golang_protobuf//proto:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) @@ -122,6 +127,7 @@ go_test( library = ":boot", deps = [ "//pkg/control/server", + "//pkg/fd", "//pkg/fspath", "//pkg/log", "//pkg/p9", @@ -130,8 +136,9 @@ go_test( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/unet", + "//runsc/config", "//runsc/fsgofer", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/boot/config.go b/runsc/boot/config.go deleted file mode 100644 index bb01b8fb5..000000000 --- a/runsc/boot/config.go +++ /dev/null @@ -1,329 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package boot - -import ( - "fmt" - "strconv" - "strings" - - "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/watchdog" -) - -// FileAccessType tells how the filesystem is accessed. -type FileAccessType int - -const ( - // FileAccessShared sends IO requests to a Gofer process that validates the - // requests and forwards them to the host. - FileAccessShared FileAccessType = iota - - // FileAccessExclusive is the same as FileAccessShared, but enables - // extra caching for improved performance. It should only be used if - // the sandbox has exclusive access to the filesystem. - FileAccessExclusive -) - -// MakeFileAccessType converts type from string. -func MakeFileAccessType(s string) (FileAccessType, error) { - switch s { - case "shared": - return FileAccessShared, nil - case "exclusive": - return FileAccessExclusive, nil - default: - return 0, fmt.Errorf("invalid file access type %q", s) - } -} - -func (f FileAccessType) String() string { - switch f { - case FileAccessShared: - return "shared" - case FileAccessExclusive: - return "exclusive" - default: - return fmt.Sprintf("unknown(%d)", f) - } -} - -// NetworkType tells which network stack to use. -type NetworkType int - -const ( - // NetworkSandbox uses internal network stack, isolated from the host. - NetworkSandbox NetworkType = iota - - // NetworkHost redirects network related syscalls to the host network. - NetworkHost - - // NetworkNone sets up just loopback using netstack. - NetworkNone -) - -// MakeNetworkType converts type from string. -func MakeNetworkType(s string) (NetworkType, error) { - switch s { - case "sandbox": - return NetworkSandbox, nil - case "host": - return NetworkHost, nil - case "none": - return NetworkNone, nil - default: - return 0, fmt.Errorf("invalid network type %q", s) - } -} - -func (n NetworkType) String() string { - switch n { - case NetworkSandbox: - return "sandbox" - case NetworkHost: - return "host" - case NetworkNone: - return "none" - default: - return fmt.Sprintf("unknown(%d)", n) - } -} - -// MakeWatchdogAction converts type from string. -func MakeWatchdogAction(s string) (watchdog.Action, error) { - switch strings.ToLower(s) { - case "log", "logwarning": - return watchdog.LogWarning, nil - case "panic": - return watchdog.Panic, nil - default: - return 0, fmt.Errorf("invalid watchdog action %q", s) - } -} - -// MakeRefsLeakMode converts type from string. -func MakeRefsLeakMode(s string) (refs.LeakMode, error) { - switch strings.ToLower(s) { - case "disabled": - return refs.NoLeakChecking, nil - case "log-names": - return refs.LeaksLogWarning, nil - case "log-traces": - return refs.LeaksLogTraces, nil - default: - return 0, fmt.Errorf("invalid refs leakmode %q", s) - } -} - -func refsLeakModeToString(mode refs.LeakMode) string { - switch mode { - // If not set, default it to disabled. - case refs.UninitializedLeakChecking, refs.NoLeakChecking: - return "disabled" - case refs.LeaksLogWarning: - return "log-names" - case refs.LeaksLogTraces: - return "log-traces" - default: - panic(fmt.Sprintf("Invalid leakmode: %d", mode)) - } -} - -// Config holds configuration that is not part of the runtime spec. -type Config struct { - // RootDir is the runtime root directory. - RootDir string - - // Debug indicates that debug logging should be enabled. - Debug bool - - // LogFilename is the filename to log to, if not empty. - LogFilename string - - // LogFormat is the log format. - LogFormat string - - // DebugLog is the path to log debug information to, if not empty. - DebugLog string - - // PanicLog is the path to log GO's runtime messages, if not empty. - PanicLog string - - // DebugLogFormat is the log format for debug. - DebugLogFormat string - - // FileAccess indicates how the filesystem is accessed. - FileAccess FileAccessType - - // Overlay is whether to wrap the root filesystem in an overlay. - Overlay bool - - // FSGoferHostUDS enables the gofer to mount a host UDS. - FSGoferHostUDS bool - - // Network indicates what type of network to use. - Network NetworkType - - // EnableRaw indicates whether raw sockets should be enabled. Raw - // sockets are disabled by stripping CAP_NET_RAW from the list of - // capabilities. - EnableRaw bool - - // HardwareGSO indicates that hardware segmentation offload is enabled. - HardwareGSO bool - - // SoftwareGSO indicates that software segmentation offload is enabled. - SoftwareGSO bool - - // TXChecksumOffload indicates that TX Checksum Offload is enabled. - TXChecksumOffload bool - - // RXChecksumOffload indicates that RX Checksum Offload is enabled. - RXChecksumOffload bool - - // QDisc indicates the type of queuening discipline to use by default - // for non-loopback interfaces. - QDisc QueueingDiscipline - - // LogPackets indicates that all network packets should be logged. - LogPackets bool - - // Platform is the platform to run on. - Platform string - - // Strace indicates that strace should be enabled. - Strace bool - - // StraceSyscalls is the set of syscalls to trace. If StraceEnable is - // true and this list is empty, then all syscalls will be traced. - StraceSyscalls []string - - // StraceLogSize is the max size of data blobs to display. - StraceLogSize uint - - // DisableSeccomp indicates whether seccomp syscall filters should be - // disabled. Pardon the double negation, but default to enabled is important. - DisableSeccomp bool - - // WatchdogAction sets what action the watchdog takes when triggered. - WatchdogAction watchdog.Action - - // PanicSignal registers signal handling that panics. Usually set to - // SIGUSR2(12) to troubleshoot hangs. -1 disables it. - PanicSignal int - - // ProfileEnable is set to prepare the sandbox to be profiled. - ProfileEnable bool - - // RestoreFile is the path to the saved container image - RestoreFile string - - // NumNetworkChannels controls the number of AF_PACKET sockets that map - // to the same underlying network device. This allows netstack to better - // scale for high throughput use cases. - NumNetworkChannels int - - // Rootless allows the sandbox to be started with a user that is not root. - // Defense is depth measures are weaker with rootless. Specifically, the - // sandbox and Gofer process run as root inside a user namespace with root - // mapped to the caller's user. - Rootless bool - - // AlsoLogToStderr allows to send log messages to stderr. - AlsoLogToStderr bool - - // ReferenceLeakMode sets reference leak check mode - ReferenceLeakMode refs.LeakMode - - // OverlayfsStaleRead instructs the sandbox to assume that the root mount - // is on a Linux overlayfs mount, which does not necessarily preserve - // coherence between read-only and subsequent writable file descriptors - // representing the "same" file. - OverlayfsStaleRead bool - - // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in - // tests. It allows runsc to start the sandbox process as the current - // user, and without chrooting the sandbox process. This can be - // necessary in test environments that have limited capabilities. - TestOnlyAllowRunAsCurrentUserWithoutChroot bool - - // TestOnlyTestNameEnv should only be used in tests. It looks up for the - // test name in the container environment variables and adds it to the debug - // log file name. This is done to help identify the log with the test when - // multiple tests are run in parallel, since there is no way to pass - // parameters to the runtime from docker. - TestOnlyTestNameEnv string - - // CPUNumFromQuota sets CPU number count to available CPU quota, using - // least integer value greater than or equal to quota. - // - // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. - CPUNumFromQuota bool - - // Enables VFS2 (not plumbled through yet). - VFS2 bool -} - -// ToFlags returns a slice of flags that correspond to the given Config. -func (c *Config) ToFlags() []string { - f := []string{ - "--root=" + c.RootDir, - "--debug=" + strconv.FormatBool(c.Debug), - "--log=" + c.LogFilename, - "--log-format=" + c.LogFormat, - "--debug-log=" + c.DebugLog, - "--panic-log=" + c.PanicLog, - "--debug-log-format=" + c.DebugLogFormat, - "--file-access=" + c.FileAccess.String(), - "--overlay=" + strconv.FormatBool(c.Overlay), - "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS), - "--network=" + c.Network.String(), - "--log-packets=" + strconv.FormatBool(c.LogPackets), - "--platform=" + c.Platform, - "--strace=" + strconv.FormatBool(c.Strace), - "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","), - "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)), - "--watchdog-action=" + c.WatchdogAction.String(), - "--panic-signal=" + strconv.Itoa(c.PanicSignal), - "--profile=" + strconv.FormatBool(c.ProfileEnable), - "--net-raw=" + strconv.FormatBool(c.EnableRaw), - "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), - "--rootless=" + strconv.FormatBool(c.Rootless), - "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr), - "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), - "--gso=" + strconv.FormatBool(c.HardwareGSO), - "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), - "--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload), - "--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload), - "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), - "--qdisc=" + c.QDisc.String(), - } - if c.CPUNumFromQuota { - f = append(f, "--cpu-num-from-quota") - } - // Only include these if set since it is never to be used by users. - if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { - f = append(f, "--TESTONLY-unsafe-nonroot=true") - } - if len(c.TestOnlyTestNameEnv) != 0 { - f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv) - } - - if c.VFS2 { - f = append(f, "--vfs2=true") - } - - return f -} diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 8125d5061..894651519 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -22,6 +22,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -33,6 +34,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) @@ -101,14 +103,13 @@ const ( // Profiling related commands (see pprof.go for more details). const ( - StartCPUProfile = "Profile.StartCPUProfile" - StopCPUProfile = "Profile.StopCPUProfile" - HeapProfile = "Profile.HeapProfile" - GoroutineProfile = "Profile.GoroutineProfile" - BlockProfile = "Profile.BlockProfile" - MutexProfile = "Profile.MutexProfile" - StartTrace = "Profile.StartTrace" - StopTrace = "Profile.StopTrace" + StartCPUProfile = "Profile.StartCPUProfile" + StopCPUProfile = "Profile.StopCPUProfile" + HeapProfile = "Profile.HeapProfile" + BlockProfile = "Profile.BlockProfile" + MutexProfile = "Profile.MutexProfile" + StartTrace = "Profile.StartTrace" + StopTrace = "Profile.StopTrace" ) // Logging related commands (see logging.go for more details). @@ -129,42 +130,52 @@ type controller struct { // manager holds the containerManager methods. manager *containerManager + + // pprop holds the profile instance if enabled. It may be nil. + pprof *control.Profile } // newController creates a new controller. The caller must call // controller.srv.StartServing() to start the controller. func newController(fd int, l *Loader) (*controller, error) { - srv, err := server.CreateFromFD(fd) + ctrl := &controller{} + var err error + ctrl.srv, err = server.CreateFromFD(fd) if err != nil { return nil, err } - manager := &containerManager{ + ctrl.manager = &containerManager{ startChan: make(chan struct{}), startResultChan: make(chan error), l: l, } - srv.Register(manager) + ctrl.srv.Register(ctrl.manager) if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { net := &Network{ Stack: eps.Stack, } - srv.Register(net) + ctrl.srv.Register(net) } - srv.Register(&debug{}) - srv.Register(&control.Logging{}) - if l.conf.ProfileEnable { - srv.Register(&control.Profile{ - Kernel: l.k, - }) + ctrl.srv.Register(&debug{}) + ctrl.srv.Register(&control.Logging{}) + + if l.root.conf.ProfileEnable { + ctrl.pprof = &control.Profile{Kernel: l.k} + ctrl.srv.Register(ctrl.pprof) } - return &controller{ - srv: srv, - manager: manager, - }, nil + return ctrl, nil +} + +func (c *controller) stop() { + if c.pprof != nil { + // These are noop if there is nothing being profiled. + _ = c.pprof.StopCPUProfile(nil, nil) + _ = c.pprof.StopTrace(nil, nil) + } } // containerManager manages sandbox containers. @@ -211,7 +222,7 @@ type StartArgs struct { Spec *specs.Spec // Config is the runsc-specific configuration for the sandbox. - Conf *Config + Conf *config.Config // CID is the ID of the container to start. CID string @@ -247,13 +258,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { // All validation passed, logs the spec for debugging. specutils.LogSpec(args.Spec) - err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files) + fds, err := fd.NewFromFiles(args.FilePayload.Files) if err != nil { + return err + } + defer func() { + for _, fd := range fds { + _ = fd.Close() + } + }() + if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil { log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) return err } log.Debugf("Container %q started", args.CID) - return nil } @@ -333,7 +351,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Pause the kernel while we build a new one. cm.l.k.Pause() - p, err := createPlatform(cm.l.conf, deviceFile) + p, err := createPlatform(cm.l.root.conf, deviceFile) if err != nil { return fmt.Errorf("creating platform: %v", err) } @@ -349,8 +367,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { cm.l.k = k // Set up the restore environment. - mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints) - renv, err := mntr.createRestoreEnvironment(cm.l.conf) + mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints) + renv, err := mntr.createRestoreEnvironment(cm.l.root.conf) if err != nil { return fmt.Errorf("creating RestoreEnvironment: %v", err) } @@ -368,7 +386,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("file cannot be empty") } - if cm.l.conf.ProfileEnable { + if cm.l.root.conf.ProfileEnable { // pprof.Initialize opens /proc/self/maps, so has to be called before // installing seccomp filters. pprof.Initialize() @@ -387,13 +405,13 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Since we have a new kernel we also must make a new watchdog. dogOpts := watchdog.DefaultOpts - dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction + dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction dog := watchdog.New(k, dogOpts) // Change the loader fields to reflect the changes made when restoring. cm.l.k = k cm.l.watchdog = dog - cm.l.rootProcArgs = kernel.CreateProcessArgs{} + cm.l.root.procArgs = kernel.CreateProcessArgs{} cm.l.restore = true // Reinitialize the sandbox ID and processes map. Note that it doesn't diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 60e33425f..6ac19668f 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -27,41 +27,30 @@ import ( // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS. var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_CLOCK_GETTIME: {}, - syscall.SYS_CLONE: []seccomp.Rule{ - { - seccomp.AllowValue( - syscall.CLONE_VM | - syscall.CLONE_FS | - syscall.CLONE_FILES | - syscall.CLONE_SIGHAND | - syscall.CLONE_SYSVSEM | - syscall.CLONE_THREAD), - }, - }, - syscall.SYS_CLOSE: {}, - syscall.SYS_DUP: {}, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, syscall.SYS_DUP3: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.O_CLOEXEC), }, }, syscall.SYS_EPOLL_CREATE1: {}, syscall.SYS_EPOLL_CTL: {}, syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, }, syscall.SYS_EVENTFD2: []seccomp.Rule{ { - seccomp.AllowValue(0), - seccomp.AllowValue(0), + seccomp.EqualTo(0), + seccomp.EqualTo(0), }, }, syscall.SYS_EXIT: {}, @@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_FCHMOD: {}, syscall.SYS_FCNTL: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_SETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_SETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFD), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFD), }, }, syscall.SYS_FSTAT: {}, @@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_FTRUNCATE: {}, syscall.SYS_FUTEX: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, }, // Non-private variants are included for flipcall support. They are otherwise // unncessary, as the sentry will use only private futexes internally. { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE), - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE), + seccomp.MatchAny{}, }, }, syscall.SYS_GETPID: {}, unix.SYS_GETRANDOM: {}, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_DOMAIN), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_DOMAIN), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_TYPE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TYPE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_ERROR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_ERROR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), }, }, syscall.SYS_GETTID: {}, @@ -141,34 +130,34 @@ var allowedSyscalls = seccomp.SyscallRules{ // setting/getting termios and winsize. syscall.SYS_IOCTL: []seccomp.Rule{ { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCGETS), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCGETS), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETS), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETS), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETSF), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETSF), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETSW), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETSW), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TIOCSWINSZ), - seccomp.AllowAny{}, /* winsize struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TIOCSWINSZ), + seccomp.MatchAny{}, /* winsize struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TIOCGWINSZ), - seccomp.AllowAny{}, /* winsize struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TIOCGWINSZ), + seccomp.MatchAny{}, /* winsize struct */ }, }, syscall.SYS_LSEEK: {}, @@ -182,46 +171,46 @@ var allowedSyscalls = seccomp.SyscallRules{ // TODO(b/148688965): Remove once this is gone from Go. syscall.SYS_MLOCK: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(4096), + seccomp.MatchAny{}, + seccomp.EqualTo(4096), }, }, syscall.SYS_MMAP: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_SHARED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_SHARED), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ), - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ), + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), }, }, syscall.SYS_MPROTECT: {}, @@ -237,32 +226,32 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_READ: {}, syscall.SYS_RECVMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), }, }, syscall.SYS_RECVMMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(fdbased.MaxMsgsPerRecv), - seccomp.AllowValue(syscall.MSG_DONTWAIT), - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(fdbased.MaxMsgsPerRecv), + seccomp.EqualTo(syscall.MSG_DONTWAIT), + seccomp.EqualTo(0), }, }, unix.SYS_SENDMMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT), - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT), + seccomp.EqualTo(0), }, }, syscall.SYS_RESTART_SYSCALL: {}, @@ -272,57 +261,50 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_SCHED_YIELD: {}, syscall.SYS_SENDMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), }, }, syscall.SYS_SETITIMER: {}, syscall.SYS_SHUTDOWN: []seccomp.Rule{ // Used by fs/host to shutdown host sockets. - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)}, - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)}, // Used by unet to shutdown connections. - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, unix.SYS_STATX: {}, syscall.SYS_SYNC_FILE_RANGE: {}, syscall.SYS_TEE: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(1), /* len */ - seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */ + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(1), /* len */ + seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */ }, }, syscall.SYS_TGKILL: []seccomp.Rule{ { - seccomp.AllowValue(uint64(os.Getpid())), + seccomp.EqualTo(uint64(os.Getpid())), }, }, syscall.SYS_UTIMENSAT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(0), /* null pathname */ - seccomp.AllowAny{}, - seccomp.AllowValue(0), /* flags */ + seccomp.MatchAny{}, + seccomp.EqualTo(0), /* null pathname */ + seccomp.MatchAny{}, + seccomp.EqualTo(0), /* flags */ }, }, syscall.SYS_WRITE: {}, - // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with - // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR - // option is enabled for a packet socket. + // For rawfile.NonBlockingWriteIovec. syscall.SYS_WRITEV: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(2), - }, - { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(3), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.GreaterThan(0), }, }, } @@ -332,10 +314,10 @@ func hostInetFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_ACCEPT4: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), }, }, syscall.SYS_BIND: {}, @@ -344,84 +326,84 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_GETSOCKNAME: {}, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_TOS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_TOS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_RECVTOS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVTOS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_TCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_TCLASS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_RECVTCLASS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_V6ONLY), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_V6ONLY), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_ERROR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_ERROR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_KEEPALIVE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_KEEPALIVE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_RCVBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_RCVBUF), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_REUSEADDR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_TYPE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TYPE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_LINGER), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_LINGER), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_NODELAY), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_NODELAY), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_INFO), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_INFO), }, }, syscall.SYS_IOCTL: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.TIOCOUTQ), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.TIOCOUTQ), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.TIOCINQ), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.TIOCINQ), }, }, syscall.SYS_LISTEN: {}, @@ -432,103 +414,103 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_SENDTO: {}, syscall.SYS_SETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_V6ONLY), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_V6ONLY), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_RCVBUF), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_RCVBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_REUSEADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_NODELAY), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_NODELAY), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_TOS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_TOS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_RECVTOS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVTOS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_TCLASS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_TCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_RECVTCLASS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_RECVTCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_RD), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_RD), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_WR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_WR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_RDWR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_RDWR), }, }, syscall.SYS_SOCKET: []seccomp.Rule{ { - seccomp.AllowValue(syscall.AF_INET), - seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET), + seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET), - seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET), + seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET6), - seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET6), + seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET6), - seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET6), + seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, }, syscall.SYS_WRITEV: {}, @@ -539,20 +521,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_ACCEPT: []seccomp.Rule{ { - seccomp.AllowValue(fd), + seccomp.EqualTo(fd), }, }, syscall.SYS_LISTEN: []seccomp.Rule{ { - seccomp.AllowValue(fd), - seccomp.AllowValue(16 /* unet.backlog */), + seccomp.EqualTo(fd), + seccomp.EqualTo(16 /* unet.backlog */), }, }, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_PEERCRED), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_PEERCRED), }, }, } diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go index 5335ff82c..cea5613b8 100644 --- a/runsc/boot/filter/config_amd64.go +++ b/runsc/boot/filter/config_amd64.go @@ -24,8 +24,41 @@ import ( ) func init() { - allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL], - seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)}, - seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)}, - ) + allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{ + // TODO(b/168828518): No longer used in Go 1.16+. + {seccomp.EqualTo(linux.ARCH_SET_FS)}, + } + + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + // parent_tidptr and child_tidptr are always 0 because neither + // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SETTLS | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, + { + // TODO(b/168828518): No longer used in Go 1.16+ (on amd64). + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, + } } diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go index 7fa9bbda3..37313f97f 100644 --- a/runsc/boot/filter/config_arm64.go +++ b/runsc/boot/filter/config_arm64.go @@ -16,6 +16,29 @@ package filter -// Reserve for future customization. +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + func init() { + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + // These arguments are left uninitialized by the Go + // runtime, so they may be anything (and are unused by + // the host). + seccomp.MatchAny{}, // parent_tidptr + seccomp.MatchAny{}, // tls + seccomp.MatchAny{}, // child_tidptr + }, + } } diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go index 194952a7b..7b8669595 100644 --- a/runsc/boot/filter/config_profile.go +++ b/runsc/boot/filter/config_profile.go @@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_OPENAT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), }, }, } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index e83584b82..ddf288456 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -29,10 +29,12 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" + "gvisor.dev/gvisor/pkg/sentry/vfs" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/gofer" @@ -47,6 +49,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) @@ -65,7 +68,7 @@ const ( // tmpfs has some extra supported options that we must pass through. var tmpfsAllowedData = []string{"mode", "uid", "gid"} -func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { +func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { // Upper layer uses the same flags as lower, but it must be read-write. upperFlags := lowerFlags upperFlags.ReadOnly = false @@ -155,7 +158,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount { } // p9MountData creates a slice of p9 mount data. -func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string { +func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string { opts := []string{ "trans=fd", "rfdno=" + strconv.Itoa(fd), @@ -166,7 +169,7 @@ func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string { // enablement. opts = append(opts, "privateunixsocket=true") } - if fa == FileAccessShared { + if fa == config.FileAccessShared { opts = append(opts, "cache=remote_revalidating") } return opts @@ -251,7 +254,7 @@ func mustFindFilesystem(name string) fs.Filesystem { // addSubmountOverlay overlays the inode over a ramfs tree containing the given // paths. -func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) { +func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) { // Construct a ramfs tree of mount points. The contents never // change, so this can be fully caching. There's no real // filesystem backing this tree, so we set the filesystem to @@ -261,7 +264,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string if err != nil { return nil, fmt.Errorf("creating mount tree: %v", err) } - overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{}) + overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf) if err != nil { return nil, fmt.Errorf("adding mount overlay: %v", err) } @@ -280,7 +283,7 @@ func subtargets(root string, mnts []specs.Mount) []string { return targets } -func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { +func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { if conf.VFS2 { return setupContainerVFS2(ctx, conf, mntr, procArgs) } @@ -318,14 +321,14 @@ func adjustDirentCache(k *kernel.Kernel) error { } type fdDispenser struct { - fds []int + fds []*fd.FD } func (f *fdDispenser) remove() int { if f.empty() { panic("fdDispenser out of fds") } - rv := f.fds[0] + rv := f.fds[0].Release() f.fds = f.fds[1:] return rv } @@ -390,6 +393,10 @@ type mountHint struct { // root is the inode where the volume is mounted. For mounts with 'pod' share // the volume is mounted once and then bind mounted inside the containers. root *fs.Inode + + // vfsMount is the master mount for the volume. For mounts with 'pod' share + // the master volume is bind mounted inside the containers. + vfsMount *vfs.Mount } func (m *mountHint) setField(key, val string) error { @@ -447,27 +454,27 @@ func (m *mountHint) isSupported() bool { func (m *mountHint) checkCompatible(mount specs.Mount) error { // Remove options that don't affect to mount's behavior. masterOpts := filterUnsupportedOptions(m.mount) - slaveOpts := filterUnsupportedOptions(mount) + replicaOpts := filterUnsupportedOptions(mount) - if len(masterOpts) != len(slaveOpts) { - return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + if len(masterOpts) != len(replicaOpts) { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) } sort.Strings(masterOpts) - sort.Strings(slaveOpts) + sort.Strings(replicaOpts) for i, opt := range masterOpts { - if opt != slaveOpts[i] { - return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + if opt != replicaOpts[i] { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) } } return nil } -func (m *mountHint) fileAccessType() FileAccessType { +func (m *mountHint) fileAccessType() config.FileAccessType { if m.share == container { - return FileAccessExclusive + return config.FileAccessExclusive } - return FileAccessShared + return config.FileAccessShared } func filterUnsupportedOptions(mount specs.Mount) []string { @@ -558,7 +565,7 @@ type containerMounter struct { hints *podMountHints } -func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter { +func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter { return &containerMounter{ root: spec.Root, mounts: compileMounts(spec), @@ -571,9 +578,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin // processHints processes annotations that container hints about how volumes // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. -func (c *containerMounter) processHints(conf *Config) error { +func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error { if conf.VFS2 { - return nil + return c.processHintsVFS2(conf, creds) } ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { @@ -595,7 +602,7 @@ func (c *containerMounter) processHints(conf *Config) error { // setupFS is used to set up the file system for all containers. This is the // main entry point method, with most of the other being internal only. It // returns the mount namespace that is created for the container. -func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { +func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { log.Infof("Configuring container's file system") // Create context with root credentials to mount the filesystem (the current @@ -621,7 +628,7 @@ func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessA return mns, nil } -func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) { +func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) { rootInode, err := c.createRootMount(ctx, conf) if err != nil { return nil, fmt.Errorf("creating filesystem for container: %v", err) @@ -633,9 +640,9 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi return mns, nil } -func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { +func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, m := range c.mounts { log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) @@ -669,7 +676,7 @@ func (c *containerMounter) checkDispenser() error { // mountSharedMaster mounts the master of a volume that is shared among // containers in a pod. It returns the root mount's inode. -func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) { +func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount) @@ -709,7 +716,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, } // createRootMount creates the root filesystem. -func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) { +func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) { // First construct the filesystem from the spec.Root. mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay} @@ -734,7 +741,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (* // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always // mounted even if they are not in the spec. submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") - rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf) if err != nil { return nil, fmt.Errorf("adding submount overlay: %v", err) } @@ -754,7 +761,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (* // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values // used for mounts. -func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) { +func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) { var ( fsName string opts []string @@ -788,19 +795,19 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( return fsName, opts, useOverlay, nil } -func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType { +func (c *containerMounter) getMountAccessType(mount specs.Mount) config.FileAccessType { if hint := c.hints.findMount(mount); hint != nil { return hint.fileAccessType() } // Non-root bind mounts are always shared if no hints were provided. - return FileAccessShared + return config.FileAccessShared } // mountSubmount mounts volumes inside the container's root. Because mounts may // be readonly, a lower ramfs overlay is added to create the mount point dir. // Another overlay is added with tmpfs on top if Config.Overlay is true. // 'm.Destination' must be an absolute path with '..' and symlinks resolved. -func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { +func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) @@ -844,7 +851,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns submounts := subtargets(m.Destination, c.mounts) if len(submounts) > 0 { log.Infof("Adding submount overlay over %q", m.Destination) - inode, err = addSubmountOverlay(ctx, inode, submounts) + inode, err = addSubmountOverlay(ctx, inode, submounts, mf) if err != nil { return fmt.Errorf("adding submount overlay: %v", err) } @@ -863,7 +870,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns if err != nil { return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) } - defer dirent.DecRef() + defer dirent.DecRef(ctx) if err := mns.Mount(ctx, dirent, inode); err != nil { return fmt.Errorf("mount %q error: %v", m.Destination, err) } @@ -884,12 +891,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun if err != nil { return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) } - defer target.DecRef() + defer target.DecRef(ctx) // Take a ref on the inode that is about to be (re)-mounted. source.root.IncRef() if err := mns.Mount(ctx, target, source.root); err != nil { - source.root.DecRef() + source.root.DecRef(ctx) return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) } @@ -899,7 +906,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun // addRestoreMount adds a mount to the MountSources map used for restoring a // checkpointed container. -func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error { +func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error { fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) if err != nil { return err @@ -924,7 +931,7 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding // the mounts to the environment. -func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) { +func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) { renv := &fs.RestoreEnvironment{ MountSources: make(map[string][]fs.MountArgs), } @@ -979,7 +986,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn // // Note that when there are submounts inside of '/tmp', directories for the // mount points must be present, making '/tmp' not empty anymore. -func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { +func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error { for _, m := range c.mounts { if filepath.Clean(m.Destination) == "/tmp" { log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m) @@ -992,12 +999,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M switch err { case nil: // Found '/tmp' in filesystem, check if it's empty. - defer tmp.DecRef() + defer tmp.DecRef(ctx) f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) if err != nil { return err } - defer f.DecRef() + defer f.DecRef(ctx) serializer := &fs.CollectEntriesSerializer{} if err := f.Readdir(ctx, serializer); err != nil { return err diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go index 912037075..e986231e5 100644 --- a/runsc/boot/fs_test.go +++ b/runsc/boot/fs_test.go @@ -20,6 +20,7 @@ import ( "testing" specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/runsc/config" ) func TestPodMountHintsHappy(t *testing.T) { @@ -196,7 +197,7 @@ func TestGetMountAccessType(t *testing.T) { for _, tst := range []struct { name string annotations map[string]string - want FileAccessType + want config.FileAccessType }{ { name: "container=exclusive", @@ -205,7 +206,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "container", }, - want: FileAccessExclusive, + want: config.FileAccessExclusive, }, { name: "pod=shared", @@ -214,7 +215,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "pod", }, - want: FileAccessShared, + want: config.FileAccessShared, }, { name: "shared=shared", @@ -223,7 +224,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "shared", }, - want: FileAccessShared, + want: config.FileAccessShared, }, { name: "default=shared", @@ -232,7 +233,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "container", }, - want: FileAccessShared, + want: config.FileAccessShared, }, } { t.Run(tst.name, func(t *testing.T) { diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index b5df1deb9..dee2c4fbb 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -16,22 +16,25 @@ package boot import ( + "errors" "fmt" mrand "math/rand" "os" "runtime" "sync/atomic" - "syscall" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fdimport" @@ -66,7 +69,9 @@ import ( "gvisor.dev/gvisor/runsc/boot/filter" _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" + "gvisor.dev/gvisor/runsc/specutils/seccomp" // Include supported socket providers. "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" @@ -77,6 +82,22 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) +type containerInfo struct { + conf *config.Config + + // spec is the base configuration for the root container. + spec *specs.Spec + + // procArgs refers to the container's init task. + procArgs kernel.CreateProcessArgs + + // stdioFDs contains stdin, stdout, and stderr. + stdioFDs []*fd.FD + + // goferFDs are the FDs that attach the sandbox to the gofers. + goferFDs []*fd.FD +} + // Loader keeps state needed to start the kernel and run the container.. type Loader struct { // k is the kernel. @@ -85,22 +106,11 @@ type Loader struct { // ctrl is the control server. ctrl *controller - conf *Config - - // console is set to true if terminal is enabled. - console bool + // root contains information about the root container in the sandbox. + root containerInfo watchdog *watchdog.Watchdog - // stdioFDs contains stdin, stdout, and stderr. - stdioFDs []int - - // goferFDs are the FDs that attach the sandbox to the gofers. - goferFDs []int - - // spec is the base configuration for the root container. - spec *specs.Spec - // stopSignalForwarding disables forwarding of signals to the sandboxed // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() @@ -108,9 +118,6 @@ type Loader struct { // restore is set to true if we are restoring a container. restore bool - // rootProcArgs refers to the root sandbox init task. - rootProcArgs kernel.CreateProcessArgs - // sandboxID is the ID for the whole sandbox. sandboxID string @@ -162,7 +169,7 @@ type Args struct { // Spec is the sandbox specification. Spec *specs.Spec // Conf is the system configuration. - Conf *Config + Conf *config.Config // ControllerFD is the FD to the URPC controller. The Loader takes ownership // of this FD and may close it at any time. ControllerFD int @@ -175,8 +182,6 @@ type Args struct { // StdioFDs is the stdio for the application. The Loader takes ownership of // these FDs and may close them at any time. StdioFDs []int - // Console is set to true if using TTY. - Console bool // NumCPU is the number of CPUs to create inside the sandbox. NumCPU int // TotalMem is the initial amount of total memory to report back to the @@ -187,7 +192,7 @@ type Args struct { } // make sure stdioFDs are always the same on initial start and on restore -const startingStdioFD = 64 +const startingStdioFD = 256 // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. @@ -205,6 +210,10 @@ func New(args Args) (*Loader, error) { // Is this a VFSv2 kernel? if args.Conf.VFS2 { kernel.VFS2Enabled = true + if args.Conf.FUSE { + kernel.FUSEEnabled = true + } + vfs2.Override() } @@ -227,9 +236,7 @@ func New(args Args) (*Loader, error) { // Create VDSO. // // Pass k as the platform since it is savable, unlike the actual platform. - // - // FIXME(b/109889800): Use non-nil context. - vdso, err := loader.PrepareVDSO(nil, k) + vdso, err := loader.PrepareVDSO(k) if err != nil { return nil, fmt.Errorf("creating vdso: %v", err) } @@ -275,6 +282,7 @@ func New(args Args) (*Loader, error) { args.NumCPU = runtime.NumCPU() } log.Infof("CPUs: %d", args.NumCPU) + runtime.GOMAXPROCS(args.NumCPU) if args.TotalMem > 0 { // Adjust the total memory returned by the Sentry so that applications that @@ -300,6 +308,12 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } + if kernel.VFS2Enabled { + if err := registerFilesystems(k); err != nil { + return nil, fmt.Errorf("registering filesystems: %w", err) + } + } + if err := adjustDirentCache(k); err != nil { return nil, err } @@ -318,7 +332,7 @@ func New(args Args) (*Loader, error) { dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction dog := watchdog.New(k, dogOpts) - procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) + procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) if err != nil { return nil, fmt.Errorf("creating init process for root container: %v", err) } @@ -338,7 +352,7 @@ func New(args Args) (*Loader, error) { if err != nil { return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) } - defer hostFilesystem.DecRef() + defer hostFilesystem.DecRef(k.SupervisorContext()) hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { return nil, fmt.Errorf("failed to create hostfs mount: %v", err) @@ -346,37 +360,45 @@ func New(args Args) (*Loader, error) { k.SetHostMount(hostMount) } + info := containerInfo{ + conf: args.Conf, + spec: args.Spec, + procArgs: procArgs, + } + // Make host FDs stable between invocations. Host FDs must map to the exact // same number when the sandbox is restored. Otherwise the wrong FD will be // used. - var stdioFDs []int newfd := startingStdioFD - for _, fd := range args.StdioFDs { - err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC) - if err != nil { - return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) + for _, stdioFD := range args.StdioFDs { + // Check that newfd is unused to avoid clobbering over it. + if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { + if err != nil { + return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) + } + return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) } - stdioFDs = append(stdioFDs, newfd) - err = syscall.Close(fd) + + err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) if err != nil { - return nil, fmt.Errorf("close original stdioFDs failed: %v", err) + return nil, fmt.Errorf("dup3 of stdios failed: %w", err) } + info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) + _ = unix.Close(stdioFD) newfd++ } + for _, goferFD := range args.GoferFDs { + info.goferFDs = append(info.goferFDs, fd.New(goferFD)) + } eid := execID{cid: args.ID} l := &Loader{ - k: k, - conf: args.Conf, - console: args.Console, - watchdog: dog, - spec: args.Spec, - goferFDs: args.GoferFDs, - stdioFDs: stdioFDs, - rootProcArgs: procArgs, - sandboxID: args.ID, - processes: map[execID]*execProcess{eid: {}}, - mountHints: mountHints, + k: k, + watchdog: dog, + sandboxID: args.ID, + processes: map[execID]*execProcess{eid: {}}, + mountHints: mountHints, + root: info, } // We don't care about child signals; some platforms can generate a @@ -404,8 +426,8 @@ func New(args Args) (*Loader, error) { return l, nil } -// newProcess creates a process that can be run with kernel.CreateProcess. -func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { +// createProcessArgs creates args that can be used with kernel.CreateProcess. +func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { // Create initial limits. ls, err := createLimitSet(spec) if err != nil { @@ -449,9 +471,19 @@ func (l *Loader) Destroy() { l.stopSignalForwarding() } l.watchdog.Stop() + + // In the success case, stdioFDs and goferFDs will only contain + // released/closed FDs that ownership has been passed over to host FDs and + // gofer sessions. Close them here in case on failure. + for _, fd := range l.root.stdioFDs { + _ = fd.Close() + } + for _, fd := range l.root.goferFDs { + _ = fd.Close() + } } -func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { +func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { p, err := platform.Lookup(conf.Platform) if err != nil { panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err)) @@ -478,14 +510,15 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return mf, nil } +// installSeccompFilters installs sandbox seccomp filters with the host. func (l *Loader) installSeccompFilters() error { - if l.conf.DisableSeccomp { + if l.root.conf.DisableSeccomp { filter.Report("syscall filter is DISABLED. Running in less secure mode.") } else { opts := filter.Options{ Platform: l.k.Platform, - HostNetwork: l.conf.Network == NetworkHost, - ProfileEnable: l.conf.ProfileEnable, + HostNetwork: l.root.conf.Network == config.NetworkHost, + ProfileEnable: l.root.conf.ProfileEnable, ControllerFD: l.ctrl.srv.FD(), } if err := filter.Install(opts); err != nil { @@ -511,7 +544,7 @@ func (l *Loader) Run() error { } func (l *Loader) run() error { - if l.conf.Network == NetworkHost { + if l.root.conf.Network == config.NetworkHost { // Delay host network configuration to this point because network namespace // is configured after the loader is created and before Run() is called. log.Debugf("Configuring host network") @@ -532,10 +565,8 @@ func (l *Loader) run() error { // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. - var ttyFile *host.TTYFileOperations - var ttyFileVFS2 *hostvfs2.TTYFileDescription if !l.restore { - if l.conf.ProfileEnable { + if l.root.conf.ProfileEnable { pprof.Initialize() } @@ -545,82 +576,30 @@ func (l *Loader) run() error { return err } - // Create the FD map, which will set stdin, stdout, and stderr. If console - // is true, then ioctl calls will be passed through to the host fd. - ctx := l.rootProcArgs.NewContext(l.k) - var err error - - // CreateProcess takes a reference on FDMap if successful. We won't need - // ours either way. - l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs) - if err != nil { - return fmt.Errorf("importing fds: %v", err) - } - - // Setup the root container file system. - l.startGoferMonitor(l.sandboxID, l.goferFDs) - - mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.processHints(l.conf); err != nil { - return err - } - if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil { - return err - } - - // Add the HOME enviroment variable if it is not already set. - var envv []string - if kernel.VFS2Enabled { - envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2, - l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) - - } else { - envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, - l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv) - } - if err != nil { - return err - } - l.rootProcArgs.Envv = envv - // Create the root container init task. It will begin running // when the kernel is started. - if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil { - return fmt.Errorf("creating init process: %v", err) + if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { + return err } - // CreateProcess takes a reference on FDTable if successful. - l.rootProcArgs.FDTable.DecRef() } ep.tg = l.k.GlobalInit() - if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok { + if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok { ep.pidnsPath = ns.Path } - if l.console { - // Set the foreground process group on the TTY to the global init process - // group, since that is what we are about to start running. - switch { - case ttyFileVFS2 != nil: - ep.ttyVFS2 = ttyFileVFS2 - ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup()) - case ttyFile != nil: - ep.tty = ttyFile - ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup()) - } - } // Handle signals by forwarding them to the root container process // (except for panic signal, which should cause a panic). l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { // Panic signal should cause a panic. - if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) { + if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) { panic("Signal-induced panic") } // Otherwise forward to root container. deliveryMode := DeliverToProcess - if l.console { + if l.root.spec.Process.Terminal { // Since we are running with a console, we should forward the signal to // the foreground process group so that job control signals like ^C can // be handled properly. @@ -632,19 +611,6 @@ func (l *Loader) run() error { } }) - // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again - // either in createFDTable() during initial start or in descriptor.initAfterLoad() - // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the - // passed FDs, so only close for VFS1. - if !kernel.VFS2Enabled { - for _, fd := range l.stdioFDs { - err := syscall.Close(fd) - if err != nil { - return fmt.Errorf("close dup()ed stdioFDs: %v", err) - } - } - } - log.Infof("Process should have started...") l.watchdog.Start() return l.k.Start() @@ -664,9 +630,9 @@ func (l *Loader) createContainer(cid string) error { } // startContainer starts a child container. It returns the thread group ID of -// the newly created process. Caller owns 'files' and may close them after -// this method returns. -func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error { +// the newly created process. Used FDs are either closed or released. It's safe +// for the caller to close any remaining files upon return. +func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -676,8 +642,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file l.mu.Lock() defer l.mu.Unlock() - eid := execID{cid: cid} - if _, ok := l.processes[eid]; !ok { + ep := l.processes[execID{cid: cid}] + if ep == nil { return fmt.Errorf("trying to start a deleted container %q", cid) } @@ -711,88 +677,136 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file if pidns == nil { pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) } - l.processes[eid].pidnsPath = ns.Path + ep.pidnsPath = ns.Path } else { pidns = l.k.RootPIDNamespace() } - procArgs, err := newProcess(cid, spec, creds, l.k, pidns) + + info := &containerInfo{ + conf: conf, + spec: spec, + stdioFDs: files[:3], + goferFDs: files[3:], + } + info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } + tg, err := l.createContainerProcess(false, cid, info, ep) + if err != nil { + return err + } + + // Success! + l.k.StartProcess(tg) + ep.tg = tg + return nil +} - // setupContainerFS() dups stdioFDs, so we don't need to dup them here. - var stdioFDs []int - for _, f := range files[:3] { - stdioFDs = append(stdioFDs, int(f.Fd())) +func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) { + console := false + if root { + // Only root container supports terminal for now. + console = info.spec.Process.Terminal } // Create the FD map, which will set stdin, stdout, and stderr. - ctx := procArgs.NewContext(l.k) - fdTable, _, _, err := createFDTable(ctx, false, stdioFDs) + ctx := info.procArgs.NewContext(l.k) + fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs) if err != nil { - return fmt.Errorf("importing fds: %v", err) - } - // CreateProcess takes a reference on fdTable if successful. We won't - // need ours either way. - procArgs.FDTable = fdTable - - // Can't take ownership away from os.File. dup them to get a new FDs. - var goferFDs []int - for _, f := range files[3:] { - fd, err := syscall.Dup(int(f.Fd())) - if err != nil { - return fmt.Errorf("failed to dup file: %v", err) - } - goferFDs = append(goferFDs, fd) + return nil, fmt.Errorf("importing fds: %v", err) } + // CreateProcess takes a reference on fdTable if successful. We won't need + // ours either way. + info.procArgs.FDTable = fdTable // Setup the child container file system. - l.startGoferMonitor(cid, goferFDs) + l.startGoferMonitor(cid, info.goferFDs) - mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints) - if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil { - return err + mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints) + if root { + if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { + return nil, err + } + } + if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil { + return nil, err } // Add the HOME enviroment variable if it is not already set. var envv []string if kernel.VFS2Enabled { - envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2, - procArgs.Credentials.RealKUID, procArgs.Envv) + envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2, + info.procArgs.Credentials.RealKUID, info.procArgs.Envv) } else { - envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace, - procArgs.Credentials.RealKUID, procArgs.Envv) + envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace, + info.procArgs.Credentials.RealKUID, info.procArgs.Envv) } if err != nil { - return err + return nil, err } - procArgs.Envv = envv + info.procArgs.Envv = envv // Create and start the new process. - tg, _, err := l.k.CreateProcess(procArgs) + tg, _, err := l.k.CreateProcess(info.procArgs) if err != nil { - return fmt.Errorf("creating process: %v", err) + return nil, fmt.Errorf("creating process: %v", err) } - l.k.StartProcess(tg) - // CreateProcess takes a reference on FDTable if successful. - procArgs.FDTable.DecRef() + info.procArgs.FDTable.DecRef(ctx) - l.processes[eid].tg = tg - return nil + // Set the foreground process group on the TTY to the global init process + // group, since that is what we are about to start running. + if root { + switch { + case ttyFileVFS2 != nil: + ep.ttyVFS2 = ttyFileVFS2 + ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) + case ttyFile != nil: + ep.tty = ttyFile + ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) + } + } + + // Install seccomp filters with the new task if there are any. + if info.conf.OCISeccomp { + if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { + program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) + if err != nil { + return nil, fmt.Errorf("building seccomp program: %v", err) + } + + if log.IsLogging(log.Debug) { + out, _ := bpf.DecodeProgram(program) + log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) + } + + task := tg.Leader() + // NOTE: It seems Flags are ignored by runc so we ignore them too. + if err := task.AppendSyscallFilter(program, true); err != nil { + return nil, fmt.Errorf("appending seccomp filters: %v", err) + } + } + } else { + if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { + log.Warningf("Seccomp spec is being ignored") + } + } + + return tg, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on -// the gofer FDs looking for disconnects, and destroys the container if a +// the gofer FDs looking for disconnects, and kills the container processes if a // disconnect occurs in any of the gofer FDs. -func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { +func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { go func() { log.Debugf("Monitoring gofer health for container %q", cid) var events []unix.PollFd - for _, fd := range goferFDs { + for _, goferFD := range goferFDs { events = append(events, unix.PollFd{ - Fd: int32(fd), + Fd: int32(goferFD.FD()), Events: unix.POLLHUP | unix.POLLRDHUP, }) } @@ -805,18 +819,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err)) } - // Check if the gofer has stopped as part of normal container destruction. - // This is done just to avoid sending an annoying error message to the log. - // Note that there is a small race window in between mu.Unlock() and the - // lock being reacquired in destroyContainer(), but it's harmless to call - // destroyContainer() multiple times. l.mu.Lock() - _, ok := l.processes[execID{cid: cid}] - l.mu.Unlock() - if ok { - log.Infof("Gofer socket disconnected, destroying container %q", cid) - if err := l.destroyContainer(cid); err != nil { - log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err) + defer l.mu.Unlock() + + // The gofer could have been stopped due to a normal container shutdown. + // Check if the container has not stopped yet. + if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil { + log.Infof("Gofer socket disconnected, killing container %q", cid) + if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { + log.Warningf("Error killing container %q after gofer stopped: %v", cid, err) } } }() @@ -885,37 +896,42 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { return 0, fmt.Errorf("container %q not started", args.ContainerID) } - // Get the container MountNamespace from the Task. + // Get the container MountNamespace from the Task. Try to acquire ref may fail + // in case it raced with task exit. if kernel.VFS2Enabled { // task.MountNamespace() does not take a ref, so we must do so ourselves. args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2() - args.MountNamespaceVFS2.IncRef() + if !args.MountNamespaceVFS2.TryIncRef() { + return 0, fmt.Errorf("container %q has stopped", args.ContainerID) + } } else { + var reffed bool tg.Leader().WithMuLocked(func(t *kernel.Task) { // task.MountNamespace() does not take a ref, so we must do so ourselves. args.MountNamespace = t.MountNamespace() - args.MountNamespace.IncRef() + reffed = args.MountNamespace.TryIncRef() }) + if !reffed { + return 0, fmt.Errorf("container %q has stopped", args.ContainerID) + } } // Add the HOME environment variable if it is not already set. if kernel.VFS2Enabled { - defer args.MountNamespaceVFS2.DecRef() - root := args.MountNamespaceVFS2.Root() - defer root.DecRef() ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespaceVFS2.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) if err != nil { return 0, err } args.Envv = envv } else { - defer args.MountNamespace.DecRef() - root := args.MountNamespace.Root() - defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespace.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) if err != nil { return 0, err @@ -1012,20 +1028,25 @@ func (l *Loader) WaitExit() kernel.ExitStatus { // Wait for container. l.k.WaitExited() + // Cleanup + l.ctrl.stop() + + refs.OnExit() + return l.k.GlobalInit().ExitStatus() } -func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { +func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { // Create an empty network stack because the network namespace may be empty at // this point. Netns is configured before Run() is called. Netstack is // configured using a control uRPC message. Host network is configured inside // Run(). switch conf.Network { - case NetworkHost: + case config.NetworkHost: // No network namespacing support for hostinet yet, hence creator is nil. return inet.NewRootNamespace(hostinet.NewStack(), nil), nil - case NetworkNone, NetworkSandbox: + case config.NetworkNone, config.NetworkSandbox: s, err := newEmptySandboxNetworkStack(clock, uniqueID) if err != nil { return nil, err @@ -1043,8 +1064,8 @@ func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.Uni } func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { - netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} - transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} + transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4} s := netstack.Stack{stack.New(stack.Options{ NetworkProtocols: netProtos, TransportProtocols: transProtos, @@ -1058,17 +1079,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in })} // Enable SACK Recovery. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { - return nil, fmt.Errorf("failed to enable SACK: %s", err) + { + opt := tcpip.TCPSACKEnabled(true) + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } } // Set default TTLs as required by socket/netstack. - s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) - s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + { + opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) + if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) + } + if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) + } + } // Enable Receive Buffer Auto-Tuning. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err) + { + opt := tcpip.TCPModerateReceiveBufferOption(true) + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } } return &s, nil @@ -1264,7 +1298,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2 return ep.tty, ep.ttyVFS2, nil } -func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { +func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { if len(stdioFDs) != 3 { return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } @@ -1273,7 +1307,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F fdTable := k.NewFDTable() ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) if err != nil { - fdTable.DecRef() + fdTable.DecRef(ctx) return nil, nil, nil, err } return fdTable, ttyFile, ttyFileVFS2, nil diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index e448fd773..1f49431a2 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -26,6 +26,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" @@ -34,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/fsgofer" ) @@ -43,15 +45,19 @@ func init() { if err := fsgofer.OpenProcSelfFD(); err != nil { panic(err) } + config.RegisterFlags() } -func testConfig() *Config { - return &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - DisableSeccomp: true, - Platform: "ptrace", +func testConfig() *config.Config { + conf, err := config.NewFromFlags() + if err != nil { + panic(err) } + // Change test defaults. + conf.RootDir = "unused_root_dir" + conf.Network = config.NetworkNone + conf.DisableSeccomp = true + return conf } // testSpec returns a simple spec that can be used in tests. @@ -258,7 +264,7 @@ type CreateMountTestcase struct { expectedPaths []string } -func createMountTestcases(vfs2 bool) []*CreateMountTestcase { +func createMountTestcases() []*CreateMountTestcase { testCases := []*CreateMountTestcase{ &CreateMountTestcase{ // Only proc. @@ -403,32 +409,26 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase { Destination: "/proc", Type: "tmpfs", }, - // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports - // MkDirAt in VFS2 (and remove the reduntant append). - // { - // Destination: "/sys/bar", - // Type: "tmpfs", - // }, - // + { + Destination: "/sys/bar", + Type: "tmpfs", + }, + { Destination: "/tmp/baz", Type: "tmpfs", }, }, }, - expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"}, + expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"}, } - if !vfs2 { - vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"}) - vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar") - } return append(testCases, vfsCase) } // Test that MountNamespace can be created with various specs. func TestCreateMountNamespace(t *testing.T) { - for _, tc := range createMountTestcases(false /* vfs2 */) { + for _, tc := range createMountTestcases() { t.Run(tc.name, func(t *testing.T) { conf := testConfig() ctx := contexttest.Context(t) @@ -439,7 +439,7 @@ func TestCreateMountNamespace(t *testing.T) { } defer cleanup() - mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{}) + mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{}) mns, err := mntr.createMountNamespace(ctx, conf) if err != nil { t.Fatalf("failed to create mount namespace: %v", err) @@ -450,13 +450,13 @@ func TestCreateMountNamespace(t *testing.T) { } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { maxTraversals := uint(0) if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) @@ -465,7 +465,7 @@ func TestCreateMountNamespace(t *testing.T) { // Test that MountNamespace can be created with various specs. func TestCreateMountNamespaceVFS2(t *testing.T) { - for _, tc := range createMountTestcases(true /* vfs2 */) { + for _, tc := range createMountTestcases() { t.Run(tc.name, func(t *testing.T) { spec := testSpec() spec.Mounts = tc.spec.Mounts @@ -479,19 +479,19 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { defer l.Destroy() defer loaderCleanup() - mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.processHints(l.conf); err != nil { + mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil { t.Fatalf("failed process hints: %v", err) } ctx := l.k.SupervisorContext() - mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs) + mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs) if err != nil { - t.Fatalf("failed to setupVFS2: %v", err) + t.Fatalf("mountAll: %v", err) } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { target := &vfs.PathOperation{ Root: root, @@ -499,10 +499,10 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { Path: fspath.Parse(p), } - if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { + if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) @@ -545,7 +545,7 @@ func TestRestoreEnvironment(t *testing.T) { { Dev: "9pfs-/", Flags: fs.MountSourceFlags{ReadOnly: true}, - DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true", }, }, "tmpfs": { @@ -599,7 +599,7 @@ func TestRestoreEnvironment(t *testing.T) { { Dev: "9pfs-/", Flags: fs.MountSourceFlags{ReadOnly: true}, - DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true", }, { Dev: "9pfs-/dev/fd-foo", @@ -657,7 +657,7 @@ func TestRestoreEnvironment(t *testing.T) { { Dev: "9pfs-/", Flags: fs.MountSourceFlags{ReadOnly: true}, - DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true", }, }, "tmpfs": { @@ -697,7 +697,11 @@ func TestRestoreEnvironment(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { conf := testConfig() - mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{}) + var ioFDs []*fd.FD + for _, ioFD := range tc.ioFDs { + ioFDs = append(ioFDs, fd.New(ioFD)) + } + mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{}) actualRenv, err := mntr.createRestoreEnvironment(conf) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 14d2f56a5..988573640 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" @@ -32,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/config" ) var ( @@ -77,44 +79,6 @@ type DefaultRoute struct { Name string } -// QueueingDiscipline is used to specify the kind of Queueing Discipline to -// apply for a give FDBasedLink. -type QueueingDiscipline int - -const ( - // QDiscNone disables any queueing for the underlying FD. - QDiscNone QueueingDiscipline = iota - - // QDiscFIFO applies a simple fifo based queue to the underlying - // FD. - QDiscFIFO -) - -// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s -// else returns an error. -func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) { - switch s { - case "none": - return QDiscNone, nil - case "fifo": - return QDiscFIFO, nil - default: - return 0, fmt.Errorf("unsupported qdisc specified: %q", s) - } -} - -// String implements fmt.Stringer. -func (q QueueingDiscipline) String() string { - switch q { - case QDiscNone: - return "none" - case QDiscFIFO: - return "fifo" - default: - panic(fmt.Sprintf("Invalid queueing discipline: %d", q)) - } -} - // FDBasedLink configures an fd-based link. type FDBasedLink struct { Name string @@ -126,7 +90,7 @@ type FDBasedLink struct { TXChecksumOffload bool RXChecksumOffload bool LinkAddress net.HardwareAddr - QDisc QueueingDiscipline + QDisc config.QueueingDiscipline // NumChannels controls how many underlying FD's are to be used to // create this endpoint. @@ -246,12 +210,15 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } switch link.QDisc { - case QDiscNone: - case QDiscFIFO: + case config.QDiscNone: + case config.QDiscFIFO: log.Infof("Enabling FIFO QDisc on %q", link.Name) linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) } + // Enable support for AF_PACKET sockets to receive outgoing packets. + linkEP = packetsocket.New(linkEP) + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go index fbfd3b07c..c21648a32 100644 --- a/runsc/boot/strace.go +++ b/runsc/boot/strace.go @@ -15,10 +15,13 @@ package boot import ( + "strings" + "gvisor.dev/gvisor/pkg/sentry/strace" + "gvisor.dev/gvisor/runsc/config" ) -func enableStrace(conf *Config) error { +func enableStrace(conf *config.Config) error { // We must initialize even if strace is not enabled. strace.Initialize() @@ -36,5 +39,5 @@ func enableStrace(conf *Config) error { strace.EnableAll(strace.SinkTypeLog) return nil } - return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog) + return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog) } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index b68117867..e36664938 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -16,12 +16,12 @@ package boot import ( "fmt" - "path" "sort" "strings" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" @@ -37,13 +37,19 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/runsc/config" ) -func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { +func registerFilesystems(k *kernel.Kernel) error { + ctx := k.SupervisorContext() + creds := auth.NewRootCredentials(k.RootUserNamespace()) + vfsObj := k.VFS() + vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, // TODO(b/29356795): Users may mount this once the terminals are in a @@ -73,6 +79,10 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre AllowUserMount: true, AllowUserList: true, }) + vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) // Setup files in devtmpfs. if err := memdev.Register(vfsObj); err != nil { @@ -81,18 +91,24 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre if err := ttydev.Register(vfsObj); err != nil { return fmt.Errorf("registering ttydev: %w", err) } - - if err := fuse.Register(vfsObj); err != nil { - return fmt.Errorf("registering fusedev: %w", err) + tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx)) + if tunSupported { + if err := tundev.Register(vfsObj); err != nil { + return fmt.Errorf("registering tundev: %v", err) + } } - if err := tundev.Register(vfsObj); err != nil { - return fmt.Errorf("registering tundev: %v", err) + + if kernel.FUSEEnabled { + if err := fuse.Register(vfsObj); err != nil { + return fmt.Errorf("registering fusedev: %w", err) + } } + a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name) if err != nil { return fmt.Errorf("creating devtmpfs accessor: %w", err) } - defer a.Release() + defer a.Release(ctx) if err := a.UserspaceInit(ctx); err != nil { return fmt.Errorf("initializing userspace: %w", err) @@ -103,20 +119,23 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil { return fmt.Errorf("creating ttydev devtmpfs files: %w", err) } - if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { - return fmt.Errorf("creating tundev devtmpfs files: %v", err) + if tunSupported { + if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating tundev devtmpfs files: %v", err) + } } - if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { - return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + + if kernel.FUSEEnabled { + if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { + return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + } } + return nil } -func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { - if err := mntr.k.VFS().Init(); err != nil { - return fmt.Errorf("failed to initialize VFS: %w", err) - } - mns, err := mntr.setupVFS2(ctx, conf, procArgs) +func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + mns, err := mntr.mountAll(conf, procArgs) if err != nil { return fmt.Errorf("failed to setupFS: %w", err) } @@ -131,7 +150,7 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte return nil } -func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { +func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { log.Infof("Configuring container's file system with VFS2") // Create context with root credentials to mount the filesystem (the current @@ -144,36 +163,115 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals rootCtx := procArgs.NewContext(c.k) - if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil { - return nil, fmt.Errorf("register filesystems: %w", err) - } - mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) if err != nil { return nil, fmt.Errorf("creating mount namespace: %w", err) } rootProcArgs.MountNamespaceVFS2 = mns + root := mns.Root() + defer root.DecRef(rootCtx) + if root.Mount().ReadOnly() { + // Switch to ReadWrite while we setup submounts. + if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil { + return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err) + } + // Restore back to ReadOnly at the end. + defer func() { + if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { + panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err)) + } + }() + } + // Mount submounts. if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil { return nil, fmt.Errorf("mounting submounts vfs2: %w", err) } + return mns, nil } -func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { +// createMountNamespaceVFS2 creates the container's root mount and namespace. +func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { fd := c.fds.remove() - opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",") + data := p9MountData(fd, conf.FileAccess, true /* vfs2 */) + + if conf.OverlayfsStaleRead { + // We can't check for overlayfs here because sandbox is chroot'ed and gofer + // can only send mount options for specs.Mounts (specs.Root is missing + // Options field). So assume root is always on top of overlayfs. + data = append(data, "overlayfs_stale_read") + } log.Infof("Mounting root over 9P, ioFD: %d", fd) - mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts}) + opts := &vfs.MountOptions{ + ReadOnly: c.root.Readonly, + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(data, ","), + }, + InternalMount: true, + } + + fsName := gofer.Name + if conf.Overlay && !c.root.Readonly { + log.Infof("Adding overlay on top of root") + var err error + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting root with overlay: %w", err) + } + defer cleanup() + fsName = overlay.Name + } + + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts) if err != nil { return nil, fmt.Errorf("setting up mount namespace: %w", err) } return mns, nil } -func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { +// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper +// layer using tmpfs, and return overlay mount options. "cleanup" must be called +// after the options have been used to mount the overlay, to release refs on +// lower and upper mounts. +func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) { + // First copy options from lower layer to upper layer and overlay. Clear + // filesystem specific options. + upperOpts := *lowerOpts + upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} + + overlayOpts := *lowerOpts + overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} + + // Next mount upper and lower. Upper is a tmpfs mount to keep all + // modifications inside the sandbox. + upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) + if err != nil { + return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) + } + cu := cleanup.Make(func() { upper.DecRef(ctx) }) + defer cu.Clean() + + // All writes go to the upper layer, be paranoid and make lower readonly. + lowerOpts.ReadOnly = true + lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) + if err != nil { + return nil, nil, err + } + cu.Add(func() { lower.DecRef(ctx) }) + + // Configure overlay with both layers. + overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{ + UpperRoot: vfs.MakeVirtualDentry(upper, upper.Root()), + LowerRoots: []vfs.VirtualDentry{vfs.MakeVirtualDentry(lower, lower.Root())}, + } + return &overlayOpts, cu.Release(), nil +} + +func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { mounts, err := c.prepareMountsVFS2() if err != nil { return err @@ -182,8 +280,34 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, for i := range mounts { submount := &mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) - if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { - return err + var ( + mnt *vfs.Mount + err error + ) + + if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() { + mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint) + if err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err) + } + } else { + mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount) + if err != nil { + return fmt.Errorf("mount submount %q: %w", submount.Destination, err) + } + } + + if mnt != nil && mnt.ReadOnly() { + // Switch to ReadWrite while we setup submounts. + if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { + return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err) + } + // Restore back to ReadOnly at the end. + defer func() { + if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { + panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err)) + } + }() } } @@ -227,62 +351,83 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { return mounts, nil } -func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { - root := mns.Root() - defer root.DecRef() - target := &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(submount.Destination), - } - fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount) +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) { + fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount) if err != nil { - return fmt.Errorf("mountOptions failed: %w", err) + return nil, fmt.Errorf("mountOptions failed: %w", err) } if len(fsName) == 0 { // Filesystem is not supported (e.g. cgroup), just skip it. - return nil + return nil, nil } - if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil { - return err + if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err) } - if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil { - return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + + if useOverlay { + log.Infof("Adding overlay on top of mount %q", submount.Destination) + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err) + } + defer cleanup() + fsName = overlay.Name + } + + root := mns.Root() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(submount.Destination), + } + mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) + if err != nil { + return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data) - return nil + return mnt, nil } // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values // used for mounts. -func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) { - var ( - fsName string - data []string - ) +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) { + fsName := m.Type + useOverlay := false + var data []string // Find filesystem name and FS specific data field. switch m.Type { case devpts.Name, devtmpfs.Name, proc.Name, sys.Name: - fsName = m.Type + // Nothing to do. + case nonefs: fsName = sys.Name - case tmpfs.Name: - fsName = m.Type + case tmpfs.Name: var err error data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { - return "", nil, err + return "", nil, false, err } case bind: fsName = gofer.Name + if m.fd == 0 { + // Check that an FD was provided to fails fast. Technically FD=0 is valid, + // but unlikely to be correct in this context. + return "", nil, false, fmt.Errorf("9P mount requires a connection FD") + } data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + default: log.Warningf("ignoring unknown filesystem type %q", m.Type) + return "", nil, false, nil } opts := &vfs.MountOptions{ @@ -307,38 +452,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF } } - if conf.Overlay { - // All writes go to upper, be paranoid and make lower readonly. - opts.ReadOnly = true - } - return fsName, opts, nil -} - -func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { - target := &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(currentPath), - } - _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) - if err == nil { - // Mount point exists, nothing else to do. - return nil - } - if err != syserror.ENOENT { - return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err) - } - - // Recurse to ensure parent is created and then create the mount point. - if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil { - return err - } - log.Debugf("Creating dir %q for mount point", currentPath) - mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} - if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil { - return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err) - } - return nil + return fsName, opts, useOverlay, nil } // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so. @@ -350,7 +464,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s // // Note that when there are submounts inside of '/tmp', directories for the // mount points must be present, making '/tmp' not empty anymore. -func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { +func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { for _, m := range c.mounts { // m.Destination has been cleaned, so it's to use equality here. if m.Destination == "/tmp" { @@ -360,28 +474,35 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse("/tmp"), } // TODO(gvisor.dev/issue/2782): Use O_PATH when available. - statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{}) + fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) switch err { case nil: - // Found '/tmp' in filesystem, check if it's empty. - if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory { - // Not a dir?! Leave it be. + defer fd.DecRef(ctx) + + err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name != "." && dirent.Name != ".." { + return syserror.ENOTEMPTY + } return nil - } - if statx.Nlink > 2 { + })) + switch err { + case nil: + log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) + case syserror.ENOTEMPTY: // If more than "." and ".." is found, skip internal tmpfs to prevent // hiding existing files. log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) return nil + default: + return err } - log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) fallthrough case syserror.ENOENT: @@ -394,9 +515,122 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds // another user. This is normally done for /tmp. Options: []string{"mode=01777"}, } - return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + _, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + return err + + case syserror.ENOTDIR: + // Not a dir?! Let it be. + return nil default: - return fmt.Errorf(`stating "/tmp" inside container: %w`, err) + return fmt.Errorf(`opening "/tmp" inside container: %w`, err) + } +} + +// processHintsVFS2 processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error { + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfs.Name { + continue + } + + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.vfsMount = mnt + } + return nil +} + +// mountSharedMasterVFS2 mounts the master of a volume that is shared among +// containers in a pod. +func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + mntFD := &mountAndFD{Mount: hint.mount} + fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + + if useOverlay { + log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination) + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err) + } + defer cleanup() + fsName = overlay.Name + } + + return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) +} + +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) { + if err := source.checkCompatible(mount); err != nil { + return nil, err + } + + // Ignore data and useOverlay because these were already applied to + // the master mount. + _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + if err != nil { + return nil, err + } + newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts) + if err != nil { + return nil, err + } + defer newMnt.DecRef(ctx) + + root := mns.Root() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(mount.Destination), + } + + if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err) + } + + if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { + return nil, err + } + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return newMnt, nil +} + +func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error { + root := mns.Root() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dest), + } + // First check if mount point exists. When overlay is enabled, gofer doesn't + // allow changes to the FS, making MakeSytheticMountpoint() ineffective + // because MkdirAt fails with EROFS even if file exists. + vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{}) + if err == nil { + // File exists, we're done. + vd.DecRef(ctx) + return nil } + return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds) } |