diff options
Diffstat (limited to 'runsc/boot')
-rw-r--r-- | runsc/boot/BUILD | 11 | ||||
-rw-r--r-- | runsc/boot/compat.go | 2 | ||||
-rw-r--r-- | runsc/boot/config.go | 336 | ||||
-rw-r--r-- | runsc/boot/controller.go | 34 | ||||
-rw-r--r-- | runsc/boot/filter/config.go | 497 | ||||
-rw-r--r-- | runsc/boot/filter/config_amd64.go | 41 | ||||
-rw-r--r-- | runsc/boot/filter/config_arm64.go | 25 | ||||
-rw-r--r-- | runsc/boot/filter/config_profile.go | 6 | ||||
-rw-r--r-- | runsc/boot/fs.go | 68 | ||||
-rw-r--r-- | runsc/boot/fs_test.go | 11 | ||||
-rw-r--r-- | runsc/boot/loader.go | 216 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 106 | ||||
-rw-r--r-- | runsc/boot/network.go | 45 | ||||
-rw-r--r-- | runsc/boot/strace.go | 7 | ||||
-rw-r--r-- | runsc/boot/vfs.go | 338 |
15 files changed, 838 insertions, 905 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 9f52438c2..b97dc3c47 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -8,7 +8,6 @@ go_library( "compat.go", "compat_amd64.go", "compat_arm64.go", - "config.go", "controller.go", "debug.go", "events.go", @@ -27,15 +26,19 @@ go_library( deps = [ "//pkg/abi", "//pkg/abi/linux", + "//pkg/bpf", + "//pkg/cleanup", "//pkg/context", "//pkg/control/server", "//pkg/cpuid", "//pkg/eventchannel", + "//pkg/fd", "//pkg/fspath", "//pkg/log", "//pkg/memutil", "//pkg/rand", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/control", @@ -105,9 +108,11 @@ go_library( "//runsc/boot/filter", "//runsc/boot/platforms", "//runsc/boot/pprof", + "//runsc/config", "//runsc/specutils", - "@com_github_golang_protobuf//proto:go_default_library", + "//runsc/specutils/seccomp", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + "@org_golang_google_protobuf//proto:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], ) @@ -123,6 +128,7 @@ go_test( library = ":boot", deps = [ "//pkg/control/server", + "//pkg/fd", "//pkg/fspath", "//pkg/log", "//pkg/p9", @@ -131,6 +137,7 @@ go_test( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/unet", + "//runsc/config", "//runsc/fsgofer", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 84c67cbc2..7076ae2e2 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -19,7 +19,7 @@ import ( "os" "syscall" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" diff --git a/runsc/boot/config.go b/runsc/boot/config.go deleted file mode 100644 index 80da8b3e6..000000000 --- a/runsc/boot/config.go +++ /dev/null @@ -1,336 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package boot - -import ( - "fmt" - "strconv" - "strings" - - "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/watchdog" -) - -// FileAccessType tells how the filesystem is accessed. -type FileAccessType int - -const ( - // FileAccessShared sends IO requests to a Gofer process that validates the - // requests and forwards them to the host. - FileAccessShared FileAccessType = iota - - // FileAccessExclusive is the same as FileAccessShared, but enables - // extra caching for improved performance. It should only be used if - // the sandbox has exclusive access to the filesystem. - FileAccessExclusive -) - -// MakeFileAccessType converts type from string. -func MakeFileAccessType(s string) (FileAccessType, error) { - switch s { - case "shared": - return FileAccessShared, nil - case "exclusive": - return FileAccessExclusive, nil - default: - return 0, fmt.Errorf("invalid file access type %q", s) - } -} - -func (f FileAccessType) String() string { - switch f { - case FileAccessShared: - return "shared" - case FileAccessExclusive: - return "exclusive" - default: - return fmt.Sprintf("unknown(%d)", f) - } -} - -// NetworkType tells which network stack to use. -type NetworkType int - -const ( - // NetworkSandbox uses internal network stack, isolated from the host. - NetworkSandbox NetworkType = iota - - // NetworkHost redirects network related syscalls to the host network. - NetworkHost - - // NetworkNone sets up just loopback using netstack. - NetworkNone -) - -// MakeNetworkType converts type from string. -func MakeNetworkType(s string) (NetworkType, error) { - switch s { - case "sandbox": - return NetworkSandbox, nil - case "host": - return NetworkHost, nil - case "none": - return NetworkNone, nil - default: - return 0, fmt.Errorf("invalid network type %q", s) - } -} - -func (n NetworkType) String() string { - switch n { - case NetworkSandbox: - return "sandbox" - case NetworkHost: - return "host" - case NetworkNone: - return "none" - default: - return fmt.Sprintf("unknown(%d)", n) - } -} - -// MakeWatchdogAction converts type from string. -func MakeWatchdogAction(s string) (watchdog.Action, error) { - switch strings.ToLower(s) { - case "log", "logwarning": - return watchdog.LogWarning, nil - case "panic": - return watchdog.Panic, nil - default: - return 0, fmt.Errorf("invalid watchdog action %q", s) - } -} - -// MakeRefsLeakMode converts type from string. -func MakeRefsLeakMode(s string) (refs.LeakMode, error) { - switch strings.ToLower(s) { - case "disabled": - return refs.NoLeakChecking, nil - case "log-names": - return refs.LeaksLogWarning, nil - case "log-traces": - return refs.LeaksLogTraces, nil - default: - return 0, fmt.Errorf("invalid refs leakmode %q", s) - } -} - -func refsLeakModeToString(mode refs.LeakMode) string { - switch mode { - // If not set, default it to disabled. - case refs.UninitializedLeakChecking, refs.NoLeakChecking: - return "disabled" - case refs.LeaksLogWarning: - return "log-names" - case refs.LeaksLogTraces: - return "log-traces" - default: - panic(fmt.Sprintf("Invalid leakmode: %d", mode)) - } -} - -// Config holds configuration that is not part of the runtime spec. -type Config struct { - // RootDir is the runtime root directory. - RootDir string - - // Debug indicates that debug logging should be enabled. - Debug bool - - // LogFilename is the filename to log to, if not empty. - LogFilename string - - // LogFormat is the log format. - LogFormat string - - // DebugLog is the path to log debug information to, if not empty. - DebugLog string - - // PanicLog is the path to log GO's runtime messages, if not empty. - PanicLog string - - // DebugLogFormat is the log format for debug. - DebugLogFormat string - - // FileAccess indicates how the filesystem is accessed. - FileAccess FileAccessType - - // Overlay is whether to wrap the root filesystem in an overlay. - Overlay bool - - // FSGoferHostUDS enables the gofer to mount a host UDS. - FSGoferHostUDS bool - - // Network indicates what type of network to use. - Network NetworkType - - // EnableRaw indicates whether raw sockets should be enabled. Raw - // sockets are disabled by stripping CAP_NET_RAW from the list of - // capabilities. - EnableRaw bool - - // HardwareGSO indicates that hardware segmentation offload is enabled. - HardwareGSO bool - - // SoftwareGSO indicates that software segmentation offload is enabled. - SoftwareGSO bool - - // TXChecksumOffload indicates that TX Checksum Offload is enabled. - TXChecksumOffload bool - - // RXChecksumOffload indicates that RX Checksum Offload is enabled. - RXChecksumOffload bool - - // QDisc indicates the type of queuening discipline to use by default - // for non-loopback interfaces. - QDisc QueueingDiscipline - - // LogPackets indicates that all network packets should be logged. - LogPackets bool - - // Platform is the platform to run on. - Platform string - - // Strace indicates that strace should be enabled. - Strace bool - - // StraceSyscalls is the set of syscalls to trace. If StraceEnable is - // true and this list is empty, then all syscalls will be traced. - StraceSyscalls []string - - // StraceLogSize is the max size of data blobs to display. - StraceLogSize uint - - // DisableSeccomp indicates whether seccomp syscall filters should be - // disabled. Pardon the double negation, but default to enabled is important. - DisableSeccomp bool - - // WatchdogAction sets what action the watchdog takes when triggered. - WatchdogAction watchdog.Action - - // PanicSignal registers signal handling that panics. Usually set to - // SIGUSR2(12) to troubleshoot hangs. -1 disables it. - PanicSignal int - - // ProfileEnable is set to prepare the sandbox to be profiled. - ProfileEnable bool - - // RestoreFile is the path to the saved container image - RestoreFile string - - // NumNetworkChannels controls the number of AF_PACKET sockets that map - // to the same underlying network device. This allows netstack to better - // scale for high throughput use cases. - NumNetworkChannels int - - // Rootless allows the sandbox to be started with a user that is not root. - // Defense is depth measures are weaker with rootless. Specifically, the - // sandbox and Gofer process run as root inside a user namespace with root - // mapped to the caller's user. - Rootless bool - - // AlsoLogToStderr allows to send log messages to stderr. - AlsoLogToStderr bool - - // ReferenceLeakMode sets reference leak check mode - ReferenceLeakMode refs.LeakMode - - // OverlayfsStaleRead instructs the sandbox to assume that the root mount - // is on a Linux overlayfs mount, which does not necessarily preserve - // coherence between read-only and subsequent writable file descriptors - // representing the "same" file. - OverlayfsStaleRead bool - - // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in - // tests. It allows runsc to start the sandbox process as the current - // user, and without chrooting the sandbox process. This can be - // necessary in test environments that have limited capabilities. - TestOnlyAllowRunAsCurrentUserWithoutChroot bool - - // TestOnlyTestNameEnv should only be used in tests. It looks up for the - // test name in the container environment variables and adds it to the debug - // log file name. This is done to help identify the log with the test when - // multiple tests are run in parallel, since there is no way to pass - // parameters to the runtime from docker. - TestOnlyTestNameEnv string - - // CPUNumFromQuota sets CPU number count to available CPU quota, using - // least integer value greater than or equal to quota. - // - // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. - CPUNumFromQuota bool - - // Enables VFS2 (not plumbled through yet). - VFS2 bool - - // Enables FUSE usage (not plumbled through yet). - FUSE bool -} - -// ToFlags returns a slice of flags that correspond to the given Config. -func (c *Config) ToFlags() []string { - f := []string{ - "--root=" + c.RootDir, - "--debug=" + strconv.FormatBool(c.Debug), - "--log=" + c.LogFilename, - "--log-format=" + c.LogFormat, - "--debug-log=" + c.DebugLog, - "--panic-log=" + c.PanicLog, - "--debug-log-format=" + c.DebugLogFormat, - "--file-access=" + c.FileAccess.String(), - "--overlay=" + strconv.FormatBool(c.Overlay), - "--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS), - "--network=" + c.Network.String(), - "--log-packets=" + strconv.FormatBool(c.LogPackets), - "--platform=" + c.Platform, - "--strace=" + strconv.FormatBool(c.Strace), - "--strace-syscalls=" + strings.Join(c.StraceSyscalls, ","), - "--strace-log-size=" + strconv.Itoa(int(c.StraceLogSize)), - "--watchdog-action=" + c.WatchdogAction.String(), - "--panic-signal=" + strconv.Itoa(c.PanicSignal), - "--profile=" + strconv.FormatBool(c.ProfileEnable), - "--net-raw=" + strconv.FormatBool(c.EnableRaw), - "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels), - "--rootless=" + strconv.FormatBool(c.Rootless), - "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr), - "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), - "--gso=" + strconv.FormatBool(c.HardwareGSO), - "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), - "--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload), - "--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload), - "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), - "--qdisc=" + c.QDisc.String(), - } - if c.CPUNumFromQuota { - f = append(f, "--cpu-num-from-quota") - } - // Only include these if set since it is never to be used by users. - if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { - f = append(f, "--TESTONLY-unsafe-nonroot=true") - } - if len(c.TestOnlyTestNameEnv) != 0 { - f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv) - } - - if c.VFS2 { - f = append(f, "--vfs2=true") - } - - if c.FUSE { - f = append(f, "--fuse=true") - } - - return f -} diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 626a3816e..4e0f0d57a 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -22,6 +22,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -29,10 +30,12 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/state" "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) @@ -220,7 +223,7 @@ type StartArgs struct { Spec *specs.Spec // Config is the runsc-specific configuration for the sandbox. - Conf *Config + Conf *config.Config // CID is the ID of the container to start. CID string @@ -256,13 +259,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { // All validation passed, logs the spec for debugging. specutils.LogSpec(args.Spec) - err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files) + fds, err := fd.NewFromFiles(args.FilePayload.Files) if err != nil { + return err + } + defer func() { + for _, fd := range fds { + _ = fd.Close() + } + }() + if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil { log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) return err } log.Debugf("Container %q started", args.CID) - return nil } @@ -358,12 +368,20 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { cm.l.k = k // Set up the restore environment. + ctx := k.SupervisorContext() mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints) - renv, err := mntr.createRestoreEnvironment(cm.l.root.conf) - if err != nil { - return fmt.Errorf("creating RestoreEnvironment: %v", err) + if kernel.VFS2Enabled { + ctx, err = mntr.configureRestore(ctx, cm.l.root.conf) + if err != nil { + return fmt.Errorf("configuring filesystem restore: %v", err) + } + } else { + renv, err := mntr.createRestoreEnvironment(cm.l.root.conf) + if err != nil { + return fmt.Errorf("creating RestoreEnvironment: %v", err) + } + fs.SetRestoreEnvironment(*renv) } - fs.SetRestoreEnvironment(*renv) // Prepare to load from the state file. if eps, ok := networkStack.(*netstack.Stack); ok { @@ -390,7 +408,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Load the state. loadOpts := state.LoadOpts{Source: specFile} - if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil { + if err := loadOpts.Load(ctx, k, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil { return err } diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 149eb0b1b..a7c4ebb0c 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -27,41 +27,30 @@ import ( // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS. var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_CLOCK_GETTIME: {}, - syscall.SYS_CLONE: []seccomp.Rule{ - { - seccomp.AllowValue( - syscall.CLONE_VM | - syscall.CLONE_FS | - syscall.CLONE_FILES | - syscall.CLONE_SIGHAND | - syscall.CLONE_SYSVSEM | - syscall.CLONE_THREAD), - }, - }, - syscall.SYS_CLOSE: {}, - syscall.SYS_DUP: {}, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, syscall.SYS_DUP3: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.O_CLOEXEC), }, }, syscall.SYS_EPOLL_CREATE1: {}, syscall.SYS_EPOLL_CTL: {}, syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, }, syscall.SYS_EVENTFD2: []seccomp.Rule{ { - seccomp.AllowValue(0), - seccomp.AllowValue(0), + seccomp.EqualTo(0), + seccomp.EqualTo(0), }, }, syscall.SYS_EXIT: {}, @@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_FCHMOD: {}, syscall.SYS_FCNTL: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_SETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_SETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFD), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFD), }, }, syscall.SYS_FSTAT: {}, @@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_FTRUNCATE: {}, syscall.SYS_FUTEX: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, }, // Non-private variants are included for flipcall support. They are otherwise // unncessary, as the sentry will use only private futexes internally. { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE), - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE), + seccomp.MatchAny{}, }, }, syscall.SYS_GETPID: {}, unix.SYS_GETRANDOM: {}, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_DOMAIN), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_DOMAIN), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_TYPE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TYPE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_ERROR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_ERROR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), }, }, syscall.SYS_GETTID: {}, @@ -141,38 +130,44 @@ var allowedSyscalls = seccomp.SyscallRules{ // setting/getting termios and winsize. syscall.SYS_IOCTL: []seccomp.Rule{ { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCGETS), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCGETS), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETS), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETS), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETSF), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETSF), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETSW), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETSW), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TIOCSWINSZ), - seccomp.AllowAny{}, /* winsize struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TIOCSWINSZ), + seccomp.MatchAny{}, /* winsize struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TIOCGWINSZ), - seccomp.AllowAny{}, /* winsize struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TIOCGWINSZ), + seccomp.MatchAny{}, /* winsize struct */ }, }, syscall.SYS_LSEEK: {}, syscall.SYS_MADVISE: {}, + unix.SYS_MEMBARRIER: []seccomp.Rule{ + { + seccomp.EqualTo(linux.MEMBARRIER_CMD_GLOBAL), + seccomp.EqualTo(0), + }, + }, syscall.SYS_MINCORE: {}, // Used by the Go runtime as a temporarily workaround for a Linux // 5.2-5.4 bug. @@ -182,46 +177,46 @@ var allowedSyscalls = seccomp.SyscallRules{ // TODO(b/148688965): Remove once this is gone from Go. syscall.SYS_MLOCK: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(4096), + seccomp.MatchAny{}, + seccomp.EqualTo(4096), }, }, syscall.SYS_MMAP: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_SHARED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_SHARED), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ), - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ), + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), }, }, syscall.SYS_MPROTECT: {}, @@ -237,32 +232,32 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_READ: {}, syscall.SYS_RECVMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), }, }, syscall.SYS_RECVMMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(fdbased.MaxMsgsPerRecv), - seccomp.AllowValue(syscall.MSG_DONTWAIT), - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(fdbased.MaxMsgsPerRecv), + seccomp.EqualTo(syscall.MSG_DONTWAIT), + seccomp.EqualTo(0), }, }, unix.SYS_SENDMMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT), - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT), + seccomp.EqualTo(0), }, }, syscall.SYS_RESTART_SYSCALL: {}, @@ -272,49 +267,49 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_SCHED_YIELD: {}, syscall.SYS_SENDMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), }, }, syscall.SYS_SETITIMER: {}, syscall.SYS_SHUTDOWN: []seccomp.Rule{ // Used by fs/host to shutdown host sockets. - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)}, - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)}, // Used by unet to shutdown connections. - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, unix.SYS_STATX: {}, syscall.SYS_SYNC_FILE_RANGE: {}, syscall.SYS_TEE: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(1), /* len */ - seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */ + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(1), /* len */ + seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */ }, }, syscall.SYS_TGKILL: []seccomp.Rule{ { - seccomp.AllowValue(uint64(os.Getpid())), + seccomp.EqualTo(uint64(os.Getpid())), }, }, syscall.SYS_UTIMENSAT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(0), /* null pathname */ - seccomp.AllowAny{}, - seccomp.AllowValue(0), /* flags */ + seccomp.MatchAny{}, + seccomp.EqualTo(0), /* null pathname */ + seccomp.MatchAny{}, + seccomp.EqualTo(0), /* flags */ }, }, syscall.SYS_WRITE: {}, // For rawfile.NonBlockingWriteIovec. syscall.SYS_WRITEV: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, seccomp.GreaterThan(0), }, }, @@ -325,10 +320,10 @@ func hostInetFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_ACCEPT4: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), }, }, syscall.SYS_BIND: {}, @@ -337,84 +332,84 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_GETSOCKNAME: {}, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_TOS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_TOS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_RECVTOS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVTOS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_TCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_TCLASS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_RECVTCLASS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_V6ONLY), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_V6ONLY), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_ERROR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_ERROR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_KEEPALIVE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_KEEPALIVE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_RCVBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_RCVBUF), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_REUSEADDR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_TYPE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TYPE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_LINGER), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_LINGER), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_NODELAY), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_NODELAY), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_INFO), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_INFO), }, }, syscall.SYS_IOCTL: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.TIOCOUTQ), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.TIOCOUTQ), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.TIOCINQ), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.TIOCINQ), }, }, syscall.SYS_LISTEN: {}, @@ -425,103 +420,103 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_SENDTO: {}, syscall.SYS_SETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_V6ONLY), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_V6ONLY), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_RCVBUF), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_RCVBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_REUSEADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_NODELAY), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_NODELAY), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_TOS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_TOS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_RECVTOS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVTOS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_TCLASS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_TCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_RECVTCLASS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_RECVTCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_RD), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_RD), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_WR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_WR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_RDWR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_RDWR), }, }, syscall.SYS_SOCKET: []seccomp.Rule{ { - seccomp.AllowValue(syscall.AF_INET), - seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET), + seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET), - seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET), + seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET6), - seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET6), + seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET6), - seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET6), + seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, }, syscall.SYS_WRITEV: {}, @@ -532,20 +527,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_ACCEPT: []seccomp.Rule{ { - seccomp.AllowValue(fd), + seccomp.EqualTo(fd), }, }, syscall.SYS_LISTEN: []seccomp.Rule{ { - seccomp.AllowValue(fd), - seccomp.AllowValue(16 /* unet.backlog */), + seccomp.EqualTo(fd), + seccomp.EqualTo(16 /* unet.backlog */), }, }, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_PEERCRED), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_PEERCRED), }, }, } diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go index 5335ff82c..cea5613b8 100644 --- a/runsc/boot/filter/config_amd64.go +++ b/runsc/boot/filter/config_amd64.go @@ -24,8 +24,41 @@ import ( ) func init() { - allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL], - seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)}, - seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)}, - ) + allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{ + // TODO(b/168828518): No longer used in Go 1.16+. + {seccomp.EqualTo(linux.ARCH_SET_FS)}, + } + + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + // parent_tidptr and child_tidptr are always 0 because neither + // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SETTLS | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, + { + // TODO(b/168828518): No longer used in Go 1.16+ (on amd64). + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, + } } diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go index 7fa9bbda3..37313f97f 100644 --- a/runsc/boot/filter/config_arm64.go +++ b/runsc/boot/filter/config_arm64.go @@ -16,6 +16,29 @@ package filter -// Reserve for future customization. +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + func init() { + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + // These arguments are left uninitialized by the Go + // runtime, so they may be anything (and are unused by + // the host). + seccomp.MatchAny{}, // parent_tidptr + seccomp.MatchAny{}, // tls + seccomp.MatchAny{}, // child_tidptr + }, + } } diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go index 194952a7b..7b8669595 100644 --- a/runsc/boot/filter/config_profile.go +++ b/runsc/boot/filter/config_profile.go @@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_OPENAT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), }, }, } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index a30fa198e..6b6ae98d7 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -34,6 +34,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/gofer" @@ -48,6 +49,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) @@ -66,7 +68,7 @@ const ( // tmpfs has some extra supported options that we must pass through. var tmpfsAllowedData = []string{"mode", "uid", "gid"} -func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { +func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { // Upper layer uses the same flags as lower, but it must be read-write. upperFlags := lowerFlags upperFlags.ReadOnly = false @@ -163,7 +165,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount { } // p9MountData creates a slice of p9 mount data. -func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string { +func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string { opts := []string{ "trans=fd", "rfdno=" + strconv.Itoa(fd), @@ -174,7 +176,7 @@ func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string { // enablement. opts = append(opts, "privateunixsocket=true") } - if fa == FileAccessShared { + if fa == config.FileAccessShared { opts = append(opts, "cache=remote_revalidating") } return opts @@ -259,7 +261,7 @@ func mustFindFilesystem(name string) fs.Filesystem { // addSubmountOverlay overlays the inode over a ramfs tree containing the given // paths. -func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) { +func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) { // Construct a ramfs tree of mount points. The contents never // change, so this can be fully caching. There's no real // filesystem backing this tree, so we set the filesystem to @@ -269,7 +271,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string if err != nil { return nil, fmt.Errorf("creating mount tree: %v", err) } - overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{}) + overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf) if err != nil { return nil, fmt.Errorf("adding mount overlay: %v", err) } @@ -288,7 +290,7 @@ func subtargets(root string, mnts []specs.Mount) []string { return targets } -func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { +func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { if conf.VFS2 { return setupContainerVFS2(ctx, conf, mntr, procArgs) } @@ -326,14 +328,14 @@ func adjustDirentCache(k *kernel.Kernel) error { } type fdDispenser struct { - fds []int + fds []*fd.FD } func (f *fdDispenser) remove() int { if f.empty() { panic("fdDispenser out of fds") } - rv := f.fds[0] + rv := f.fds[0].Release() f.fds = f.fds[1:] return rv } @@ -459,27 +461,27 @@ func (m *mountHint) isSupported() bool { func (m *mountHint) checkCompatible(mount specs.Mount) error { // Remove options that don't affect to mount's behavior. masterOpts := filterUnsupportedOptions(m.mount) - slaveOpts := filterUnsupportedOptions(mount) + replicaOpts := filterUnsupportedOptions(mount) - if len(masterOpts) != len(slaveOpts) { - return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + if len(masterOpts) != len(replicaOpts) { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) } sort.Strings(masterOpts) - sort.Strings(slaveOpts) + sort.Strings(replicaOpts) for i, opt := range masterOpts { - if opt != slaveOpts[i] { - return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + if opt != replicaOpts[i] { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) } } return nil } -func (m *mountHint) fileAccessType() FileAccessType { +func (m *mountHint) fileAccessType() config.FileAccessType { if m.share == container { - return FileAccessExclusive + return config.FileAccessExclusive } - return FileAccessShared + return config.FileAccessShared } func filterUnsupportedOptions(mount specs.Mount) []string { @@ -570,7 +572,7 @@ type containerMounter struct { hints *podMountHints } -func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter { +func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter { return &containerMounter{ root: spec.Root, mounts: compileMounts(spec), @@ -583,7 +585,7 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin // processHints processes annotations that container hints about how volumes // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. -func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error { +func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error { if conf.VFS2 { return c.processHintsVFS2(conf, creds) } @@ -607,7 +609,7 @@ func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) e // setupFS is used to set up the file system for all containers. This is the // main entry point method, with most of the other being internal only. It // returns the mount namespace that is created for the container. -func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { +func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { log.Infof("Configuring container's file system") // Create context with root credentials to mount the filesystem (the current @@ -633,7 +635,7 @@ func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessA return mns, nil } -func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) { +func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) { rootInode, err := c.createRootMount(ctx, conf) if err != nil { return nil, fmt.Errorf("creating filesystem for container: %v", err) @@ -645,7 +647,7 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi return mns, nil } -func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { +func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error { root := mns.Root() defer root.DecRef(ctx) @@ -681,7 +683,7 @@ func (c *containerMounter) checkDispenser() error { // mountSharedMaster mounts the master of a volume that is shared among // containers in a pod. It returns the root mount's inode. -func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) { +func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount) @@ -721,7 +723,7 @@ func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, } // createRootMount creates the root filesystem. -func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) { +func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) { // First construct the filesystem from the spec.Root. mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay} @@ -746,7 +748,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (* // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always // mounted even if they are not in the spec. submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") - rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf) if err != nil { return nil, fmt.Errorf("adding submount overlay: %v", err) } @@ -766,7 +768,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (* // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values // used for mounts. -func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) { +func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m specs.Mount) (string, []string, bool, error) { var ( fsName string opts []string @@ -800,19 +802,19 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( return fsName, opts, useOverlay, nil } -func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType { +func (c *containerMounter) getMountAccessType(mount specs.Mount) config.FileAccessType { if hint := c.hints.findMount(mount); hint != nil { return hint.fileAccessType() } // Non-root bind mounts are always shared if no hints were provided. - return FileAccessShared + return config.FileAccessShared } // mountSubmount mounts volumes inside the container's root. Because mounts may // be readonly, a lower ramfs overlay is added to create the mount point dir. // Another overlay is added with tmpfs on top if Config.Overlay is true. // 'm.Destination' must be an absolute path with '..' and symlinks resolved. -func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { +func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) @@ -856,7 +858,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns submounts := subtargets(m.Destination, c.mounts) if len(submounts) > 0 { log.Infof("Adding submount overlay over %q", m.Destination) - inode, err = addSubmountOverlay(ctx, inode, submounts) + inode, err = addSubmountOverlay(ctx, inode, submounts, mf) if err != nil { return fmt.Errorf("adding submount overlay: %v", err) } @@ -911,7 +913,7 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun // addRestoreMount adds a mount to the MountSources map used for restoring a // checkpointed container. -func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error { +func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m specs.Mount) error { fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) if err != nil { return err @@ -936,7 +938,7 @@ func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnviron // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding // the mounts to the environment. -func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) { +func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) { renv := &fs.RestoreEnvironment{ MountSources: make(map[string][]fs.MountArgs), } @@ -991,7 +993,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn // // Note that when there are submounts inside of '/tmp', directories for the // mount points must be present, making '/tmp' not empty anymore. -func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { +func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error { for _, m := range c.mounts { if filepath.Clean(m.Destination) == "/tmp" { log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m) diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go index 912037075..e986231e5 100644 --- a/runsc/boot/fs_test.go +++ b/runsc/boot/fs_test.go @@ -20,6 +20,7 @@ import ( "testing" specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/runsc/config" ) func TestPodMountHintsHappy(t *testing.T) { @@ -196,7 +197,7 @@ func TestGetMountAccessType(t *testing.T) { for _, tst := range []struct { name string annotations map[string]string - want FileAccessType + want config.FileAccessType }{ { name: "container=exclusive", @@ -205,7 +206,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "container", }, - want: FileAccessExclusive, + want: config.FileAccessExclusive, }, { name: "pod=shared", @@ -214,7 +215,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "pod", }, - want: FileAccessShared, + want: config.FileAccessShared, }, { name: "shared=shared", @@ -223,7 +224,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "shared", }, - want: FileAccessShared, + want: config.FileAccessShared, }, { name: "default=shared", @@ -232,7 +233,7 @@ func TestGetMountAccessType(t *testing.T) { MountPrefix + "mount1.type": "bind", MountPrefix + "mount1.share": "container", }, - want: FileAccessShared, + want: config.FileAccessShared, }, } { t.Run(tst.name, func(t *testing.T) { diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 40c6f99fd..8c6ab213d 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -27,12 +27,15 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fdimport" @@ -67,7 +70,9 @@ import ( "gvisor.dev/gvisor/runsc/boot/filter" _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. "gvisor.dev/gvisor/runsc/boot/pprof" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" + "gvisor.dev/gvisor/runsc/specutils/seccomp" // Include supported socket providers. "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" @@ -79,7 +84,7 @@ import ( ) type containerInfo struct { - conf *Config + conf *config.Config // spec is the base configuration for the root container. spec *specs.Spec @@ -88,10 +93,10 @@ type containerInfo struct { procArgs kernel.CreateProcessArgs // stdioFDs contains stdin, stdout, and stderr. - stdioFDs []int + stdioFDs []*fd.FD // goferFDs are the FDs that attach the sandbox to the gofers. - goferFDs []int + goferFDs []*fd.FD } // Loader keeps state needed to start the kernel and run the container.. @@ -165,7 +170,7 @@ type Args struct { // Spec is the sandbox specification. Spec *specs.Spec // Conf is the system configuration. - Conf *Config + Conf *config.Config // ControllerFD is the FD to the URPC controller. The Loader takes ownership // of this FD and may close it at any time. ControllerFD int @@ -278,6 +283,7 @@ func New(args Args) (*Loader, error) { args.NumCPU = runtime.NumCPU() } log.Infof("CPUs: %d", args.NumCPU) + runtime.GOMAXPROCS(args.NumCPU) if args.TotalMem > 0 { // Adjust the total memory returned by the Sentry so that applications that @@ -355,12 +361,17 @@ func New(args Args) (*Loader, error) { k.SetHostMount(hostMount) } + info := containerInfo{ + conf: args.Conf, + spec: args.Spec, + procArgs: procArgs, + } + // Make host FDs stable between invocations. Host FDs must map to the exact // same number when the sandbox is restored. Otherwise the wrong FD will be // used. - var stdioFDs []int newfd := startingStdioFD - for _, fd := range args.StdioFDs { + for _, stdioFD := range args.StdioFDs { // Check that newfd is unused to avoid clobbering over it. if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { if err != nil { @@ -369,14 +380,17 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) } - err := unix.Dup3(fd, newfd, unix.O_CLOEXEC) + err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) if err != nil { - return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) + return nil, fmt.Errorf("dup3 of stdios failed: %w", err) } - stdioFDs = append(stdioFDs, newfd) - _ = unix.Close(fd) + info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) + _ = unix.Close(stdioFD) newfd++ } + for _, goferFD := range args.GoferFDs { + info.goferFDs = append(info.goferFDs, fd.New(goferFD)) + } eid := execID{cid: args.ID} l := &Loader{ @@ -385,13 +399,7 @@ func New(args Args) (*Loader, error) { sandboxID: args.ID, processes: map[execID]*execProcess{eid: {}}, mountHints: mountHints, - root: containerInfo{ - conf: args.Conf, - stdioFDs: stdioFDs, - goferFDs: args.GoferFDs, - spec: args.Spec, - procArgs: procArgs, - }, + root: info, } // We don't care about child signals; some platforms can generate a @@ -465,13 +473,28 @@ func (l *Loader) Destroy() { } l.watchdog.Stop() - for i, fd := range l.root.stdioFDs { - _ = unix.Close(fd) - l.root.stdioFDs[i] = -1 + // Release all kernel resources. This is only safe after we can no longer + // save/restore. + l.k.Release() + + // All sentry-created resources should have been released at this point; + // check for reference leaks. + if refsvfs2.LeakCheckEnabled() { + refsvfs2.DoLeakCheck() + } + + // In the success case, stdioFDs and goferFDs will only contain + // released/closed FDs that ownership has been passed over to host FDs and + // gofer sessions. Close them here in case of failure. + for _, fd := range l.root.stdioFDs { + _ = fd.Close() + } + for _, fd := range l.root.goferFDs { + _ = fd.Close() } } -func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { +func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { p, err := platform.Lookup(conf.Platform) if err != nil { panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err)) @@ -498,13 +521,14 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return mf, nil } +// installSeccompFilters installs sandbox seccomp filters with the host. func (l *Loader) installSeccompFilters() error { if l.root.conf.DisableSeccomp { filter.Report("syscall filter is DISABLED. Running in less secure mode.") } else { opts := filter.Options{ Platform: l.k.Platform, - HostNetwork: l.root.conf.Network == NetworkHost, + HostNetwork: l.root.conf.Network == config.NetworkHost, ProfileEnable: l.root.conf.ProfileEnable, ControllerFD: l.ctrl.srv.FD(), } @@ -531,7 +555,7 @@ func (l *Loader) Run() error { } func (l *Loader) run() error { - if l.root.conf.Network == NetworkHost { + if l.root.conf.Network == config.NetworkHost { // Delay host network configuration to this point because network namespace // is configured after the loader is created and before Run() is called. log.Debugf("Configuring host network") @@ -568,6 +592,7 @@ func (l *Loader) run() error { if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { return err } + } ep.tg = l.k.GlobalInit() @@ -597,17 +622,6 @@ func (l *Loader) run() error { } }) - // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again - // either in createFDTable() during initial start or in descriptor.initAfterLoad() - // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the - // passed FDs, so only close for VFS1. - if !kernel.VFS2Enabled { - for i, fd := range l.root.stdioFDs { - _ = unix.Close(fd) - l.root.stdioFDs[i] = -1 - } - } - log.Infof("Process should have started...") l.watchdog.Start() return l.k.Start() @@ -627,9 +641,9 @@ func (l *Loader) createContainer(cid string) error { } // startContainer starts a child container. It returns the thread group ID of -// the newly created process. Caller owns 'files' and may close them after -// this method returns. -func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error { +// the newly created process. Used FDs are either closed or released. It's safe +// for the caller to close any remaining files upon return. +func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -680,28 +694,15 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file } info := &containerInfo{ - conf: conf, - spec: spec, + conf: conf, + spec: spec, + stdioFDs: files[:3], + goferFDs: files[3:], } info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } - - // setupContainerFS() dups stdioFDs, so we don't need to dup them here. - for _, f := range files[:3] { - info.stdioFDs = append(info.stdioFDs, int(f.Fd())) - } - - // Can't take ownership away from os.File. dup them to get a new FDs. - for _, f := range files[3:] { - fd, err := unix.Dup(int(f.Fd())) - if err != nil { - return fmt.Errorf("failed to dup file: %v", err) - } - info.goferFDs = append(info.goferFDs, fd) - } - tg, err := l.createContainerProcess(false, cid, info, ep) if err != nil { return err @@ -743,7 +744,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn return nil, err } - // Add the HOME enviroment variable if it is not already set. + // Add the HOME environment variable if it is not already set. var envv []string if kernel.VFS2Enabled { envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2, @@ -779,19 +780,44 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn } } + // Install seccomp filters with the new task if there are any. + if info.conf.OCISeccomp { + if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { + program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) + if err != nil { + return nil, fmt.Errorf("building seccomp program: %v", err) + } + + if log.IsLogging(log.Debug) { + out, _ := bpf.DecodeProgram(program) + log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) + } + + task := tg.Leader() + // NOTE: It seems Flags are ignored by runc so we ignore them too. + if err := task.AppendSyscallFilter(program, true); err != nil { + return nil, fmt.Errorf("appending seccomp filters: %v", err) + } + } + } else { + if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { + log.Warningf("Seccomp spec is being ignored") + } + } + return tg, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on -// the gofer FDs looking for disconnects, and destroys the container if a +// the gofer FDs looking for disconnects, and kills the container processes if a // disconnect occurs in any of the gofer FDs. -func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { +func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { go func() { log.Debugf("Monitoring gofer health for container %q", cid) var events []unix.PollFd - for _, fd := range goferFDs { + for _, goferFD := range goferFDs { events = append(events, unix.PollFd{ - Fd: int32(fd), + Fd: int32(goferFD.FD()), Events: unix.POLLHUP | unix.POLLRDHUP, }) } @@ -804,18 +830,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err)) } - // Check if the gofer has stopped as part of normal container destruction. - // This is done just to avoid sending an annoying error message to the log. - // Note that there is a small race window in between mu.Unlock() and the - // lock being reacquired in destroyContainer(), but it's harmless to call - // destroyContainer() multiple times. l.mu.Lock() - _, ok := l.processes[execID{cid: cid}] - l.mu.Unlock() - if ok { - log.Infof("Gofer socket disconnected, destroying container %q", cid) - if err := l.destroyContainer(cid); err != nil { - log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err) + defer l.mu.Unlock() + + // The gofer could have been stopped due to a normal container shutdown. + // Check if the container has not stopped yet. + if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil { + log.Infof("Gofer socket disconnected, killing container %q", cid) + if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { + log.Warningf("Error killing container %q after gofer stopped: %v", cid, err) } } }() @@ -884,17 +907,24 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { return 0, fmt.Errorf("container %q not started", args.ContainerID) } - // Get the container MountNamespace from the Task. + // Get the container MountNamespace from the Task. Try to acquire ref may fail + // in case it raced with task exit. if kernel.VFS2Enabled { - // task.MountNamespace() does not take a ref, so we must do so ourselves. + // task.MountNamespaceVFS2() does not take a ref, so we must do so ourselves. args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2() - args.MountNamespaceVFS2.IncRef() + if !args.MountNamespaceVFS2.TryIncRef() { + return 0, fmt.Errorf("container %q has stopped", args.ContainerID) + } } else { + var reffed bool tg.Leader().WithMuLocked(func(t *kernel.Task) { // task.MountNamespace() does not take a ref, so we must do so ourselves. args.MountNamespace = t.MountNamespace() - args.MountNamespace.IncRef() + reffed = args.MountNamespace.TryIncRef() }) + if !reffed { + return 0, fmt.Errorf("container %q has stopped", args.ContainerID) + } } // Add the HOME environment variable if it is not already set. @@ -902,7 +932,6 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { root := args.MountNamespaceVFS2.Root() ctx := vfs.WithRoot(l.k.SupervisorContext(), root) defer args.MountNamespaceVFS2.DecRef(ctx) - defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) if err != nil { return 0, err @@ -1017,17 +1046,17 @@ func (l *Loader) WaitExit() kernel.ExitStatus { return l.k.GlobalInit().ExitStatus() } -func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { +func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { // Create an empty network stack because the network namespace may be empty at // this point. Netns is configured before Run() is called. Netstack is // configured using a control uRPC message. Host network is configured inside // Run(). switch conf.Network { - case NetworkHost: + case config.NetworkHost: // No network namespacing support for hostinet yet, hence creator is nil. return inet.NewRootNamespace(hostinet.NewStack(), nil), nil - case NetworkNone, NetworkSandbox: + case config.NetworkNone, config.NetworkSandbox: s, err := newEmptySandboxNetworkStack(clock, uniqueID) if err != nil { return nil, err @@ -1045,8 +1074,8 @@ func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.Uni } func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { - netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} - transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} + transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4} s := netstack.Stack{stack.New(stack.Options{ NetworkProtocols: netProtos, TransportProtocols: transProtos, @@ -1060,17 +1089,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in })} // Enable SACK Recovery. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { - return nil, fmt.Errorf("failed to enable SACK: %s", err) + { + opt := tcpip.TCPSACKEnabled(true) + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } } // Set default TTLs as required by socket/netstack. - s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) - s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + { + opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) + if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) + } + if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) + } + } // Enable Receive Buffer Auto-Tuning. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err) + { + opt := tcpip.TCPModerateReceiveBufferOption(true) + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } } return &s, nil @@ -1266,7 +1308,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2 return ep.tty, ep.ttyVFS2, nil } -func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { +func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { if len(stdioFDs) != 3 { return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index aa3fdf96c..b77b4762e 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -26,6 +26,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" @@ -34,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/fsgofer" ) @@ -43,15 +45,19 @@ func init() { if err := fsgofer.OpenProcSelfFD(); err != nil { panic(err) } + config.RegisterFlags() } -func testConfig() *Config { - return &Config{ - RootDir: "unused_root_dir", - Network: NetworkNone, - DisableSeccomp: true, - Platform: "ptrace", +func testConfig() *config.Config { + conf, err := config.NewFromFlags() + if err != nil { + panic(err) } + // Change test defaults. + conf.RootDir = "unused_root_dir" + conf.Network = config.NetworkNone + conf.DisableSeccomp = true + return conf } // testSpec returns a simple spec that can be used in tests. @@ -258,9 +264,9 @@ type CreateMountTestcase struct { expectedPaths []string } -func createMountTestcases(vfs2 bool) []*CreateMountTestcase { +func createMountTestcases() []*CreateMountTestcase { testCases := []*CreateMountTestcase{ - &CreateMountTestcase{ + { // Only proc. name: "only proc mount", spec: specs.Spec{ @@ -298,11 +304,10 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase { }, }, }, - // /some/deep/path should be mounted, along with /proc, - // /dev, and /sys. + // /some/deep/path should be mounted, along with /proc, /dev, and /sys. expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - &CreateMountTestcase{ + { // Mounts are nested inside each other. name: "nested mounts", spec: specs.Spec{ @@ -346,7 +351,7 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase { expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux", "/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - &CreateMountTestcase{ + { name: "mount inside /dev", spec: specs.Spec{ Root: &specs.Root{ @@ -389,46 +394,42 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase { }, expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"}, }, - } - - vfsCase := &CreateMountTestcase{ - name: "mounts inside mandatory mounts", - spec: specs.Spec{ - Root: &specs.Root{ - Path: os.TempDir(), - Readonly: true, - }, - Mounts: []specs.Mount{ - { - Destination: "/proc", - Type: "tmpfs", + { + name: "mounts inside mandatory mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, }, - // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports - // MkDirAt in VFS2 (and remove the reduntant append). - // { - // Destination: "/sys/bar", - // Type: "tmpfs", - // }, - // - { - Destination: "/tmp/baz", - Type: "tmpfs", + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + { + Destination: "/sys/bar", + Type: "tmpfs", + }, + { + Destination: "/tmp/baz", + Type: "tmpfs", + }, + { + Destination: "/dev/goo", + Type: "tmpfs", + }, }, }, + expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz", "/dev/goo"}, }, - expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"}, } - if !vfs2 { - vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"}) - vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar") - } - return append(testCases, vfsCase) + return testCases } // Test that MountNamespace can be created with various specs. func TestCreateMountNamespace(t *testing.T) { - for _, tc := range createMountTestcases(false /* vfs2 */) { + for _, tc := range createMountTestcases() { t.Run(tc.name, func(t *testing.T) { conf := testConfig() ctx := contexttest.Context(t) @@ -439,7 +440,7 @@ func TestCreateMountNamespace(t *testing.T) { } defer cleanup() - mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{}) + mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{}) mns, err := mntr.createMountNamespace(ctx, conf) if err != nil { t.Fatalf("failed to create mount namespace: %v", err) @@ -465,7 +466,7 @@ func TestCreateMountNamespace(t *testing.T) { // Test that MountNamespace can be created with various specs. func TestCreateMountNamespaceVFS2(t *testing.T) { - for _, tc := range createMountTestcases(true /* vfs2 */) { + for _, tc := range createMountTestcases() { t.Run(tc.name, func(t *testing.T) { spec := testSpec() spec.Mounts = tc.spec.Mounts @@ -485,12 +486,13 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { } ctx := l.k.SupervisorContext() - mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs) + mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs) if err != nil { - t.Fatalf("failed to setupVFS2: %v", err) + t.Fatalf("mountAll: %v", err) } root := mns.Root() + root.IncRef() defer root.DecRef(ctx) for _, p := range tc.expectedPaths { target := &vfs.PathOperation{ @@ -545,7 +547,7 @@ func TestRestoreEnvironment(t *testing.T) { { Dev: "9pfs-/", Flags: fs.MountSourceFlags{ReadOnly: true}, - DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true", }, }, "tmpfs": { @@ -599,7 +601,7 @@ func TestRestoreEnvironment(t *testing.T) { { Dev: "9pfs-/", Flags: fs.MountSourceFlags{ReadOnly: true}, - DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true", }, { Dev: "9pfs-/dev/fd-foo", @@ -657,7 +659,7 @@ func TestRestoreEnvironment(t *testing.T) { { Dev: "9pfs-/", Flags: fs.MountSourceFlags{ReadOnly: true}, - DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true,cache=remote_revalidating", + DataString: "trans=fd,rfdno=0,wfdno=0,privateunixsocket=true", }, }, "tmpfs": { @@ -697,7 +699,11 @@ func TestRestoreEnvironment(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { conf := testConfig() - mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{}) + var ioFDs []*fd.FD + for _, ioFD := range tc.ioFDs { + ioFDs = append(ioFDs, fd.New(ioFD)) + } + mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{}) actualRenv, err := mntr.createRestoreEnvironment(conf) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 4e1fa7665..988573640 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -33,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/config" ) var ( @@ -78,44 +79,6 @@ type DefaultRoute struct { Name string } -// QueueingDiscipline is used to specify the kind of Queueing Discipline to -// apply for a give FDBasedLink. -type QueueingDiscipline int - -const ( - // QDiscNone disables any queueing for the underlying FD. - QDiscNone QueueingDiscipline = iota - - // QDiscFIFO applies a simple fifo based queue to the underlying - // FD. - QDiscFIFO -) - -// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s -// else returns an error. -func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) { - switch s { - case "none": - return QDiscNone, nil - case "fifo": - return QDiscFIFO, nil - default: - return 0, fmt.Errorf("unsupported qdisc specified: %q", s) - } -} - -// String implements fmt.Stringer. -func (q QueueingDiscipline) String() string { - switch q { - case QDiscNone: - return "none" - case QDiscFIFO: - return "fifo" - default: - panic(fmt.Sprintf("Invalid queueing discipline: %d", q)) - } -} - // FDBasedLink configures an fd-based link. type FDBasedLink struct { Name string @@ -127,7 +90,7 @@ type FDBasedLink struct { TXChecksumOffload bool RXChecksumOffload bool LinkAddress net.HardwareAddr - QDisc QueueingDiscipline + QDisc config.QueueingDiscipline // NumChannels controls how many underlying FD's are to be used to // create this endpoint. @@ -247,8 +210,8 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } switch link.QDisc { - case QDiscNone: - case QDiscFIFO: + case config.QDiscNone: + case config.QDiscFIFO: log.Infof("Enabling FIFO QDisc on %q", link.Name) linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) } diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go index fbfd3b07c..c21648a32 100644 --- a/runsc/boot/strace.go +++ b/runsc/boot/strace.go @@ -15,10 +15,13 @@ package boot import ( + "strings" + "gvisor.dev/gvisor/pkg/sentry/strace" + "gvisor.dev/gvisor/runsc/config" ) -func enableStrace(conf *Config) error { +func enableStrace(conf *config.Config) error { // We must initialize even if strace is not enabled. strace.Initialize() @@ -36,5 +39,5 @@ func enableStrace(conf *Config) error { strace.EnableAll(strace.SinkTypeLog) return nil } - return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog) + return strace.Enable(strings.Split(conf.StraceSyscalls, ","), strace.SinkTypeLog) } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 08dce8b6c..b157387ef 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -16,12 +16,12 @@ package boot import ( "fmt" - "path" "sort" "strings" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" @@ -42,6 +42,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/runsc/config" ) func registerFilesystems(k *kernel.Kernel) error { @@ -133,8 +134,8 @@ func registerFilesystems(k *kernel.Kernel) error { return nil } -func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { - mns, err := mntr.setupVFS2(ctx, conf, procArgs) +func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + mns, err := mntr.mountAll(conf, procArgs) if err != nil { return fmt.Errorf("failed to setupFS: %w", err) } @@ -149,7 +150,7 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte return nil } -func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { +func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { log.Infof("Configuring container's file system with VFS2") // Create context with root credentials to mount the filesystem (the current @@ -168,35 +169,141 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs } rootProcArgs.MountNamespaceVFS2 = mns + root := mns.Root() + root.IncRef() + defer root.DecRef(rootCtx) + if root.Mount().ReadOnly() { + // Switch to ReadWrite while we setup submounts. + if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil { + return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err) + } + // Restore back to ReadOnly at the end. + defer func() { + if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { + panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err)) + } + }() + } + // Mount submounts. if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil { return nil, fmt.Errorf("mounting submounts vfs2: %w", err) } + return mns, nil } -func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { +// createMountNamespaceVFS2 creates the container's root mount and namespace. +func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { fd := c.fds.remove() - opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */) + data := p9MountData(fd, conf.FileAccess, true /* vfs2 */) if conf.OverlayfsStaleRead { // We can't check for overlayfs here because sandbox is chroot'ed and gofer // can only send mount options for specs.Mounts (specs.Root is missing // Options field). So assume root is always on top of overlayfs. - opts = append(opts, "overlayfs_stale_read") + data = append(data, "overlayfs_stale_read") } log.Infof("Mounting root over 9P, ioFD: %d", fd) - mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{ - Data: strings.Join(opts, ","), - }) + opts := &vfs.MountOptions{ + ReadOnly: c.root.Readonly, + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(data, ","), + InternalData: gofer.InternalFilesystemOptions{ + UniqueID: "/", + }, + }, + InternalMount: true, + } + + fsName := gofer.Name + if conf.Overlay && !c.root.Readonly { + log.Infof("Adding overlay on top of root") + var err error + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting root with overlay: %w", err) + } + defer cleanup() + fsName = overlay.Name + } + + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts) if err != nil { return nil, fmt.Errorf("setting up mount namespace: %w", err) } return mns, nil } -func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { +// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper +// layer using tmpfs, and return overlay mount options. "cleanup" must be called +// after the options have been used to mount the overlay, to release refs on +// lower and upper mounts. +func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) { + // First copy options from lower layer to upper layer and overlay. Clear + // filesystem specific options. + upperOpts := *lowerOpts + upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} + + overlayOpts := *lowerOpts + overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} + + // Next mount upper and lower. Upper is a tmpfs mount to keep all + // modifications inside the sandbox. + upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) + if err != nil { + return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) + } + cu := cleanup.Make(func() { upper.DecRef(ctx) }) + defer cu.Clean() + + // All writes go to the upper layer, be paranoid and make lower readonly. + lowerOpts.ReadOnly = true + lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) + if err != nil { + return nil, nil, err + } + cu.Add(func() { lower.DecRef(ctx) }) + + // Propagate the lower layer's root's owner, group, and mode to the upper + // layer's root for consistency with VFS1. + upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) + lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root()) + stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{ + Root: lowerRootVD, + Start: lowerRootVD, + }, &vfs.StatOptions{ + Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE, + }) + if err != nil { + return nil, nil, err + } + err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{ + Root: upperRootVD, + Start: upperRootVD, + }, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask, + UID: stat.UID, + GID: stat.GID, + Mode: stat.Mode, + }, + }) + if err != nil { + return nil, nil, err + } + + // Configure overlay with both layers. + overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{ + UpperRoot: upperRootVD, + LowerRoots: []vfs.VirtualDentry{lowerRootVD}, + } + return &overlayOpts, cu.Release(), nil +} + +func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { mounts, err := c.prepareMountsVFS2() if err != nil { return err @@ -205,15 +312,35 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, for i := range mounts { submount := &mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) + var ( + mnt *vfs.Mount + err error + ) + if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() { - if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil { + mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint) + if err != nil { return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err) } } else { - if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { + mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount) + if err != nil { return fmt.Errorf("mount submount %q: %w", submount.Destination, err) } } + + if mnt != nil && mnt.ReadOnly() { + // Switch to ReadWrite while we setup submounts. + if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { + return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err) + } + // Restore back to ReadOnly at the end. + defer func() { + if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { + panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err)) + } + }() + } } if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil { @@ -256,38 +383,54 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { return mounts, nil } -func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) { + fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount) + if err != nil { + return nil, fmt.Errorf("mountOptions failed: %w", err) + } + if len(fsName) == 0 { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil, nil + } + + if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err) + } + + if useOverlay { + log.Infof("Adding overlay on top of mount %q", submount.Destination) + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err) + } + defer cleanup() + fsName = overlay.Name + } + root := mns.Root() + root.IncRef() defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(submount.Destination), } - fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount) + mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) if err != nil { - return fmt.Errorf("mountOptions failed: %w", err) - } - if len(fsName) == 0 { - // Filesystem is not supported (e.g. cgroup), just skip it. - return nil - } - - if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil { - return err - } - if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil { - return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data) - return nil + return mnt, nil } // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values // used for mounts. -func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) { +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) { fsName := m.Type + useOverlay := false var data []string + var iopts interface{} // Find filesystem name and FS specific data field. switch m.Type { @@ -301,7 +444,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF var err error data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { - return "", nil, err + return "", nil, false, err } case bind: @@ -309,18 +452,25 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF if m.fd == 0 { // Check that an FD was provided to fails fast. Technically FD=0 is valid, // but unlikely to be correct in this context. - return "", nil, fmt.Errorf("9P mount requires a connection FD") + return "", nil, false, fmt.Errorf("9P mount requires a connection FD") } data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) + iopts = gofer.InternalFilesystemOptions{ + UniqueID: m.Destination, + } + + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly default: log.Warningf("ignoring unknown filesystem type %q", m.Type) - return "", nil, nil + return "", nil, false, nil } opts := &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ - Data: strings.Join(data, ","), + Data: strings.Join(data, ","), + InternalData: iopts, }, InternalMount: true, } @@ -340,38 +490,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF } } - if conf.Overlay { - // All writes go to upper, be paranoid and make lower readonly. - opts.ReadOnly = true - } - return fsName, opts, nil -} - -func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { - target := &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(currentPath), - } - _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) - if err == nil { - log.Debugf("Mount point %q already exists", currentPath) - return nil - } - if err != syserror.ENOENT { - return fmt.Errorf("stat failed for %q during mount point creation: %w", currentPath, err) - } - - // Recurse to ensure parent is created and then create the mount point. - if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil { - return err - } - log.Debugf("Creating dir %q for mount point", currentPath) - mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} - if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil { - return fmt.Errorf("failed to create directory %q for mount: %w", currentPath, err) - } - return nil + return fsName, opts, useOverlay, nil } // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so. @@ -383,7 +502,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s // // Note that when there are submounts inside of '/tmp', directories for the // mount points must be present, making '/tmp' not empty anymore. -func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { +func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { for _, m := range c.mounts { // m.Destination has been cleaned, so it's to use equality here. if m.Destination == "/tmp" { @@ -393,6 +512,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds } root := mns.Root() + root.IncRef() defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, @@ -434,7 +554,8 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds // another user. This is normally done for /tmp. Options: []string{"mode=01777"}, } - return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + _, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + return err case syserror.ENOTDIR: // Not a dir?! Let it be. @@ -448,7 +569,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds // processHintsVFS2 processes annotations that container hints about how volumes // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. -func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error { +func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error { ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a @@ -469,51 +590,106 @@ func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credential // mountSharedMasterVFS2 mounts the master of a volume that is shared among // containers in a pod. -func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) { +func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. mntFD := &mountAndFD{Mount: hint.mount} - fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD) + fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD) if err != nil { return nil, err } if len(fsName) == 0 { return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) } + + if useOverlay { + log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination) + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err) + } + defer cleanup() + fsName = overlay.Name + } + return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) } // mountSharedSubmount binds mount to a previously mounted volume that is shared // among containers in the same pod. -func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error { +func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) { if err := source.checkCompatible(mount); err != nil { - return err + return nil, err } - _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + // Ignore data and useOverlay because these were already applied to + // the master mount. + _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) if err != nil { - return err + return nil, err } newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts) if err != nil { - return err + return nil, err } defer newMnt.DecRef(ctx) root := mns.Root() + root.IncRef() defer root.DecRef(ctx) - if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil { - return err - } - target := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(mount.Destination), } + + if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err) + } + if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { - return err + return nil, err } log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) - return nil + return newMnt, nil +} + +func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error { + root := mns.Root() + root.IncRef() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dest), + } + // First check if mount point exists. When overlay is enabled, gofer doesn't + // allow changes to the FS, making MakeSytheticMountpoint() ineffective + // because MkdirAt fails with EROFS even if file exists. + vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{}) + if err == nil { + // File exists, we're done. + vd.DecRef(ctx) + return nil + } + return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds) +} + +// configureRestore returns an updated context.Context including filesystem +// state used by restore defined by conf. +func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Config) (context.Context, error) { + fdmap := make(map[string]int) + fdmap["/"] = c.fds.remove() + mounts, err := c.prepareMountsVFS2() + if err != nil { + return ctx, err + } + for i := range c.mounts { + submount := &mounts[i] + if submount.fd >= 0 { + fdmap[submount.Destination] = submount.fd + } + } + return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil } |