diff options
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/boot/config.go | 14 | ||||
-rw-r--r-- | runsc/boot/controller.go | 4 | ||||
-rw-r--r-- | runsc/boot/filter/config.go | 10 | ||||
-rw-r--r-- | runsc/boot/fs.go | 16 | ||||
-rw-r--r-- | runsc/boot/loader.go | 2 | ||||
-rw-r--r-- | runsc/boot/network.go | 14 | ||||
-rw-r--r-- | runsc/cmd/gofer.go | 29 | ||||
-rw-r--r-- | runsc/container/container.go | 6 | ||||
-rw-r--r-- | runsc/container/container_test.go | 37 | ||||
-rw-r--r-- | runsc/fsgofer/filter/config.go | 12 | ||||
-rw-r--r-- | runsc/fsgofer/fsgofer.go | 105 | ||||
-rw-r--r-- | runsc/main.go | 32 | ||||
-rw-r--r-- | runsc/sandbox/BUILD | 1 | ||||
-rw-r--r-- | runsc/sandbox/network.go | 12 | ||||
-rw-r--r-- | runsc/specutils/BUILD | 1 | ||||
-rw-r--r-- | runsc/specutils/cri.go | 110 | ||||
-rw-r--r-- | runsc/specutils/specutils.go | 58 | ||||
-rw-r--r-- | runsc/testutil/BUILD | 1 | ||||
-rw-r--r-- | runsc/testutil/testutil.go | 4 |
19 files changed, 335 insertions, 133 deletions
diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 38278d0a2..72a33534f 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -178,8 +178,11 @@ type Config struct { // capabilities. EnableRaw bool - // GSO indicates that generic segmentation offload is enabled. - GSO bool + // HardwareGSO indicates that hardware segmentation offload is enabled. + HardwareGSO bool + + // SoftwareGSO indicates that software segmentation offload is enabled. + SoftwareGSO bool // LogPackets indicates that all network packets should be logged. LogPackets bool @@ -231,6 +234,10 @@ type Config struct { // ReferenceLeakMode sets reference leak check mode ReferenceLeakMode refs.LeakMode + // OverlayfsStaleRead causes cached FDs to reopen after a file is opened for + // write to workaround overlayfs limitation on kernels before 4.19. + OverlayfsStaleRead bool + // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in // tests. It allows runsc to start the sandbox process as the current // user, and without chrooting the sandbox process. This can be @@ -271,6 +278,9 @@ func (c *Config) ToFlags() []string { "--rootless=" + strconv.FormatBool(c.Rootless), "--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr), "--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode), + "--gso=" + strconv.FormatBool(c.HardwareGSO), + "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), + "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), } // Only include these if set since it is never to be used by users. if c.TestOnlyAllowRunAsCurrentUserWithoutChroot { diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index a73c593ea..5f644b57e 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -32,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/specutils" ) const ( @@ -237,6 +238,9 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") } + // All validation passed, logs the spec for debugging. + specutils.LogSpec(args.Spec) + err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files) if err != nil { log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index a2ecc6bcb..5ad108261 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -44,6 +44,7 @@ var allowedSyscalls = seccomp.SyscallRules{ }, syscall.SYS_CLOSE: {}, syscall.SYS_DUP: {}, + syscall.SYS_DUP2: {}, syscall.SYS_EPOLL_CREATE1: {}, syscall.SYS_EPOLL_CTL: {}, syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ @@ -242,6 +243,15 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowValue(0), }, }, + unix.SYS_SENDMMSG: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(syscall.MSG_DONTWAIT), + seccomp.AllowValue(0), + }, + }, syscall.SYS_RESTART_SYSCALL: {}, syscall.SYS_RT_SIGACTION: {}, syscall.SYS_RT_SIGPROCMASK: {}, diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 393c2a88b..76036c147 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -703,6 +703,14 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (* log.Infof("Mounting root over 9P, ioFD: %d", fd) p9FS := mustFindFilesystem("9p") opts := p9MountOptions(fd, conf.FileAccess) + + if conf.OverlayfsStaleRead { + // We can't check for overlayfs here because sandbox is chroot'ed and gofer + // can only send mount options for specs.Mounts (specs.Root is missing + // Options field). So assume root is always on top of overlayfs. + opts = append(opts, "overlayfs_stale_read") + } + rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) if err != nil { return nil, fmt.Errorf("creating root mount point: %v", err) @@ -737,7 +745,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( fsName string opts []string useOverlay bool - err error ) switch m.Type { @@ -747,7 +754,12 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( fsName = sysfs case tmpfs: fsName = m.Type + + var err error opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...) + if err != nil { + return "", nil, false, err + } case bind: fd := c.fds.remove() @@ -763,7 +775,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( // for now. log.Warningf("ignoring unknown filesystem type %q", m.Type) } - return fsName, opts, useOverlay, err + return fsName, opts, useOverlay, nil } // mountSubmount mounts volumes inside the container's root. Because mounts may diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index c8e5e86ee..0c0eba99e 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -922,7 +922,7 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { HandleLocal: true, // Enable raw sockets for users with sufficient // privileges. - UnassociatedFactory: raw.EndpointFactory{}, + RawFactory: raw.EndpointFactory{}, })} // Enable SACK Recovery. diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 32cba5ac1..f98c5fd36 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -50,12 +50,13 @@ type DefaultRoute struct { // FDBasedLink configures an fd-based link. type FDBasedLink struct { - Name string - MTU int - Addresses []net.IP - Routes []Route - GSOMaxSize uint32 - LinkAddress net.HardwareAddr + Name string + MTU int + Addresses []net.IP + Routes []Route + GSOMaxSize uint32 + SoftwareGSOEnabled bool + LinkAddress net.HardwareAddr // NumChannels controls how many underlying FD's are to be used to // create this endpoint. @@ -163,6 +164,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct Address: mac, PacketDispatchMode: fdbased.RecvMMsg, GSOMaxSize: link.GSOMaxSize, + SoftwareGSOEnabled: link.SoftwareGSOEnabled, RXChecksumOffload: true, }) if err != nil { diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 4c2fb80bf..4831210c0 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -27,6 +27,7 @@ import ( "flag" "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/unet" @@ -135,7 +136,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // // Note that all mount points have been mounted in the proper location in // setupRootFS(). - cleanMounts, err := resolveMounts(spec.Mounts, root) + cleanMounts, err := resolveMounts(conf, spec.Mounts, root) if err != nil { Fatalf("Failure to resolve mounts: %v", err) } @@ -380,7 +381,7 @@ func setupMounts(mounts []specs.Mount, root string) error { // Otherwise, it may follow symlinks to locations that would be overwritten // with another mount point and return the wrong location. In short, make sure // setupMounts() has been called before. -func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) { +func resolveMounts(conf *boot.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) { cleanMounts := make([]specs.Mount, 0, len(mounts)) for _, m := range mounts { if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { @@ -395,8 +396,15 @@ func resolveMounts(mounts []specs.Mount, root string) ([]specs.Mount, error) { if err != nil { panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err)) } + + opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options) + if err != nil { + return nil, err + } + cpy := m cpy.Destination = filepath.Join("/", relDst) + cpy.Options = opts cleanMounts = append(cleanMounts, cpy) } return cleanMounts, nil @@ -453,3 +461,20 @@ func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, erro } return base, nil } + +// adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs. +func adjustMountOptions(conf *boot.Config, path string, opts []string) ([]string, error) { + rv := make([]string, len(opts)) + copy(rv, opts) + + if conf.OverlayfsStaleRead { + statfs := syscall.Statfs_t{} + if err := syscall.Statfs(path, &statfs); err != nil { + return nil, err + } + if statfs.Type == unix.OVERLAYFS_SUPER_MAGIC { + rv = append(rv, "overlayfs_stale_read") + } + } + return rv, nil +} diff --git a/runsc/container/container.go b/runsc/container/container.go index a721c1c31..32510d427 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -1149,7 +1149,7 @@ func maybeLockRootContainer(spec *specs.Spec, rootDir string) (func() error, err } func isRoot(spec *specs.Spec) bool { - return specutils.ShouldCreateSandbox(spec) + return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer } // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute @@ -1198,7 +1198,7 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) // Get the lowest score for all containers. var lowScore int scoreFound := false - if len(containers) == 1 && len(containers[0].Spec.Annotations[specutils.ContainerdContainerTypeAnnotation]) == 0 { + if len(containers) == 1 && specutils.SpecContainerType(containers[0].Spec) == specutils.ContainerTypeUnspecified { // This is a single-container sandbox. Set the oom_score_adj to // the value specified in the OCI bundle. if containers[0].Spec.Process.OOMScoreAdj != nil { @@ -1214,7 +1214,7 @@ func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) // // We will use OOMScoreAdj in the single-container case where the // containerd container-type annotation is not present. - if container.Spec.Annotations[specutils.ContainerdContainerTypeAnnotation] == specutils.ContainerdContainerTypeSandbox { + if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox { continue } diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 519f5ed9b..c4c56b2e0 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -2074,6 +2074,43 @@ func TestNetRaw(t *testing.T) { } } +// TestOverlayfsStaleRead most basic test that '--overlayfs-stale-read' works. +func TestOverlayfsStaleRead(t *testing.T) { + conf := testutil.TestConfig() + conf.OverlayfsStaleRead = true + + in, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.in") + if err != nil { + t.Fatalf("ioutil.TempFile() failed: %v", err) + } + defer in.Close() + if _, err := in.WriteString("stale data"); err != nil { + t.Fatalf("in.Write() failed: %v", err) + } + + out, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.out") + if err != nil { + t.Fatalf("ioutil.TempFile() failed: %v", err) + } + defer out.Close() + + const want = "foobar" + cmd := fmt.Sprintf("cat %q && echo %q> %q && cp %q %q", in.Name(), want, in.Name(), in.Name(), out.Name()) + spec := testutil.NewSpecWithArgs("/bin/bash", "-c", cmd) + if err := run(spec, conf); err != nil { + t.Fatalf("Error running container: %v", err) + } + + gotBytes, err := ioutil.ReadAll(out) + if err != nil { + t.Fatalf("out.Read() failed: %v", err) + } + got := strings.TrimSpace(string(gotBytes)) + if want != got { + t.Errorf("Wrong content in out file, got: %q. want: %q", got, want) + } +} + // executeSync synchronously executes a new process. func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { pid, err := cont.Execute(args) diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 0bf7507b7..2ea95f8fb 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -220,6 +220,18 @@ var udsSyscalls = seccomp.SyscallRules{ syscall.SYS_SOCKET: []seccomp.Rule{ { seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_STREAM), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_DGRAM), + seccomp.AllowValue(0), + }, + { + seccomp.AllowValue(syscall.AF_UNIX), + seccomp.AllowValue(syscall.SOCK_SEQPACKET), + seccomp.AllowValue(0), }, }, syscall.SYS_CONNECT: []seccomp.Rule{ diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 29a82138e..3fceecb3d 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -21,7 +21,6 @@ package fsgofer import ( - "errors" "fmt" "io" "math" @@ -126,13 +125,6 @@ func NewAttachPoint(prefix string, c Config) (p9.Attacher, error) { // Attach implements p9.Attacher. func (a *attachPoint) Attach() (p9.File, error) { - // dirFD (1st argument) is ignored because 'prefix' is always absolute. - stat, err := statAt(-1, a.prefix) - if err != nil { - return nil, fmt.Errorf("stat file %q, err: %v", a.prefix, err) - } - - // Acquire the attach point lock. a.attachedMu.Lock() defer a.attachedMu.Unlock() @@ -140,47 +132,24 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix) } - // Hold the file descriptor we are converting into a p9.File. - var f *fd.FD - - // Apply the S_IFMT bitmask so we can detect file type appropriately. - switch fmtStat := stat.Mode & syscall.S_IFMT; fmtStat { - case syscall.S_IFSOCK: - // Check to see if the CLI option has been set to allow the UDS mount. - if !a.conf.HostUDS { - return nil, errors.New("host UDS support is disabled") - } - - // Attempt to open a connection. Bubble up the failures. - f, err = fd.DialUnix(a.prefix) - if err != nil { - return nil, err - } - - default: - // Default to Read/Write permissions. - mode := syscall.O_RDWR - - // If the configuration is Read Only or the mount point is a directory, - // set the mode to Read Only. - if a.conf.ROMount || fmtStat == syscall.S_IFDIR { - mode = syscall.O_RDONLY - } + f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) { + return fd.Open(a.prefix, openFlags|mode, 0) + }) + if err != nil { + return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err) + } - // Open the mount point & capture the FD. - f, err = fd.Open(a.prefix, openFlags|mode, 0) - if err != nil { - return nil, fmt.Errorf("unable to open file %q, err: %v", a.prefix, err) - } + stat, err := stat(f.FD()) + if err != nil { + return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err) } - // Return a localFile object to the caller with the UDS FD included. - rv, err := newLocalFile(a, f, a.prefix, stat) + lf, err := newLocalFile(a, f, a.prefix, stat) if err != nil { - return nil, err + return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err) } a.attached = true - return rv, nil + return lf, nil } // makeQID returns a unique QID for the given stat buffer. @@ -296,10 +265,10 @@ func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, erro // actual file open and is customizable by the caller. func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) { // Attempt to open file in the following mode in order: - // 1. RDONLY | NONBLOCK: for all files, works for directories and ro mounts too. - // Use non-blocking to prevent getting stuck inside open(2) for FIFOs. This option - // has no effect on regular files. - // 2. PATH: for symlinks + // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. + // Use non-blocking to prevent getting stuck inside open(2) for + // FIFOs. This option has no effect on regular files. + // 2. PATH: for symlinks, sockets. modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH} var err error @@ -1063,12 +1032,48 @@ func (l *localFile) Flush() error { } // Connect implements p9.File. -func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) { - // Check to see if the CLI option has been set to allow the UDS mount. +func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) { if !l.attachPoint.conf.HostUDS { return nil, syscall.ECONNREFUSED } - return fd.DialUnix(l.hostPath) + + // TODO(gvisor.dev/issue/1003): Due to different app vs replacement + // mappings, the app path may have fit in the sockaddr, but we can't + // fit f.path in our sockaddr. We'd need to redirect through a shorter + // path in order to actually connect to this socket. + if len(l.hostPath) > linux.UnixPathMax { + return nil, syscall.ECONNREFUSED + } + + var stype int + switch flags { + case p9.StreamSocket: + stype = syscall.SOCK_STREAM + case p9.DgramSocket: + stype = syscall.SOCK_DGRAM + case p9.SeqpacketSocket: + stype = syscall.SOCK_SEQPACKET + default: + return nil, syscall.ENXIO + } + + f, err := syscall.Socket(syscall.AF_UNIX, stype, 0) + if err != nil { + return nil, err + } + + if err := syscall.SetNonblock(f, true); err != nil { + syscall.Close(f) + return nil, err + } + + sa := syscall.SockaddrUnix{Name: l.hostPath} + if err := syscall.Connect(f, &sa); err != nil { + syscall.Close(f) + return nil, err + } + + return fd.New(f), nil } // Close implements p9.File. diff --git a/runsc/main.go b/runsc/main.go index 7dce9dc00..ae906c661 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -41,35 +41,37 @@ import ( var ( // Although these flags are not part of the OCI spec, they are used by // Docker, and thus should not be changed. - rootDir = flag.String("root", "", "root directory for storage of container state") - logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout") - logFormat = flag.String("log-format", "text", "log format: text (default), json, or json-k8s") - debug = flag.Bool("debug", false, "enable debug logging") - showVersion = flag.Bool("version", false, "show version and exit") + rootDir = flag.String("root", "", "root directory for storage of container state.") + logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout.") + logFormat = flag.String("log-format", "text", "log format: text (default), json, or json-k8s.") + debug = flag.Bool("debug", false, "enable debug logging.") + showVersion = flag.Bool("version", false, "show version and exit.") // These flags are unique to runsc, and are used to configure parts of the // system that are not covered by the runtime spec. // Debugging flags. debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.") - logPackets = flag.Bool("log-packets", false, "enable network packet logging") + logPackets = flag.Bool("log-packets", false, "enable network packet logging.") logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.") debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.") - debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s") - alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr") + debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.") + alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr.") // Debugging flags: strace related - strace = flag.Bool("strace", false, "enable strace") + strace = flag.Bool("strace", false, "enable strace.") straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.") - straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs") + straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.") // Flags that control sandbox runtime behavior. - platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") + platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.") network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") - gso = flag.Bool("gso", true, "enable generic segmenation offload") + hardwareGSO = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.") + softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.") fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") - fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "Allow the gofer to mount Unix Domain Sockets.") + fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") + overlayfsStaleRead = flag.Bool("overlayfs-stale-read", false, "reopen cached FDs after a file is opened for write to workaround overlayfs limitation on kernels before 4.19.") watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") @@ -199,7 +201,8 @@ func main() { FSGoferHostUDS: *fsGoferHostUDS, Overlay: *overlay, Network: netType, - GSO: *gso, + HardwareGSO: *hardwareGSO, + SoftwareGSO: *softwareGSO, LogPackets: *logPackets, Platform: platformType, Strace: *strace, @@ -212,6 +215,7 @@ func main() { Rootless: *rootless, AlsoLogToStderr: *alsoLogToStderr, ReferenceLeakMode: refsLeakMode, + OverlayfsStaleRead: *overlayfsStaleRead, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, TestOnlyTestNameEnv: *testOnlyTestNameEnv, diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 7fdceaab6..27459e6d1 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -19,6 +19,7 @@ go_library( "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/platform", + "//pkg/tcpip/stack", "//pkg/urpc", "//runsc/boot", "//runsc/boot/platforms", diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 5634f0707..d42de0176 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -28,6 +28,7 @@ import ( "github.com/vishvananda/netlink" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" @@ -61,7 +62,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") - if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil { + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels); err != nil { return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) } case boot.NetworkHost: @@ -136,7 +137,7 @@ func isRootNS() (bool, error) { // createInterfacesAndRoutesFromNS scrapes the interface and routes from the // net namespace with the given path, creates them in the sandbox, and removes // them from the host. -func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error { +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int) error { // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { @@ -232,7 +233,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO // Create the socket for the device. for i := 0; i < link.NumChannels; i++ { log.Debugf("Creating Channel %d", i) - socketEntry, err := createSocket(iface, ifaceLink, enableGSO) + socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO) if err != nil { return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err) } @@ -246,6 +247,11 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO } args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) } + if link.GSOMaxSize == 0 && softwareGSO { + // Hardware GSO is disabled. Let's enable software GSO. + link.GSOMaxSize = stack.SoftwareGSOMaxSize + link.SoftwareGSOEnabled = true + } // Collect the addresses for the interface, enable forwarding, // and remove them from the host. diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index fa58313a0..205638803 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_library( name = "specutils", srcs = [ + "cri.go", "fs.go", "namespace.go", "specutils.go", diff --git a/runsc/specutils/cri.go b/runsc/specutils/cri.go new file mode 100644 index 000000000..9c5877cd5 --- /dev/null +++ b/runsc/specutils/cri.go @@ -0,0 +1,110 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package specutils + +import ( + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + // ContainerdContainerTypeAnnotation is the OCI annotation set by + // containerd to indicate whether the container to create should have + // its own sandbox or a container within an existing sandbox. + ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type" + // ContainerdContainerTypeContainer is the container type value + // indicating the container should be created in an existing sandbox. + ContainerdContainerTypeContainer = "container" + // ContainerdContainerTypeSandbox is the container type value + // indicating the container should be created in a new sandbox. + ContainerdContainerTypeSandbox = "sandbox" + + // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate + // which sandbox the container should be created in when the container + // is not the first container in the sandbox. + ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id" + + // CRIOContainerTypeAnnotation is the OCI annotation set by + // CRI-O to indicate whether the container to create should have + // its own sandbox or a container within an existing sandbox. + CRIOContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType" + + // CRIOContainerTypeContainer is the container type value + // indicating the container should be created in an existing sandbox. + CRIOContainerTypeContainer = "container" + // CRIOContainerTypeSandbox is the container type value + // indicating the container should be created in a new sandbox. + CRIOContainerTypeSandbox = "sandbox" + + // CRIOSandboxIDAnnotation is the OCI annotation set to indicate + // which sandbox the container should be created in when the container + // is not the first container in the sandbox. + CRIOSandboxIDAnnotation = "io.kubernetes.cri-o.SandboxID" +) + +// ContainerType represents the type of container requested by the calling container manager. +type ContainerType int + +const ( + // ContainerTypeUnspecified indicates that no known container type + // annotation was found in the spec. + ContainerTypeUnspecified ContainerType = iota + // ContainerTypeUnknown indicates that a container type was specified + // but is unknown to us. + ContainerTypeUnknown + // ContainerTypeSandbox indicates that the container should be run in a + // new sandbox. + ContainerTypeSandbox + // ContainerTypeContainer indicates that the container should be run in + // an existing sandbox. + ContainerTypeContainer +) + +// SpecContainerType tries to determine the type of container specified by the +// container manager using well-known container annotations. +func SpecContainerType(spec *specs.Spec) ContainerType { + if t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]; ok { + switch t { + case ContainerdContainerTypeSandbox: + return ContainerTypeSandbox + case ContainerdContainerTypeContainer: + return ContainerTypeContainer + default: + return ContainerTypeUnknown + } + } + if t, ok := spec.Annotations[CRIOContainerTypeAnnotation]; ok { + switch t { + case CRIOContainerTypeSandbox: + return ContainerTypeSandbox + case CRIOContainerTypeContainer: + return ContainerTypeContainer + default: + return ContainerTypeUnknown + } + } + return ContainerTypeUnspecified +} + +// SandboxID returns the ID of the sandbox to join and whether an ID was found +// in the spec. +func SandboxID(spec *specs.Spec) (string, bool) { + if id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]; ok { + return id, true + } + if id, ok := spec.Annotations[CRIOSandboxIDAnnotation]; ok { + return id, true + } + return "", false +} diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 3d9ced1b6..d3c2e4e78 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -108,23 +108,18 @@ func ValidateSpec(spec *specs.Spec) error { } } - // Two annotations are use by containerd to support multi-container pods. - // "io.kubernetes.cri.container-type" - // "io.kubernetes.cri.sandbox-id" - containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation] - _, hasSandboxID := spec.Annotations[ContainerdSandboxIDAnnotation] - switch { - // Non-containerd use won't set a container type. - case !hasContainerType: - case containerType == ContainerdContainerTypeSandbox: - // When starting a container in an existing sandbox, the sandbox ID - // must be set. - case containerType == ContainerdContainerTypeContainer: - if !hasSandboxID { - return fmt.Errorf("spec has container-type of %s, but no sandbox ID set", containerType) + // CRI specifies whether a container should start a new sandbox, or run + // another container in an existing sandbox. + switch SpecContainerType(spec) { + case ContainerTypeContainer: + // When starting a container in an existing sandbox, the + // sandbox ID must be set. + if _, ok := SandboxID(spec); !ok { + return fmt.Errorf("spec has container-type of container, but no sandbox ID set") } + case ContainerTypeUnknown: + return fmt.Errorf("unknown container-type") default: - return fmt.Errorf("unknown container-type: %s", containerType) } return nil @@ -338,39 +333,6 @@ func IsSupportedDevMount(m specs.Mount) bool { return true } -const ( - // ContainerdContainerTypeAnnotation is the OCI annotation set by - // containerd to indicate whether the container to create should have - // its own sandbox or a container within an existing sandbox. - ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type" - // ContainerdContainerTypeContainer is the container type value - // indicating the container should be created in an existing sandbox. - ContainerdContainerTypeContainer = "container" - // ContainerdContainerTypeSandbox is the container type value - // indicating the container should be created in a new sandbox. - ContainerdContainerTypeSandbox = "sandbox" - - // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate - // which sandbox the container should be created in when the container - // is not the first container in the sandbox. - ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id" -) - -// ShouldCreateSandbox returns true if the spec indicates that a new sandbox -// should be created for the container. If false, the container should be -// started in an existing sandbox. -func ShouldCreateSandbox(spec *specs.Spec) bool { - t, ok := spec.Annotations[ContainerdContainerTypeAnnotation] - return !ok || t == ContainerdContainerTypeSandbox -} - -// SandboxID returns the ID of the sandbox to join and whether an ID was found -// in the spec. -func SandboxID(spec *specs.Spec) (string, bool) { - id, ok := spec.Annotations[ContainerdSandboxIDAnnotation] - return id, ok -} - // WaitForReady waits for a process to become ready. The process is ready when // the 'ready' function returns true. It continues to wait if 'ready' returns // false. It returns error on timeout, if the process stops or if 'ready' fails. diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD index d44ebc906..c96ca2eb6 100644 --- a/runsc/testutil/BUILD +++ b/runsc/testutil/BUILD @@ -9,6 +9,7 @@ go_library( importpath = "gvisor.dev/gvisor/runsc/testutil", visibility = ["//:sandbox"], deps = [ + "//pkg/log", "//runsc/boot", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go index edf8b126c..26467bdc7 100644 --- a/runsc/testutil/testutil.go +++ b/runsc/testutil/testutil.go @@ -25,7 +25,6 @@ import ( "fmt" "io" "io/ioutil" - "log" "math" "math/rand" "net/http" @@ -42,6 +41,7 @@ import ( "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" ) @@ -286,7 +286,7 @@ func WaitForHTTP(port int, timeout time.Duration) error { url := fmt.Sprintf("http://localhost:%d/", port) resp, err := c.Get(url) if err != nil { - log.Printf("Waiting %s: %v", url, err) + log.Infof("Waiting %s: %v", url, err) return err } resp.Body.Close() |