diff options
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/boot/BUILD | 8 | ||||
-rw-r--r-- | runsc/boot/config.go | 5 | ||||
-rw-r--r-- | runsc/boot/fds.go | 108 | ||||
-rw-r--r-- | runsc/boot/fs.go | 13 | ||||
-rw-r--r-- | runsc/boot/loader.go | 216 | ||||
-rw-r--r-- | runsc/boot/loader_amd64.go | 26 | ||||
-rw-r--r-- | runsc/boot/loader_arm64.go | 26 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 19 | ||||
-rw-r--r-- | runsc/boot/network.go | 50 | ||||
-rw-r--r-- | runsc/boot/vfs.go | 57 | ||||
-rw-r--r-- | runsc/cmd/syscalls.go | 23 | ||||
-rw-r--r-- | runsc/container/BUILD | 2 | ||||
-rw-r--r-- | runsc/container/container.go | 23 | ||||
-rw-r--r-- | runsc/container/container_test.go | 147 | ||||
-rw-r--r-- | runsc/container/multi_container_test.go | 2 | ||||
-rw-r--r-- | runsc/main.go | 8 | ||||
-rw-r--r-- | runsc/sandbox/network.go | 5 | ||||
-rw-r--r-- | runsc/specutils/specutils.go | 14 |
18 files changed, 370 insertions, 382 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index ed3c8f546..a907c103b 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -12,12 +12,9 @@ go_library( "controller.go", "debug.go", "events.go", - "fds.go", "fs.go", "limits.go", "loader.go", - "loader_amd64.go", - "loader_arm64.go", "network.go", "strace.go", "vfs.go", @@ -43,6 +40,7 @@ go_library( "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/control", "//pkg/sentry/devices/memdev", + "//pkg/sentry/fdimport", "//pkg/sentry/fs", "//pkg/sentry/fs/dev", "//pkg/sentry/fs/gofer", @@ -53,6 +51,7 @@ go_library( "//pkg/sentry/fs/tmpfs", "//pkg/sentry/fs/tty", "//pkg/sentry/fs/user", + "//pkg/sentry/fsimpl/devpts", "//pkg/sentry/fsimpl/devtmpfs", "//pkg/sentry/fsimpl/gofer", "//pkg/sentry/fsimpl/host", @@ -76,7 +75,6 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/state", "//pkg/sentry/strace", - "//pkg/sentry/syscalls/linux", "//pkg/sentry/syscalls/linux/vfs2", "//pkg/sentry/time", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", @@ -88,6 +86,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/link/fdbased", "//pkg/tcpip/link/loopback", + "//pkg/tcpip/link/qdisc/fifo", "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/arp", "//pkg/tcpip/network/ipv4", @@ -124,7 +123,6 @@ go_test( "//pkg/p9", "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/sentry/kernel", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/unet", diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 715a19112..6d6a705f8 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -187,6 +187,10 @@ type Config struct { // SoftwareGSO indicates that software segmentation offload is enabled. SoftwareGSO bool + // QDisc indicates the type of queuening discipline to use by default + // for non-loopback interfaces. + QDisc QueueingDiscipline + // LogPackets indicates that all network packets should be logged. LogPackets bool @@ -294,6 +298,7 @@ func (c *Config) ToFlags() []string { "--gso=" + strconv.FormatBool(c.HardwareGSO), "--software-gso=" + strconv.FormatBool(c.SoftwareGSO), "--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead), + "--qdisc=" + c.QDisc.String(), } if c.CPUNumFromQuota { f = append(f, "--cpu-num-from-quota") diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go deleted file mode 100644 index 7e7a31fbd..000000000 --- a/runsc/boot/fds.go +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package boot - -import ( - "fmt" - - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/host" - vfshost "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" - "gvisor.dev/gvisor/pkg/sentry/kernel" -) - -// createFDTable creates an FD table that contains stdin, stdout, and stderr. -// If console is true, then ioctl calls will be passed through to the host FD. -// Upon success, createFDMap dups then closes stdioFDs. -func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) { - if len(stdioFDs) != 3 { - return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) - } - - if kernel.VFS2Enabled { - return createFDTableVFS2(ctx, console, stdioFDs) - } - - k := kernel.KernelFromContext(ctx) - fdTable := k.NewFDTable() - defer fdTable.DecRef() - - var ttyFile *fs.File - for appFD, hostFD := range stdioFDs { - var appFile *fs.File - - if console && appFD < 3 { - // Import the file as a host TTY file. - if ttyFile == nil { - var err error - appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */) - if err != nil { - return nil, err - } - defer appFile.DecRef() - - // Remember this in the TTY file, as we will - // use it for the other stdio FDs. - ttyFile = appFile - } else { - // Re-use the existing TTY file, as all three - // stdio FDs must point to the same fs.File in - // order to share TTY state, specifically the - // foreground process group id. - appFile = ttyFile - } - } else { - // Import the file as a regular host file. - var err error - appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */) - if err != nil { - return nil, err - } - defer appFile.DecRef() - } - - // Add the file to the FD map. - if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { - return nil, err - } - } - - fdTable.IncRef() - return fdTable, nil -} - -func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) { - k := kernel.KernelFromContext(ctx) - fdTable := k.NewFDTable() - defer fdTable.DecRef() - - for appFD, hostFD := range stdioFDs { - // TODO(gvisor.dev/issue/1482): Add TTY support. - appFile, err := vfshost.ImportFD(ctx, k.HostMount(), hostFD, false) - if err != nil { - return nil, err - } - - if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { - appFile.DecRef() - return nil, err - } - appFile.DecRef() - } - - fdTable.IncRef() - return fdTable, nil -} diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 98cce60af..4875452e2 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -219,6 +219,9 @@ func mountFlags(opts []string) fs.MountSourceFlags { mf.NoAtime = true case "noexec": mf.NoExec = true + case "bind", "rbind": + // When options include either "bind" or "rbind", + // it's converted to a 9P mount. default: log.Warningf("ignoring unknown mount option %q", o) } @@ -765,6 +768,16 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( useOverlay bool ) + for _, opt := range m.Options { + // When options include either "bind" or "rbind", this behaves as + // bind mount even if the mount type is equal to a filesystem supported + // on runsc. + if opt == "bind" || opt == "rbind" { + m.Type = bind + break + } + } + switch m.Type { case devpts, devtmpfs, proc, sysfs: fsName = m.Type diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index f6ea4c102..8c8bad11c 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -27,16 +27,18 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/fs/user" - vfs2host "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -75,8 +77,6 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) -var syscallTable *kernel.SyscallTable - // Loader keeps state needed to start the kernel and run the container.. type Loader struct { // k is the kernel. @@ -143,6 +143,9 @@ type execProcess struct { // tty will be nil if the process is not attached to a terminal. tty *host.TTYFileOperations + // tty will be nil if the process is not attached to a terminal. + ttyVFS2 *hostvfs2.TTYFileDescription + // pidnsPath is the pid namespace path in spec pidnsPath string } @@ -199,14 +202,12 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("setting up memory usage: %v", err) } - // Patch the syscall table. - kernel.VFS2Enabled = args.Conf.VFS2 - if kernel.VFS2Enabled { - vfs2.Override(syscallTable.Table) + // Is this a VFSv2 kernel? + if args.Conf.VFS2 { + kernel.VFS2Enabled = true + vfs2.Override() } - kernel.RegisterSyscallTable(syscallTable) - // Create kernel and platform. p, err := createPlatform(args.Conf, args.Device) if err != nil { @@ -333,7 +334,7 @@ func New(args Args) (*Loader, error) { if kernel.VFS2Enabled { // Set up host mount that will be used for imported fds. - hostFilesystem := vfs2host.NewFilesystem(k.VFS()) + hostFilesystem := hostvfs2.NewFilesystem(k.VFS()) defer hostFilesystem.DecRef() hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { @@ -528,6 +529,8 @@ func (l *Loader) run() error { // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. + var ttyFile *host.TTYFileOperations + var ttyFileVFS2 *hostvfs2.TTYFileDescription if !l.restore { if l.conf.ProfileEnable { pprof.Initialize() @@ -542,13 +545,14 @@ func (l *Loader) run() error { // Create the FD map, which will set stdin, stdout, and stderr. If console // is true, then ioctl calls will be passed through to the host fd. ctx := l.rootProcArgs.NewContext(l.k) - fdTable, err := createFDTable(ctx, l.console, l.stdioFDs) + var err error + + // CreateProcess takes a reference on FDMap if successful. We won't need + // ours either way. + l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs) if err != nil { return fmt.Errorf("importing fds: %v", err) } - // CreateProcess takes a reference on FDMap if successful. We won't need - // ours either way. - l.rootProcArgs.FDTable = fdTable // Setup the root container file system. l.startGoferMonitor(l.sandboxID, l.goferFDs) @@ -591,14 +595,16 @@ func (l *Loader) run() error { ep.pidnsPath = ns.Path } if l.console { - ttyFile, _ := l.rootProcArgs.FDTable.Get(0) - defer ttyFile.DecRef() - ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations) - - // Set the foreground process group on the TTY to the global - // init process group, since that is what we are about to - // start running. - ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup()) + // Set the foreground process group on the TTY to the global init process + // group, since that is what we are about to start running. + switch { + case ttyFileVFS2 != nil: + ep.ttyVFS2 = ttyFileVFS2 + ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup()) + case ttyFile != nil: + ep.tty = ttyFile + ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup()) + } } // Handle signals by forwarding them to the root container process @@ -719,7 +725,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file // Create the FD map, which will set stdin, stdout, and stderr. ctx := procArgs.NewContext(l.k) - fdTable, err := createFDTable(ctx, false, stdioFDs) + fdTable, _, _, err := createFDTable(ctx, false, stdioFDs) if err != nil { return fmt.Errorf("importing fds: %v", err) } @@ -804,14 +810,14 @@ func (l *Loader) destroyContainer(cid string) error { l.mu.Lock() defer l.mu.Unlock() - _, _, started, err := l.threadGroupFromIDLocked(execID{cid: cid}) + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) if err != nil { // Container doesn't exist. return err } - // The container exists, has it been started? - if started { + // The container exists, but has it been started? + if tg != nil { if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { return fmt.Errorf("sending SIGKILL to all container processes: %v", err) } @@ -853,48 +859,65 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { l.mu.Lock() defer l.mu.Unlock() - tg, _, started, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID}) + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID}) if err != nil { return 0, err } - if !started { + if tg == nil { return 0, fmt.Errorf("container %q not started", args.ContainerID) } - // TODO(gvisor.dev/issue/1623): Add VFS2 support - // Get the container MountNamespace from the Task. - tg.Leader().WithMuLocked(func(t *kernel.Task) { + if kernel.VFS2Enabled { // task.MountNamespace() does not take a ref, so we must do so ourselves. - args.MountNamespace = t.MountNamespace() - args.MountNamespace.IncRef() - }) - if args.MountNamespace != nil { - defer args.MountNamespace.DecRef() + args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2() + args.MountNamespaceVFS2.IncRef() + } else { + tg.Leader().WithMuLocked(func(t *kernel.Task) { + // task.MountNamespace() does not take a ref, so we must do so ourselves. + args.MountNamespace = t.MountNamespace() + args.MountNamespace.IncRef() + }) } // Add the HOME environment variable if it is not already set. - root := args.MountNamespace.Root() - defer root.DecRef() - ctx := fs.WithRoot(l.k.SupervisorContext(), root) - envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) - if err != nil { - return 0, err + if kernel.VFS2Enabled { + defer args.MountNamespaceVFS2.DecRef() + + root := args.MountNamespaceVFS2.Root() + defer root.DecRef() + ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) + if err != nil { + return 0, err + } + args.Envv = envv + } else { + defer args.MountNamespace.DecRef() + + root := args.MountNamespace.Root() + defer root.DecRef() + ctx := fs.WithRoot(l.k.SupervisorContext(), root) + envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) + if err != nil { + return 0, err + } + args.Envv = envv } - args.Envv = envv // Start the process. proc := control.Proc{Kernel: l.k} args.PIDNamespace = tg.PIDNamespace() - newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args) + newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args) if err != nil { return 0, err } eid := execID{cid: args.ContainerID, pid: tgid} l.processes[eid] = &execProcess{ - tg: newTG, - tty: ttyFile, + tg: newTG, + tty: ttyFile, + ttyVFS2: ttyFileVFS2, } log.Debugf("updated processes: %v", l.processes) @@ -905,7 +928,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { // Don't defer unlock, as doing so would make it impossible for // multiple clients to wait on the same container. - tg, _, err := l.threadGroupFromID(execID{cid: cid}) + tg, err := l.threadGroupFromID(execID{cid: cid}) if err != nil { return fmt.Errorf("can't wait for container %q: %v", cid, err) } @@ -924,7 +947,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e // Try to find a process that was exec'd eid := execID{cid: cid, pid: tgid} - execTG, _, err := l.threadGroupFromID(eid) + execTG, err := l.threadGroupFromID(eid) if err == nil { ws := l.wait(execTG) *waitStatus = ws @@ -938,7 +961,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e // The caller may be waiting on a process not started directly via exec. // In this case, find the process in the container's PID namespace. - initTG, _, err := l.threadGroupFromID(execID{cid: cid}) + initTG, err := l.threadGroupFromID(execID{cid: cid}) if err != nil { return fmt.Errorf("waiting for PID %d: %v", tgid, err) } @@ -1089,8 +1112,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid) } // Check that the container has actually started before signaling it. - _, _, err := l.threadGroupFromID(execID{cid: cid}) - if err != nil { + if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil { return err } if err := l.signalAllProcesses(cid, signo); err != nil { @@ -1104,7 +1126,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e } func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error { - execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) + execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) if err == nil { // Send signal directly to the identified process. return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo}) @@ -1113,7 +1135,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er // The caller may be signaling a process not started directly via exec. // In this case, find the process in the container's PID namespace and // signal it. - initTG, _, err := l.threadGroupFromID(execID{cid: cid}) + initTG, err := l.threadGroupFromID(execID{cid: cid}) if err != nil { return fmt.Errorf("no thread group found: %v", err) } @@ -1127,17 +1149,35 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}) } +// signalForegrondProcessGroup looks up foreground process group from the TTY +// for the given "tgid" inside container "cid", and send the signal to it. func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { - // Lookup foreground process group from the TTY for the given process, - // and send the signal to it. - tg, tty, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) + l.mu.Lock() + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid}) + if err != nil { + l.mu.Unlock() + return fmt.Errorf("no thread group found: %v", err) + } + if tg == nil { + l.mu.Unlock() + return fmt.Errorf("container %q not started", cid) + } + + tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid}) + l.mu.Unlock() if err != nil { return fmt.Errorf("no thread group found: %v", err) } - if tty == nil { + + var pg *kernel.ProcessGroup + switch { + case ttyVFS2 != nil: + pg = ttyVFS2.ForegroundProcessGroup() + case tty != nil: + pg = tty.ForegroundProcessGroup() + default: return fmt.Errorf("no TTY attached") } - pg := tty.ForegroundProcessGroup() if pg == nil { // No foreground process group has been set. Signal the // original thread group. @@ -1168,33 +1208,57 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error { return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}) } -// threadGroupFromID same as threadGroupFromIDLocked except that it acquires -// mutex before calling it. -func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) { +// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it +// acquires mutex before calling it and fails in case container hasn't started +// yet. +func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) { l.mu.Lock() defer l.mu.Unlock() - tg, tty, ok, err := l.threadGroupFromIDLocked(key) + tg, err := l.tryThreadGroupFromIDLocked(key) if err != nil { - return nil, nil, err + return nil, err } - if !ok { - return nil, nil, fmt.Errorf("container %q not started", key.cid) + if tg == nil { + return nil, fmt.Errorf("container %q not started", key.cid) } - return tg, tty, nil + return tg, nil } -// threadGroupFromIDLocked returns the thread group and TTY for the given -// execution ID. TTY may be nil if the process is not attached to a terminal. -// Also returns a boolean indicating whether the container has already started. -// Returns error if execution ID is invalid or if the container cannot be -// found (maybe it has been deleted). Caller must hold 'mu'. -func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, bool, error) { +// tryThreadGroupFromIDLocked returns the thread group for the given execution +// ID. It may return nil in case the container has not started yet. Returns +// error if execution ID is invalid or if the container cannot be found (maybe +// it has been deleted). Caller must hold 'mu'. +func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) { ep := l.processes[key] if ep == nil { - return nil, nil, false, fmt.Errorf("container %q not found", key.cid) + return nil, fmt.Errorf("container %q not found", key.cid) } - if ep.tg == nil { - return nil, nil, false, nil + return ep.tg, nil +} + +// ttyFromIDLocked returns the TTY files for the given execution ID. It may +// return nil in case the container has not started yet. Returns error if +// execution ID is invalid or if the container cannot be found (maybe it has +// been deleted). Caller must hold 'mu'. +func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { + ep := l.processes[key] + if ep == nil { + return nil, nil, fmt.Errorf("container %q not found", key.cid) + } + return ep.tty, ep.ttyVFS2, nil +} + +func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { + if len(stdioFDs) != 3 { + return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) + } + + k := kernel.KernelFromContext(ctx) + fdTable := k.NewFDTable() + ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) + if err != nil { + fdTable.DecRef() + return nil, nil, nil, err } - return ep.tg, ep.tty, true, nil + return fdTable, ttyFile, ttyFileVFS2, nil } diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go deleted file mode 100644 index 78df86611..000000000 --- a/runsc/boot/loader_amd64.go +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package boot - -import ( - "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" -) - -func init() { - // Set the global syscall table. - syscallTable = linux.AMD64 -} diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go deleted file mode 100644 index 250785010..000000000 --- a/runsc/boot/loader_arm64.go +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package boot - -import ( - "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" -) - -func init() { - // Set the global syscall table. - syscallTable = linux.ARM64 -} diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 55d27a632..e448fd773 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -31,7 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" @@ -69,11 +68,6 @@ func testSpec() *specs.Spec { } } -func resetSyscallTable() { - kernel.VFS2Enabled = false - kernel.FlushSyscallTablesTestOnly() -} - // startGofer starts a new gofer routine serving 'root' path. It returns the // sandbox side of the connection, and a function that when called will stop the // gofer. @@ -150,13 +144,11 @@ func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) { // TestRun runs a simple application in a sandbox and checks that it succeeds. func TestRun(t *testing.T) { - defer resetSyscallTable() doRun(t, false) } // TestRunVFS2 runs TestRun in VFSv2. func TestRunVFS2(t *testing.T) { - defer resetSyscallTable() doRun(t, true) } @@ -199,13 +191,11 @@ func doRun(t *testing.T, vfsEnabled bool) { // TestStartSignal tests that the controller Start message will cause // WaitForStartSignal to return. func TestStartSignal(t *testing.T) { - defer resetSyscallTable() doStartSignal(t, false) } // TestStartSignalVFS2 does TestStartSignal with VFS2. func TestStartSignalVFS2(t *testing.T) { - defer resetSyscallTable() doStartSignal(t, true) } @@ -438,7 +428,6 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase { // Test that MountNamespace can be created with various specs. func TestCreateMountNamespace(t *testing.T) { - for _, tc := range createMountTestcases(false /* vfs2 */) { t.Run(tc.name, func(t *testing.T) { conf := testConfig() @@ -476,15 +465,13 @@ func TestCreateMountNamespace(t *testing.T) { // Test that MountNamespace can be created with various specs. func TestCreateMountNamespaceVFS2(t *testing.T) { - for _, tc := range createMountTestcases(true /* vfs2 */) { t.Run(tc.name, func(t *testing.T) { - defer resetSyscallTable() - spec := testSpec() spec.Mounts = tc.spec.Mounts spec.Root = tc.spec.Root + t.Logf("Using root: %q", spec.Root.Path) l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec) if err != nil { t.Fatalf("failed to create loader: %v", err) @@ -497,7 +484,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { t.Fatalf("failed process hints: %v", err) } - ctx := l.rootProcArgs.NewContext(l.k) + ctx := l.k.SupervisorContext() mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs) if err != nil { t.Fatalf("failed to setupVFS2: %v", err) @@ -506,7 +493,6 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { root := mns.Root() defer root.DecRef() for _, p := range tc.expectedPaths { - target := &vfs.PathOperation{ Root: root, Start: root, @@ -518,7 +504,6 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { } else { d.DecRef() } - } }) } diff --git a/runsc/boot/network.go b/runsc/boot/network.go index bee6ee336..0af30456e 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -17,6 +17,7 @@ package boot import ( "fmt" "net" + "runtime" "strings" "syscall" @@ -24,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -75,6 +77,44 @@ type DefaultRoute struct { Name string } +// QueueingDiscipline is used to specify the kind of Queueing Discipline to +// apply for a give FDBasedLink. +type QueueingDiscipline int + +const ( + // QDiscNone disables any queueing for the underlying FD. + QDiscNone QueueingDiscipline = iota + + // QDiscFIFO applies a simple fifo based queue to the underlying + // FD. + QDiscFIFO +) + +// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s +// else returns an error. +func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) { + switch s { + case "none": + return QDiscNone, nil + case "fifo": + return QDiscFIFO, nil + default: + return 0, fmt.Errorf("unsupported qdisc specified: %q", s) + } +} + +// String implements fmt.Stringer. +func (q QueueingDiscipline) String() string { + switch q { + case QDiscNone: + return "none" + case QDiscFIFO: + return "fifo" + default: + panic(fmt.Sprintf("Invalid queueing discipline: %d", q)) + } +} + // FDBasedLink configures an fd-based link. type FDBasedLink struct { Name string @@ -84,6 +124,7 @@ type FDBasedLink struct { GSOMaxSize uint32 SoftwareGSOEnabled bool LinkAddress net.HardwareAddr + QDisc QueueingDiscipline // NumChannels controls how many underlying FD's are to be used to // create this endpoint. @@ -185,6 +226,8 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct } mac := tcpip.LinkAddress(link.LinkAddress) + log.Infof("gso max size is: %d", link.GSOMaxSize) + linkEP, err := fdbased.New(&fdbased.Options{ FDs: FDs, MTU: uint32(link.MTU), @@ -199,6 +242,13 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct return err } + switch link.QDisc { + case QDiscNone: + case QDiscFIFO: + log.Infof("Enabling FIFO QDisc on %q", link.Name) + linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) + } + log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { return err diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 0b9b0b436..d1397ed2c 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/devices/memdev" "gvisor.dev/gvisor/pkg/sentry/fs" + devpts2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" @@ -41,37 +42,28 @@ import ( ) func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { - - vfsObj.MustRegisterFilesystemType(rootFsName, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserList: true, - }) - - vfsObj.MustRegisterFilesystemType(bind, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + vfsObj.MustRegisterFilesystemType(devpts2.Name, &devpts2.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, + // TODO(b/29356795): Users may mount this once the terminals are in a + // usable state. + AllowUserMount: false, }) - - vfsObj.MustRegisterFilesystemType(devpts, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - AllowUserList: true, - }) - - vfsObj.MustRegisterFilesystemType(devtmpfs, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + vfsObj.MustRegisterFilesystemType(devtmpfsimpl.Name, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) - vfsObj.MustRegisterFilesystemType(proc, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - AllowUserList: true, + vfsObj.MustRegisterFilesystemType(goferimpl.Name, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, }) - vfsObj.MustRegisterFilesystemType(sysfs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + vfsObj.MustRegisterFilesystemType(procimpl.Name, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) - vfsObj.MustRegisterFilesystemType(tmpfs, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + vfsObj.MustRegisterFilesystemType(sysimpl.Name, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) - vfsObj.MustRegisterFilesystemType(nonefs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + vfsObj.MustRegisterFilesystemType(tmpfsimpl.Name, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) @@ -108,7 +100,6 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte } func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error { - exe := procArgs.Argv[0] // Absolute paths can be used directly. @@ -120,11 +111,9 @@ func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessAr // Paths with '/' in them should be joined to the working directory, or // to the root if working directory is not set. if strings.IndexByte(exe, '/') > 0 { - if !path.IsAbs(procArgs.WorkingDirectory) { return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory) } - procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe) return nil } @@ -144,21 +133,17 @@ func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessAr creds := procArgs.Credentials for _, p := range paths { - binPath := path.Join(p, exe) - pop := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(binPath), FollowFinalSymlink: true, } - opts := &vfs.OpenOptions{ FileExec: true, Flags: linux.O_RDONLY, } - dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts) if err == syserror.ENOENT || err == syserror.EACCES { // Didn't find it here. @@ -181,35 +166,32 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs // Create context with root credentials to mount the filesystem (the current // user may not be privileged enough). + rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace) rootProcArgs := *procArgs rootProcArgs.WorkingDirectory = "/" - rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs.Credentials = rootCreds rootProcArgs.Umask = 0022 rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals rootCtx := procArgs.NewContext(c.k) - creds := procArgs.Credentials - if err := registerFilesystems(rootCtx, c.k.VFS(), creds); err != nil { + if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil { return nil, fmt.Errorf("register filesystems: %w", err) } - mns, err := c.createMountNamespaceVFS2(ctx, conf, creds) + mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) if err != nil { return nil, fmt.Errorf("creating mount namespace: %w", err) } - rootProcArgs.MountNamespaceVFS2 = mns // Mount submounts. - if err := c.mountSubmountsVFS2(rootCtx, conf, mns, creds); err != nil { + if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil { return nil, fmt.Errorf("mounting submounts vfs2: %w", err) } - return mns, nil } func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { - fd := c.fds.remove() opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",") @@ -222,7 +204,6 @@ func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *C } func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { - c.prepareMountsVFS2() for _, submount := range c.mounts { @@ -256,7 +237,6 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, if err != nil { return fmt.Errorf("mountOptions failed: %w", err) } - if fsName == "" { // Filesystem is not supported (e.g. cgroup), just skip it. return nil @@ -277,7 +257,7 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, // All writes go to upper, be paranoid and make lower readonly. opts.ReadOnly = useOverlay - if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil { + if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil { return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts) @@ -336,7 +316,6 @@ func p9MountOptionsVFS2(fd int, fa FileAccessType) []string { } func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { - target := &vfs.PathOperation{ Root: root, Start: root, @@ -345,12 +324,10 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) switch { - case err == syserror.ENOENT: if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil { return err } - mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil { return fmt.Errorf("failed to makedir for mount %+v: %w", target, err) diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go index 7072547be..a37d66139 100644 --- a/runsc/cmd/syscalls.go +++ b/runsc/cmd/syscalls.go @@ -32,9 +32,10 @@ import ( // Syscalls implements subcommands.Command for the "syscalls" command. type Syscalls struct { - output string - os string - arch string + format string + os string + arch string + filename string } // CompatibilityInfo is a map of system and architecture to compatibility doc. @@ -95,16 +96,17 @@ func (*Syscalls) Usage() string { // SetFlags implements subcommands.Command.SetFlags. func (s *Syscalls) SetFlags(f *flag.FlagSet) { - f.StringVar(&s.output, "o", "table", "Output format (table, csv, json).") + f.StringVar(&s.format, "format", "table", "Output format (table, csv, json).") f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)") f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).") + f.StringVar(&s.filename, "filename", "", "Output filename (otherwise stdout).") } // Execute implements subcommands.Command.Execute. func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - out, ok := outputMap[s.output] + out, ok := outputMap[s.format] if !ok { - Fatalf("Unsupported output format %q", s.output) + Fatalf("Unsupported output format %q", s.format) } // Build map of all supported architectures. @@ -124,7 +126,14 @@ func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface Fatalf("%v", err) } - if err := out(os.Stdout, info); err != nil { + w := os.Stdout // Default. + if s.filename != "" { + w, err = os.OpenFile(s.filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + if err != nil { + Fatalf("Error opening %q: %v", s.filename, err) + } + } + if err := out(w, info); err != nil { Fatalf("Error writing output: %v", err) } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 331b8e866..46154df60 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -15,8 +15,10 @@ go_library( "//test:__subpackages__", ], deps = [ + "//pkg/abi/linux", "//pkg/log", "//pkg/sentry/control", + "//pkg/sentry/sighandling", "//pkg/sync", "//runsc/boot", "//runsc/cgroup", diff --git a/runsc/container/container.go b/runsc/container/container.go index e0153832c..8539f252d 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -22,7 +22,6 @@ import ( "io/ioutil" "os" "os/exec" - "os/signal" "regexp" "strconv" "strings" @@ -31,8 +30,10 @@ import ( "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/sighandling" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/sandbox" @@ -621,21 +622,15 @@ func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error { // forwarding signals. func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() { log.Debugf("Forwarding all signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess) - sigCh := make(chan os.Signal, 1) - signal.Notify(sigCh) - go func() { - for s := range sigCh { - log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", s, c.ID, pid, fgProcess) - if err := c.Sandbox.SignalProcess(c.ID, pid, s.(syscall.Signal), fgProcess); err != nil { - log.Warningf("error forwarding signal %d to container %q: %v", s, c.ID, err) - } + stop := sighandling.StartSignalForwarding(func(sig linux.Signal) { + log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", sig, c.ID, pid, fgProcess) + if err := c.Sandbox.SignalProcess(c.ID, pid, syscall.Signal(sig), fgProcess); err != nil { + log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err) } - log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess) - }() - + }) return func() { - signal.Stop(sigCh) - close(sigCh) + log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess) + stop() } } diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index a1d4d3b7e..acf988aa0 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -256,6 +256,8 @@ var ( func configs(t *testing.T, opts ...configOption) map[string]*boot.Config { // Always load the default config. cs := make(map[string]*boot.Config) + cs["default"] = testutil.TestConfig(t) + for _, o := range opts { switch o { case overlay: @@ -281,7 +283,7 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config { return cs } -func configsWithVFS2(t *testing.T, opts []configOption) map[string]*boot.Config { +func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config { vfs1 := configs(t, opts...) vfs2 := configs(t, opts...) @@ -302,7 +304,7 @@ func TestLifecycle(t *testing.T) { childReaper.Start() defer childReaper.Stop() - for name, conf := range configsWithVFS2(t, all) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { // The container will just sleep for a long time. We will kill it before // it finishes sleeping. @@ -476,7 +478,7 @@ func TestExePath(t *testing.T) { t.Fatalf("error making directory: %v", err) } - for name, conf := range configsWithVFS2(t, []configOption{overlay}) { + for name, conf := range configsWithVFS2(t, overlay) { t.Run(name, func(t *testing.T) { for _, test := range []struct { path string @@ -1297,7 +1299,7 @@ func TestCapabilities(t *testing.T) { // TestRunNonRoot checks that sandbox can be configured when running as // non-privileged user. func TestRunNonRoot(t *testing.T) { - for name, conf := range configs(t, noOverlay...) { + for name, conf := range configsWithVFS2(t, noOverlay...) { t.Run(name, func(t *testing.T) { spec := testutil.NewSpecWithArgs("/bin/true") @@ -1341,7 +1343,7 @@ func TestRunNonRoot(t *testing.T) { // TestMountNewDir checks that runsc will create destination directory if it // doesn't exit. func TestMountNewDir(t *testing.T) { - for name, conf := range configsWithVFS2(t, []configOption{overlay}) { + for name, conf := range configsWithVFS2(t, overlay) { t.Run(name, func(t *testing.T) { root, err := ioutil.TempDir(testutil.TmpDir(), "root") if err != nil { @@ -1370,7 +1372,7 @@ func TestMountNewDir(t *testing.T) { } func TestReadonlyRoot(t *testing.T) { - for name, conf := range configsWithVFS2(t, []configOption{overlay}) { + for name, conf := range configsWithVFS2(t, overlay) { t.Run(name, func(t *testing.T) { spec := testutil.NewSpecWithArgs("/bin/touch", "/foo") spec.Root.Readonly = true @@ -1488,7 +1490,7 @@ func TestUIDMap(t *testing.T) { } func TestReadonlyMount(t *testing.T) { - for name, conf := range configsWithVFS2(t, []configOption{overlay}) { + for name, conf := range configsWithVFS2(t, overlay) { t.Run(name, func(t *testing.T) { dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount") spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) @@ -1535,6 +1537,28 @@ func TestReadonlyMount(t *testing.T) { } } +func TestBindMountByOption(t *testing.T) { + for _, conf := range configs(t, overlay) { + t.Logf("Running test with conf: %+v", conf) + + dir, err := ioutil.TempDir(testutil.TmpDir(), "bind-mount") + spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: dir, + Type: "none", + Options: []string{"rw", "bind"}, + }) + + if err := run(spec, conf); err != nil { + t.Fatalf("error running sandbox: %v", err) + } + } +} + // TestAbbreviatedIDs checks that runsc supports using abbreviated container // IDs in place of full IDs. func TestAbbreviatedIDs(t *testing.T) { @@ -1742,7 +1766,7 @@ func TestUserLog(t *testing.T) { } func TestWaitOnExitedSandbox(t *testing.T) { - for name, conf := range configsWithVFS2(t, all) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { // Run a shell that sleeps for 1 second and then exits with a // non-zero code. @@ -2167,63 +2191,70 @@ func TestTTYField(t *testing.T) { } for _, test := range testCases { - t.Run(test.name, func(t *testing.T) { - conf := testutil.TestConfig(t) - - // We will run /bin/sleep, possibly with an open TTY. - cmd := []string{"/bin/sleep", "10000"} - if test.useTTY { - // Run inside the "pty-runner". - cmd = append([]string{testApp, "pty-runner"}, cmd...) - } - - spec := testutil.NewSpecWithArgs(cmd...) - _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer cleanup() + for _, vfs2 := range []bool{false, true} { + name := test.name + if vfs2 { + name += "-vfs2" + } + t.Run(name, func(t *testing.T) { + conf := testutil.TestConfig(t) + conf.VFS2 = vfs2 + + // We will run /bin/sleep, possibly with an open TTY. + cmd := []string{"/bin/sleep", "10000"} + if test.useTTY { + // Run inside the "pty-runner". + cmd = append([]string{testApp, "pty-runner"}, cmd...) + } - // Create and start the container. - args := Args{ - ID: testutil.RandomContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() - if err := c.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + spec := testutil.NewSpecWithArgs(cmd...) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // Wait for sleep to be running, and check the TTY - // field. - var gotTTYField string - cb := func() error { - ps, err := c.Processes() + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) if err != nil { - err = fmt.Errorf("error getting process data from container: %v", err) - return &backoff.PermanentError{Err: err} + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) } - for _, p := range ps { - if strings.Contains(p.Cmd, "sleep") { - gotTTYField = p.TTY - return nil + + // Wait for sleep to be running, and check the TTY + // field. + var gotTTYField string + cb := func() error { + ps, err := c.Processes() + if err != nil { + err = fmt.Errorf("error getting process data from container: %v", err) + return &backoff.PermanentError{Err: err} + } + for _, p := range ps { + if strings.Contains(p.Cmd, "sleep") { + gotTTYField = p.TTY + return nil + } } + return fmt.Errorf("sleep not running") + } + if err := testutil.Poll(cb, 30*time.Second); err != nil { + t.Fatalf("error waiting for sleep process: %v", err) } - return fmt.Errorf("sleep not running") - } - if err := testutil.Poll(cb, 30*time.Second); err != nil { - t.Fatalf("error waiting for sleep process: %v", err) - } - if gotTTYField != test.wantTTYField { - t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField) - } - }) + if gotTTYField != test.wantTTYField { + t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField) + } + }) + } } } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index e3704b453..f6861b1dd 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -1394,7 +1394,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { Destination: "/mydir/test", Source: "/some/dir", Type: "tmpfs", - Options: []string{"rw", "rbind", "relatime"}, + Options: []string{"rw", "relatime"}, } podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) diff --git a/runsc/main.go b/runsc/main.go index 8e594c58e..0625a06e0 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -72,6 +72,7 @@ var ( network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") hardwareGSO = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.") softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.") + qDisc = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.") fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") @@ -198,6 +199,11 @@ func main() { cmd.Fatalf("%v", err) } + queueingDiscipline, err := boot.MakeQueueingDiscipline(*qDisc) + if err != nil { + cmd.Fatalf("%s", err) + } + // Sets the reference leak check mode. Also set it in config below to // propagate it to child processes. refs.SetLeakMode(refsLeakMode) @@ -232,7 +238,7 @@ func main() { OverlayfsStaleRead: *overlayfsStaleRead, CPUNumFromQuota: *cpuNumFromQuota, VFS2: *vfs2Enabled, - + QDisc: queueingDiscipline, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, TestOnlyTestNameEnv: *testOnlyTestNameEnv, } diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index bc093fba5..209bfdb20 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -62,7 +62,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") - if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels); err != nil { + if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels, conf.QDisc); err != nil { return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) } case boot.NetworkHost: @@ -115,7 +115,7 @@ func isRootNS() (bool, error) { // createInterfacesAndRoutesFromNS scrapes the interface and routes from the // net namespace with the given path, creates them in the sandbox, and removes // them from the host. -func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int) error { +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error { // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { @@ -201,6 +201,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG MTU: iface.MTU, Routes: routes, NumChannels: numNetworkChannels, + QDisc: qDisc, } // Get the link for the interface. diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 837d5e238..202518b58 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -311,7 +311,19 @@ func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth. // Is9PMount returns true if the given mount can be mounted as an external gofer. func Is9PMount(m specs.Mount) bool { - return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m) + var isBind bool + switch m.Type { + case "bind": + isBind = true + default: + for _, opt := range m.Options { + if opt == "bind" || opt == "rbind" { + isBind = true + break + } + } + } + return isBind && m.Source != "" && IsSupportedDevMount(m) } // IsSupportedDevMount returns true if the mount is a supported /dev mount. |