summaryrefslogtreecommitdiffhomepage
path: root/runsc/boot
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/boot')
-rw-r--r--runsc/boot/BUILD8
-rw-r--r--runsc/boot/config.go5
-rw-r--r--runsc/boot/fds.go108
-rw-r--r--runsc/boot/fs.go13
-rw-r--r--runsc/boot/loader.go216
-rw-r--r--runsc/boot/loader_amd64.go26
-rw-r--r--runsc/boot/loader_arm64.go26
-rw-r--r--runsc/boot/loader_test.go19
-rw-r--r--runsc/boot/network.go50
-rw-r--r--runsc/boot/vfs.go57
10 files changed, 230 insertions, 298 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index ed3c8f546..a907c103b 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -12,12 +12,9 @@ go_library(
"controller.go",
"debug.go",
"events.go",
- "fds.go",
"fs.go",
"limits.go",
"loader.go",
- "loader_amd64.go",
- "loader_arm64.go",
"network.go",
"strace.go",
"vfs.go",
@@ -43,6 +40,7 @@ go_library(
"//pkg/sentry/arch:registers_go_proto",
"//pkg/sentry/control",
"//pkg/sentry/devices/memdev",
+ "//pkg/sentry/fdimport",
"//pkg/sentry/fs",
"//pkg/sentry/fs/dev",
"//pkg/sentry/fs/gofer",
@@ -53,6 +51,7 @@ go_library(
"//pkg/sentry/fs/tmpfs",
"//pkg/sentry/fs/tty",
"//pkg/sentry/fs/user",
+ "//pkg/sentry/fsimpl/devpts",
"//pkg/sentry/fsimpl/devtmpfs",
"//pkg/sentry/fsimpl/gofer",
"//pkg/sentry/fsimpl/host",
@@ -76,7 +75,6 @@ go_library(
"//pkg/sentry/socket/unix",
"//pkg/sentry/state",
"//pkg/sentry/strace",
- "//pkg/sentry/syscalls/linux",
"//pkg/sentry/syscalls/linux/vfs2",
"//pkg/sentry/time",
"//pkg/sentry/unimpl:unimplemented_syscall_go_proto",
@@ -88,6 +86,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/link/fdbased",
"//pkg/tcpip/link/loopback",
+ "//pkg/tcpip/link/qdisc/fifo",
"//pkg/tcpip/link/sniffer",
"//pkg/tcpip/network/arp",
"//pkg/tcpip/network/ipv4",
@@ -124,7 +123,6 @@ go_test(
"//pkg/p9",
"//pkg/sentry/contexttest",
"//pkg/sentry/fs",
- "//pkg/sentry/kernel",
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/unet",
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 715a19112..6d6a705f8 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -187,6 +187,10 @@ type Config struct {
// SoftwareGSO indicates that software segmentation offload is enabled.
SoftwareGSO bool
+ // QDisc indicates the type of queuening discipline to use by default
+ // for non-loopback interfaces.
+ QDisc QueueingDiscipline
+
// LogPackets indicates that all network packets should be logged.
LogPackets bool
@@ -294,6 +298,7 @@ func (c *Config) ToFlags() []string {
"--gso=" + strconv.FormatBool(c.HardwareGSO),
"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
+ "--qdisc=" + c.QDisc.String(),
}
if c.CPUNumFromQuota {
f = append(f, "--cpu-num-from-quota")
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
deleted file mode 100644
index 7e7a31fbd..000000000
--- a/runsc/boot/fds.go
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package boot
-
-import (
- "fmt"
-
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/fs/host"
- vfshost "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
- "gvisor.dev/gvisor/pkg/sentry/kernel"
-)
-
-// createFDTable creates an FD table that contains stdin, stdout, and stderr.
-// If console is true, then ioctl calls will be passed through to the host FD.
-// Upon success, createFDMap dups then closes stdioFDs.
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) {
- if len(stdioFDs) != 3 {
- return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
- }
-
- if kernel.VFS2Enabled {
- return createFDTableVFS2(ctx, console, stdioFDs)
- }
-
- k := kernel.KernelFromContext(ctx)
- fdTable := k.NewFDTable()
- defer fdTable.DecRef()
-
- var ttyFile *fs.File
- for appFD, hostFD := range stdioFDs {
- var appFile *fs.File
-
- if console && appFD < 3 {
- // Import the file as a host TTY file.
- if ttyFile == nil {
- var err error
- appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */)
- if err != nil {
- return nil, err
- }
- defer appFile.DecRef()
-
- // Remember this in the TTY file, as we will
- // use it for the other stdio FDs.
- ttyFile = appFile
- } else {
- // Re-use the existing TTY file, as all three
- // stdio FDs must point to the same fs.File in
- // order to share TTY state, specifically the
- // foreground process group id.
- appFile = ttyFile
- }
- } else {
- // Import the file as a regular host file.
- var err error
- appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */)
- if err != nil {
- return nil, err
- }
- defer appFile.DecRef()
- }
-
- // Add the file to the FD map.
- if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
- return nil, err
- }
- }
-
- fdTable.IncRef()
- return fdTable, nil
-}
-
-func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) {
- k := kernel.KernelFromContext(ctx)
- fdTable := k.NewFDTable()
- defer fdTable.DecRef()
-
- for appFD, hostFD := range stdioFDs {
- // TODO(gvisor.dev/issue/1482): Add TTY support.
- appFile, err := vfshost.ImportFD(ctx, k.HostMount(), hostFD, false)
- if err != nil {
- return nil, err
- }
-
- if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
- appFile.DecRef()
- return nil, err
- }
- appFile.DecRef()
- }
-
- fdTable.IncRef()
- return fdTable, nil
-}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 98cce60af..4875452e2 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -219,6 +219,9 @@ func mountFlags(opts []string) fs.MountSourceFlags {
mf.NoAtime = true
case "noexec":
mf.NoExec = true
+ case "bind", "rbind":
+ // When options include either "bind" or "rbind",
+ // it's converted to a 9P mount.
default:
log.Warningf("ignoring unknown mount option %q", o)
}
@@ -765,6 +768,16 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
useOverlay bool
)
+ for _, opt := range m.Options {
+ // When options include either "bind" or "rbind", this behaves as
+ // bind mount even if the mount type is equal to a filesystem supported
+ // on runsc.
+ if opt == "bind" || opt == "rbind" {
+ m.Type = bind
+ break
+ }
+ }
+
switch m.Type {
case devpts, devtmpfs, proc, sysfs:
fsName = m.Type
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f6ea4c102..8c8bad11c 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -27,16 +27,18 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/memutil"
"gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/control"
+ "gvisor.dev/gvisor/pkg/sentry/fdimport"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
"gvisor.dev/gvisor/pkg/sentry/fs/user"
- vfs2host "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+ hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -75,8 +77,6 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
)
-var syscallTable *kernel.SyscallTable
-
// Loader keeps state needed to start the kernel and run the container..
type Loader struct {
// k is the kernel.
@@ -143,6 +143,9 @@ type execProcess struct {
// tty will be nil if the process is not attached to a terminal.
tty *host.TTYFileOperations
+ // tty will be nil if the process is not attached to a terminal.
+ ttyVFS2 *hostvfs2.TTYFileDescription
+
// pidnsPath is the pid namespace path in spec
pidnsPath string
}
@@ -199,14 +202,12 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("setting up memory usage: %v", err)
}
- // Patch the syscall table.
- kernel.VFS2Enabled = args.Conf.VFS2
- if kernel.VFS2Enabled {
- vfs2.Override(syscallTable.Table)
+ // Is this a VFSv2 kernel?
+ if args.Conf.VFS2 {
+ kernel.VFS2Enabled = true
+ vfs2.Override()
}
- kernel.RegisterSyscallTable(syscallTable)
-
// Create kernel and platform.
p, err := createPlatform(args.Conf, args.Device)
if err != nil {
@@ -333,7 +334,7 @@ func New(args Args) (*Loader, error) {
if kernel.VFS2Enabled {
// Set up host mount that will be used for imported fds.
- hostFilesystem := vfs2host.NewFilesystem(k.VFS())
+ hostFilesystem := hostvfs2.NewFilesystem(k.VFS())
defer hostFilesystem.DecRef()
hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
if err != nil {
@@ -528,6 +529,8 @@ func (l *Loader) run() error {
// If we are restoring, we do not want to create a process.
// l.restore is set by the container manager when a restore call is made.
+ var ttyFile *host.TTYFileOperations
+ var ttyFileVFS2 *hostvfs2.TTYFileDescription
if !l.restore {
if l.conf.ProfileEnable {
pprof.Initialize()
@@ -542,13 +545,14 @@ func (l *Loader) run() error {
// Create the FD map, which will set stdin, stdout, and stderr. If console
// is true, then ioctl calls will be passed through to the host fd.
ctx := l.rootProcArgs.NewContext(l.k)
- fdTable, err := createFDTable(ctx, l.console, l.stdioFDs)
+ var err error
+
+ // CreateProcess takes a reference on FDMap if successful. We won't need
+ // ours either way.
+ l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
if err != nil {
return fmt.Errorf("importing fds: %v", err)
}
- // CreateProcess takes a reference on FDMap if successful. We won't need
- // ours either way.
- l.rootProcArgs.FDTable = fdTable
// Setup the root container file system.
l.startGoferMonitor(l.sandboxID, l.goferFDs)
@@ -591,14 +595,16 @@ func (l *Loader) run() error {
ep.pidnsPath = ns.Path
}
if l.console {
- ttyFile, _ := l.rootProcArgs.FDTable.Get(0)
- defer ttyFile.DecRef()
- ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations)
-
- // Set the foreground process group on the TTY to the global
- // init process group, since that is what we are about to
- // start running.
- ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+ // Set the foreground process group on the TTY to the global init process
+ // group, since that is what we are about to start running.
+ switch {
+ case ttyFileVFS2 != nil:
+ ep.ttyVFS2 = ttyFileVFS2
+ ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+ case ttyFile != nil:
+ ep.tty = ttyFile
+ ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+ }
}
// Handle signals by forwarding them to the root container process
@@ -719,7 +725,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
// Create the FD map, which will set stdin, stdout, and stderr.
ctx := procArgs.NewContext(l.k)
- fdTable, err := createFDTable(ctx, false, stdioFDs)
+ fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
if err != nil {
return fmt.Errorf("importing fds: %v", err)
}
@@ -804,14 +810,14 @@ func (l *Loader) destroyContainer(cid string) error {
l.mu.Lock()
defer l.mu.Unlock()
- _, _, started, err := l.threadGroupFromIDLocked(execID{cid: cid})
+ tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
if err != nil {
// Container doesn't exist.
return err
}
- // The container exists, has it been started?
- if started {
+ // The container exists, but has it been started?
+ if tg != nil {
if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
}
@@ -853,48 +859,65 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
l.mu.Lock()
defer l.mu.Unlock()
- tg, _, started, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID})
+ tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
if err != nil {
return 0, err
}
- if !started {
+ if tg == nil {
return 0, fmt.Errorf("container %q not started", args.ContainerID)
}
- // TODO(gvisor.dev/issue/1623): Add VFS2 support
-
// Get the container MountNamespace from the Task.
- tg.Leader().WithMuLocked(func(t *kernel.Task) {
+ if kernel.VFS2Enabled {
// task.MountNamespace() does not take a ref, so we must do so ourselves.
- args.MountNamespace = t.MountNamespace()
- args.MountNamespace.IncRef()
- })
- if args.MountNamespace != nil {
- defer args.MountNamespace.DecRef()
+ args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
+ args.MountNamespaceVFS2.IncRef()
+ } else {
+ tg.Leader().WithMuLocked(func(t *kernel.Task) {
+ // task.MountNamespace() does not take a ref, so we must do so ourselves.
+ args.MountNamespace = t.MountNamespace()
+ args.MountNamespace.IncRef()
+ })
}
// Add the HOME environment variable if it is not already set.
- root := args.MountNamespace.Root()
- defer root.DecRef()
- ctx := fs.WithRoot(l.k.SupervisorContext(), root)
- envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
- if err != nil {
- return 0, err
+ if kernel.VFS2Enabled {
+ defer args.MountNamespaceVFS2.DecRef()
+
+ root := args.MountNamespaceVFS2.Root()
+ defer root.DecRef()
+ ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+ envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
+ if err != nil {
+ return 0, err
+ }
+ args.Envv = envv
+ } else {
+ defer args.MountNamespace.DecRef()
+
+ root := args.MountNamespace.Root()
+ defer root.DecRef()
+ ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+ envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
+ if err != nil {
+ return 0, err
+ }
+ args.Envv = envv
}
- args.Envv = envv
// Start the process.
proc := control.Proc{Kernel: l.k}
args.PIDNamespace = tg.PIDNamespace()
- newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args)
+ newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args)
if err != nil {
return 0, err
}
eid := execID{cid: args.ContainerID, pid: tgid}
l.processes[eid] = &execProcess{
- tg: newTG,
- tty: ttyFile,
+ tg: newTG,
+ tty: ttyFile,
+ ttyVFS2: ttyFileVFS2,
}
log.Debugf("updated processes: %v", l.processes)
@@ -905,7 +928,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
// Don't defer unlock, as doing so would make it impossible for
// multiple clients to wait on the same container.
- tg, _, err := l.threadGroupFromID(execID{cid: cid})
+ tg, err := l.threadGroupFromID(execID{cid: cid})
if err != nil {
return fmt.Errorf("can't wait for container %q: %v", cid, err)
}
@@ -924,7 +947,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e
// Try to find a process that was exec'd
eid := execID{cid: cid, pid: tgid}
- execTG, _, err := l.threadGroupFromID(eid)
+ execTG, err := l.threadGroupFromID(eid)
if err == nil {
ws := l.wait(execTG)
*waitStatus = ws
@@ -938,7 +961,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e
// The caller may be waiting on a process not started directly via exec.
// In this case, find the process in the container's PID namespace.
- initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+ initTG, err := l.threadGroupFromID(execID{cid: cid})
if err != nil {
return fmt.Errorf("waiting for PID %d: %v", tgid, err)
}
@@ -1089,8 +1112,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e
return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
}
// Check that the container has actually started before signaling it.
- _, _, err := l.threadGroupFromID(execID{cid: cid})
- if err != nil {
+ if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
return err
}
if err := l.signalAllProcesses(cid, signo); err != nil {
@@ -1104,7 +1126,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e
}
func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
- execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+ execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
if err == nil {
// Send signal directly to the identified process.
return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo})
@@ -1113,7 +1135,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er
// The caller may be signaling a process not started directly via exec.
// In this case, find the process in the container's PID namespace and
// signal it.
- initTG, _, err := l.threadGroupFromID(execID{cid: cid})
+ initTG, err := l.threadGroupFromID(execID{cid: cid})
if err != nil {
return fmt.Errorf("no thread group found: %v", err)
}
@@ -1127,17 +1149,35 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er
return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
}
+// signalForegrondProcessGroup looks up foreground process group from the TTY
+// for the given "tgid" inside container "cid", and send the signal to it.
func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
- // Lookup foreground process group from the TTY for the given process,
- // and send the signal to it.
- tg, tty, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+ l.mu.Lock()
+ tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
+ if err != nil {
+ l.mu.Unlock()
+ return fmt.Errorf("no thread group found: %v", err)
+ }
+ if tg == nil {
+ l.mu.Unlock()
+ return fmt.Errorf("container %q not started", cid)
+ }
+
+ tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
+ l.mu.Unlock()
if err != nil {
return fmt.Errorf("no thread group found: %v", err)
}
- if tty == nil {
+
+ var pg *kernel.ProcessGroup
+ switch {
+ case ttyVFS2 != nil:
+ pg = ttyVFS2.ForegroundProcessGroup()
+ case tty != nil:
+ pg = tty.ForegroundProcessGroup()
+ default:
return fmt.Errorf("no TTY attached")
}
- pg := tty.ForegroundProcessGroup()
if pg == nil {
// No foreground process group has been set. Signal the
// original thread group.
@@ -1168,33 +1208,57 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error {
return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo})
}
-// threadGroupFromID same as threadGroupFromIDLocked except that it acquires
-// mutex before calling it.
-func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
+// acquires mutex before calling it and fails in case container hasn't started
+// yet.
+func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
l.mu.Lock()
defer l.mu.Unlock()
- tg, tty, ok, err := l.threadGroupFromIDLocked(key)
+ tg, err := l.tryThreadGroupFromIDLocked(key)
if err != nil {
- return nil, nil, err
+ return nil, err
}
- if !ok {
- return nil, nil, fmt.Errorf("container %q not started", key.cid)
+ if tg == nil {
+ return nil, fmt.Errorf("container %q not started", key.cid)
}
- return tg, tty, nil
+ return tg, nil
}
-// threadGroupFromIDLocked returns the thread group and TTY for the given
-// execution ID. TTY may be nil if the process is not attached to a terminal.
-// Also returns a boolean indicating whether the container has already started.
-// Returns error if execution ID is invalid or if the container cannot be
-// found (maybe it has been deleted). Caller must hold 'mu'.
-func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, bool, error) {
+// tryThreadGroupFromIDLocked returns the thread group for the given execution
+// ID. It may return nil in case the container has not started yet. Returns
+// error if execution ID is invalid or if the container cannot be found (maybe
+// it has been deleted). Caller must hold 'mu'.
+func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
ep := l.processes[key]
if ep == nil {
- return nil, nil, false, fmt.Errorf("container %q not found", key.cid)
+ return nil, fmt.Errorf("container %q not found", key.cid)
}
- if ep.tg == nil {
- return nil, nil, false, nil
+ return ep.tg, nil
+}
+
+// ttyFromIDLocked returns the TTY files for the given execution ID. It may
+// return nil in case the container has not started yet. Returns error if
+// execution ID is invalid or if the container cannot be found (maybe it has
+// been deleted). Caller must hold 'mu'.
+func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+ ep := l.processes[key]
+ if ep == nil {
+ return nil, nil, fmt.Errorf("container %q not found", key.cid)
+ }
+ return ep.tty, ep.ttyVFS2, nil
+}
+
+func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+ if len(stdioFDs) != 3 {
+ return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
+ }
+
+ k := kernel.KernelFromContext(ctx)
+ fdTable := k.NewFDTable()
+ ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
+ if err != nil {
+ fdTable.DecRef()
+ return nil, nil, nil, err
}
- return ep.tg, ep.tty, true, nil
+ return fdTable, ttyFile, ttyFileVFS2, nil
}
diff --git a/runsc/boot/loader_amd64.go b/runsc/boot/loader_amd64.go
deleted file mode 100644
index 78df86611..000000000
--- a/runsc/boot/loader_amd64.go
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build amd64
-
-package boot
-
-import (
- "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-)
-
-func init() {
- // Set the global syscall table.
- syscallTable = linux.AMD64
-}
diff --git a/runsc/boot/loader_arm64.go b/runsc/boot/loader_arm64.go
deleted file mode 100644
index 250785010..000000000
--- a/runsc/boot/loader_arm64.go
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build arm64
-
-package boot
-
-import (
- "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
-)
-
-func init() {
- // Set the global syscall table.
- syscallTable = linux.ARM64
-}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 55d27a632..e448fd773 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -31,7 +31,6 @@ import (
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
@@ -69,11 +68,6 @@ func testSpec() *specs.Spec {
}
}
-func resetSyscallTable() {
- kernel.VFS2Enabled = false
- kernel.FlushSyscallTablesTestOnly()
-}
-
// startGofer starts a new gofer routine serving 'root' path. It returns the
// sandbox side of the connection, and a function that when called will stop the
// gofer.
@@ -150,13 +144,11 @@ func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) {
// TestRun runs a simple application in a sandbox and checks that it succeeds.
func TestRun(t *testing.T) {
- defer resetSyscallTable()
doRun(t, false)
}
// TestRunVFS2 runs TestRun in VFSv2.
func TestRunVFS2(t *testing.T) {
- defer resetSyscallTable()
doRun(t, true)
}
@@ -199,13 +191,11 @@ func doRun(t *testing.T, vfsEnabled bool) {
// TestStartSignal tests that the controller Start message will cause
// WaitForStartSignal to return.
func TestStartSignal(t *testing.T) {
- defer resetSyscallTable()
doStartSignal(t, false)
}
// TestStartSignalVFS2 does TestStartSignal with VFS2.
func TestStartSignalVFS2(t *testing.T) {
- defer resetSyscallTable()
doStartSignal(t, true)
}
@@ -438,7 +428,6 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
// Test that MountNamespace can be created with various specs.
func TestCreateMountNamespace(t *testing.T) {
-
for _, tc := range createMountTestcases(false /* vfs2 */) {
t.Run(tc.name, func(t *testing.T) {
conf := testConfig()
@@ -476,15 +465,13 @@ func TestCreateMountNamespace(t *testing.T) {
// Test that MountNamespace can be created with various specs.
func TestCreateMountNamespaceVFS2(t *testing.T) {
-
for _, tc := range createMountTestcases(true /* vfs2 */) {
t.Run(tc.name, func(t *testing.T) {
- defer resetSyscallTable()
-
spec := testSpec()
spec.Mounts = tc.spec.Mounts
spec.Root = tc.spec.Root
+ t.Logf("Using root: %q", spec.Root.Path)
l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec)
if err != nil {
t.Fatalf("failed to create loader: %v", err)
@@ -497,7 +484,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
t.Fatalf("failed process hints: %v", err)
}
- ctx := l.rootProcArgs.NewContext(l.k)
+ ctx := l.k.SupervisorContext()
mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
if err != nil {
t.Fatalf("failed to setupVFS2: %v", err)
@@ -506,7 +493,6 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
root := mns.Root()
defer root.DecRef()
for _, p := range tc.expectedPaths {
-
target := &vfs.PathOperation{
Root: root,
Start: root,
@@ -518,7 +504,6 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
} else {
d.DecRef()
}
-
}
})
}
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index bee6ee336..0af30456e 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -17,6 +17,7 @@ package boot
import (
"fmt"
"net"
+ "runtime"
"strings"
"syscall"
@@ -24,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
@@ -75,6 +77,44 @@ type DefaultRoute struct {
Name string
}
+// QueueingDiscipline is used to specify the kind of Queueing Discipline to
+// apply for a give FDBasedLink.
+type QueueingDiscipline int
+
+const (
+ // QDiscNone disables any queueing for the underlying FD.
+ QDiscNone QueueingDiscipline = iota
+
+ // QDiscFIFO applies a simple fifo based queue to the underlying
+ // FD.
+ QDiscFIFO
+)
+
+// MakeQueueingDiscipline if possible the equivalent QueuingDiscipline for s
+// else returns an error.
+func MakeQueueingDiscipline(s string) (QueueingDiscipline, error) {
+ switch s {
+ case "none":
+ return QDiscNone, nil
+ case "fifo":
+ return QDiscFIFO, nil
+ default:
+ return 0, fmt.Errorf("unsupported qdisc specified: %q", s)
+ }
+}
+
+// String implements fmt.Stringer.
+func (q QueueingDiscipline) String() string {
+ switch q {
+ case QDiscNone:
+ return "none"
+ case QDiscFIFO:
+ return "fifo"
+ default:
+ panic(fmt.Sprintf("Invalid queueing discipline: %d", q))
+ }
+}
+
// FDBasedLink configures an fd-based link.
type FDBasedLink struct {
Name string
@@ -84,6 +124,7 @@ type FDBasedLink struct {
GSOMaxSize uint32
SoftwareGSOEnabled bool
LinkAddress net.HardwareAddr
+ QDisc QueueingDiscipline
// NumChannels controls how many underlying FD's are to be used to
// create this endpoint.
@@ -185,6 +226,8 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
}
mac := tcpip.LinkAddress(link.LinkAddress)
+ log.Infof("gso max size is: %d", link.GSOMaxSize)
+
linkEP, err := fdbased.New(&fdbased.Options{
FDs: FDs,
MTU: uint32(link.MTU),
@@ -199,6 +242,13 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
return err
}
+ switch link.QDisc {
+ case QDiscNone:
+ case QDiscFIFO:
+ log.Infof("Enabling FIFO QDisc on %q", link.Name)
+ linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
+ }
+
log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return err
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 0b9b0b436..d1397ed2c 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -26,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ devpts2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
@@ -41,37 +42,28 @@ import (
)
func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
-
- vfsObj.MustRegisterFilesystemType(rootFsName, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
- AllowUserList: true,
- })
-
- vfsObj.MustRegisterFilesystemType(bind, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ vfsObj.MustRegisterFilesystemType(devpts2.Name, &devpts2.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserList: true,
+ // TODO(b/29356795): Users may mount this once the terminals are in a
+ // usable state.
+ AllowUserMount: false,
})
-
- vfsObj.MustRegisterFilesystemType(devpts, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
- AllowUserMount: true,
- AllowUserList: true,
- })
-
- vfsObj.MustRegisterFilesystemType(devtmpfs, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ vfsObj.MustRegisterFilesystemType(devtmpfsimpl.Name, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
AllowUserList: true,
})
- vfsObj.MustRegisterFilesystemType(proc, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
- AllowUserMount: true,
- AllowUserList: true,
+ vfsObj.MustRegisterFilesystemType(goferimpl.Name, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserList: true,
})
- vfsObj.MustRegisterFilesystemType(sysfs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ vfsObj.MustRegisterFilesystemType(procimpl.Name, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
AllowUserList: true,
})
- vfsObj.MustRegisterFilesystemType(tmpfs, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ vfsObj.MustRegisterFilesystemType(sysimpl.Name, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
AllowUserList: true,
})
- vfsObj.MustRegisterFilesystemType(nonefs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ vfsObj.MustRegisterFilesystemType(tmpfsimpl.Name, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
AllowUserList: true,
})
@@ -108,7 +100,6 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte
}
func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
-
exe := procArgs.Argv[0]
// Absolute paths can be used directly.
@@ -120,11 +111,9 @@ func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessAr
// Paths with '/' in them should be joined to the working directory, or
// to the root if working directory is not set.
if strings.IndexByte(exe, '/') > 0 {
-
if !path.IsAbs(procArgs.WorkingDirectory) {
return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory)
}
-
procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
return nil
}
@@ -144,21 +133,17 @@ func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessAr
creds := procArgs.Credentials
for _, p := range paths {
-
binPath := path.Join(p, exe)
-
pop := &vfs.PathOperation{
Root: root,
Start: root,
Path: fspath.Parse(binPath),
FollowFinalSymlink: true,
}
-
opts := &vfs.OpenOptions{
FileExec: true,
Flags: linux.O_RDONLY,
}
-
dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
if err == syserror.ENOENT || err == syserror.EACCES {
// Didn't find it here.
@@ -181,35 +166,32 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
// Create context with root credentials to mount the filesystem (the current
// user may not be privileged enough).
+ rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
rootProcArgs := *procArgs
rootProcArgs.WorkingDirectory = "/"
- rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+ rootProcArgs.Credentials = rootCreds
rootProcArgs.Umask = 0022
rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
rootCtx := procArgs.NewContext(c.k)
- creds := procArgs.Credentials
- if err := registerFilesystems(rootCtx, c.k.VFS(), creds); err != nil {
+ if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil {
return nil, fmt.Errorf("register filesystems: %w", err)
}
- mns, err := c.createMountNamespaceVFS2(ctx, conf, creds)
+ mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
if err != nil {
return nil, fmt.Errorf("creating mount namespace: %w", err)
}
-
rootProcArgs.MountNamespaceVFS2 = mns
// Mount submounts.
- if err := c.mountSubmountsVFS2(rootCtx, conf, mns, creds); err != nil {
+ if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
}
-
return mns, nil
}
func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
-
fd := c.fds.remove()
opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",")
@@ -222,7 +204,6 @@ func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *C
}
func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
-
c.prepareMountsVFS2()
for _, submount := range c.mounts {
@@ -256,7 +237,6 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
if err != nil {
return fmt.Errorf("mountOptions failed: %w", err)
}
-
if fsName == "" {
// Filesystem is not supported (e.g. cgroup), just skip it.
return nil
@@ -277,7 +257,7 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
// All writes go to upper, be paranoid and make lower readonly.
opts.ReadOnly = useOverlay
- if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil {
+ if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
}
log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts)
@@ -336,7 +316,6 @@ func p9MountOptionsVFS2(fd int, fa FileAccessType) []string {
}
func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
-
target := &vfs.PathOperation{
Root: root,
Start: root,
@@ -345,12 +324,10 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
switch {
-
case err == syserror.ENOENT:
if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil {
return err
}
-
mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}
if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil {
return fmt.Errorf("failed to makedir for mount %+v: %w", target, err)