summaryrefslogtreecommitdiffhomepage
path: root/runsc/boot
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/boot')
-rw-r--r--runsc/boot/BUILD8
-rw-r--r--runsc/boot/config.go7
-rw-r--r--runsc/boot/controller.go14
-rw-r--r--runsc/boot/filter/config.go11
-rw-r--r--runsc/boot/fs.go9
-rw-r--r--runsc/boot/loader.go264
-rw-r--r--runsc/boot/loader_test.go8
-rw-r--r--runsc/boot/network.go4
-rw-r--r--runsc/boot/vfs.go164
9 files changed, 307 insertions, 182 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 2b1e6b13e..9f52438c2 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -40,6 +40,8 @@ go_library(
"//pkg/sentry/arch:registers_go_proto",
"//pkg/sentry/control",
"//pkg/sentry/devices/memdev",
+ "//pkg/sentry/devices/ttydev",
+ "//pkg/sentry/devices/tundev",
"//pkg/sentry/fdimport",
"//pkg/sentry/fs",
"//pkg/sentry/fs/dev",
@@ -53,6 +55,7 @@ go_library(
"//pkg/sentry/fs/user",
"//pkg/sentry/fsimpl/devpts",
"//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/fsimpl/fuse",
"//pkg/sentry/fsimpl/gofer",
"//pkg/sentry/fsimpl/host",
"//pkg/sentry/fsimpl/overlay",
@@ -87,6 +90,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/link/fdbased",
"//pkg/tcpip/link/loopback",
+ "//pkg/tcpip/link/packetsocket",
"//pkg/tcpip/link/qdisc/fifo",
"//pkg/tcpip/link/sniffer",
"//pkg/tcpip/network/arp",
@@ -103,7 +107,7 @@ go_library(
"//runsc/boot/pprof",
"//runsc/specutils",
"@com_github_golang_protobuf//proto:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
)
@@ -128,7 +132,7 @@ go_test(
"//pkg/sync",
"//pkg/unet",
"//runsc/fsgofer",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index bb01b8fb5..80da8b3e6 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -274,6 +274,9 @@ type Config struct {
// Enables VFS2 (not plumbled through yet).
VFS2 bool
+
+ // Enables FUSE usage (not plumbled through yet).
+ FUSE bool
}
// ToFlags returns a slice of flags that correspond to the given Config.
@@ -325,5 +328,9 @@ func (c *Config) ToFlags() []string {
f = append(f, "--vfs2=true")
}
+ if c.FUSE {
+ f = append(f, "--fuse=true")
+ }
+
return f
}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 8125d5061..3e5e4c22f 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -155,7 +155,7 @@ func newController(fd int, l *Loader) (*controller, error) {
srv.Register(&debug{})
srv.Register(&control.Logging{})
- if l.conf.ProfileEnable {
+ if l.root.conf.ProfileEnable {
srv.Register(&control.Profile{
Kernel: l.k,
})
@@ -333,7 +333,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
// Pause the kernel while we build a new one.
cm.l.k.Pause()
- p, err := createPlatform(cm.l.conf, deviceFile)
+ p, err := createPlatform(cm.l.root.conf, deviceFile)
if err != nil {
return fmt.Errorf("creating platform: %v", err)
}
@@ -349,8 +349,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
cm.l.k = k
// Set up the restore environment.
- mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
- renv, err := mntr.createRestoreEnvironment(cm.l.conf)
+ mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints)
+ renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
if err != nil {
return fmt.Errorf("creating RestoreEnvironment: %v", err)
}
@@ -368,7 +368,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("file cannot be empty")
}
- if cm.l.conf.ProfileEnable {
+ if cm.l.root.conf.ProfileEnable {
// pprof.Initialize opens /proc/self/maps, so has to be called before
// installing seccomp filters.
pprof.Initialize()
@@ -387,13 +387,13 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
// Since we have a new kernel we also must make a new watchdog.
dogOpts := watchdog.DefaultOpts
- dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+ dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction
dog := watchdog.New(k, dogOpts)
// Change the loader fields to reflect the changes made when restoring.
cm.l.k = k
cm.l.watchdog = dog
- cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+ cm.l.root.procArgs = kernel.CreateProcessArgs{}
cm.l.restore = true
// Reinitialize the sandbox ID and processes map. Note that it doesn't
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 60e33425f..149eb0b1b 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -310,19 +310,12 @@ var allowedSyscalls = seccomp.SyscallRules{
},
},
syscall.SYS_WRITE: {},
- // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
- // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
- // option is enabled for a packet socket.
+ // For rawfile.NonBlockingWriteIovec.
syscall.SYS_WRITEV: []seccomp.Rule{
{
seccomp.AllowAny{},
seccomp.AllowAny{},
- seccomp.AllowValue(2),
- },
- {
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(3),
+ seccomp.GreaterThan(0),
},
},
}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e83584b82..59639ba19 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -29,6 +29,7 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -390,6 +391,10 @@ type mountHint struct {
// root is the inode where the volume is mounted. For mounts with 'pod' share
// the volume is mounted once and then bind mounted inside the containers.
root *fs.Inode
+
+ // vfsMount is the master mount for the volume. For mounts with 'pod' share
+ // the master volume is bind mounted inside the containers.
+ vfsMount *vfs.Mount
}
func (m *mountHint) setField(key, val string) error {
@@ -571,9 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
// processHints processes annotations that container hints about how volumes
// should be mounted (e.g. a volume shared between containers). It must be
// called for the root container only.
-func (c *containerMounter) processHints(conf *Config) error {
+func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error {
if conf.VFS2 {
- return nil
+ return c.processHintsVFS2(conf, creds)
}
ctx := c.k.SupervisorContext()
for _, hint := range c.hints.mounts {
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 081db39c1..9cd9c5909 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -77,29 +77,34 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
)
-// Loader keeps state needed to start the kernel and run the container..
-type Loader struct {
- // k is the kernel.
- k *kernel.Kernel
-
- // ctrl is the control server.
- ctrl *controller
-
+type containerInfo struct {
conf *Config
- // console is set to true if terminal is enabled.
- console bool
+ // spec is the base configuration for the root container.
+ spec *specs.Spec
- watchdog *watchdog.Watchdog
+ // procArgs refers to the container's init task.
+ procArgs kernel.CreateProcessArgs
// stdioFDs contains stdin, stdout, and stderr.
stdioFDs []int
// goferFDs are the FDs that attach the sandbox to the gofers.
goferFDs []int
+}
- // spec is the base configuration for the root container.
- spec *specs.Spec
+// Loader keeps state needed to start the kernel and run the container..
+type Loader struct {
+ // k is the kernel.
+ k *kernel.Kernel
+
+ // ctrl is the control server.
+ ctrl *controller
+
+ // root contains information about the root container in the sandbox.
+ root containerInfo
+
+ watchdog *watchdog.Watchdog
// stopSignalForwarding disables forwarding of signals to the sandboxed
// container. It should be called when a sandbox is destroyed.
@@ -108,9 +113,6 @@ type Loader struct {
// restore is set to true if we are restoring a container.
restore bool
- // rootProcArgs refers to the root sandbox init task.
- rootProcArgs kernel.CreateProcessArgs
-
// sandboxID is the ID for the whole sandbox.
sandboxID string
@@ -175,8 +177,6 @@ type Args struct {
// StdioFDs is the stdio for the application. The Loader takes ownership of
// these FDs and may close them at any time.
StdioFDs []int
- // Console is set to true if using TTY.
- Console bool
// NumCPU is the number of CPUs to create inside the sandbox.
NumCPU int
// TotalMem is the initial amount of total memory to report back to the
@@ -205,6 +205,10 @@ func New(args Args) (*Loader, error) {
// Is this a VFSv2 kernel?
if args.Conf.VFS2 {
kernel.VFS2Enabled = true
+ if args.Conf.FUSE {
+ kernel.FUSEEnabled = true
+ }
+
vfs2.Override()
}
@@ -227,9 +231,7 @@ func New(args Args) (*Loader, error) {
// Create VDSO.
//
// Pass k as the platform since it is savable, unlike the actual platform.
- //
- // FIXME(b/109889800): Use non-nil context.
- vdso, err := loader.PrepareVDSO(nil, k)
+ vdso, err := loader.PrepareVDSO(k)
if err != nil {
return nil, fmt.Errorf("creating vdso: %v", err)
}
@@ -300,6 +302,12 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("initializing kernel: %v", err)
}
+ if kernel.VFS2Enabled {
+ if err := registerFilesystems(k); err != nil {
+ return nil, fmt.Errorf("registering filesystems: %w", err)
+ }
+ }
+
if err := adjustDirentCache(k); err != nil {
return nil, err
}
@@ -318,7 +326,7 @@ func New(args Args) (*Loader, error) {
dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
dog := watchdog.New(k, dogOpts)
- procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
+ procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
if err != nil {
return nil, fmt.Errorf("creating init process for root container: %v", err)
}
@@ -366,17 +374,18 @@ func New(args Args) (*Loader, error) {
eid := execID{cid: args.ID}
l := &Loader{
- k: k,
- conf: args.Conf,
- console: args.Console,
- watchdog: dog,
- spec: args.Spec,
- goferFDs: args.GoferFDs,
- stdioFDs: stdioFDs,
- rootProcArgs: procArgs,
- sandboxID: args.ID,
- processes: map[execID]*execProcess{eid: {}},
- mountHints: mountHints,
+ k: k,
+ watchdog: dog,
+ sandboxID: args.ID,
+ processes: map[execID]*execProcess{eid: {}},
+ mountHints: mountHints,
+ root: containerInfo{
+ conf: args.Conf,
+ stdioFDs: stdioFDs,
+ goferFDs: args.GoferFDs,
+ spec: args.Spec,
+ procArgs: procArgs,
+ },
}
// We don't care about child signals; some platforms can generate a
@@ -404,8 +413,8 @@ func New(args Args) (*Loader, error) {
return l, nil
}
-// newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
+// createProcessArgs creates args that can be used with kernel.CreateProcess.
+func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
// Create initial limits.
ls, err := createLimitSet(spec)
if err != nil {
@@ -479,13 +488,13 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
}
func (l *Loader) installSeccompFilters() error {
- if l.conf.DisableSeccomp {
+ if l.root.conf.DisableSeccomp {
filter.Report("syscall filter is DISABLED. Running in less secure mode.")
} else {
opts := filter.Options{
Platform: l.k.Platform,
- HostNetwork: l.conf.Network == NetworkHost,
- ProfileEnable: l.conf.ProfileEnable,
+ HostNetwork: l.root.conf.Network == NetworkHost,
+ ProfileEnable: l.root.conf.ProfileEnable,
ControllerFD: l.ctrl.srv.FD(),
}
if err := filter.Install(opts); err != nil {
@@ -511,7 +520,7 @@ func (l *Loader) Run() error {
}
func (l *Loader) run() error {
- if l.conf.Network == NetworkHost {
+ if l.root.conf.Network == NetworkHost {
// Delay host network configuration to this point because network namespace
// is configured after the loader is created and before Run() is called.
log.Debugf("Configuring host network")
@@ -532,10 +541,8 @@ func (l *Loader) run() error {
// If we are restoring, we do not want to create a process.
// l.restore is set by the container manager when a restore call is made.
- var ttyFile *host.TTYFileOperations
- var ttyFileVFS2 *hostvfs2.TTYFileDescription
if !l.restore {
- if l.conf.ProfileEnable {
+ if l.root.conf.ProfileEnable {
pprof.Initialize()
}
@@ -545,82 +552,29 @@ func (l *Loader) run() error {
return err
}
- // Create the FD map, which will set stdin, stdout, and stderr. If console
- // is true, then ioctl calls will be passed through to the host fd.
- ctx := l.rootProcArgs.NewContext(l.k)
- var err error
-
- // CreateProcess takes a reference on FDMap if successful. We won't need
- // ours either way.
- l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
- if err != nil {
- return fmt.Errorf("importing fds: %v", err)
- }
-
- // Setup the root container file system.
- l.startGoferMonitor(l.sandboxID, l.goferFDs)
-
- mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
- if err := mntr.processHints(l.conf); err != nil {
- return err
- }
- if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
- return err
- }
-
- // Add the HOME enviroment variable if it is not already set.
- var envv []string
- if kernel.VFS2Enabled {
- envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
- l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
-
- } else {
- envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
- l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
- }
- if err != nil {
- return err
- }
- l.rootProcArgs.Envv = envv
-
// Create the root container init task. It will begin running
// when the kernel is started.
- if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
- return fmt.Errorf("creating init process: %v", err)
+ if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
+ return err
}
-
- // CreateProcess takes a reference on FDTable if successful.
- l.rootProcArgs.FDTable.DecRef()
}
ep.tg = l.k.GlobalInit()
- if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
+ if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
ep.pidnsPath = ns.Path
}
- if l.console {
- // Set the foreground process group on the TTY to the global init process
- // group, since that is what we are about to start running.
- switch {
- case ttyFileVFS2 != nil:
- ep.ttyVFS2 = ttyFileVFS2
- ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
- case ttyFile != nil:
- ep.tty = ttyFile
- ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
- }
- }
// Handle signals by forwarding them to the root container process
// (except for panic signal, which should cause a panic).
l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
// Panic signal should cause a panic.
- if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+ if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
panic("Signal-induced panic")
}
// Otherwise forward to root container.
deliveryMode := DeliverToProcess
- if l.console {
+ if l.root.spec.Process.Terminal {
// Since we are running with a console, we should forward the signal to
// the foreground process group so that job control signals like ^C can
// be handled properly.
@@ -637,7 +591,7 @@ func (l *Loader) run() error {
// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
// passed FDs, so only close for VFS1.
if !kernel.VFS2Enabled {
- for _, fd := range l.stdioFDs {
+ for _, fd := range l.root.stdioFDs {
err := syscall.Close(fd)
if err != nil {
return fmt.Errorf("close dup()ed stdioFDs: %v", err)
@@ -676,8 +630,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
l.mu.Lock()
defer l.mu.Unlock()
- eid := execID{cid: cid}
- if _, ok := l.processes[eid]; !ok {
+ ep := l.processes[execID{cid: cid}]
+ if ep == nil {
return fmt.Errorf("trying to start a deleted container %q", cid)
}
@@ -711,76 +665,112 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
if pidns == nil {
pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
}
- l.processes[eid].pidnsPath = ns.Path
+ ep.pidnsPath = ns.Path
} else {
pidns = l.k.RootPIDNamespace()
}
- procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
+
+ info := &containerInfo{
+ conf: conf,
+ spec: spec,
+ }
+ info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
if err != nil {
return fmt.Errorf("creating new process: %v", err)
}
// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
- var stdioFDs []int
for _, f := range files[:3] {
- stdioFDs = append(stdioFDs, int(f.Fd()))
+ info.stdioFDs = append(info.stdioFDs, int(f.Fd()))
}
- // Create the FD map, which will set stdin, stdout, and stderr.
- ctx := procArgs.NewContext(l.k)
- fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
- if err != nil {
- return fmt.Errorf("importing fds: %v", err)
- }
- // CreateProcess takes a reference on fdTable if successful. We won't
- // need ours either way.
- procArgs.FDTable = fdTable
-
// Can't take ownership away from os.File. dup them to get a new FDs.
- var goferFDs []int
for _, f := range files[3:] {
fd, err := syscall.Dup(int(f.Fd()))
if err != nil {
return fmt.Errorf("failed to dup file: %v", err)
}
- goferFDs = append(goferFDs, fd)
+ info.goferFDs = append(info.goferFDs, fd)
+ }
+
+ tg, err := l.createContainerProcess(false, cid, info, ep)
+ if err != nil {
+ return err
+ }
+
+ // Success!
+ l.k.StartProcess(tg)
+ ep.tg = tg
+ return nil
+}
+
+func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) {
+ console := false
+ if root {
+ // Only root container supports terminal for now.
+ console = info.spec.Process.Terminal
}
+ // Create the FD map, which will set stdin, stdout, and stderr.
+ ctx := info.procArgs.NewContext(l.k)
+ fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs)
+ if err != nil {
+ return nil, fmt.Errorf("importing fds: %v", err)
+ }
+ // CreateProcess takes a reference on fdTable if successful. We won't need
+ // ours either way.
+ info.procArgs.FDTable = fdTable
+
// Setup the child container file system.
- l.startGoferMonitor(cid, goferFDs)
+ l.startGoferMonitor(cid, info.goferFDs)
- mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
- if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
- return err
+ mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints)
+ if root {
+ if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
+ return nil, err
+ }
+ }
+ if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
+ return nil, err
}
// Add the HOME enviroment variable if it is not already set.
var envv []string
if kernel.VFS2Enabled {
- envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2,
- procArgs.Credentials.RealKUID, procArgs.Envv)
+ envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
+ info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
} else {
- envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace,
- procArgs.Credentials.RealKUID, procArgs.Envv)
+ envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
+ info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
}
if err != nil {
- return err
+ return nil, err
}
- procArgs.Envv = envv
+ info.procArgs.Envv = envv
// Create and start the new process.
- tg, _, err := l.k.CreateProcess(procArgs)
+ tg, _, err := l.k.CreateProcess(info.procArgs)
if err != nil {
- return fmt.Errorf("creating process: %v", err)
+ return nil, fmt.Errorf("creating process: %v", err)
}
- l.k.StartProcess(tg)
-
// CreateProcess takes a reference on FDTable if successful.
- procArgs.FDTable.DecRef()
+ info.procArgs.FDTable.DecRef()
- l.processes[eid].tg = tg
- return nil
+ // Set the foreground process group on the TTY to the global init process
+ // group, since that is what we are about to start running.
+ if root {
+ switch {
+ case ttyFileVFS2 != nil:
+ ep.ttyVFS2 = ttyFileVFS2
+ ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
+ case ttyFile != nil:
+ ep.tty = ttyFile
+ ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
+ }
+ }
+
+ return tg, nil
}
// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
@@ -1058,7 +1048,7 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
})}
// Enable SACK Recovery.
- if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.StackSACKEnabled(true)); err != nil {
+ if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
return nil, fmt.Errorf("failed to enable SACK: %s", err)
}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index e448fd773..8e6fe57e1 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -479,13 +479,13 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
defer l.Destroy()
defer loaderCleanup()
- mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
- if err := mntr.processHints(l.conf); err != nil {
+ mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints)
+ if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil {
t.Fatalf("failed process hints: %v", err)
}
ctx := l.k.SupervisorContext()
- mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
+ mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs)
if err != nil {
t.Fatalf("failed to setupVFS2: %v", err)
}
@@ -499,7 +499,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
Path: fspath.Parse(p),
}
- if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+ if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
} else {
d.DecRef()
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 14d2f56a5..4e1fa7665 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket"
"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
@@ -252,6 +253,9 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
}
+ // Enable support for AF_PACKET sockets to receive outgoing packets.
+ linkEP = packetsocket.New(linkEP)
+
log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return err
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index d1653b279..cfe2d36aa 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -26,9 +26,12 @@ import (
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
+ "gvisor.dev/gvisor/pkg/sentry/devices/ttydev"
+ "gvisor.dev/gvisor/pkg/sentry/devices/tundev"
"gvisor.dev/gvisor/pkg/sentry/fs/user"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
@@ -40,7 +43,11 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
-func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+func registerFilesystems(k *kernel.Kernel) error {
+ ctx := k.SupervisorContext()
+ creds := auth.NewRootCredentials(k.RootUserNamespace())
+ vfsObj := k.VFS()
+
vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserList: true,
// TODO(b/29356795): Users may mount this once the terminals are in a
@@ -70,11 +77,28 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
AllowUserMount: true,
AllowUserList: true,
})
+ vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
// Setup files in devtmpfs.
if err := memdev.Register(vfsObj); err != nil {
return fmt.Errorf("registering memdev: %w", err)
}
+ if err := ttydev.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering ttydev: %w", err)
+ }
+
+ if kernel.FUSEEnabled {
+ if err := fuse.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering fusedev: %w", err)
+ }
+ }
+
+ if err := tundev.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering tundev: %v", err)
+ }
a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
if err != nil {
return fmt.Errorf("creating devtmpfs accessor: %w", err)
@@ -85,15 +109,25 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
return fmt.Errorf("initializing userspace: %w", err)
}
if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
- return fmt.Errorf("creating devtmpfs files: %w", err)
+ return fmt.Errorf("creating memdev devtmpfs files: %w", err)
+ }
+ if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
+ return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
+ }
+ if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
+ return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+ }
+
+ if kernel.FUSEEnabled {
+ if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
+ return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+ }
}
+
return nil
}
func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
- if err := mntr.k.VFS().Init(); err != nil {
- return fmt.Errorf("failed to initialize VFS: %w", err)
- }
mns, err := mntr.setupVFS2(ctx, conf, procArgs)
if err != nil {
return fmt.Errorf("failed to setupFS: %w", err)
@@ -122,10 +156,6 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
rootCtx := procArgs.NewContext(c.k)
- if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil {
- return nil, fmt.Errorf("register filesystems: %w", err)
- }
-
mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
if err != nil {
return nil, fmt.Errorf("creating mount namespace: %w", err)
@@ -141,10 +171,19 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
fd := c.fds.remove()
- opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",")
+ opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+
+ if conf.OverlayfsStaleRead {
+ // We can't check for overlayfs here because sandbox is chroot'ed and gofer
+ // can only send mount options for specs.Mounts (specs.Root is missing
+ // Options field). So assume root is always on top of overlayfs.
+ opts = append(opts, "overlayfs_stale_read")
+ }
log.Infof("Mounting root over 9P, ioFD: %d", fd)
- mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts})
+ mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{
+ Data: strings.Join(opts, ","),
+ })
if err != nil {
return nil, fmt.Errorf("setting up mount namespace: %w", err)
}
@@ -160,8 +199,14 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
for i := range mounts {
submount := &mounts[i]
log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
- if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
- return err
+ if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
+ if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil {
+ return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
+ }
+ } else {
+ if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
+ return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
+ }
}
}
@@ -235,20 +280,18 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
// used for mounts.
func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
- var (
- fsName string
- data []string
- )
+ fsName := m.Type
+ var data []string
// Find filesystem name and FS specific data field.
switch m.Type {
case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
- fsName = m.Type
+ // Nothing to do.
+
case nonefs:
fsName = sys.Name
- case tmpfs.Name:
- fsName = m.Type
+ case tmpfs.Name:
var err error
data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
if err != nil {
@@ -257,10 +300,16 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
case bind:
fsName = gofer.Name
+ if m.fd == 0 {
+ // Check that an FD was provided to fails fast. Technically FD=0 is valid,
+ // but unlikely to be correct in this context.
+ return "", nil, fmt.Errorf("9P mount requires a connection FD")
+ }
data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
default:
log.Warningf("ignoring unknown filesystem type %q", m.Type)
+ return "", nil, nil
}
opts := &vfs.MountOptions{
@@ -300,7 +349,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
}
_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
if err == nil {
- // Mount point exists, nothing else to do.
+ log.Debugf("Mount point %q already exists", currentPath)
return nil
}
if err != syserror.ENOENT {
@@ -378,3 +427,76 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds
return fmt.Errorf(`stating "/tmp" inside container: %w`, err)
}
}
+
+// processHintsVFS2 processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error {
+ ctx := c.k.SupervisorContext()
+ for _, hint := range c.hints.mounts {
+ // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+ // common gofer to mount all shared volumes.
+ if hint.mount.Type != tmpfs.Name {
+ continue
+ }
+
+ log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+ mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
+ if err != nil {
+ return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+ }
+ hint.vfsMount = mnt
+ }
+ return nil
+}
+
+// mountSharedMasterVFS2 mounts the master of a volume that is shared among
+// containers in a pod.
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ mntFD := &mountAndFD{Mount: hint.mount}
+ fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+ if err != nil {
+ return nil, err
+ }
+ if len(fsName) == 0 {
+ return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+ }
+ return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error {
+ if err := source.checkCompatible(mount); err != nil {
+ return err
+ }
+
+ _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+ if err != nil {
+ return err
+ }
+ newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
+ if err != nil {
+ return err
+ }
+ defer newMnt.DecRef()
+
+ root := mns.Root()
+ defer root.DecRef()
+ if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil {
+ return err
+ }
+
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(mount.Destination),
+ }
+ if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
+ return err
+ }
+ log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+ return nil
+}