1 files changed, 1264 insertions, 0 deletions
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
new file mode 100644
index 000000000..8c8bad11c
--- /dev/null
+++ b/runsc/boot/loader.go
@@ -0,0 +1,1264 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package boot loads the kernel and runs a container.
+package boot
+
+import (
+	"fmt"
+	mrand "math/rand"
+	"os"
+	"runtime"
+	"sync/atomic"
+	"syscall"
+	gtime "time"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/fdimport"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/host"
+	"gvisor.dev/gvisor/pkg/sentry/fs/user"
+	hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/sighandling"
+	"gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
+	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/watchdog"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/runsc/boot/filter"
+	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
+	"gvisor.dev/gvisor/runsc/boot/pprof"
+	"gvisor.dev/gvisor/runsc/specutils"
+
+	// Include supported socket providers.
+	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
+)
+
+// Loader keeps state needed to start the kernel and run the container..
+type Loader struct {
+	// k is the kernel.
+	k *kernel.Kernel
+
+	// ctrl is the control server.
+	ctrl *controller
+
+	conf *Config
+
+	// console is set to true if terminal is enabled.
+	console bool
+
+	watchdog *watchdog.Watchdog
+
+	// stdioFDs contains stdin, stdout, and stderr.
+	stdioFDs []int
+
+	// goferFDs are the FDs that attach the sandbox to the gofers.
+	goferFDs []int
+
+	// spec is the base configuration for the root container.
+	spec *specs.Spec
+
+	// stopSignalForwarding disables forwarding of signals to the sandboxed
+	// container. It should be called when a sandbox is destroyed.
+	stopSignalForwarding func()
+
+	// restore is set to true if we are restoring a container.
+	restore bool
+
+	// rootProcArgs refers to the root sandbox init task.
+	rootProcArgs kernel.CreateProcessArgs
+
+	// sandboxID is the ID for the whole sandbox.
+	sandboxID string
+
+	// mu guards processes.
+	mu sync.Mutex
+
+	// processes maps containers init process and invocation of exec. Root
+	// processes are keyed with container ID and pid=0, while exec invocations
+	// have the corresponding pid set.
+	//
+	// processes is guardded by mu.
+	processes map[execID]*execProcess
+
+	// mountHints provides extra information about mounts for containers that
+	// apply to the entire pod.
+	mountHints *podMountHints
+}
+
+// execID uniquely identifies a sentry process that is executed in a container.
+type execID struct {
+	cid string
+	pid kernel.ThreadID
+}
+
+// execProcess contains the thread group and host TTY of a sentry process.
+type execProcess struct {
+	// tg will be nil for containers that haven't started yet.
+	tg *kernel.ThreadGroup
+
+	// tty will be nil if the process is not attached to a terminal.
+	tty *host.TTYFileOperations
+
+	// tty will be nil if the process is not attached to a terminal.
+	ttyVFS2 *hostvfs2.TTYFileDescription
+
+	// pidnsPath is the pid namespace path in spec
+	pidnsPath string
+}
+
+func init() {
+	// Initialize the random number generator.
+	mrand.Seed(gtime.Now().UnixNano())
+}
+
+// Args are the arguments for New().
+type Args struct {
+	// Id is the sandbox ID.
+	ID string
+	// Spec is the sandbox specification.
+	Spec *specs.Spec
+	// Conf is the system configuration.
+	Conf *Config
+	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
+	// of this FD and may close it at any time.
+	ControllerFD int
+	// Device is an optional argument that is passed to the platform. The Loader
+	// takes ownership of this file and may close it at any time.
+	Device *os.File
+	// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
+	// takes ownership of these FDs and may close them at any time.
+	GoferFDs []int
+	// StdioFDs is the stdio for the application. The Loader takes ownership of
+	// these FDs and may close them at any time.
+	StdioFDs []int
+	// Console is set to true if using TTY.
+	Console bool
+	// NumCPU is the number of CPUs to create inside the sandbox.
+	NumCPU int
+	// TotalMem is the initial amount of total memory to report back to the
+	// container.
+	TotalMem uint64
+	// UserLogFD is the file descriptor to write user logs to.
+	UserLogFD int
+}
+
+// make sure stdioFDs are always the same on initial start and on restore
+const startingStdioFD = 64
+
+// New initializes a new kernel loader configured by spec.
+// New also handles setting up a kernel for restoring a container.
+func New(args Args) (*Loader, error) {
+	// We initialize the rand package now to make sure /dev/urandom is pre-opened
+	// on kernels that do not support getrandom(2).
+	if err := rand.Init(); err != nil {
+		return nil, fmt.Errorf("setting up rand: %v", err)
+	}
+
+	if err := usage.Init(); err != nil {
+		return nil, fmt.Errorf("setting up memory usage: %v", err)
+	}
+
+	// Is this a VFSv2 kernel?
+	if args.Conf.VFS2 {
+		kernel.VFS2Enabled = true
+		vfs2.Override()
+	}
+
+	// Create kernel and platform.
+	p, err := createPlatform(args.Conf, args.Device)
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+	k := &kernel.Kernel{
+		Platform: p,
+	}
+
+	// Create memory file.
+	mf, err := createMemoryFile()
+	if err != nil {
+		return nil, fmt.Errorf("creating memory file: %v", err)
+	}
+	k.SetMemoryFile(mf)
+
+	// Create VDSO.
+	//
+	// Pass k as the platform since it is savable, unlike the actual platform.
+	//
+	// FIXME(b/109889800): Use non-nil context.
+	vdso, err := loader.PrepareVDSO(nil, k)
+	if err != nil {
+		return nil, fmt.Errorf("creating vdso: %v", err)
+	}
+
+	// Create timekeeper.
+	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+	if err != nil {
+		return nil, fmt.Errorf("creating timekeeper: %v", err)
+	}
+	tk.SetClocks(time.NewCalibratedClocks())
+
+	if err := enableStrace(args.Conf); err != nil {
+		return nil, fmt.Errorf("enabling strace: %v", err)
+	}
+
+	// Create root network namespace/stack.
+	netns, err := newRootNetworkNamespace(args.Conf, k, k)
+	if err != nil {
+		return nil, fmt.Errorf("creating network: %v", err)
+	}
+
+	// Create capabilities.
+	caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
+	if err != nil {
+		return nil, fmt.Errorf("converting capabilities: %v", err)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids))
+	for _, GID := range args.Spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	// Create credentials.
+	creds := auth.NewUserCredentials(
+		auth.KUID(args.Spec.Process.User.UID),
+		auth.KGID(args.Spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		auth.NewRootUserNamespace())
+
+	if args.NumCPU == 0 {
+		args.NumCPU = runtime.NumCPU()
+	}
+	log.Infof("CPUs: %d", args.NumCPU)
+
+	if args.TotalMem > 0 {
+		// Adjust the total memory returned by the Sentry so that applications that
+		// use /proc/meminfo can make allocations based on this limit.
+		usage.MinimumTotalMemoryBytes = args.TotalMem
+		log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
+	}
+
+	// Initiate the Kernel object, which is required by the Context passed
+	// to createVFS in order to mount (among other things) procfs.
+	if err = k.Init(kernel.InitKernelArgs{
+		FeatureSet:                  cpuid.HostFeatureSet(),
+		Timekeeper:                  tk,
+		RootUserNamespace:           creds.UserNamespace,
+		RootNetworkNamespace:        netns,
+		ApplicationCores:            uint(args.NumCPU),
+		Vdso:                        vdso,
+		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
+		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
+		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
+	}); err != nil {
+		return nil, fmt.Errorf("initializing kernel: %v", err)
+	}
+
+	if err := adjustDirentCache(k); err != nil {
+		return nil, err
+	}
+
+	// Turn on packet logging if enabled.
+	if args.Conf.LogPackets {
+		log.Infof("Packet logging enabled")
+		atomic.StoreUint32(&sniffer.LogPackets, 1)
+	} else {
+		log.Infof("Packet logging disabled")
+		atomic.StoreUint32(&sniffer.LogPackets, 0)
+	}
+
+	// Create a watchdog.
+	dogOpts := watchdog.DefaultOpts
+	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
+	dog := watchdog.New(k, dogOpts)
+
+	procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
+	if err != nil {
+		return nil, fmt.Errorf("creating init process for root container: %v", err)
+	}
+
+	if err := initCompatLogs(args.UserLogFD); err != nil {
+		return nil, fmt.Errorf("initializing compat logs: %v", err)
+	}
+
+	mountHints, err := newPodMountHints(args.Spec)
+	if err != nil {
+		return nil, fmt.Errorf("creating pod mount hints: %v", err)
+	}
+
+	if kernel.VFS2Enabled {
+		// Set up host mount that will be used for imported fds.
+		hostFilesystem := hostvfs2.NewFilesystem(k.VFS())
+		defer hostFilesystem.DecRef()
+		hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
+		if err != nil {
+			return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
+		}
+		k.SetHostMount(hostMount)
+	}
+
+	// Make host FDs stable between invocations. Host FDs must map to the exact
+	// same number when the sandbox is restored. Otherwise the wrong FD will be
+	// used.
+	var stdioFDs []int
+	newfd := startingStdioFD
+	for _, fd := range args.StdioFDs {
+		err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
+		if err != nil {
+			return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+		}
+		stdioFDs = append(stdioFDs, newfd)
+		err = syscall.Close(fd)
+		if err != nil {
+			return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
+		}
+		newfd++
+	}
+
+	eid := execID{cid: args.ID}
+	l := &Loader{
+		k:            k,
+		conf:         args.Conf,
+		console:      args.Console,
+		watchdog:     dog,
+		spec:         args.Spec,
+		goferFDs:     args.GoferFDs,
+		stdioFDs:     stdioFDs,
+		rootProcArgs: procArgs,
+		sandboxID:    args.ID,
+		processes:    map[execID]*execProcess{eid: {}},
+		mountHints:   mountHints,
+	}
+
+	// We don't care about child signals; some platforms can generate a
+	// tremendous number of useless ones (I'm looking at you, ptrace).
+	if err := sighandling.IgnoreChildStop(); err != nil {
+		return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
+	}
+
+	// Create the control server using the provided FD.
+	//
+	// This must be done *after* we have initialized the kernel since the
+	// controller is used to configure the kernel's network stack.
+	ctrl, err := newController(args.ControllerFD, l)
+	if err != nil {
+		return nil, fmt.Errorf("creating control server: %v", err)
+	}
+	l.ctrl = ctrl
+
+	// Only start serving after Loader is set to controller and controller is set
+	// to Loader, because they are both used in the urpc methods.
+	if err := ctrl.srv.StartServing(); err != nil {
+		return nil, fmt.Errorf("starting control server: %v", err)
+	}
+
+	return l, nil
+}
+
+// newProcess creates a process that can be run with kernel.CreateProcess.
+func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
+	// Create initial limits.
+	ls, err := createLimitSet(spec)
+	if err != nil {
+		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
+	}
+
+	wd := spec.Process.Cwd
+	if wd == "" {
+		wd = "/"
+	}
+
+	// Create the process arguments.
+	procArgs := kernel.CreateProcessArgs{
+		Argv:                    spec.Process.Args,
+		Envv:                    spec.Process.Env,
+		WorkingDirectory:        wd,
+		Credentials:             creds,
+		Umask:                   0022,
+		Limits:                  ls,
+		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
+		UTSNamespace:            k.RootUTSNamespace(),
+		IPCNamespace:            k.RootIPCNamespace(),
+		AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
+		ContainerID:             id,
+		PIDNamespace:            pidns,
+	}
+
+	return procArgs, nil
+}
+
+// Destroy cleans up all resources used by the loader.
+//
+// Note that this will block until all open control server connections have
+// been closed. For that reason, this should NOT be called in a defer, because
+// a panic in a control server rpc would then hang forever.
+func (l *Loader) Destroy() {
+	if l.ctrl != nil {
+		l.ctrl.srv.Stop()
+	}
+	if l.stopSignalForwarding != nil {
+		l.stopSignalForwarding()
+	}
+	l.watchdog.Stop()
+}
+
+func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
+	p, err := platform.Lookup(conf.Platform)
+	if err != nil {
+		panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
+	}
+	log.Infof("Platform: %s", conf.Platform)
+	return p.New(deviceFile)
+}
+
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+	const memfileName = "runsc-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating memfd: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	// We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
+	// there are memory cgroups specified, because at this point we're already
+	// in a mount namespace in which the relevant cgroupfs is not visible.
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+	if err != nil {
+		memfile.Close()
+		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+	}
+	return mf, nil
+}
+
+func (l *Loader) installSeccompFilters() error {
+	if l.conf.DisableSeccomp {
+		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+	} else {
+		opts := filter.Options{
+			Platform:      l.k.Platform,
+			HostNetwork:   l.conf.Network == NetworkHost,
+			ProfileEnable: l.conf.ProfileEnable,
+			ControllerFD:  l.ctrl.srv.FD(),
+		}
+		if err := filter.Install(opts); err != nil {
+			return fmt.Errorf("installing seccomp filters: %v", err)
+		}
+	}
+	return nil
+}
+
+// Run runs the root container.
+func (l *Loader) Run() error {
+	err := l.run()
+	l.ctrl.manager.startResultChan <- err
+	if err != nil {
+		// Give the controller some time to send the error to the
+		// runtime. If we return too quickly here the process will exit
+		// and the control connection will be closed before the error
+		// is returned.
+		gtime.Sleep(2 * gtime.Second)
+		return err
+	}
+	return nil
+}
+
+func (l *Loader) run() error {
+	if l.conf.Network == NetworkHost {
+		// Delay host network configuration to this point because network namespace
+		// is configured after the loader is created and before Run() is called.
+		log.Debugf("Configuring host network")
+		stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
+		if err := stack.Configure(); err != nil {
+			return err
+		}
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: l.sandboxID}
+	ep, ok := l.processes[eid]
+	if !ok {
+		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
+	}
+
+	// If we are restoring, we do not want to create a process.
+	// l.restore is set by the container manager when a restore call is made.
+	var ttyFile *host.TTYFileOperations
+	var ttyFileVFS2 *hostvfs2.TTYFileDescription
+	if !l.restore {
+		if l.conf.ProfileEnable {
+			pprof.Initialize()
+		}
+
+		// Finally done with all configuration. Setup filters before user code
+		// is loaded.
+		if err := l.installSeccompFilters(); err != nil {
+			return err
+		}
+
+		// Create the FD map, which will set stdin, stdout, and stderr.  If console
+		// is true, then ioctl calls will be passed through to the host fd.
+		ctx := l.rootProcArgs.NewContext(l.k)
+		var err error
+
+		// CreateProcess takes a reference on FDMap if successful. We won't need
+		// ours either way.
+		l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
+		if err != nil {
+			return fmt.Errorf("importing fds: %v", err)
+		}
+
+		// Setup the root container file system.
+		l.startGoferMonitor(l.sandboxID, l.goferFDs)
+
+		mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
+		if err := mntr.processHints(l.conf); err != nil {
+			return err
+		}
+		if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
+			return err
+		}
+
+		// Add the HOME enviroment variable if it is not already set.
+		var envv []string
+		if kernel.VFS2Enabled {
+			envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+
+		} else {
+			envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
+				l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		}
+		if err != nil {
+			return err
+		}
+		l.rootProcArgs.Envv = envv
+
+		// Create the root container init task. It will begin running
+		// when the kernel is started.
+		if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
+			return fmt.Errorf("creating init process: %v", err)
+		}
+
+		// CreateProcess takes a reference on FDTable if successful.
+		l.rootProcArgs.FDTable.DecRef()
+	}
+
+	ep.tg = l.k.GlobalInit()
+	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
+		ep.pidnsPath = ns.Path
+	}
+	if l.console {
+		// Set the foreground process group on the TTY to the global init process
+		// group, since that is what we are about to start running.
+		switch {
+		case ttyFileVFS2 != nil:
+			ep.ttyVFS2 = ttyFileVFS2
+			ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+		case ttyFile != nil:
+			ep.tty = ttyFile
+			ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
+		}
+	}
+
+	// Handle signals by forwarding them to the root container process
+	// (except for panic signal, which should cause a panic).
+	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
+		// Panic signal should cause a panic.
+		if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+			panic("Signal-induced panic")
+		}
+
+		// Otherwise forward to root container.
+		deliveryMode := DeliverToProcess
+		if l.console {
+			// Since we are running with a console, we should forward the signal to
+			// the foreground process group so that job control signals like ^C can
+			// be handled properly.
+			deliveryMode = DeliverToForegroundProcessGroup
+		}
+		log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
+			log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err)
+		}
+	})
+
+	// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
+	// either in createFDTable() during initial start or in descriptor.initAfterLoad()
+	// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
+	// passed FDs, so only close for VFS1.
+	if !kernel.VFS2Enabled {
+		for _, fd := range l.stdioFDs {
+			err := syscall.Close(fd)
+			if err != nil {
+				return fmt.Errorf("close dup()ed stdioFDs: %v", err)
+			}
+		}
+	}
+
+	log.Infof("Process should have started...")
+	l.watchdog.Start()
+	return l.k.Start()
+}
+
+// createContainer creates a new container inside the sandbox.
+func (l *Loader) createContainer(cid string) error {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	if _, ok := l.processes[eid]; ok {
+		return fmt.Errorf("container %q already exists", cid)
+	}
+	l.processes[eid] = &execProcess{}
+	return nil
+}
+
+// startContainer starts a child container. It returns the thread group ID of
+// the newly created process. Caller owns 'files' and may close them after
+// this method returns.
+func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+	// Create capabilities.
+	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
+	if err != nil {
+		return fmt.Errorf("creating capabilities: %v", err)
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	eid := execID{cid: cid}
+	if _, ok := l.processes[eid]; !ok {
+		return fmt.Errorf("trying to start a deleted container %q", cid)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+	for _, GID := range spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	// Create credentials. We reuse the root user namespace because the
+	// sentry currently supports only 1 mount namespace, which is tied to a
+	// single user namespace. Thus we must run in the same user namespace
+	// to access mounts.
+	creds := auth.NewUserCredentials(
+		auth.KUID(spec.Process.User.UID),
+		auth.KGID(spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		l.k.RootUserNamespace())
+
+	var pidns *kernel.PIDNamespace
+	if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok {
+		if ns.Path != "" {
+			for _, p := range l.processes {
+				if ns.Path == p.pidnsPath {
+					pidns = p.tg.PIDNamespace()
+					break
+				}
+			}
+		}
+		if pidns == nil {
+			pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
+		}
+		l.processes[eid].pidnsPath = ns.Path
+	} else {
+		pidns = l.k.RootPIDNamespace()
+	}
+	procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
+	if err != nil {
+		return fmt.Errorf("creating new process: %v", err)
+	}
+
+	// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
+	var stdioFDs []int
+	for _, f := range files[:3] {
+		stdioFDs = append(stdioFDs, int(f.Fd()))
+	}
+
+	// Create the FD map, which will set stdin, stdout, and stderr.
+	ctx := procArgs.NewContext(l.k)
+	fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
+	if err != nil {
+		return fmt.Errorf("importing fds: %v", err)
+	}
+	// CreateProcess takes a reference on fdTable if successful. We won't
+	// need ours either way.
+	procArgs.FDTable = fdTable
+
+	// Can't take ownership away from os.File. dup them to get a new FDs.
+	var goferFDs []int
+	for _, f := range files[3:] {
+		fd, err := syscall.Dup(int(f.Fd()))
+		if err != nil {
+			return fmt.Errorf("failed to dup file: %v", err)
+		}
+		goferFDs = append(goferFDs, fd)
+	}
+
+	// Setup the child container file system.
+	l.startGoferMonitor(cid, goferFDs)
+
+	mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
+	if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
+		return err
+	}
+
+	// Create and start the new process.
+	tg, _, err := l.k.CreateProcess(procArgs)
+	if err != nil {
+		return fmt.Errorf("creating process: %v", err)
+	}
+	l.k.StartProcess(tg)
+
+	// CreateProcess takes a reference on FDTable if successful.
+	procArgs.FDTable.DecRef()
+
+	l.processes[eid].tg = tg
+	return nil
+}
+
+// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
+// the gofer FDs looking for disconnects, and destroys the container if a
+// disconnect occurs in any of the gofer FDs.
+func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+	go func() {
+		log.Debugf("Monitoring gofer health for container %q", cid)
+		var events []unix.PollFd
+		for _, fd := range goferFDs {
+			events = append(events, unix.PollFd{
+				Fd:     int32(fd),
+				Events: unix.POLLHUP | unix.POLLRDHUP,
+			})
+		}
+		_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
+			// Use ppoll instead of poll because it's already whilelisted in seccomp.
+			n, err := unix.Ppoll(events, nil, nil)
+			return uintptr(n), 0, err
+		})
+		if err != nil {
+			panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
+		}
+
+		// Check if the gofer has stopped as part of normal container destruction.
+		// This is done just to avoid sending an annoying error message to the log.
+		// Note that there is a small race window in between mu.Unlock() and the
+		// lock being reacquired in destroyContainer(), but it's harmless to call
+		// destroyContainer() multiple times.
+		l.mu.Lock()
+		_, ok := l.processes[execID{cid: cid}]
+		l.mu.Unlock()
+		if ok {
+			log.Infof("Gofer socket disconnected, destroying container %q", cid)
+			if err := l.destroyContainer(cid); err != nil {
+				log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+			}
+		}
+	}()
+}
+
+// destroyContainer stops a container if it is still running and cleans up its
+// filesystem.
+func (l *Loader) destroyContainer(cid string) error {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
+	if err != nil {
+		// Container doesn't exist.
+		return err
+	}
+
+	// The container exists, but has it been started?
+	if tg != nil {
+		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+			return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
+		}
+		// Wait for all processes that belong to the container to exit (including
+		// exec'd processes).
+		for _, t := range l.k.TaskSet().Root.Tasks() {
+			if t.ContainerID() == cid {
+				t.ThreadGroup().WaitExited()
+			}
+		}
+
+		// At this point, all processes inside of the container have exited,
+		// releasing all references to the container's MountNamespace and
+		// causing all submounts and overlays to be unmounted.
+		//
+		// Since the container's MountNamespace has been released,
+		// MountNamespace.destroy() will have executed, but that function may
+		// trigger async close operations. We must wait for those to complete
+		// before returning, otherwise the caller may kill the gofer before
+		// they complete, causing a cascade of failing RPCs.
+		fs.AsyncBarrier()
+	}
+
+	// No more failure from this point on. Remove all container thread groups
+	// from the map.
+	for key := range l.processes {
+		if key.cid == cid {
+			delete(l.processes, key)
+		}
+	}
+
+	log.Debugf("Container destroyed %q", cid)
+	return nil
+}
+
+func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
+	// Hold the lock for the entire operation to ensure that exec'd process is
+	// added to 'processes' in case it races with destroyContainer().
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
+	if err != nil {
+		return 0, err
+	}
+	if tg == nil {
+		return 0, fmt.Errorf("container %q not started", args.ContainerID)
+	}
+
+	// Get the container MountNamespace from the Task.
+	if kernel.VFS2Enabled {
+		// task.MountNamespace() does not take a ref, so we must do so ourselves.
+		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
+		args.MountNamespaceVFS2.IncRef()
+	} else {
+		tg.Leader().WithMuLocked(func(t *kernel.Task) {
+			// task.MountNamespace() does not take a ref, so we must do so ourselves.
+			args.MountNamespace = t.MountNamespace()
+			args.MountNamespace.IncRef()
+		})
+	}
+
+	// Add the HOME environment variable if it is not already set.
+	if kernel.VFS2Enabled {
+		defer args.MountNamespaceVFS2.DecRef()
+
+		root := args.MountNamespaceVFS2.Root()
+		defer root.DecRef()
+		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
+		if err != nil {
+			return 0, err
+		}
+		args.Envv = envv
+	} else {
+		defer args.MountNamespace.DecRef()
+
+		root := args.MountNamespace.Root()
+		defer root.DecRef()
+		ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+		envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
+		if err != nil {
+			return 0, err
+		}
+		args.Envv = envv
+	}
+
+	// Start the process.
+	proc := control.Proc{Kernel: l.k}
+	args.PIDNamespace = tg.PIDNamespace()
+	newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args)
+	if err != nil {
+		return 0, err
+	}
+
+	eid := execID{cid: args.ContainerID, pid: tgid}
+	l.processes[eid] = &execProcess{
+		tg:      newTG,
+		tty:     ttyFile,
+		ttyVFS2: ttyFileVFS2,
+	}
+	log.Debugf("updated processes: %v", l.processes)
+
+	return tgid, nil
+}
+
+// waitContainer waits for the init process of a container to exit.
+func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
+	// Don't defer unlock, as doing so would make it impossible for
+	// multiple clients to wait on the same container.
+	tg, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("can't wait for container %q: %v", cid, err)
+	}
+
+	// If the thread either has already exited or exits during waiting,
+	// consider the container exited.
+	ws := l.wait(tg)
+	*waitStatus = ws
+	return nil
+}
+
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
+	if tgid <= 0 {
+		return fmt.Errorf("PID (%d) must be positive", tgid)
+	}
+
+	// Try to find a process that was exec'd
+	eid := execID{cid: cid, pid: tgid}
+	execTG, err := l.threadGroupFromID(eid)
+	if err == nil {
+		ws := l.wait(execTG)
+		*waitStatus = ws
+
+		l.mu.Lock()
+		delete(l.processes, eid)
+		log.Debugf("updated processes (removal): %v", l.processes)
+		l.mu.Unlock()
+		return nil
+	}
+
+	// The caller may be waiting on a process not started directly via exec.
+	// In this case, find the process in the container's PID namespace.
+	initTG, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("waiting for PID %d: %v", tgid, err)
+	}
+	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+	if tg == nil {
+		return fmt.Errorf("waiting for PID %d: no such process", tgid)
+	}
+	if tg.Leader().ContainerID() != cid {
+		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+	}
+	ws := l.wait(tg)
+	*waitStatus = ws
+	return nil
+}
+
+// wait waits for the process with TGID 'tgid' in a container's PID namespace
+// to exit.
+func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
+	tg.WaitExited()
+	return tg.ExitStatus().Status()
+}
+
+// WaitForStartSignal waits for a start signal from the control server.
+func (l *Loader) WaitForStartSignal() {
+	<-l.ctrl.manager.startChan
+}
+
+// WaitExit waits for the root container to exit, and returns its exit status.
+func (l *Loader) WaitExit() kernel.ExitStatus {
+	// Wait for container.
+	l.k.WaitExited()
+
+	return l.k.GlobalInit().ExitStatus()
+}
+
+func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
+	// Create an empty network stack because the network namespace may be empty at
+	// this point. Netns is configured before Run() is called. Netstack is
+	// configured using a control uRPC message. Host network is configured inside
+	// Run().
+	switch conf.Network {
+	case NetworkHost:
+		// No network namespacing support for hostinet yet, hence creator is nil.
+		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
+
+	case NetworkNone, NetworkSandbox:
+		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
+		if err != nil {
+			return nil, err
+		}
+		creator := &sandboxNetstackCreator{
+			clock:    clock,
+			uniqueID: uniqueID,
+		}
+		return inet.NewRootNamespace(s, creator), nil
+
+	default:
+		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
+	}
+
+}
+
+func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
+	netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+	transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+	s := netstack.Stack{stack.New(stack.Options{
+		NetworkProtocols:   netProtos,
+		TransportProtocols: transProtos,
+		Clock:              clock,
+		Stats:              netstack.Metrics,
+		HandleLocal:        true,
+		// Enable raw sockets for users with sufficient
+		// privileges.
+		RawFactory: raw.EndpointFactory{},
+		UniqueID:   uniqueID,
+	})}
+
+	// Enable SACK Recovery.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
+		return nil, fmt.Errorf("failed to enable SACK: %v", err)
+	}
+
+	// Set default TTLs as required by socket/netstack.
+	s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+	s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
+	// Enable Receive Buffer Auto-Tuning.
+	if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+	}
+
+	s.FillDefaultIPTables()
+
+	return &s, nil
+}
+
+// sandboxNetstackCreator implements kernel.NetworkStackCreator.
+//
+// +stateify savable
+type sandboxNetstackCreator struct {
+	clock    tcpip.Clock
+	uniqueID stack.UniqueID
+}
+
+// CreateStack implements kernel.NetworkStackCreator.CreateStack.
+func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
+	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
+	if err != nil {
+		return nil, err
+	}
+
+	// Setup loopback.
+	n := &Network{Stack: s.(*netstack.Stack).Stack}
+	nicID := tcpip.NICID(f.uniqueID.UniqueID())
+	link := DefaultLoopbackLink
+	linkEP := loopback.New()
+	if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
+		return nil, err
+	}
+
+	return s, nil
+}
+
+// signal sends a signal to one or more processes in a container. If PID is 0,
+// then the container init process is used. Depending on the SignalDeliveryMode
+// option, the signal may be sent directly to the indicated process, to all
+// processes in the container, or to the foreground process group.
+func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
+	if pid < 0 {
+		return fmt.Errorf("PID (%d) must be positive", pid)
+	}
+
+	switch mode {
+	case DeliverToProcess:
+		if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
+			return fmt.Errorf("signaling process in container %q PID %d: %v", cid, pid, err)
+		}
+		return nil
+
+	case DeliverToForegroundProcessGroup:
+		if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
+			return fmt.Errorf("signaling foreground process group in container %q PID %d: %v", cid, pid, err)
+		}
+		return nil
+
+	case DeliverToAllProcesses:
+		if pid != 0 {
+			return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
+		}
+		// Check that the container has actually started before signaling it.
+		if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
+			return err
+		}
+		if err := l.signalAllProcesses(cid, signo); err != nil {
+			return fmt.Errorf("signaling all processes in container %q: %v", cid, err)
+		}
+		return nil
+
+	default:
+		panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
+	}
+}
+
+func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
+	execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
+	if err == nil {
+		// Send signal directly to the identified process.
+		return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo})
+	}
+
+	// The caller may be signaling a process not started directly via exec.
+	// In this case, find the process in the container's PID namespace and
+	// signal it.
+	initTG, err := l.threadGroupFromID(execID{cid: cid})
+	if err != nil {
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
+	if tg == nil {
+		return fmt.Errorf("no such process with PID %d", tgid)
+	}
+	if tg.Leader().ContainerID() != cid {
+		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
+	}
+	return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
+}
+
+// signalForegrondProcessGroup looks up foreground process group from the TTY
+// for the given "tgid" inside container "cid", and send the signal to it.
+func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
+	l.mu.Lock()
+	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
+	if err != nil {
+		l.mu.Unlock()
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+	if tg == nil {
+		l.mu.Unlock()
+		return fmt.Errorf("container %q not started", cid)
+	}
+
+	tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
+	l.mu.Unlock()
+	if err != nil {
+		return fmt.Errorf("no thread group found: %v", err)
+	}
+
+	var pg *kernel.ProcessGroup
+	switch {
+	case ttyVFS2 != nil:
+		pg = ttyVFS2.ForegroundProcessGroup()
+	case tty != nil:
+		pg = tty.ForegroundProcessGroup()
+	default:
+		return fmt.Errorf("no TTY attached")
+	}
+	if pg == nil {
+		// No foreground process group has been set. Signal the
+		// original thread group.
+		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
+		return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
+	}
+	// Send the signal to all processes in the process group.
+	var lastErr error
+	for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
+		if tg.ProcessGroup() != pg {
+			continue
+		}
+		if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil {
+			lastErr = err
+		}
+	}
+	return lastErr
+}
+
+// signalAllProcesses that belong to specified container. It's a noop if the
+// container hasn't started or has exited.
+func (l *Loader) signalAllProcesses(cid string, signo int32) error {
+	// Pause the kernel to prevent new processes from being created while
+	// the signal is delivered. This prevents process leaks when SIGKILL is
+	// sent to the entire container.
+	l.k.Pause()
+	defer l.k.Unpause()
+	return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo})
+}
+
+// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
+// acquires mutex before calling it and fails in case container hasn't started
+// yet.
+func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	tg, err := l.tryThreadGroupFromIDLocked(key)
+	if err != nil {
+		return nil, err
+	}
+	if tg == nil {
+		return nil, fmt.Errorf("container %q not started", key.cid)
+	}
+	return tg, nil
+}
+
+// tryThreadGroupFromIDLocked returns the thread group for the given execution
+// ID. It may return nil in case the container has not started yet. Returns
+// error if execution ID is invalid or if the container cannot be found (maybe
+// it has been deleted). Caller must hold 'mu'.
+func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
+	ep := l.processes[key]
+	if ep == nil {
+		return nil, fmt.Errorf("container %q not found", key.cid)
+	}
+	return ep.tg, nil
+}
+
+// ttyFromIDLocked returns the TTY files for the given execution ID. It may
+// return nil in case the container has not started yet. Returns error if
+// execution ID is invalid or if the container cannot be found (maybe it has
+// been deleted). Caller must hold 'mu'.
+func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	ep := l.processes[key]
+	if ep == nil {
+		return nil, nil, fmt.Errorf("container %q not found", key.cid)
+	}
+	return ep.tty, ep.ttyVFS2, nil
+}
+
+func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+	if len(stdioFDs) != 3 {
+		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
+	}
+
+	k := kernel.KernelFromContext(ctx)
+	fdTable := k.NewFDTable()
+	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
+	if err != nil {
+		fdTable.DecRef()
+		return nil, nil, nil, err
+	}
+	return fdTable, ttyFile, ttyFileVFS2, nil
+}