summaryrefslogtreecommitdiffhomepage
path: root/runsc/boot
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/boot')
-rw-r--r--runsc/boot/BUILD2
-rw-r--r--runsc/boot/compat.go2
-rw-r--r--runsc/boot/controller.go120
-rw-r--r--runsc/boot/filter/config.go60
-rw-r--r--runsc/boot/fs.go51
-rw-r--r--runsc/boot/loader.go98
-rw-r--r--runsc/boot/loader_test.go57
-rw-r--r--runsc/boot/network.go39
-rw-r--r--runsc/boot/vfs.go98
9 files changed, 373 insertions, 154 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 248f77c34..8c73dc5dc 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -38,6 +38,7 @@ go_library(
"//pkg/memutil",
"//pkg/rand",
"//pkg/refs",
+ "//pkg/refsvfs2",
"//pkg/sentry/arch",
"//pkg/sentry/arch:registers_go_proto",
"//pkg/sentry/control",
@@ -74,6 +75,7 @@ go_library(
"//pkg/sentry/platform",
"//pkg/sentry/sighandling",
"//pkg/sentry/socket/hostinet",
+ "//pkg/sentry/socket/netfilter",
"//pkg/sentry/socket/netlink",
"//pkg/sentry/socket/netlink/route",
"//pkg/sentry/socket/netlink/uevent",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 7076ae2e2..a3a76b609 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -53,7 +53,7 @@ type compatEmitter struct {
func newCompatEmitter(logFD int) (*compatEmitter, error) {
nameMap, ok := getSyscallNameMap()
if !ok {
- return nil, fmt.Errorf("Linux syscall table not found")
+ return nil, fmt.Errorf("syscall table not found")
}
c := &compatEmitter{
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 894651519..865126ac5 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -30,6 +30,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
"gvisor.dev/gvisor/pkg/sentry/state"
"gvisor.dev/gvisor/pkg/sentry/time"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/urpc"
@@ -195,7 +196,7 @@ type containerManager struct {
// StartRoot will start the root container process.
func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
- log.Debugf("containerManager.StartRoot %q", *cid)
+ log.Debugf("containerManager.StartRoot, cid: %s", *cid)
// Tell the root container to start and wait for the result.
cm.startChan <- struct{}{}
if err := <-cm.startResultChan; err != nil {
@@ -206,14 +207,35 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
// Processes retrieves information about processes running in the sandbox.
func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error {
- log.Debugf("containerManager.Processes: %q", *cid)
+ log.Debugf("containerManager.Processes, cid: %s", *cid)
return control.Processes(cm.l.k, *cid, out)
}
+// CreateArgs contains arguments to the Create method.
+type CreateArgs struct {
+ // CID is the ID of the container to start.
+ CID string
+
+ // FilePayload may contain a TTY file for the terminal, if enabled.
+ urpc.FilePayload
+}
+
// Create creates a container within a sandbox.
-func (cm *containerManager) Create(cid *string, _ *struct{}) error {
- log.Debugf("containerManager.Create: %q", *cid)
- return cm.l.createContainer(*cid)
+func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error {
+ log.Debugf("containerManager.Create: %s", args.CID)
+
+ if len(args.Files) > 1 {
+ return fmt.Errorf("start arguments must have at most 1 files for TTY")
+ }
+ var tty *fd.FD
+ if len(args.Files) == 1 {
+ var err error
+ tty, err = fd.NewFromFile(args.Files[0])
+ if err != nil {
+ return fmt.Errorf("error dup'ing TTY file: %w", err)
+ }
+ }
+ return cm.l.createContainer(args.CID, tty)
}
// StartArgs contains arguments to the Start method.
@@ -228,20 +250,18 @@ type StartArgs struct {
CID string
// FilePayload contains, in order:
- // * stdin, stdout, and stderr.
- // * the file descriptor over which the sandbox will
- // request files from its root filesystem.
+ // * stdin, stdout, and stderr (optional: if terminal is disabled).
+ // * file descriptors to connect to gofer to serve the root filesystem.
urpc.FilePayload
}
// Start runs a created container within a sandbox.
func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
- log.Debugf("containerManager.Start: %+v", args)
-
// Validate arguments.
if args == nil {
return errors.New("start missing arguments")
}
+ log.Debugf("containerManager.Start, cid: %s, args: %+v", args.CID, args)
if args.Spec == nil {
return errors.New("start arguments missing spec")
}
@@ -251,44 +271,66 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
if args.CID == "" {
return errors.New("start argument missing container ID")
}
- if len(args.FilePayload.Files) < 4 {
- return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
+ if len(args.Files) < 1 {
+ return fmt.Errorf("start arguments must contain at least one file for the container root gofer")
}
// All validation passed, logs the spec for debugging.
specutils.LogSpec(args.Spec)
- fds, err := fd.NewFromFiles(args.FilePayload.Files)
+ goferFiles := args.Files
+ var stdios []*fd.FD
+ if !args.Spec.Process.Terminal {
+ // When not using a terminal, stdios come as the first 3 files in the
+ // payload.
+ if l := len(args.Files); l < 4 {
+ return fmt.Errorf("start arguments (len: %d) must contain stdios and files for the container root gofer", l)
+ }
+ var err error
+ stdios, err = fd.NewFromFiles(goferFiles[:3])
+ if err != nil {
+ return fmt.Errorf("error dup'ing stdio files: %w", err)
+ }
+ goferFiles = goferFiles[3:]
+ }
+ defer func() {
+ for _, fd := range stdios {
+ _ = fd.Close()
+ }
+ }()
+
+ goferFDs, err := fd.NewFromFiles(goferFiles)
if err != nil {
- return err
+ return fmt.Errorf("error dup'ing gofer files: %w", err)
}
defer func() {
- for _, fd := range fds {
+ for _, fd := range goferFDs {
_ = fd.Close()
}
}()
- if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
- log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
+
+ if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil {
+ log.Debugf("containerManager.Start failed, cid: %s, args: %+v, err: %v", args.CID, args, err)
return err
}
- log.Debugf("Container %q started", args.CID)
+ log.Debugf("Container started, cid: %s", args.CID)
return nil
}
// Destroy stops a container if it is still running and cleans up its
// filesystem.
func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
- log.Debugf("containerManager.destroy %q", *cid)
+ log.Debugf("containerManager.destroy, cid: %s", *cid)
return cm.l.destroyContainer(*cid)
}
// ExecuteAsync starts running a command on a created or running sandbox. It
// returns the PID of the new process.
func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
- log.Debugf("containerManager.ExecuteAsync: %+v", args)
+ log.Debugf("containerManager.ExecuteAsync, cid: %s, args: %+v", args.ContainerID, args)
tgid, err := cm.l.executeAsync(args)
if err != nil {
- log.Debugf("containerManager.ExecuteAsync failed: %+v: %v", args, err)
+ log.Debugf("containerManager.ExecuteAsync failed, cid: %s, args: %+v, err: %v", args.ContainerID, args, err)
return err
}
*pid = int32(tgid)
@@ -330,18 +372,18 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
log.Debugf("containerManager.Restore")
var specFile, deviceFile *os.File
- switch numFiles := len(o.FilePayload.Files); numFiles {
+ switch numFiles := len(o.Files); numFiles {
case 2:
// The device file is donated to the platform.
// Can't take ownership away from os.File. dup them to get a new FD.
- fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd()))
+ fd, err := syscall.Dup(int(o.Files[1].Fd()))
if err != nil {
return fmt.Errorf("failed to dup file: %v", err)
}
deviceFile = os.NewFile(uintptr(fd), "platform device")
fallthrough
case 1:
- specFile = o.FilePayload.Files[0]
+ specFile = o.Files[0]
case 0:
return fmt.Errorf("at least one file must be passed to Restore")
default:
@@ -367,12 +409,20 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
cm.l.k = k
// Set up the restore environment.
+ ctx := k.SupervisorContext()
mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints)
- renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
- if err != nil {
- return fmt.Errorf("creating RestoreEnvironment: %v", err)
+ if kernel.VFS2Enabled {
+ ctx, err = mntr.configureRestore(ctx, cm.l.root.conf)
+ if err != nil {
+ return fmt.Errorf("configuring filesystem restore: %v", err)
+ }
+ } else {
+ renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
+ if err != nil {
+ return fmt.Errorf("creating RestoreEnvironment: %v", err)
+ }
+ fs.SetRestoreEnvironment(*renv)
}
- fs.SetRestoreEnvironment(*renv)
// Prepare to load from the state file.
if eps, ok := networkStack.(*netstack.Stack); ok {
@@ -399,7 +449,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
// Load the state.
loadOpts := state.LoadOpts{Source: specFile}
- if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil {
+ if err := loadOpts.Load(ctx, k, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil {
return err
}
@@ -444,9 +494,9 @@ func (cm *containerManager) Resume(_, _ *struct{}) error {
// Wait waits for the init process in the given container.
func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
- log.Debugf("containerManager.Wait")
+ log.Debugf("containerManager.Wait, cid: %s", *cid)
err := cm.l.waitContainer(*cid, waitStatus)
- log.Debugf("containerManager.Wait returned, waitStatus: %v: %v", waitStatus, err)
+ log.Debugf("containerManager.Wait returned, cid: %s, waitStatus: %#x, err: %v", *cid, *waitStatus, err)
return err
}
@@ -461,8 +511,10 @@ type WaitPIDArgs struct {
// WaitPID waits for the process with PID 'pid' in the sandbox.
func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
- log.Debugf("containerManager.Wait")
- return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
+ log.Debugf("containerManager.Wait, cid: %s, pid: %d", args.CID, args.PID)
+ err := cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
+ log.Debugf("containerManager.Wait, cid: %s, pid: %d, waitStatus: %#x, err: %v", args.CID, args.PID, *waitStatus, err)
+ return err
}
// SignalDeliveryMode enumerates different signal delivery modes.
@@ -519,6 +571,6 @@ type SignalArgs struct {
// indicated process, to all processes in the container, or to the foreground
// process group.
func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
- log.Debugf("containerManager.Signal %+v", args)
+ log.Debugf("containerManager.Signal: cid: %s, PID: %d, signal: %d, mode: %v", args.CID, args.PID, args.Signo, args.Mode)
return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode)
}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index a7c4ebb0c..4e3bb9ac7 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -343,6 +343,16 @@ func hostInetFilters() seccomp.SyscallRules {
},
{
seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_PKTINFO),
+ },
+ {
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_RECVORIGDSTADDR),
+ },
+ {
+ seccomp.MatchAny{},
seccomp.EqualTo(syscall.SOL_IPV6),
seccomp.EqualTo(syscall.IPV6_TCLASS),
},
@@ -358,6 +368,11 @@ func hostInetFilters() seccomp.SyscallRules {
},
{
seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(linux.IPV6_RECVORIGDSTADDR),
+ },
+ {
+ seccomp.MatchAny{},
seccomp.EqualTo(syscall.SOL_SOCKET),
seccomp.EqualTo(syscall.SO_ERROR),
},
@@ -393,6 +408,11 @@ func hostInetFilters() seccomp.SyscallRules {
},
{
seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_TIMESTAMP),
+ },
+ {
+ seccomp.MatchAny{},
seccomp.EqualTo(syscall.SOL_TCP),
seccomp.EqualTo(syscall.TCP_NODELAY),
},
@@ -401,6 +421,11 @@ func hostInetFilters() seccomp.SyscallRules {
seccomp.EqualTo(syscall.SOL_TCP),
seccomp.EqualTo(syscall.TCP_INFO),
},
+ {
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(linux.TCP_INQ),
+ },
},
syscall.SYS_IOCTL: []seccomp.Rule{
{
@@ -449,6 +474,13 @@ func hostInetFilters() seccomp.SyscallRules {
},
{
seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_TIMESTAMP),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
+ },
+ {
+ seccomp.MatchAny{},
seccomp.EqualTo(syscall.SOL_TCP),
seccomp.EqualTo(syscall.TCP_NODELAY),
seccomp.MatchAny{},
@@ -456,6 +488,13 @@ func hostInetFilters() seccomp.SyscallRules {
},
{
seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(linux.TCP_INQ),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
+ },
+ {
+ seccomp.MatchAny{},
seccomp.EqualTo(syscall.SOL_IP),
seccomp.EqualTo(syscall.IP_TOS),
seccomp.MatchAny{},
@@ -470,6 +509,20 @@ func hostInetFilters() seccomp.SyscallRules {
},
{
seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_PKTINFO),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
+ },
+ {
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_RECVORIGDSTADDR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
+ },
+ {
+ seccomp.MatchAny{},
seccomp.EqualTo(syscall.SOL_IPV6),
seccomp.EqualTo(syscall.IPV6_TCLASS),
seccomp.MatchAny{},
@@ -482,6 +535,13 @@ func hostInetFilters() seccomp.SyscallRules {
seccomp.MatchAny{},
seccomp.EqualTo(4),
},
+ {
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(linux.IPV6_RECVORIGDSTADDR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
+ },
},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
{
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index ddf288456..2b0d2cd51 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -22,15 +22,6 @@ import (
"strings"
"syscall"
- // Include filesystem types that OCI spec might mount.
- _ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
- _ "gvisor.dev/gvisor/pkg/sentry/fs/host"
- _ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
- _ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
- _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
- _ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
-
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -48,9 +39,18 @@ import (
tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
+
+ // Include filesystem types that OCI spec might mount.
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/host"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
)
const (
@@ -105,33 +105,28 @@ func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name
// mandatory mounts that are required by the OCI specification.
func compileMounts(spec *specs.Spec) []specs.Mount {
// Keep track of whether proc and sys were mounted.
- var procMounted, sysMounted bool
+ var procMounted, sysMounted, devMounted, devptsMounted bool
var mounts []specs.Mount
- // Always mount /dev.
- mounts = append(mounts, specs.Mount{
- Type: devtmpfs.Name,
- Destination: "/dev",
- })
-
- mounts = append(mounts, specs.Mount{
- Type: devpts.Name,
- Destination: "/dev/pts",
- })
-
// Mount all submounts from the spec.
for _, m := range spec.Mounts {
if !specutils.IsSupportedDevMount(m) {
log.Warningf("ignoring dev mount at %q", m.Destination)
continue
}
- mounts = append(mounts, m)
switch filepath.Clean(m.Destination) {
case "/proc":
procMounted = true
case "/sys":
sysMounted = true
+ case "/dev":
+ m.Type = devtmpfs.Name
+ devMounted = true
+ case "/dev/pts":
+ m.Type = devpts.Name
+ devptsMounted = true
}
+ mounts = append(mounts, m)
}
// Mount proc and sys even if the user did not ask for it, as the spec
@@ -149,6 +144,18 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
Destination: "/sys",
})
}
+ if !devMounted {
+ mandatoryMounts = append(mandatoryMounts, specs.Mount{
+ Type: devtmpfs.Name,
+ Destination: "/dev",
+ })
+ }
+ if !devptsMounted {
+ mandatoryMounts = append(mandatoryMounts, specs.Mount{
+ Type: devpts.Name,
+ Destination: "/dev/pts",
+ })
+ }
// The mandatory mounts should be ordered right after the root, in case
// there are submounts of these mandatory mounts already in the spec.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 8ad000497..3df013d34 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -35,6 +35,7 @@ import (
"gvisor.dev/gvisor/pkg/memutil"
"gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/refsvfs2"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fdimport"
@@ -49,6 +50,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/sighandling"
+ "gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
"gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
"gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -155,6 +157,11 @@ type execProcess struct {
// pidnsPath is the pid namespace path in spec
pidnsPath string
+
+ // hostTTY is present when creating a sub-container with terminal enabled.
+ // TTY file is passed during container create and must be saved until
+ // container start.
+ hostTTY *fd.FD
}
func init() {
@@ -476,6 +483,10 @@ func (l *Loader) Destroy() {
// save/restore.
l.k.Release()
+ // All sentry-created resources should have been released at this point;
+ // check for reference leaks.
+ refsvfs2.DoLeakCheck()
+
// In the success case, stdioFDs and goferFDs will only contain
// released/closed FDs that ownership has been passed over to host FDs and
// gofer sessions. Close them here in case of failure.
@@ -582,7 +593,9 @@ func (l *Loader) run() error {
// Create the root container init task. It will begin running
// when the kernel is started.
- if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
+ var err error
+ _, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(true, l.sandboxID, &l.root)
+ if err != nil {
return err
}
@@ -621,7 +634,7 @@ func (l *Loader) run() error {
}
// createContainer creates a new container inside the sandbox.
-func (l *Loader) createContainer(cid string) error {
+func (l *Loader) createContainer(cid string, tty *fd.FD) error {
l.mu.Lock()
defer l.mu.Unlock()
@@ -629,14 +642,14 @@ func (l *Loader) createContainer(cid string) error {
if _, ok := l.processes[eid]; ok {
return fmt.Errorf("container %q already exists", cid)
}
- l.processes[eid] = &execProcess{}
+ l.processes[eid] = &execProcess{hostTTY: tty}
return nil
}
// startContainer starts a child container. It returns the thread group ID of
// the newly created process. Used FDs are either closed or released. It's safe
// for the caller to close any remaining files upon return.
-func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error {
// Create capabilities.
caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
if err != nil {
@@ -689,36 +702,41 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin
info := &containerInfo{
conf: conf,
spec: spec,
- stdioFDs: files[:3],
- goferFDs: files[3:],
+ goferFDs: goferFDs,
}
info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
if err != nil {
return fmt.Errorf("creating new process: %v", err)
}
- tg, err := l.createContainerProcess(false, cid, info, ep)
+
+ // Use stdios or TTY depending on the spec configuration.
+ if spec.Process.Terminal {
+ if len(stdioFDs) > 0 {
+ return fmt.Errorf("using TTY, stdios not expected: %v", stdioFDs)
+ }
+ if ep.hostTTY == nil {
+ return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?")
+ }
+ info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY}
+ ep.hostTTY = nil
+ } else {
+ info.stdioFDs = stdioFDs
+ }
+
+ ep.tg, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(false, cid, info)
if err != nil {
return err
}
-
- // Success!
- l.k.StartProcess(tg)
- ep.tg = tg
+ l.k.StartProcess(ep.tg)
return nil
}
-func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) {
- console := false
- if root {
- // Only root container supports terminal for now.
- console = info.spec.Process.Terminal
- }
-
+func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
// Create the FD map, which will set stdin, stdout, and stderr.
ctx := info.procArgs.NewContext(l.k)
- fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs)
+ fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs)
if err != nil {
- return nil, fmt.Errorf("importing fds: %v", err)
+ return nil, nil, nil, fmt.Errorf("importing fds: %v", err)
}
// CreateProcess takes a reference on fdTable if successful. We won't need
// ours either way.
@@ -730,14 +748,14 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints)
if root {
if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
- return nil, err
+ return nil, nil, nil, err
}
}
if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
- return nil, err
+ return nil, nil, nil, err
}
- // Add the HOME enviroment variable if it is not already set.
+ // Add the HOME environment variable if it is not already set.
var envv []string
if kernel.VFS2Enabled {
envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
@@ -748,29 +766,25 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
}
if err != nil {
- return nil, err
+ return nil, nil, nil, err
}
info.procArgs.Envv = envv
// Create and start the new process.
tg, _, err := l.k.CreateProcess(info.procArgs)
if err != nil {
- return nil, fmt.Errorf("creating process: %v", err)
+ return nil, nil, nil, fmt.Errorf("creating process: %v", err)
}
// CreateProcess takes a reference on FDTable if successful.
info.procArgs.FDTable.DecRef(ctx)
// Set the foreground process group on the TTY to the global init process
// group, since that is what we are about to start running.
- if root {
- switch {
- case ttyFileVFS2 != nil:
- ep.ttyVFS2 = ttyFileVFS2
- ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
- case ttyFile != nil:
- ep.tty = ttyFile
- ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
- }
+ switch {
+ case ttyFileVFS2 != nil:
+ ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
+ case ttyFile != nil:
+ ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
}
// Install seccomp filters with the new task if there are any.
@@ -778,7 +792,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
if err != nil {
- return nil, fmt.Errorf("building seccomp program: %v", err)
+ return nil, nil, nil, fmt.Errorf("building seccomp program: %v", err)
}
if log.IsLogging(log.Debug) {
@@ -789,7 +803,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
task := tg.Leader()
// NOTE: It seems Flags are ignored by runc so we ignore them too.
if err := task.AppendSyscallFilter(program, true); err != nil {
- return nil, fmt.Errorf("appending seccomp filters: %v", err)
+ return nil, nil, nil, fmt.Errorf("appending seccomp filters: %v", err)
}
}
} else {
@@ -798,7 +812,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
}
}
- return tg, nil
+ return tg, ttyFile, ttyFileVFS2, nil
}
// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
@@ -882,7 +896,7 @@ func (l *Loader) destroyContainer(cid string) error {
}
}
- log.Debugf("Container destroyed %q", cid)
+ log.Debugf("Container destroyed, cid: %s", cid)
return nil
}
@@ -1068,7 +1082,12 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st
func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
- transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4}
+ transProtos := []stack.TransportProtocolFactory{
+ tcp.NewProtocol,
+ udp.NewProtocol,
+ icmp.NewProtocol4,
+ icmp.NewProtocol6,
+ }
s := netstack.Stack{stack.New(stack.Options{
NetworkProtocols: netProtos,
TransportProtocols: transProtos,
@@ -1079,6 +1098,7 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
// privileges.
RawFactory: raw.EndpointFactory{},
UniqueID: uniqueID,
+ IPTables: netfilter.DefaultLinuxTables(),
})}
// Enable SACK Recovery.
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index e376f944b..b77b4762e 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -266,7 +266,7 @@ type CreateMountTestcase struct {
func createMountTestcases() []*CreateMountTestcase {
testCases := []*CreateMountTestcase{
- &CreateMountTestcase{
+ {
// Only proc.
name: "only proc mount",
spec: specs.Spec{
@@ -304,11 +304,10 @@ func createMountTestcases() []*CreateMountTestcase {
},
},
},
- // /some/deep/path should be mounted, along with /proc,
- // /dev, and /sys.
+ // /some/deep/path should be mounted, along with /proc, /dev, and /sys.
expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"},
},
- &CreateMountTestcase{
+ {
// Mounts are nested inside each other.
name: "nested mounts",
spec: specs.Spec{
@@ -352,7 +351,7 @@ func createMountTestcases() []*CreateMountTestcase {
expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux",
"/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"},
},
- &CreateMountTestcase{
+ {
name: "mount inside /dev",
spec: specs.Spec{
Root: &specs.Root{
@@ -395,35 +394,37 @@ func createMountTestcases() []*CreateMountTestcase {
},
expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"},
},
- }
-
- vfsCase := &CreateMountTestcase{
- name: "mounts inside mandatory mounts",
- spec: specs.Spec{
- Root: &specs.Root{
- Path: os.TempDir(),
- Readonly: true,
- },
- Mounts: []specs.Mount{
- {
- Destination: "/proc",
- Type: "tmpfs",
- },
- {
- Destination: "/sys/bar",
- Type: "tmpfs",
+ {
+ name: "mounts inside mandatory mounts",
+ spec: specs.Spec{
+ Root: &specs.Root{
+ Path: os.TempDir(),
+ Readonly: true,
},
-
- {
- Destination: "/tmp/baz",
- Type: "tmpfs",
+ Mounts: []specs.Mount{
+ {
+ Destination: "/proc",
+ Type: "tmpfs",
+ },
+ {
+ Destination: "/sys/bar",
+ Type: "tmpfs",
+ },
+ {
+ Destination: "/tmp/baz",
+ Type: "tmpfs",
+ },
+ {
+ Destination: "/dev/goo",
+ Type: "tmpfs",
+ },
},
},
+ expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz", "/dev/goo"},
},
- expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"},
}
- return append(testCases, vfsCase)
+ return testCases
}
// Test that MountNamespace can be created with various specs.
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 988573640..3d3a813df 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -28,7 +28,6 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/link/packetsocket"
"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
- "gvisor.dev/gvisor/pkg/tcpip/network/arp"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -41,9 +40,9 @@ var (
// "::1/8" on "lo" interface.
DefaultLoopbackLink = LoopbackLink{
Name: "lo",
- Addresses: []net.IP{
- net.IP("\x7f\x00\x00\x01"),
- net.IPv6loopback,
+ Addresses: []IPWithPrefix{
+ {Address: net.IP("\x7f\x00\x00\x01"), PrefixLen: 8},
+ {Address: net.IPv6loopback, PrefixLen: 128},
},
Routes: []Route{
{
@@ -83,7 +82,7 @@ type DefaultRoute struct {
type FDBasedLink struct {
Name string
MTU int
- Addresses []net.IP
+ Addresses []IPWithPrefix
Routes []Route
GSOMaxSize uint32
SoftwareGSOEnabled bool
@@ -100,7 +99,7 @@ type FDBasedLink struct {
// LoopbackLink configures a loopback li nk.
type LoopbackLink struct {
Name string
- Addresses []net.IP
+ Addresses []IPWithPrefix
Routes []Route
}
@@ -118,6 +117,19 @@ type CreateLinksAndRoutesArgs struct {
Defaultv6Gateway DefaultRoute
}
+// IPWithPrefix is an address with its subnet prefix length.
+type IPWithPrefix struct {
+ // Address is a network address.
+ Address net.IP
+
+ // PrefixLen is the subnet prefix length.
+ PrefixLen int
+}
+
+func (ip IPWithPrefix) String() string {
+ return fmt.Sprintf("%s/%d", ip.Address, ip.PrefixLen)
+}
+
// Empty returns true if route hasn't been set.
func (r *Route) Empty() bool {
return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil
@@ -265,20 +277,19 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
// createNICWithAddrs creates a NIC in the network stack and adds the given
// addresses.
-func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error {
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []IPWithPrefix) error {
opts := stack.NICOptions{Name: name}
if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil {
return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err)
}
- // Always start with an arp address for the NIC.
- if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
- return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err)
- }
-
for _, addr := range addrs {
- proto, tcpipAddr := ipToAddressAndProto(addr)
- if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil {
+ proto, tcpipAddr := ipToAddressAndProto(addr.Address)
+ ap := tcpip.AddressWithPrefix{
+ Address: tcpipAddr,
+ PrefixLen: addr.PrefixLen,
+ }
+ if err := n.Stack.AddAddressWithPrefix(id, proto, ap); err != nil {
return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err)
}
}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 004da5b40..3fd28e516 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -210,6 +210,9 @@ func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *c
ReadOnly: c.root.Readonly,
GetFilesystemOptions: vfs.GetFilesystemOptions{
Data: strings.Join(data, ","),
+ InternalData: gofer.InternalFilesystemOptions{
+ UniqueID: "/",
+ },
},
InternalMount: true,
}
@@ -247,36 +250,76 @@ func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Cre
overlayOpts := *lowerOpts
overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
- // Next mount upper and lower. Upper is a tmpfs mount to keep all
- // modifications inside the sandbox.
- upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
- if err != nil {
- return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
- }
- cu := cleanup.Make(func() { upper.DecRef(ctx) })
- defer cu.Clean()
-
// All writes go to the upper layer, be paranoid and make lower readonly.
lowerOpts.ReadOnly = true
lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
if err != nil {
return nil, nil, err
}
- cu.Add(func() { lower.DecRef(ctx) })
+ cu := cleanup.Make(func() { lower.DecRef(ctx) })
+ defer cu.Clean()
- // Propagate the lower layer's root's owner, group, and mode to the upper
- // layer's root for consistency with VFS1.
- upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
+ // Determine the lower layer's root's type.
lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root())
stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{
Root: lowerRootVD,
Start: lowerRootVD,
}, &vfs.StatOptions{
- Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE,
+ Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE,
})
if err != nil {
- return nil, nil, err
+ return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err)
+ }
+ if stat.Mask&linux.STATX_TYPE == 0 {
+ return nil, nil, fmt.Errorf("failed to get file type of lower layer's root")
+ }
+ rootType := stat.Mode & linux.S_IFMT
+ if rootType != linux.S_IFDIR && rootType != linux.S_IFREG {
+ return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType)
+ }
+
+ // Upper is a tmpfs mount to keep all modifications inside the sandbox.
+ upperOpts.GetFilesystemOptions.InternalData = tmpfs.FilesystemOpts{
+ RootFileType: uint16(rootType),
+ }
+ upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
+ }
+ cu.Add(func() { upper.DecRef(ctx) })
+
+ // If the overlay mount consists of a regular file, copy up its contents
+ // from the lower layer, since in the overlay the otherwise-empty upper
+ // layer file will take precedence.
+ upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
+ if rootType == linux.S_IFREG {
+ lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: lowerRootVD,
+ Start: lowerRootVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err)
+ }
+ defer lowerFD.DecRef(ctx)
+ upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: upperRootVD,
+ Start: upperRootVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_WRONLY,
+ })
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err)
+ }
+ defer upperFD.DecRef(ctx)
+ if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil {
+ return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err)
+ }
}
+
+ // Propagate the lower layer's root's owner, group, and mode to the upper
+ // layer's root for consistency with VFS1.
err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{
Root: upperRootVD,
Start: upperRootVD,
@@ -427,6 +470,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
fsName := m.Type
useOverlay := false
var data []string
+ var iopts interface{}
// Find filesystem name and FS specific data field.
switch m.Type {
@@ -451,6 +495,9 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
}
data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+ iopts = gofer.InternalFilesystemOptions{
+ UniqueID: m.Destination,
+ }
// If configured, add overlay to all writable mounts.
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
@@ -462,7 +509,8 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
opts := &vfs.MountOptions{
GetFilesystemOptions: vfs.GetFilesystemOptions{
- Data: strings.Join(data, ","),
+ Data: strings.Join(data, ","),
+ InternalData: iopts,
},
InternalMount: true,
}
@@ -667,3 +715,21 @@ func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Crede
}
return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
}
+
+// configureRestore returns an updated context.Context including filesystem
+// state used by restore defined by conf.
+func (c *containerMounter) configureRestore(ctx context.Context, conf *config.Config) (context.Context, error) {
+ fdmap := make(map[string]int)
+ fdmap["/"] = c.fds.remove()
+ mounts, err := c.prepareMountsVFS2()
+ if err != nil {
+ return ctx, err
+ }
+ for i := range c.mounts {
+ submount := &mounts[i]
+ if submount.fd >= 0 {
+ fdmap[submount.Destination] = submount.fd
+ }
+ }
+ return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil
+}