summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
Diffstat (limited to 'runsc')
-rw-r--r--runsc/BUILD12
-rw-r--r--runsc/boot/BUILD9
-rw-r--r--runsc/boot/config.go15
-rw-r--r--runsc/boot/controller.go65
-rw-r--r--runsc/boot/filter/config.go19
-rw-r--r--runsc/boot/filter/extra_filters_msan.go2
-rw-r--r--runsc/boot/fs.go74
-rw-r--r--runsc/boot/loader.go323
-rw-r--r--runsc/boot/loader_test.go16
-rw-r--r--runsc/boot/network.go9
-rw-r--r--runsc/boot/vfs.go396
-rw-r--r--runsc/cgroup/BUILD8
-rw-r--r--runsc/cgroup/cgroup.go71
-rw-r--r--runsc/cgroup/cgroup_test.go582
-rw-r--r--runsc/cmd/BUILD10
-rw-r--r--runsc/cmd/boot.go8
-rw-r--r--runsc/cmd/debug.go40
-rw-r--r--runsc/cmd/gofer.go15
-rw-r--r--runsc/cmd/spec.go222
-rw-r--r--runsc/cmd/statefile.go12
-rw-r--r--runsc/container/BUILD8
-rw-r--r--runsc/container/console_test.go3
-rw-r--r--runsc/container/container.go21
-rw-r--r--runsc/container/container_test.go319
-rw-r--r--runsc/container/multi_container_test.go337
-rwxr-xr-xrunsc/debian/postinst.sh9
-rw-r--r--runsc/fsgofer/BUILD3
-rw-r--r--runsc/fsgofer/filter/config.go1
-rw-r--r--runsc/fsgofer/fsgofer.go305
-rw-r--r--runsc/fsgofer/fsgofer_test.go169
-rw-r--r--runsc/main.go8
-rw-r--r--runsc/sandbox/BUILD3
-rw-r--r--runsc/sandbox/network.go17
-rw-r--r--runsc/sandbox/sandbox.go27
-rw-r--r--runsc/specutils/BUILD5
-rw-r--r--runsc/specutils/namespace.go16
-rw-r--r--runsc/specutils/specutils.go84
37 files changed, 2160 insertions, 1083 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
index 757f6d44c..96f697a5f 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -62,18 +62,22 @@ go_binary(
)
pkg_tar(
- name = "runsc-bin",
- srcs = [":runsc"],
+ name = "debian-bin",
+ srcs = [
+ ":runsc",
+ "//shim/v1:gvisor-containerd-shim",
+ "//shim/v2:containerd-shim-runsc-v1",
+ ],
mode = "0755",
package_dir = "/usr/bin",
- strip_prefix = "/runsc/linux_amd64_pure_stripped",
)
pkg_tar(
name = "debian-data",
extension = "tar.gz",
deps = [
- ":runsc-bin",
+ ":debian-bin",
+ "//shim:config",
],
)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index a907c103b..9f52438c2 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -40,6 +40,8 @@ go_library(
"//pkg/sentry/arch:registers_go_proto",
"//pkg/sentry/control",
"//pkg/sentry/devices/memdev",
+ "//pkg/sentry/devices/ttydev",
+ "//pkg/sentry/devices/tundev",
"//pkg/sentry/fdimport",
"//pkg/sentry/fs",
"//pkg/sentry/fs/dev",
@@ -53,8 +55,10 @@ go_library(
"//pkg/sentry/fs/user",
"//pkg/sentry/fsimpl/devpts",
"//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/fsimpl/fuse",
"//pkg/sentry/fsimpl/gofer",
"//pkg/sentry/fsimpl/host",
+ "//pkg/sentry/fsimpl/overlay",
"//pkg/sentry/fsimpl/proc",
"//pkg/sentry/fsimpl/sys",
"//pkg/sentry/fsimpl/tmpfs",
@@ -86,6 +90,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/link/fdbased",
"//pkg/tcpip/link/loopback",
+ "//pkg/tcpip/link/packetsocket",
"//pkg/tcpip/link/qdisc/fifo",
"//pkg/tcpip/link/sniffer",
"//pkg/tcpip/network/arp",
@@ -102,7 +107,7 @@ go_library(
"//runsc/boot/pprof",
"//runsc/specutils",
"@com_github_golang_protobuf//proto:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
)
@@ -127,7 +132,7 @@ go_test(
"//pkg/sync",
"//pkg/unet",
"//runsc/fsgofer",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index bcec7e4db..80da8b3e6 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -187,6 +187,12 @@ type Config struct {
// SoftwareGSO indicates that software segmentation offload is enabled.
SoftwareGSO bool
+ // TXChecksumOffload indicates that TX Checksum Offload is enabled.
+ TXChecksumOffload bool
+
+ // RXChecksumOffload indicates that RX Checksum Offload is enabled.
+ RXChecksumOffload bool
+
// QDisc indicates the type of queuening discipline to use by default
// for non-loopback interfaces.
QDisc QueueingDiscipline
@@ -268,6 +274,9 @@ type Config struct {
// Enables VFS2 (not plumbled through yet).
VFS2 bool
+
+ // Enables FUSE usage (not plumbled through yet).
+ FUSE bool
}
// ToFlags returns a slice of flags that correspond to the given Config.
@@ -299,6 +308,8 @@ func (c *Config) ToFlags() []string {
"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
"--gso=" + strconv.FormatBool(c.HardwareGSO),
"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
+ "--rx-checksum-offload=" + strconv.FormatBool(c.RXChecksumOffload),
+ "--tx-checksum-offload=" + strconv.FormatBool(c.TXChecksumOffload),
"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
"--qdisc=" + c.QDisc.String(),
}
@@ -317,5 +328,9 @@ func (c *Config) ToFlags() []string {
f = append(f, "--vfs2=true")
}
+ if c.FUSE {
+ f = append(f, "--fuse=true")
+ }
+
return f
}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 8125d5061..626a3816e 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -101,14 +101,13 @@ const (
// Profiling related commands (see pprof.go for more details).
const (
- StartCPUProfile = "Profile.StartCPUProfile"
- StopCPUProfile = "Profile.StopCPUProfile"
- HeapProfile = "Profile.HeapProfile"
- GoroutineProfile = "Profile.GoroutineProfile"
- BlockProfile = "Profile.BlockProfile"
- MutexProfile = "Profile.MutexProfile"
- StartTrace = "Profile.StartTrace"
- StopTrace = "Profile.StopTrace"
+ StartCPUProfile = "Profile.StartCPUProfile"
+ StopCPUProfile = "Profile.StopCPUProfile"
+ HeapProfile = "Profile.HeapProfile"
+ BlockProfile = "Profile.BlockProfile"
+ MutexProfile = "Profile.MutexProfile"
+ StartTrace = "Profile.StartTrace"
+ StopTrace = "Profile.StopTrace"
)
// Logging related commands (see logging.go for more details).
@@ -129,42 +128,52 @@ type controller struct {
// manager holds the containerManager methods.
manager *containerManager
+
+ // pprop holds the profile instance if enabled. It may be nil.
+ pprof *control.Profile
}
// newController creates a new controller. The caller must call
// controller.srv.StartServing() to start the controller.
func newController(fd int, l *Loader) (*controller, error) {
- srv, err := server.CreateFromFD(fd)
+ ctrl := &controller{}
+ var err error
+ ctrl.srv, err = server.CreateFromFD(fd)
if err != nil {
return nil, err
}
- manager := &containerManager{
+ ctrl.manager = &containerManager{
startChan: make(chan struct{}),
startResultChan: make(chan error),
l: l,
}
- srv.Register(manager)
+ ctrl.srv.Register(ctrl.manager)
if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
net := &Network{
Stack: eps.Stack,
}
- srv.Register(net)
+ ctrl.srv.Register(net)
}
- srv.Register(&debug{})
- srv.Register(&control.Logging{})
- if l.conf.ProfileEnable {
- srv.Register(&control.Profile{
- Kernel: l.k,
- })
+ ctrl.srv.Register(&debug{})
+ ctrl.srv.Register(&control.Logging{})
+
+ if l.root.conf.ProfileEnable {
+ ctrl.pprof = &control.Profile{Kernel: l.k}
+ ctrl.srv.Register(ctrl.pprof)
}
- return &controller{
- srv: srv,
- manager: manager,
- }, nil
+ return ctrl, nil
+}
+
+func (c *controller) stop() {
+ if c.pprof != nil {
+ // These are noop if there is nothing being profiled.
+ _ = c.pprof.StopCPUProfile(nil, nil)
+ _ = c.pprof.StopTrace(nil, nil)
+ }
}
// containerManager manages sandbox containers.
@@ -333,7 +342,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
// Pause the kernel while we build a new one.
cm.l.k.Pause()
- p, err := createPlatform(cm.l.conf, deviceFile)
+ p, err := createPlatform(cm.l.root.conf, deviceFile)
if err != nil {
return fmt.Errorf("creating platform: %v", err)
}
@@ -349,8 +358,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
cm.l.k = k
// Set up the restore environment.
- mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
- renv, err := mntr.createRestoreEnvironment(cm.l.conf)
+ mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints)
+ renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
if err != nil {
return fmt.Errorf("creating RestoreEnvironment: %v", err)
}
@@ -368,7 +377,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("file cannot be empty")
}
- if cm.l.conf.ProfileEnable {
+ if cm.l.root.conf.ProfileEnable {
// pprof.Initialize opens /proc/self/maps, so has to be called before
// installing seccomp filters.
pprof.Initialize()
@@ -387,13 +396,13 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
// Since we have a new kernel we also must make a new watchdog.
dogOpts := watchdog.DefaultOpts
- dogOpts.TaskTimeoutAction = cm.l.conf.WatchdogAction
+ dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction
dog := watchdog.New(k, dogOpts)
// Change the loader fields to reflect the changes made when restoring.
cm.l.k = k
cm.l.watchdog = dog
- cm.l.rootProcArgs = kernel.CreateProcessArgs{}
+ cm.l.root.procArgs = kernel.CreateProcessArgs{}
cm.l.restore = true
// Reinitialize the sandbox ID and processes map. Note that it doesn't
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 98cdd90dd..149eb0b1b 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -288,6 +288,14 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_SIGALTSTACK: {},
unix.SYS_STATX: {},
syscall.SYS_SYNC_FILE_RANGE: {},
+ syscall.SYS_TEE: []seccomp.Rule{
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowAny{},
+ seccomp.AllowValue(1), /* len */
+ seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+ },
+ },
syscall.SYS_TGKILL: []seccomp.Rule{
{
seccomp.AllowValue(uint64(os.Getpid())),
@@ -302,19 +310,12 @@ var allowedSyscalls = seccomp.SyscallRules{
},
},
syscall.SYS_WRITE: {},
- // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with
- // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR
- // option is enabled for a packet socket.
+ // For rawfile.NonBlockingWriteIovec.
syscall.SYS_WRITEV: []seccomp.Rule{
{
seccomp.AllowAny{},
seccomp.AllowAny{},
- seccomp.AllowValue(2),
- },
- {
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(3),
+ seccomp.GreaterThan(0),
},
},
}
diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go
index 5e5a3c998..209e646a7 100644
--- a/runsc/boot/filter/extra_filters_msan.go
+++ b/runsc/boot/filter/extra_filters_msan.go
@@ -26,6 +26,8 @@ import (
func instrumentationFilters() seccomp.SyscallRules {
Report("MSAN is enabled: syscall filters less restrictive!")
return seccomp.SyscallRules{
+ syscall.SYS_CLONE: {},
+ syscall.SYS_MMAP: {},
syscall.SYS_SCHED_GETAFFINITY: {},
syscall.SYS_SET_ROBUST_LIST: {},
}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index e1181271a..9dd5b0184 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -29,6 +29,7 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -37,6 +38,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+ "gvisor.dev/gvisor/pkg/sentry/fs/user"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
@@ -62,7 +64,7 @@ const (
)
// tmpfs has some extra supported options that we must pass through.
-var tmpfsAllowedOptions = []string{"mode", "uid", "gid"}
+var tmpfsAllowedData = []string{"mode", "uid", "gid"}
func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
// Upper layer uses the same flags as lower, but it must be read-write.
@@ -153,8 +155,8 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
return mounts
}
-// p9MountOptions creates a slice of options for a p9 mount.
-func p9MountOptions(fd int, fa FileAccessType, vfs2 bool) []string {
+// p9MountData creates a slice of p9 mount data.
+func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
opts := []string{
"trans=fd",
"rfdno=" + strconv.Itoa(fd),
@@ -221,9 +223,6 @@ func mountFlags(opts []string) fs.MountSourceFlags {
mf.NoAtime = true
case "noexec":
mf.NoExec = true
- case "bind", "rbind":
- // When options include either "bind" or "rbind",
- // it's converted to a 9P mount.
default:
log.Warningf("ignoring unknown mount option %q", o)
}
@@ -237,7 +236,7 @@ func isSupportedMountFlag(fstype, opt string) bool {
return true
}
if fstype == tmpfsvfs2.Name {
- ok, err := parseMountOption(opt, tmpfsAllowedOptions...)
+ ok, err := parseMountOption(opt, tmpfsAllowedData...)
return ok && err == nil
}
return false
@@ -294,19 +293,12 @@ func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter,
// Set namespace here so that it can be found in ctx.
procArgs.MountNamespace = mns
- return setExecutablePath(ctx, procArgs)
-}
-
-// setExecutablePath sets the procArgs.Filename by searching the PATH for an
-// executable matching the procArgs.Argv[0].
-func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
- paths := fs.GetPath(procArgs.Envv)
- exe := procArgs.Argv[0]
- f, err := procArgs.MountNamespace.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
+ // Resolve the executable path from working dir and environment.
+ resolved, err := user.ResolveExecutablePath(ctx, procArgs)
if err != nil {
- return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+ return err
}
- procArgs.Filename = f
+ procArgs.Filename = resolved
return nil
}
@@ -399,6 +391,10 @@ type mountHint struct {
// root is the inode where the volume is mounted. For mounts with 'pod' share
// the volume is mounted once and then bind mounted inside the containers.
root *fs.Inode
+
+ // vfsMount is the master mount for the volume. For mounts with 'pod' share
+ // the master volume is bind mounted inside the containers.
+ vfsMount *vfs.Mount
}
func (m *mountHint) setField(key, val string) error {
@@ -580,9 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
// processHints processes annotations that container hints about how volumes
// should be mounted (e.g. a volume shared between containers). It must be
// called for the root container only.
-func (c *containerMounter) processHints(conf *Config) error {
+func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error {
if conf.VFS2 {
- return nil
+ return c.processHintsVFS2(conf, creds)
}
ctx := c.k.SupervisorContext()
for _, hint := range c.hints.mounts {
@@ -644,7 +640,7 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi
func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
for _, m := range c.mounts {
log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
@@ -725,7 +721,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
fd := c.fds.remove()
log.Infof("Mounting root over 9P, ioFD: %d", fd)
p9FS := mustFindFilesystem("9p")
- opts := p9MountOptions(fd, conf.FileAccess, false /* vfs2 */)
+ opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
if conf.OverlayfsStaleRead {
// We can't check for overlayfs here because sandbox is chroot'ed and gofer
@@ -770,10 +766,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
useOverlay bool
)
- if isBindMount(m) {
- m.Type = bind
- }
-
switch m.Type {
case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name:
fsName = m.Type
@@ -783,7 +775,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
fsName = m.Type
var err error
- opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+ opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
if err != nil {
return "", nil, false, err
}
@@ -791,7 +783,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
case bind:
fd := c.fds.remove()
fsName = gofervfs2.Name
- opts = p9MountOptions(fd, c.getMountAccessType(m), conf.VFS2)
+ opts = p9MountData(fd, c.getMountAccessType(m), conf.VFS2)
// If configured, add overlay to all writable mounts.
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
@@ -801,18 +793,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
return fsName, opts, useOverlay, nil
}
-func isBindMount(m specs.Mount) bool {
- for _, opt := range m.Options {
- // When options include either "bind" or "rbind", this behaves as
- // bind mount even if the mount type is equal to a filesystem supported
- // on runsc.
- if opt == "bind" || opt == "rbind" {
- return true
- }
- }
- return false
-}
-
func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
if hint := c.hints.findMount(mount); hint != nil {
return hint.fileAccessType()
@@ -888,7 +868,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
if err != nil {
return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
}
- defer dirent.DecRef()
+ defer dirent.DecRef(ctx)
if err := mns.Mount(ctx, dirent, inode); err != nil {
return fmt.Errorf("mount %q error: %v", m.Destination, err)
}
@@ -909,12 +889,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun
if err != nil {
return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
}
- defer target.DecRef()
+ defer target.DecRef(ctx)
// Take a ref on the inode that is about to be (re)-mounted.
source.root.IncRef()
if err := mns.Mount(ctx, target, source.root); err != nil {
- source.root.DecRef()
+ source.root.DecRef(ctx)
return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
}
@@ -956,7 +936,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
// Add root mount.
fd := c.fds.remove()
- opts := p9MountOptions(fd, conf.FileAccess, false /* vfs2 */)
+ opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
mf := fs.MountSourceFlags{}
if c.root.Readonly || conf.Overlay {
@@ -1017,12 +997,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
switch err {
case nil:
// Found '/tmp' in filesystem, check if it's empty.
- defer tmp.DecRef()
+ defer tmp.DecRef(ctx)
f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
if err != nil {
return err
}
- defer f.DecRef()
+ defer f.DecRef(ctx)
serializer := &fs.CollectEntriesSerializer{}
if err := f.Readdir(ctx, serializer); err != nil {
return err
@@ -1044,7 +1024,7 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
Destination: "/tmp",
// Sticky bit is added to prevent accidental deletion of files from
// another user. This is normally done for /tmp.
- Options: []string{"mode=1777"},
+ Options: []string{"mode=01777"},
}
return c.mountSubmount(ctx, conf, mns, root, tmpMount)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f802bc9fb..40c6f99fd 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -16,12 +16,12 @@
package boot
import (
+ "errors"
"fmt"
mrand "math/rand"
"os"
"runtime"
"sync/atomic"
- "syscall"
gtime "time"
specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -32,6 +32,7 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/memutil"
"gvisor.dev/gvisor/pkg/rand"
+ "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fdimport"
@@ -77,29 +78,34 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
)
-// Loader keeps state needed to start the kernel and run the container..
-type Loader struct {
- // k is the kernel.
- k *kernel.Kernel
-
- // ctrl is the control server.
- ctrl *controller
-
+type containerInfo struct {
conf *Config
- // console is set to true if terminal is enabled.
- console bool
+ // spec is the base configuration for the root container.
+ spec *specs.Spec
- watchdog *watchdog.Watchdog
+ // procArgs refers to the container's init task.
+ procArgs kernel.CreateProcessArgs
// stdioFDs contains stdin, stdout, and stderr.
stdioFDs []int
// goferFDs are the FDs that attach the sandbox to the gofers.
goferFDs []int
+}
- // spec is the base configuration for the root container.
- spec *specs.Spec
+// Loader keeps state needed to start the kernel and run the container..
+type Loader struct {
+ // k is the kernel.
+ k *kernel.Kernel
+
+ // ctrl is the control server.
+ ctrl *controller
+
+ // root contains information about the root container in the sandbox.
+ root containerInfo
+
+ watchdog *watchdog.Watchdog
// stopSignalForwarding disables forwarding of signals to the sandboxed
// container. It should be called when a sandbox is destroyed.
@@ -108,9 +114,6 @@ type Loader struct {
// restore is set to true if we are restoring a container.
restore bool
- // rootProcArgs refers to the root sandbox init task.
- rootProcArgs kernel.CreateProcessArgs
-
// sandboxID is the ID for the whole sandbox.
sandboxID string
@@ -175,8 +178,6 @@ type Args struct {
// StdioFDs is the stdio for the application. The Loader takes ownership of
// these FDs and may close them at any time.
StdioFDs []int
- // Console is set to true if using TTY.
- Console bool
// NumCPU is the number of CPUs to create inside the sandbox.
NumCPU int
// TotalMem is the initial amount of total memory to report back to the
@@ -187,7 +188,7 @@ type Args struct {
}
// make sure stdioFDs are always the same on initial start and on restore
-const startingStdioFD = 64
+const startingStdioFD = 256
// New initializes a new kernel loader configured by spec.
// New also handles setting up a kernel for restoring a container.
@@ -205,6 +206,10 @@ func New(args Args) (*Loader, error) {
// Is this a VFSv2 kernel?
if args.Conf.VFS2 {
kernel.VFS2Enabled = true
+ if args.Conf.FUSE {
+ kernel.FUSEEnabled = true
+ }
+
vfs2.Override()
}
@@ -227,9 +232,7 @@ func New(args Args) (*Loader, error) {
// Create VDSO.
//
// Pass k as the platform since it is savable, unlike the actual platform.
- //
- // FIXME(b/109889800): Use non-nil context.
- vdso, err := loader.PrepareVDSO(nil, k)
+ vdso, err := loader.PrepareVDSO(k)
if err != nil {
return nil, fmt.Errorf("creating vdso: %v", err)
}
@@ -300,6 +303,12 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("initializing kernel: %v", err)
}
+ if kernel.VFS2Enabled {
+ if err := registerFilesystems(k); err != nil {
+ return nil, fmt.Errorf("registering filesystems: %w", err)
+ }
+ }
+
if err := adjustDirentCache(k); err != nil {
return nil, err
}
@@ -318,7 +327,7 @@ func New(args Args) (*Loader, error) {
dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
dog := watchdog.New(k, dogOpts)
- procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
+ procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
if err != nil {
return nil, fmt.Errorf("creating init process for root container: %v", err)
}
@@ -338,7 +347,7 @@ func New(args Args) (*Loader, error) {
if err != nil {
return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err)
}
- defer hostFilesystem.DecRef()
+ defer hostFilesystem.DecRef(k.SupervisorContext())
hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
@@ -352,31 +361,37 @@ func New(args Args) (*Loader, error) {
var stdioFDs []int
newfd := startingStdioFD
for _, fd := range args.StdioFDs {
- err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
+ // Check that newfd is unused to avoid clobbering over it.
+ if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
+ if err != nil {
+ return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
+ }
+ return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
+ }
+
+ err := unix.Dup3(fd, newfd, unix.O_CLOEXEC)
if err != nil {
return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
}
stdioFDs = append(stdioFDs, newfd)
- err = syscall.Close(fd)
- if err != nil {
- return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
- }
+ _ = unix.Close(fd)
newfd++
}
eid := execID{cid: args.ID}
l := &Loader{
- k: k,
- conf: args.Conf,
- console: args.Console,
- watchdog: dog,
- spec: args.Spec,
- goferFDs: args.GoferFDs,
- stdioFDs: stdioFDs,
- rootProcArgs: procArgs,
- sandboxID: args.ID,
- processes: map[execID]*execProcess{eid: {}},
- mountHints: mountHints,
+ k: k,
+ watchdog: dog,
+ sandboxID: args.ID,
+ processes: map[execID]*execProcess{eid: {}},
+ mountHints: mountHints,
+ root: containerInfo{
+ conf: args.Conf,
+ stdioFDs: stdioFDs,
+ goferFDs: args.GoferFDs,
+ spec: args.Spec,
+ procArgs: procArgs,
+ },
}
// We don't care about child signals; some platforms can generate a
@@ -404,8 +419,8 @@ func New(args Args) (*Loader, error) {
return l, nil
}
-// newProcess creates a process that can be run with kernel.CreateProcess.
-func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
+// createProcessArgs creates args that can be used with kernel.CreateProcess.
+func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
// Create initial limits.
ls, err := createLimitSet(spec)
if err != nil {
@@ -449,6 +464,11 @@ func (l *Loader) Destroy() {
l.stopSignalForwarding()
}
l.watchdog.Stop()
+
+ for i, fd := range l.root.stdioFDs {
+ _ = unix.Close(fd)
+ l.root.stdioFDs[i] = -1
+ }
}
func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
@@ -479,13 +499,13 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
}
func (l *Loader) installSeccompFilters() error {
- if l.conf.DisableSeccomp {
+ if l.root.conf.DisableSeccomp {
filter.Report("syscall filter is DISABLED. Running in less secure mode.")
} else {
opts := filter.Options{
Platform: l.k.Platform,
- HostNetwork: l.conf.Network == NetworkHost,
- ProfileEnable: l.conf.ProfileEnable,
+ HostNetwork: l.root.conf.Network == NetworkHost,
+ ProfileEnable: l.root.conf.ProfileEnable,
ControllerFD: l.ctrl.srv.FD(),
}
if err := filter.Install(opts); err != nil {
@@ -511,7 +531,7 @@ func (l *Loader) Run() error {
}
func (l *Loader) run() error {
- if l.conf.Network == NetworkHost {
+ if l.root.conf.Network == NetworkHost {
// Delay host network configuration to this point because network namespace
// is configured after the loader is created and before Run() is called.
log.Debugf("Configuring host network")
@@ -532,10 +552,8 @@ func (l *Loader) run() error {
// If we are restoring, we do not want to create a process.
// l.restore is set by the container manager when a restore call is made.
- var ttyFile *host.TTYFileOperations
- var ttyFileVFS2 *hostvfs2.TTYFileDescription
if !l.restore {
- if l.conf.ProfileEnable {
+ if l.root.conf.ProfileEnable {
pprof.Initialize()
}
@@ -545,82 +563,29 @@ func (l *Loader) run() error {
return err
}
- // Create the FD map, which will set stdin, stdout, and stderr. If console
- // is true, then ioctl calls will be passed through to the host fd.
- ctx := l.rootProcArgs.NewContext(l.k)
- var err error
-
- // CreateProcess takes a reference on FDMap if successful. We won't need
- // ours either way.
- l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
- if err != nil {
- return fmt.Errorf("importing fds: %v", err)
- }
-
- // Setup the root container file system.
- l.startGoferMonitor(l.sandboxID, l.goferFDs)
-
- mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
- if err := mntr.processHints(l.conf); err != nil {
- return err
- }
- if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
- return err
- }
-
- // Add the HOME enviroment variable if it is not already set.
- var envv []string
- if kernel.VFS2Enabled {
- envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
- l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
-
- } else {
- envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
- l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
- }
- if err != nil {
- return err
- }
- l.rootProcArgs.Envv = envv
-
// Create the root container init task. It will begin running
// when the kernel is started.
- if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
- return fmt.Errorf("creating init process: %v", err)
+ if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
+ return err
}
-
- // CreateProcess takes a reference on FDTable if successful.
- l.rootProcArgs.FDTable.DecRef()
}
ep.tg = l.k.GlobalInit()
- if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
+ if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
ep.pidnsPath = ns.Path
}
- if l.console {
- // Set the foreground process group on the TTY to the global init process
- // group, since that is what we are about to start running.
- switch {
- case ttyFileVFS2 != nil:
- ep.ttyVFS2 = ttyFileVFS2
- ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
- case ttyFile != nil:
- ep.tty = ttyFile
- ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
- }
- }
// Handle signals by forwarding them to the root container process
// (except for panic signal, which should cause a panic).
l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
// Panic signal should cause a panic.
- if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+ if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
panic("Signal-induced panic")
}
// Otherwise forward to root container.
deliveryMode := DeliverToProcess
- if l.console {
+ if l.root.spec.Process.Terminal {
// Since we are running with a console, we should forward the signal to
// the foreground process group so that job control signals like ^C can
// be handled properly.
@@ -637,11 +602,9 @@ func (l *Loader) run() error {
// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
// passed FDs, so only close for VFS1.
if !kernel.VFS2Enabled {
- for _, fd := range l.stdioFDs {
- err := syscall.Close(fd)
- if err != nil {
- return fmt.Errorf("close dup()ed stdioFDs: %v", err)
- }
+ for i, fd := range l.root.stdioFDs {
+ _ = unix.Close(fd)
+ l.root.stdioFDs[i] = -1
}
}
@@ -676,8 +639,8 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
l.mu.Lock()
defer l.mu.Unlock()
- eid := execID{cid: cid}
- if _, ok := l.processes[eid]; !ok {
+ ep := l.processes[execID{cid: cid}]
+ if ep == nil {
return fmt.Errorf("trying to start a deleted container %q", cid)
}
@@ -711,61 +674,112 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
if pidns == nil {
pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
}
- l.processes[eid].pidnsPath = ns.Path
+ ep.pidnsPath = ns.Path
} else {
pidns = l.k.RootPIDNamespace()
}
- procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
+
+ info := &containerInfo{
+ conf: conf,
+ spec: spec,
+ }
+ info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
if err != nil {
return fmt.Errorf("creating new process: %v", err)
}
// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
- var stdioFDs []int
for _, f := range files[:3] {
- stdioFDs = append(stdioFDs, int(f.Fd()))
- }
-
- // Create the FD map, which will set stdin, stdout, and stderr.
- ctx := procArgs.NewContext(l.k)
- fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
- if err != nil {
- return fmt.Errorf("importing fds: %v", err)
+ info.stdioFDs = append(info.stdioFDs, int(f.Fd()))
}
- // CreateProcess takes a reference on fdTable if successful. We won't
- // need ours either way.
- procArgs.FDTable = fdTable
// Can't take ownership away from os.File. dup them to get a new FDs.
- var goferFDs []int
for _, f := range files[3:] {
- fd, err := syscall.Dup(int(f.Fd()))
+ fd, err := unix.Dup(int(f.Fd()))
if err != nil {
return fmt.Errorf("failed to dup file: %v", err)
}
- goferFDs = append(goferFDs, fd)
+ info.goferFDs = append(info.goferFDs, fd)
+ }
+
+ tg, err := l.createContainerProcess(false, cid, info, ep)
+ if err != nil {
+ return err
+ }
+
+ // Success!
+ l.k.StartProcess(tg)
+ ep.tg = tg
+ return nil
+}
+
+func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) {
+ console := false
+ if root {
+ // Only root container supports terminal for now.
+ console = info.spec.Process.Terminal
+ }
+
+ // Create the FD map, which will set stdin, stdout, and stderr.
+ ctx := info.procArgs.NewContext(l.k)
+ fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs)
+ if err != nil {
+ return nil, fmt.Errorf("importing fds: %v", err)
}
+ // CreateProcess takes a reference on fdTable if successful. We won't need
+ // ours either way.
+ info.procArgs.FDTable = fdTable
// Setup the child container file system.
- l.startGoferMonitor(cid, goferFDs)
+ l.startGoferMonitor(cid, info.goferFDs)
- mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
- if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
- return err
+ mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints)
+ if root {
+ if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
+ return nil, err
+ }
+ }
+ if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
+ return nil, err
}
- // Create and start the new process.
- tg, _, err := l.k.CreateProcess(procArgs)
+ // Add the HOME enviroment variable if it is not already set.
+ var envv []string
+ if kernel.VFS2Enabled {
+ envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
+ info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
+
+ } else {
+ envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
+ info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
+ }
if err != nil {
- return fmt.Errorf("creating process: %v", err)
+ return nil, err
}
- l.k.StartProcess(tg)
+ info.procArgs.Envv = envv
+ // Create and start the new process.
+ tg, _, err := l.k.CreateProcess(info.procArgs)
+ if err != nil {
+ return nil, fmt.Errorf("creating process: %v", err)
+ }
// CreateProcess takes a reference on FDTable if successful.
- procArgs.FDTable.DecRef()
+ info.procArgs.FDTable.DecRef(ctx)
- l.processes[eid].tg = tg
- return nil
+ // Set the foreground process group on the TTY to the global init process
+ // group, since that is what we are about to start running.
+ if root {
+ switch {
+ case ttyFileVFS2 != nil:
+ ep.ttyVFS2 = ttyFileVFS2
+ ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
+ case ttyFile != nil:
+ ep.tty = ttyFile
+ ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
+ }
+ }
+
+ return tg, nil
}
// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
@@ -885,22 +899,20 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
// Add the HOME environment variable if it is not already set.
if kernel.VFS2Enabled {
- defer args.MountNamespaceVFS2.DecRef()
-
root := args.MountNamespaceVFS2.Root()
- defer root.DecRef()
ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
+ defer args.MountNamespaceVFS2.DecRef(ctx)
+ defer root.DecRef(ctx)
envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
if err != nil {
return 0, err
}
args.Envv = envv
} else {
- defer args.MountNamespace.DecRef()
-
root := args.MountNamespace.Root()
- defer root.DecRef()
ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+ defer args.MountNamespace.DecRef(ctx)
+ defer root.DecRef(ctx)
envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
if err != nil {
return 0, err
@@ -997,6 +1009,11 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
// Wait for container.
l.k.WaitExited()
+ // Cleanup
+ l.ctrl.stop()
+
+ refs.OnExit()
+
return l.k.GlobalInit().ExitStatus()
}
@@ -1044,7 +1061,7 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
// Enable SACK Recovery.
if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
- return nil, fmt.Errorf("failed to enable SACK: %v", err)
+ return nil, fmt.Errorf("failed to enable SACK: %s", err)
}
// Set default TTLs as required by socket/netstack.
@@ -1053,11 +1070,9 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
// Enable Receive Buffer Auto-Tuning.
if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
- return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
+ return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
}
- s.FillDefaultIPTables()
-
return &s, nil
}
@@ -1260,7 +1275,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F
fdTable := k.NewFDTable()
ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
if err != nil {
- fdTable.DecRef()
+ fdTable.DecRef(ctx)
return nil, nil, nil, err
}
return fdTable, ttyFile, ttyFileVFS2, nil
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index e448fd773..aa3fdf96c 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -450,13 +450,13 @@ func TestCreateMountNamespace(t *testing.T) {
}
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
for _, p := range tc.expectedPaths {
maxTraversals := uint(0)
if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil {
t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
} else {
- d.DecRef()
+ d.DecRef(ctx)
}
}
})
@@ -479,19 +479,19 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
defer l.Destroy()
defer loaderCleanup()
- mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
- if err := mntr.processHints(l.conf); err != nil {
+ mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints)
+ if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil {
t.Fatalf("failed process hints: %v", err)
}
ctx := l.k.SupervisorContext()
- mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs)
+ mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs)
if err != nil {
t.Fatalf("failed to setupVFS2: %v", err)
}
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
for _, p := range tc.expectedPaths {
target := &vfs.PathOperation{
Root: root,
@@ -499,10 +499,10 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
Path: fspath.Parse(p),
}
- if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
+ if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil {
t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err)
} else {
- d.DecRef()
+ d.DecRef(ctx)
}
}
})
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0af30456e..4e1fa7665 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket"
"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
@@ -123,6 +124,8 @@ type FDBasedLink struct {
Routes []Route
GSOMaxSize uint32
SoftwareGSOEnabled bool
+ TXChecksumOffload bool
+ RXChecksumOffload bool
LinkAddress net.HardwareAddr
QDisc QueueingDiscipline
@@ -236,7 +239,8 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
PacketDispatchMode: fdbased.RecvMMsg,
GSOMaxSize: link.GSOMaxSize,
SoftwareGSOEnabled: link.SoftwareGSOEnabled,
- RXChecksumOffload: true,
+ TXChecksumOffload: link.TXChecksumOffload,
+ RXChecksumOffload: link.RXChecksumOffload,
})
if err != nil {
return err
@@ -249,6 +253,9 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
linkEP = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000)
}
+ // Enable support for AF_PACKET sockets to receive outgoing packets.
+ linkEP = packetsocket.New(linkEP)
+
log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return err
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 147c901c4..08dce8b6c 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -22,25 +22,33 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
- "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/devices/ttydev"
+ "gvisor.dev/gvisor/pkg/sentry/devices/tundev"
+ "gvisor.dev/gvisor/pkg/sentry/fs/user"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
- "gvisor.dev/gvisor/pkg/syserror"
-
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
)
-func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+func registerFilesystems(k *kernel.Kernel) error {
+ ctx := k.SupervisorContext()
+ creds := auth.NewRootCredentials(k.RootUserNamespace())
+ vfsObj := k.VFS()
+
vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserList: true,
// TODO(b/29356795): Users may mount this once the terminals are in a
@@ -54,6 +62,10 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserList: true,
})
+ vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
AllowUserList: true,
@@ -66,98 +78,75 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre
AllowUserMount: true,
AllowUserList: true,
})
+ vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
// Setup files in devtmpfs.
if err := memdev.Register(vfsObj); err != nil {
return fmt.Errorf("registering memdev: %w", err)
}
+ if err := ttydev.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering ttydev: %w", err)
+ }
+ tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
+ if tunSupported {
+ if err := tundev.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering tundev: %v", err)
+ }
+ }
+
+ if kernel.FUSEEnabled {
+ if err := fuse.Register(vfsObj); err != nil {
+ return fmt.Errorf("registering fusedev: %w", err)
+ }
+ }
+
a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
if err != nil {
return fmt.Errorf("creating devtmpfs accessor: %w", err)
}
- defer a.Release()
+ defer a.Release(ctx)
if err := a.UserspaceInit(ctx); err != nil {
return fmt.Errorf("initializing userspace: %w", err)
}
if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
- return fmt.Errorf("creating devtmpfs files: %w", err)
+ return fmt.Errorf("creating memdev devtmpfs files: %w", err)
+ }
+ if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
+ return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
+ }
+ if tunSupported {
+ if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
+ return fmt.Errorf("creating tundev devtmpfs files: %v", err)
+ }
}
+
+ if kernel.FUSEEnabled {
+ if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
+ return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
+ }
+ }
+
return nil
}
func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
- if err := mntr.k.VFS().Init(); err != nil {
- return fmt.Errorf("failed to initialize VFS: %w", err)
- }
mns, err := mntr.setupVFS2(ctx, conf, procArgs)
if err != nil {
return fmt.Errorf("failed to setupFS: %w", err)
}
procArgs.MountNamespaceVFS2 = mns
- return setExecutablePathVFS2(ctx, procArgs)
-}
-
-func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
- exe := procArgs.Argv[0]
-
- // Absolute paths can be used directly.
- if path.IsAbs(exe) {
- procArgs.Filename = exe
- return nil
- }
-
- // Paths with '/' in them should be joined to the working directory, or
- // to the root if working directory is not set.
- if strings.IndexByte(exe, '/') > 0 {
- if !path.IsAbs(procArgs.WorkingDirectory) {
- return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory)
- }
- procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
- return nil
- }
-
- // Paths with a '/' are relative to the CWD.
- if strings.IndexByte(exe, '/') > 0 {
- procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe)
- return nil
- }
-
- // Otherwise, We must lookup the name in the paths, starting from the
- // root directory.
- root := procArgs.MountNamespaceVFS2.Root()
- defer root.DecRef()
- paths := fs.GetPath(procArgs.Envv)
- creds := procArgs.Credentials
-
- for _, p := range paths {
- binPath := path.Join(p, exe)
- pop := &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(binPath),
- FollowFinalSymlink: true,
- }
- opts := &vfs.OpenOptions{
- FileExec: true,
- Flags: linux.O_RDONLY,
- }
- dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts)
- if err == syserror.ENOENT || err == syserror.EACCES {
- // Didn't find it here.
- continue
- }
- if err != nil {
- return err
- }
- dentry.DecRef()
-
- procArgs.Filename = binPath
- return nil
+ // Resolve the executable path from working dir and environment.
+ resolved, err := user.ResolveExecutablePath(ctx, procArgs)
+ if err != nil {
+ return err
}
-
- return fmt.Errorf("executable %q not found in $PATH=%q", exe, strings.Join(paths, ":"))
+ procArgs.Filename = resolved
+ return nil
}
func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
@@ -173,10 +162,6 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
rootCtx := procArgs.NewContext(c.k)
- if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil {
- return nil, fmt.Errorf("register filesystems: %w", err)
- }
-
mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
if err != nil {
return nil, fmt.Errorf("creating mount namespace: %w", err)
@@ -192,10 +177,19 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
fd := c.fds.remove()
- opts := strings.Join(p9MountOptions(fd, conf.FileAccess, true /* vfs2 */), ",")
+ opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+
+ if conf.OverlayfsStaleRead {
+ // We can't check for overlayfs here because sandbox is chroot'ed and gofer
+ // can only send mount options for specs.Mounts (specs.Root is missing
+ // Options field). So assume root is always on top of overlayfs.
+ opts = append(opts, "overlayfs_stale_read")
+ }
log.Infof("Mounting root over 9P, ioFD: %d", fd)
- mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts})
+ mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{
+ Data: strings.Join(opts, ","),
+ })
if err != nil {
return nil, fmt.Errorf("setting up mount namespace: %w", err)
}
@@ -211,13 +205,20 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
for i := range mounts {
submount := &mounts[i]
log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
- if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
- return err
+ if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
+ if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil {
+ return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
+ }
+ } else {
+ if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil {
+ return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
+ }
}
}
- // TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go.
-
+ if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
+ return fmt.Errorf(`mount submount "\tmp": %w`, err)
+ }
return nil
}
@@ -235,7 +236,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
fd := -1
// Only bind mounts use host FDs; see
// containerMounter.getMountNameAndOptionsVFS2.
- if m.Type == bind || isBindMount(m) {
+ if m.Type == bind {
fd = c.fds.remove()
}
mounts = append(mounts, mountAndFD{
@@ -255,22 +256,19 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
return mounts, nil
}
-// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1
-// version.
func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
root := mns.Root()
- defer root.DecRef()
+ defer root.DecRef(ctx)
target := &vfs.PathOperation{
Root: root,
Start: root,
Path: fspath.Parse(submount.Destination),
}
-
- fsName, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
+ fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
if err != nil {
return fmt.Errorf("mountOptions failed: %w", err)
}
- if fsName == "" {
+ if len(fsName) == 0 {
// Filesystem is not supported (e.g. cgroup), just skip it.
return nil
}
@@ -278,17 +276,6 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
return err
}
-
- opts := &vfs.MountOptions{
- GetFilesystemOptions: vfs.GetFilesystemOptions{
- Data: strings.Join(options, ","),
- },
- InternalMount: true,
- }
-
- // All writes go to upper, be paranoid and make lower readonly.
- opts.ReadOnly = useOverlay
-
if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
}
@@ -298,41 +285,66 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
// used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, []string, bool, error) {
- var (
- fsName string
- opts []string
- useOverlay bool
- )
-
- if isBindMount(m.Mount) {
- m.Type = bind
- }
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
+ fsName := m.Type
+ var data []string
+ // Find filesystem name and FS specific data field.
switch m.Type {
case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
- fsName = m.Type
+ // Nothing to do.
+
case nonefs:
fsName = sys.Name
- case tmpfs.Name:
- fsName = m.Type
+ case tmpfs.Name:
var err error
- opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
+ data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
if err != nil {
- return "", nil, false, err
+ return "", nil, err
}
case bind:
fsName = gofer.Name
- opts = p9MountOptions(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
- // If configured, add overlay to all writable mounts.
- useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+ if m.fd == 0 {
+ // Check that an FD was provided to fails fast. Technically FD=0 is valid,
+ // but unlikely to be correct in this context.
+ return "", nil, fmt.Errorf("9P mount requires a connection FD")
+ }
+ data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
default:
log.Warningf("ignoring unknown filesystem type %q", m.Type)
+ return "", nil, nil
}
- return fsName, opts, useOverlay, nil
+
+ opts := &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ Data: strings.Join(data, ","),
+ },
+ InternalMount: true,
+ }
+
+ for _, o := range m.Options {
+ switch o {
+ case "rw":
+ opts.ReadOnly = false
+ case "ro":
+ opts.ReadOnly = true
+ case "noatime":
+ opts.Flags.NoATime = true
+ case "noexec":
+ opts.Flags.NoExec = true
+ default:
+ log.Warningf("ignoring unknown mount option %q", o)
+ }
+ }
+
+ if conf.Overlay {
+ // All writes go to upper, be paranoid and make lower readonly.
+ opts.ReadOnly = true
+ }
+ return fsName, opts, nil
}
func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
@@ -343,7 +355,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
}
_, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{})
if err == nil {
- // Mount point exists, nothing else to do.
+ log.Debugf("Mount point %q already exists", currentPath)
return nil
}
if err != syserror.ENOENT {
@@ -361,3 +373,147 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
}
return nil
}
+
+// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+// 1. /tmp is mounted explicitly: we should not override user's wish
+// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
+ for _, m := range c.mounts {
+ // m.Destination has been cleaned, so it's to use equality here.
+ if m.Destination == "/tmp" {
+ log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
+ return nil
+ }
+ }
+
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse("/tmp"),
+ }
+ // TODO(gvisor.dev/issue/2782): Use O_PATH when available.
+ fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
+ switch err {
+ case nil:
+ defer fd.DecRef(ctx)
+
+ err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+ if dirent.Name != "." && dirent.Name != ".." {
+ return syserror.ENOTEMPTY
+ }
+ return nil
+ }))
+ switch err {
+ case nil:
+ log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
+ case syserror.ENOTEMPTY:
+ // If more than "." and ".." is found, skip internal tmpfs to prevent
+ // hiding existing files.
+ log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
+ return nil
+ default:
+ return err
+ }
+ fallthrough
+
+ case syserror.ENOENT:
+ // No '/tmp' found (or fallthrough from above). It's safe to mount internal
+ // tmpfs.
+ tmpMount := specs.Mount{
+ Type: tmpfs.Name,
+ Destination: "/tmp",
+ // Sticky bit is added to prevent accidental deletion of files from
+ // another user. This is normally done for /tmp.
+ Options: []string{"mode=01777"},
+ }
+ return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
+
+ case syserror.ENOTDIR:
+ // Not a dir?! Let it be.
+ return nil
+
+ default:
+ return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
+ }
+}
+
+// processHintsVFS2 processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error {
+ ctx := c.k.SupervisorContext()
+ for _, hint := range c.hints.mounts {
+ // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+ // common gofer to mount all shared volumes.
+ if hint.mount.Type != tmpfs.Name {
+ continue
+ }
+
+ log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+ mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
+ if err != nil {
+ return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+ }
+ hint.vfsMount = mnt
+ }
+ return nil
+}
+
+// mountSharedMasterVFS2 mounts the master of a volume that is shared among
+// containers in a pod.
+func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ mntFD := &mountAndFD{Mount: hint.mount}
+ fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+ if err != nil {
+ return nil, err
+ }
+ if len(fsName) == 0 {
+ return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+ }
+ return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error {
+ if err := source.checkCompatible(mount); err != nil {
+ return err
+ }
+
+ _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+ if err != nil {
+ return err
+ }
+ newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
+ if err != nil {
+ return err
+ }
+ defer newMnt.DecRef(ctx)
+
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil {
+ return err
+ }
+
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(mount.Destination),
+ }
+ if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
+ return err
+ }
+ log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+ return nil
+}
diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD
index d4c7bdfbb..37f4253ba 100644
--- a/runsc/cgroup/BUILD
+++ b/runsc/cgroup/BUILD
@@ -7,10 +7,10 @@ go_library(
srcs = ["cgroup.go"],
visibility = ["//:sandbox"],
deps = [
+ "//pkg/cleanup",
"//pkg/log",
- "//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
],
)
@@ -20,4 +20,8 @@ go_test(
srcs = ["cgroup_test.go"],
library = ":cgroup",
tags = ["local"],
+ deps = [
+ "//pkg/test/testutil",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+ ],
)
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 19c8b0db6..8fbc3887a 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -31,8 +31,8 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/runsc/specutils"
)
const (
@@ -43,6 +43,7 @@ var controllers = map[string]config{
"blkio": config{ctrlr: &blockIO{}},
"cpu": config{ctrlr: &cpu{}},
"cpuset": config{ctrlr: &cpuSet{}},
+ "hugetlb": config{ctrlr: &hugeTLB{}, optional: true},
"memory": config{ctrlr: &memory{}},
"net_cls": config{ctrlr: &networkClass{}},
"net_prio": config{ctrlr: &networkPrio{}},
@@ -52,7 +53,6 @@ var controllers = map[string]config{
// irrelevant for a sandbox.
"devices": config{ctrlr: &noop{}},
"freezer": config{ctrlr: &noop{}},
- "hugetlb": config{ctrlr: &noop{}, optional: true},
"perf_event": config{ctrlr: &noop{}},
"rdma": config{ctrlr: &noop{}, optional: true},
"systemd": config{ctrlr: &noop{}},
@@ -92,7 +92,17 @@ func setOptionalValueUint16(path, name string, val *uint16) error {
func setValue(path, name, data string) error {
fullpath := filepath.Join(path, name)
- return ioutil.WriteFile(fullpath, []byte(data), 0700)
+
+ // Retry writes on EINTR; see:
+ // https://github.com/golang/go/issues/38033
+ for {
+ err := ioutil.WriteFile(fullpath, []byte(data), 0700)
+ if err == nil {
+ return nil
+ } else if !errors.Is(err, syscall.EINTR) {
+ return err
+ }
+ }
}
func getValue(path, name string) (string, error) {
@@ -125,15 +135,23 @@ func fillFromAncestor(path string) (string, error) {
return val, nil
}
- // File is not set, recurse to parent and then set here.
+ // File is not set, recurse to parent and then set here.
name := filepath.Base(path)
parent := filepath.Dir(filepath.Dir(path))
val, err = fillFromAncestor(filepath.Join(parent, name))
if err != nil {
return "", err
}
- if err := ioutil.WriteFile(path, []byte(val), 0700); err != nil {
- return "", err
+
+ // Retry writes on EINTR; see:
+ // https://github.com/golang/go/issues/38033
+ for {
+ err := ioutil.WriteFile(path, []byte(val), 0700)
+ if err == nil {
+ break
+ } else if !errors.Is(err, syscall.EINTR) {
+ return "", err
+ }
}
return val, nil
}
@@ -246,7 +264,7 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
// The Cleanup object cleans up partially created cgroups when an error occurs.
// Errors occuring during cleanup itself are ignored.
- clean := specutils.MakeCleanup(func() { _ = c.Uninstall() })
+ clean := cleanup.Make(func() { _ = c.Uninstall() })
defer clean.Clean()
for key, cfg := range controllers {
@@ -446,7 +464,13 @@ func (*cpu) set(spec *specs.LinuxResources, path string) error {
if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil {
return err
}
- return setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period)
+ if err := setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period); err != nil {
+ return err
+ }
+ if err := setOptionalValueUint(path, "cpu.rt_period_us", spec.CPU.RealtimePeriod); err != nil {
+ return err
+ }
+ return setOptionalValueInt(path, "cpu.rt_runtime_us", spec.CPU.RealtimeRuntime)
}
type cpuSet struct{}
@@ -487,13 +511,17 @@ func (*blockIO) set(spec *specs.LinuxResources, path string) error {
}
for _, dev := range spec.BlockIO.WeightDevice {
- val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Weight)
- if err := setValue(path, "blkio.weight_device", val); err != nil {
- return err
+ if dev.Weight != nil {
+ val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight)
+ if err := setValue(path, "blkio.weight_device", val); err != nil {
+ return err
+ }
}
- val = fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.LeafWeight)
- if err := setValue(path, "blkio.leaf_weight_device", val); err != nil {
- return err
+ if dev.LeafWeight != nil {
+ val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.LeafWeight)
+ if err := setValue(path, "blkio.leaf_weight_device", val); err != nil {
+ return err
+ }
}
}
if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil {
@@ -545,9 +573,22 @@ func (*networkPrio) set(spec *specs.LinuxResources, path string) error {
type pids struct{}
func (*pids) set(spec *specs.LinuxResources, path string) error {
- if spec.Pids == nil {
+ if spec.Pids == nil || spec.Pids.Limit <= 0 {
return nil
}
val := strconv.FormatInt(spec.Pids.Limit, 10)
return setValue(path, "pids.max", val)
}
+
+type hugeTLB struct{}
+
+func (*hugeTLB) set(spec *specs.LinuxResources, path string) error {
+ for _, limit := range spec.HugepageLimits {
+ name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize)
+ val := strconv.FormatUint(limit.Limit, 10)
+ if err := setValue(path, name, val); err != nil {
+ return err
+ }
+ }
+ return nil
+}
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
index 548c80e9a..4db5ee5c3 100644
--- a/runsc/cgroup/cgroup_test.go
+++ b/runsc/cgroup/cgroup_test.go
@@ -15,7 +15,14 @@
package cgroup
import (
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strings"
"testing"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/test/testutil"
)
func TestUninstallEnoent(t *testing.T) {
@@ -65,3 +72,578 @@ func TestCountCpuset(t *testing.T) {
})
}
}
+
+func uint16Ptr(v uint16) *uint16 {
+ return &v
+}
+
+func uint32Ptr(v uint32) *uint32 {
+ return &v
+}
+
+func int64Ptr(v int64) *int64 {
+ return &v
+}
+
+func uint64Ptr(v uint64) *uint64 {
+ return &v
+}
+
+func boolPtr(v bool) *bool {
+ return &v
+}
+
+func checkDir(t *testing.T, dir string, contents map[string]string) {
+ all, err := ioutil.ReadDir(dir)
+ if err != nil {
+ t.Fatalf("ReadDir(%q): %v", dir, err)
+ }
+ fileCount := 0
+ for _, file := range all {
+ if file.IsDir() {
+ // Only want to compare files.
+ continue
+ }
+ fileCount++
+
+ want, ok := contents[file.Name()]
+ if !ok {
+ t.Errorf("file not expected: %q", file.Name())
+ continue
+ }
+ gotBytes, err := ioutil.ReadFile(filepath.Join(dir, file.Name()))
+ if err != nil {
+ t.Fatal(err.Error())
+ }
+ got := strings.TrimSuffix(string(gotBytes), "\n")
+ if got != want {
+ t.Errorf("wrong file content, file: %q, want: %q, got: %q", file.Name(), want, got)
+ }
+ }
+ if fileCount != len(contents) {
+ t.Errorf("file is missing, want: %v, got: %v", contents, all)
+ }
+}
+
+func makeLinuxWeightDevice(major, minor int64, weight, leafWeight *uint16) specs.LinuxWeightDevice {
+ rv := specs.LinuxWeightDevice{
+ Weight: weight,
+ LeafWeight: leafWeight,
+ }
+ rv.Major = major
+ rv.Minor = minor
+ return rv
+}
+
+func makeLinuxThrottleDevice(major, minor int64, rate uint64) specs.LinuxThrottleDevice {
+ rv := specs.LinuxThrottleDevice{
+ Rate: rate,
+ }
+ rv.Major = major
+ rv.Minor = minor
+ return rv
+}
+
+func TestBlockIO(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ spec *specs.LinuxBlockIO
+ wants map[string]string
+ }{
+ {
+ name: "simple",
+ spec: &specs.LinuxBlockIO{
+ Weight: uint16Ptr(1),
+ LeafWeight: uint16Ptr(2),
+ },
+ wants: map[string]string{
+ "blkio.weight": "1",
+ "blkio.leaf_weight": "2",
+ },
+ },
+ {
+ name: "weight_device",
+ spec: &specs.LinuxBlockIO{
+ WeightDevice: []specs.LinuxWeightDevice{
+ makeLinuxWeightDevice(1, 2, uint16Ptr(3), uint16Ptr(4)),
+ },
+ },
+ wants: map[string]string{
+ "blkio.weight_device": "1:2 3",
+ "blkio.leaf_weight_device": "1:2 4",
+ },
+ },
+ {
+ name: "weight_device_nil_values",
+ spec: &specs.LinuxBlockIO{
+ WeightDevice: []specs.LinuxWeightDevice{
+ makeLinuxWeightDevice(1, 2, nil, nil),
+ },
+ },
+ },
+ {
+ name: "throttle",
+ spec: &specs.LinuxBlockIO{
+ ThrottleReadBpsDevice: []specs.LinuxThrottleDevice{
+ makeLinuxThrottleDevice(1, 2, 3),
+ },
+ ThrottleReadIOPSDevice: []specs.LinuxThrottleDevice{
+ makeLinuxThrottleDevice(4, 5, 6),
+ },
+ ThrottleWriteBpsDevice: []specs.LinuxThrottleDevice{
+ makeLinuxThrottleDevice(7, 8, 9),
+ },
+ ThrottleWriteIOPSDevice: []specs.LinuxThrottleDevice{
+ makeLinuxThrottleDevice(10, 11, 12),
+ },
+ },
+ wants: map[string]string{
+ "blkio.throttle.read_bps_device": "1:2 3",
+ "blkio.throttle.read_iops_device": "4:5 6",
+ "blkio.throttle.write_bps_device": "7:8 9",
+ "blkio.throttle.write_iops_device": "10:11 12",
+ },
+ },
+ {
+ name: "nil_values",
+ spec: &specs.LinuxBlockIO{},
+ },
+ {
+ name: "nil",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ defer os.RemoveAll(dir)
+
+ spec := &specs.LinuxResources{
+ BlockIO: tc.spec,
+ }
+ ctrlr := blockIO{}
+ if err := ctrlr.set(spec, dir); err != nil {
+ t.Fatalf("ctrlr.set(): %v", err)
+ }
+ checkDir(t, dir, tc.wants)
+ })
+ }
+}
+
+func TestCPU(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ spec *specs.LinuxCPU
+ wants map[string]string
+ }{
+ {
+ name: "all",
+ spec: &specs.LinuxCPU{
+ Shares: uint64Ptr(1),
+ Quota: int64Ptr(2),
+ Period: uint64Ptr(3),
+ RealtimeRuntime: int64Ptr(4),
+ RealtimePeriod: uint64Ptr(5),
+ },
+ wants: map[string]string{
+ "cpu.shares": "1",
+ "cpu.cfs_quota_us": "2",
+ "cpu.cfs_period_us": "3",
+ "cpu.rt_runtime_us": "4",
+ "cpu.rt_period_us": "5",
+ },
+ },
+ {
+ name: "nil_values",
+ spec: &specs.LinuxCPU{},
+ },
+ {
+ name: "nil",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ defer os.RemoveAll(dir)
+
+ spec := &specs.LinuxResources{
+ CPU: tc.spec,
+ }
+ ctrlr := cpu{}
+ if err := ctrlr.set(spec, dir); err != nil {
+ t.Fatalf("ctrlr.set(): %v", err)
+ }
+ checkDir(t, dir, tc.wants)
+ })
+ }
+}
+
+func TestCPUSet(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ spec *specs.LinuxCPU
+ wants map[string]string
+ }{
+ {
+ name: "all",
+ spec: &specs.LinuxCPU{
+ Cpus: "foo",
+ Mems: "bar",
+ },
+ wants: map[string]string{
+ "cpuset.cpus": "foo",
+ "cpuset.mems": "bar",
+ },
+ },
+ // Don't test nil values because they are copied from the parent.
+ // See TestCPUSetAncestor().
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ defer os.RemoveAll(dir)
+
+ spec := &specs.LinuxResources{
+ CPU: tc.spec,
+ }
+ ctrlr := cpuSet{}
+ if err := ctrlr.set(spec, dir); err != nil {
+ t.Fatalf("ctrlr.set(): %v", err)
+ }
+ checkDir(t, dir, tc.wants)
+ })
+ }
+}
+
+// TestCPUSetAncestor checks that, when not available, value is read from
+// parent directory.
+func TestCPUSetAncestor(t *testing.T) {
+ // Prepare master directory with cgroup files that will be propagated to
+ // children.
+ grandpa, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ defer os.RemoveAll(grandpa)
+
+ if err := ioutil.WriteFile(filepath.Join(grandpa, "cpuset.cpus"), []byte("parent-cpus"), 0666); err != nil {
+ t.Fatalf("ioutil.WriteFile(): %v", err)
+ }
+ if err := ioutil.WriteFile(filepath.Join(grandpa, "cpuset.mems"), []byte("parent-mems"), 0666); err != nil {
+ t.Fatalf("ioutil.WriteFile(): %v", err)
+ }
+
+ for _, tc := range []struct {
+ name string
+ spec *specs.LinuxCPU
+ }{
+ {
+ name: "nil_values",
+ spec: &specs.LinuxCPU{},
+ },
+ {
+ name: "nil",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ // Create empty files in intermediate directory. They should be ignored
+ // when reading, and then populated from parent.
+ parent, err := ioutil.TempDir(grandpa, "parent")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ defer os.RemoveAll(parent)
+ if _, err := os.Create(filepath.Join(parent, "cpuset.cpus")); err != nil {
+ t.Fatalf("os.Create(): %v", err)
+ }
+ if _, err := os.Create(filepath.Join(parent, "cpuset.mems")); err != nil {
+ t.Fatalf("os.Create(): %v", err)
+ }
+
+ // cgroup files mmust exist.
+ dir, err := ioutil.TempDir(parent, "child")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ if _, err := os.Create(filepath.Join(dir, "cpuset.cpus")); err != nil {
+ t.Fatalf("os.Create(): %v", err)
+ }
+ if _, err := os.Create(filepath.Join(dir, "cpuset.mems")); err != nil {
+ t.Fatalf("os.Create(): %v", err)
+ }
+
+ spec := &specs.LinuxResources{
+ CPU: tc.spec,
+ }
+ ctrlr := cpuSet{}
+ if err := ctrlr.set(spec, dir); err != nil {
+ t.Fatalf("ctrlr.set(): %v", err)
+ }
+ want := map[string]string{
+ "cpuset.cpus": "parent-cpus",
+ "cpuset.mems": "parent-mems",
+ }
+ // Both path and dir must have been populated from grandpa.
+ checkDir(t, parent, want)
+ checkDir(t, dir, want)
+ })
+ }
+}
+
+func TestHugeTlb(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ spec []specs.LinuxHugepageLimit
+ wants map[string]string
+ }{
+ {
+ name: "single",
+ spec: []specs.LinuxHugepageLimit{
+ {
+ Pagesize: "1G",
+ Limit: 123,
+ },
+ },
+ wants: map[string]string{
+ "hugetlb.1G.limit_in_bytes": "123",
+ },
+ },
+ {
+ name: "multiple",
+ spec: []specs.LinuxHugepageLimit{
+ {
+ Pagesize: "1G",
+ Limit: 123,
+ },
+ {
+ Pagesize: "2G",
+ Limit: 456,
+ },
+ {
+ Pagesize: "1P",
+ Limit: 789,
+ },
+ },
+ wants: map[string]string{
+ "hugetlb.1G.limit_in_bytes": "123",
+ "hugetlb.2G.limit_in_bytes": "456",
+ "hugetlb.1P.limit_in_bytes": "789",
+ },
+ },
+ {
+ name: "nil",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ defer os.RemoveAll(dir)
+
+ spec := &specs.LinuxResources{
+ HugepageLimits: tc.spec,
+ }
+ ctrlr := hugeTLB{}
+ if err := ctrlr.set(spec, dir); err != nil {
+ t.Fatalf("ctrlr.set(): %v", err)
+ }
+ checkDir(t, dir, tc.wants)
+ })
+ }
+}
+
+func TestMemory(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ spec *specs.LinuxMemory
+ wants map[string]string
+ }{
+ {
+ name: "all",
+ spec: &specs.LinuxMemory{
+ Limit: int64Ptr(1),
+ Reservation: int64Ptr(2),
+ Swap: int64Ptr(3),
+ Kernel: int64Ptr(4),
+ KernelTCP: int64Ptr(5),
+ Swappiness: uint64Ptr(6),
+ DisableOOMKiller: boolPtr(true),
+ },
+ wants: map[string]string{
+ "memory.limit_in_bytes": "1",
+ "memory.soft_limit_in_bytes": "2",
+ "memory.memsw.limit_in_bytes": "3",
+ "memory.kmem.limit_in_bytes": "4",
+ "memory.kmem.tcp.limit_in_bytes": "5",
+ "memory.swappiness": "6",
+ "memory.oom_control": "1",
+ },
+ },
+ {
+ // Disable OOM killer should only write when set to true.
+ name: "oomkiller",
+ spec: &specs.LinuxMemory{
+ DisableOOMKiller: boolPtr(false),
+ },
+ },
+ {
+ name: "nil_values",
+ spec: &specs.LinuxMemory{},
+ },
+ {
+ name: "nil",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ defer os.RemoveAll(dir)
+
+ spec := &specs.LinuxResources{
+ Memory: tc.spec,
+ }
+ ctrlr := memory{}
+ if err := ctrlr.set(spec, dir); err != nil {
+ t.Fatalf("ctrlr.set(): %v", err)
+ }
+ checkDir(t, dir, tc.wants)
+ })
+ }
+}
+
+func TestNetworkClass(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ spec *specs.LinuxNetwork
+ wants map[string]string
+ }{
+ {
+ name: "all",
+ spec: &specs.LinuxNetwork{
+ ClassID: uint32Ptr(1),
+ },
+ wants: map[string]string{
+ "net_cls.classid": "1",
+ },
+ },
+ {
+ name: "nil_values",
+ spec: &specs.LinuxNetwork{},
+ },
+ {
+ name: "nil",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ defer os.RemoveAll(dir)
+
+ spec := &specs.LinuxResources{
+ Network: tc.spec,
+ }
+ ctrlr := networkClass{}
+ if err := ctrlr.set(spec, dir); err != nil {
+ t.Fatalf("ctrlr.set(): %v", err)
+ }
+ checkDir(t, dir, tc.wants)
+ })
+ }
+}
+
+func TestNetworkPriority(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ spec *specs.LinuxNetwork
+ wants map[string]string
+ }{
+ {
+ name: "all",
+ spec: &specs.LinuxNetwork{
+ Priorities: []specs.LinuxInterfacePriority{
+ {
+ Name: "foo",
+ Priority: 1,
+ },
+ },
+ },
+ wants: map[string]string{
+ "net_prio.ifpriomap": "foo 1",
+ },
+ },
+ {
+ name: "nil_values",
+ spec: &specs.LinuxNetwork{},
+ },
+ {
+ name: "nil",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ defer os.RemoveAll(dir)
+
+ spec := &specs.LinuxResources{
+ Network: tc.spec,
+ }
+ ctrlr := networkPrio{}
+ if err := ctrlr.set(spec, dir); err != nil {
+ t.Fatalf("ctrlr.set(): %v", err)
+ }
+ checkDir(t, dir, tc.wants)
+ })
+ }
+}
+
+func TestPids(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ spec *specs.LinuxPids
+ wants map[string]string
+ }{
+ {
+ name: "all",
+ spec: &specs.LinuxPids{Limit: 1},
+ wants: map[string]string{
+ "pids.max": "1",
+ },
+ },
+ {
+ name: "nil_values",
+ spec: &specs.LinuxPids{},
+ },
+ {
+ name: "nil",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ dir, err := ioutil.TempDir(testutil.TmpDir(), "cgroup")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ defer os.RemoveAll(dir)
+
+ spec := &specs.LinuxResources{
+ Pids: tc.spec,
+ }
+ ctrlr := pids{}
+ if err := ctrlr.set(spec, dir); err != nil {
+ t.Fatalf("ctrlr.set(): %v", err)
+ }
+ checkDir(t, dir, tc.wants)
+ })
+ }
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index af3538ef0..1b5178dd5 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -45,7 +45,7 @@ go_library(
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/platform",
- "//pkg/state",
+ "//pkg/state/pretty",
"//pkg/state/statefile",
"//pkg/sync",
"//pkg/unet",
@@ -58,7 +58,7 @@ go_library(
"//runsc/fsgofer/filter",
"//runsc/specutils",
"@com_github_google_subcommands//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@com_github_syndtr_gocapability//capability:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
@@ -87,9 +87,9 @@ go_test(
"//runsc/boot",
"//runsc/container",
"//runsc/specutils",
- "@com_github_google_go-cmp//cmp:go_default_library",
- "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_google_go_cmp//cmp:go_default_library",
+ "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@com_github_syndtr_gocapability//capability:go_default_library",
],
)
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 4c2ac6ff0..f4f247721 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -54,10 +54,6 @@ type Boot struct {
// provided in that order.
stdioFDs intFlags
- // console is set to true if the sandbox should allow terminal ioctl(2)
- // syscalls.
- console bool
-
// applyCaps determines if capabilities defined in the spec should be applied
// to the process.
applyCaps bool
@@ -115,7 +111,6 @@ func (b *Boot) SetFlags(f *flag.FlagSet) {
f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec")
f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
- f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls")
f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
@@ -136,7 +131,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
// Ensure that if there is a panic, all goroutine stacks are printed.
- debug.SetTraceback("all")
+ debug.SetTraceback("system")
conf := args[0].(*boot.Config)
@@ -229,7 +224,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
Device: os.NewFile(uintptr(b.deviceFD), "platform device"),
GoferFDs: b.ioFDs.GetArray(),
StdioFDs: b.stdioFDs.GetArray(),
- Console: b.console,
NumCPU: b.cpuNum,
TotalMem: b.totalMem,
UserLogFD: b.userLogFD,
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index b5de2588b..742f8c344 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -32,20 +32,19 @@ import (
// Debug implements subcommands.Command for the "debug" command.
type Debug struct {
- pid int
- stacks bool
- signal int
- profileHeap string
- profileCPU string
- profileGoroutine string
- profileBlock string
- profileMutex string
- trace string
- strace string
- logLevel string
- logPackets string
- duration time.Duration
- ps bool
+ pid int
+ stacks bool
+ signal int
+ profileHeap string
+ profileCPU string
+ profileBlock string
+ profileMutex string
+ trace string
+ strace string
+ logLevel string
+ logPackets string
+ duration time.Duration
+ ps bool
}
// Name implements subcommands.Command.
@@ -69,7 +68,6 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
- f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.")
f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.")
f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.")
f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
@@ -153,18 +151,6 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
log.Infof("Heap profile written to %q", d.profileHeap)
}
- if d.profileGoroutine != "" {
- f, err := os.Create(d.profileGoroutine)
- if err != nil {
- return Errorf(err.Error())
- }
- defer f.Close()
-
- if err := c.Sandbox.GoroutineProfile(f); err != nil {
- return Errorf(err.Error())
- }
- log.Infof("Goroutine profile written to %q", d.profileGoroutine)
- }
if d.profileBlock != "" {
f, err := os.Create(d.profileBlock)
if err != nil {
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 28f0d54b9..3966e2d21 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -168,7 +168,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
// Start with root mount, then add any other additional mount as needed.
ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
- ROMount: spec.Root.Readonly,
+ ROMount: spec.Root.Readonly || conf.Overlay,
PanicOnWrite: g.panicOnWrite,
})
if err != nil {
@@ -181,7 +181,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
for _, m := range spec.Mounts {
if specutils.Is9PMount(m) {
cfg := fsgofer.Config{
- ROMount: isReadonlyMount(m.Options),
+ ROMount: isReadonlyMount(m.Options) || conf.Overlay,
PanicOnWrite: g.panicOnWrite,
HostUDS: conf.FSGoferHostUDS,
}
@@ -306,7 +306,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
}
// Replace the current spec, with the clean spec with symlinks resolved.
- if err := setupMounts(spec.Mounts, root); err != nil {
+ if err := setupMounts(conf, spec.Mounts, root); err != nil {
Fatalf("error setting up FS: %v", err)
}
@@ -322,7 +322,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
}
// Check if root needs to be remounted as readonly.
- if spec.Root.Readonly {
+ if spec.Root.Readonly || conf.Overlay {
// If root is a mount point but not read-only, we can change mount options
// to make it read-only for extra safety.
log.Infof("Remounting root as readonly: %q", root)
@@ -346,7 +346,7 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error {
// setupMounts binds mount all mounts specified in the spec in their correct
// location inside root. It will resolve relative paths and symlinks. It also
// creates directories as needed.
-func setupMounts(mounts []specs.Mount, root string) error {
+func setupMounts(conf *boot.Config, mounts []specs.Mount, root string) error {
for _, m := range mounts {
if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
continue
@@ -358,6 +358,11 @@ func setupMounts(mounts []specs.Mount, root string) error {
}
flags := specutils.OptionsToFlags(m.Options) | syscall.MS_BIND
+ if conf.Overlay {
+ // Force mount read-only if writes are not going to be sent to it.
+ flags |= syscall.MS_RDONLY
+ }
+
log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
return fmt.Errorf("mounting %v: %v", m, err)
diff --git a/runsc/cmd/spec.go b/runsc/cmd/spec.go
index 8e2b36e85..55194e641 100644
--- a/runsc/cmd/spec.go
+++ b/runsc/cmd/spec.go
@@ -16,118 +16,122 @@ package cmd
import (
"context"
- "io/ioutil"
+ "encoding/json"
+ "io"
"os"
"path/filepath"
"github.com/google/subcommands"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/runsc/flag"
)
-var specTemplate = []byte(`{
- "ociVersion": "1.0.0",
- "process": {
- "terminal": true,
- "user": {
- "uid": 0,
- "gid": 0
- },
- "args": [
- "sh"
- ],
- "env": [
- "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
- "TERM=xterm"
- ],
- "cwd": "/",
- "capabilities": {
- "bounding": [
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE"
- ],
- "effective": [
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE"
- ],
- "inheritable": [
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE"
- ],
- "permitted": [
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE"
- ],
- "ambient": [
- "CAP_AUDIT_WRITE",
- "CAP_KILL",
- "CAP_NET_BIND_SERVICE"
- ]
- },
- "rlimits": [
- {
- "type": "RLIMIT_NOFILE",
- "hard": 1024,
- "soft": 1024
- }
- ]
- },
- "root": {
- "path": "rootfs",
- "readonly": true
- },
- "hostname": "runsc",
- "mounts": [
- {
- "destination": "/proc",
- "type": "proc",
- "source": "proc"
+func writeSpec(w io.Writer, cwd string, netns string, args []string) error {
+ spec := &specs.Spec{
+ Version: "1.0.0",
+ Process: &specs.Process{
+ Terminal: true,
+ User: specs.User{
+ UID: 0,
+ GID: 0,
+ },
+ Args: args,
+ Env: []string{
+ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+ "TERM=xterm",
+ },
+ Cwd: cwd,
+ Capabilities: &specs.LinuxCapabilities{
+ Bounding: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Effective: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Inheritable: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ Permitted: []string{
+ "CAP_AUDIT_WRITE",
+ "CAP_KILL",
+ "CAP_NET_BIND_SERVICE",
+ },
+ // TODO(gvisor.dev/issue/3166): support ambient capabilities
+ },
+ Rlimits: []specs.POSIXRlimit{
+ {
+ Type: "RLIMIT_NOFILE",
+ Hard: 1024,
+ Soft: 1024,
+ },
+ },
},
- {
- "destination": "/dev",
- "type": "tmpfs",
- "source": "tmpfs",
- "options": []
+ Root: &specs.Root{
+ Path: "rootfs",
+ Readonly: true,
},
- {
- "destination": "/sys",
- "type": "sysfs",
- "source": "sysfs",
- "options": [
- "nosuid",
- "noexec",
- "nodev",
- "ro"
- ]
- }
- ],
- "linux": {
- "namespaces": [
+ Hostname: "runsc",
+ Mounts: []specs.Mount{
{
- "type": "pid"
+ Destination: "/proc",
+ Type: "proc",
+ Source: "proc",
},
{
- "type": "network"
+ Destination: "/dev",
+ Type: "tmpfs",
+ Source: "tmpfs",
},
{
- "type": "ipc"
+ Destination: "/sys",
+ Type: "sysfs",
+ Source: "sysfs",
+ Options: []string{
+ "nosuid",
+ "noexec",
+ "nodev",
+ "ro",
+ },
},
- {
- "type": "uts"
+ },
+ Linux: &specs.Linux{
+ Namespaces: []specs.LinuxNamespace{
+ {
+ Type: "pid",
+ },
+ {
+ Type: "network",
+ Path: netns,
+ },
+ {
+ Type: "ipc",
+ },
+ {
+ Type: "uts",
+ },
+ {
+ Type: "mount",
+ },
},
- {
- "type": "mount"
- }
- ]
+ },
}
-}`)
+
+ e := json.NewEncoder(w)
+ e.SetIndent("", " ")
+ return e.Encode(spec)
+}
// Spec implements subcommands.Command for the "spec" command.
type Spec struct {
bundle string
+ cwd string
+ netns string
}
// Name implements subcommands.Command.Name.
@@ -142,21 +146,26 @@ func (*Spec) Synopsis() string {
// Usage implements subcommands.Command.Usage.
func (*Spec) Usage() string {
- return `spec [options] - create a new OCI bundle specification file.
+ return `spec [options] [-- args...] - create a new OCI bundle specification file.
+
+The spec command creates a new specification file (config.json) for a new OCI
+bundle.
-The spec command creates a new specification file (config.json) for a new OCI bundle.
+The specification file is a starter file that runs the command specified by
+'args' in the container. If 'args' is not specified the default is to run the
+'sh' program.
-The specification file is a starter file that runs the "sh" command in the container. You
-should edit the file to suit your needs. You can find out more about the format of the
-specification file by visiting the OCI runtime spec repository:
+While a number of flags are provided to change values in the specification, you
+can examine the file and edit it to suit your needs after this command runs.
+You can find out more about the format of the specification file by visiting
+the OCI runtime spec repository:
https://github.com/opencontainers/runtime-spec/
EXAMPLE:
$ mkdir -p bundle/rootfs
$ cd bundle
- $ runsc spec
+ $ runsc spec -- /hello
$ docker export $(docker create hello-world) | tar -xf - -C rootfs
- $ sed -i 's;"sh";"/hello";' config.json
$ sudo runsc run hello
`
@@ -165,16 +174,31 @@ EXAMPLE:
// SetFlags implements subcommands.Command.SetFlags.
func (s *Spec) SetFlags(f *flag.FlagSet) {
f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle")
+ f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+
+ "this value MUST be an absolute path")
+ f.StringVar(&s.netns, "netns", "", "network namespace path")
}
// Execute implements subcommands.Command.Execute.
func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ // Grab the arguments.
+ containerArgs := f.Args()
+ if len(containerArgs) == 0 {
+ containerArgs = []string{"sh"}
+ }
+
confPath := filepath.Join(s.bundle, "config.json")
if _, err := os.Stat(confPath); !os.IsNotExist(err) {
Fatalf("file %q already exists", confPath)
}
- if err := ioutil.WriteFile(confPath, specTemplate, 0664); err != nil {
+ configFile, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0664)
+ if err != nil {
+ Fatalf("opening file %q: %v", confPath, err)
+ }
+
+ err = writeSpec(configFile, s.cwd, s.netns, containerArgs)
+ if err != nil {
Fatalf("writing to %q: %v", confPath, err)
}
diff --git a/runsc/cmd/statefile.go b/runsc/cmd/statefile.go
index e6f1907da..daed9e728 100644
--- a/runsc/cmd/statefile.go
+++ b/runsc/cmd/statefile.go
@@ -20,7 +20,7 @@ import (
"os"
"github.com/google/subcommands"
- "gvisor.dev/gvisor/pkg/state"
+ "gvisor.dev/gvisor/pkg/state/pretty"
"gvisor.dev/gvisor/pkg/state/statefile"
"gvisor.dev/gvisor/runsc/flag"
)
@@ -105,8 +105,14 @@ func (s *Statefile) Execute(_ context.Context, f *flag.FlagSet, args ...interfac
if err != nil {
Fatalf("error parsing statefile: %v", err)
}
- if err := state.PrettyPrint(output, rc, s.html); err != nil {
- Fatalf("error printing state: %v", err)
+ if s.html {
+ if err := pretty.PrintHTML(output, rc); err != nil {
+ Fatalf("error printing state: %v", err)
+ }
+ } else {
+ if err := pretty.PrintText(output, rc); err != nil {
+ Fatalf("error printing state: %v", err)
+ }
}
return subcommands.ExitSuccess
}
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 46154df60..9a9ee7e2a 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -16,6 +16,7 @@ go_library(
],
deps = [
"//pkg/abi/linux",
+ "//pkg/cleanup",
"//pkg/log",
"//pkg/sentry/control",
"//pkg/sentry/sighandling",
@@ -26,7 +27,7 @@ go_library(
"//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
"@com_github_gofrs_flock//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
],
)
@@ -46,13 +47,14 @@ go_test(
"//test/cmd/test_app",
],
library = ":container",
- shard_count = 5,
+ shard_count = 10,
tags = [
"requires-kvm",
],
deps = [
"//pkg/abi/linux",
"//pkg/bits",
+ "//pkg/cleanup",
"//pkg/log",
"//pkg/sentry/control",
"//pkg/sentry/kernel",
@@ -66,7 +68,7 @@ go_test(
"//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
"@com_github_kr_pty//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 294dca5e7..995d4e267 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -119,9 +119,10 @@ func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) {
// Test that an pty FD is sent over the console socket if one is provided.
func TestConsoleSocket(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
spec := testutil.NewSpecWithArgs("true")
+ spec.Process.Terminal = true
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
if err != nil {
t.Fatalf("error setting up container: %v", err)
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 8539f252d..7ad09bf23 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -31,6 +31,7 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/sighandling"
@@ -293,7 +294,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
}
// The Cleanup object cleans up partially created containers when an error
// occurs. Any errors occurring during cleanup itself are ignored.
- cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+ cu := cleanup.Make(func() { _ = c.Destroy() })
defer cu.Clean()
// Lock the container metadata file to prevent concurrent creations of
@@ -323,7 +324,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
}
}
if err := runInCgroup(cg, func() error {
- ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir)
+ ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached)
if err != nil {
return err
}
@@ -402,7 +403,7 @@ func (c *Container) Start(conf *boot.Config) error {
if err := c.Saver.lock(); err != nil {
return err
}
- unlock := specutils.MakeCleanup(func() { c.Saver.unlock() })
+ unlock := cleanup.Make(func() { c.Saver.unlock() })
defer unlock.Clean()
if err := c.requireStatus("start", Created); err != nil {
@@ -426,7 +427,7 @@ func (c *Container) Start(conf *boot.Config) error {
// the start (and all their children processes).
if err := runInCgroup(c.Sandbox.Cgroup, func() error {
// Create the gofer process.
- ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
+ ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false)
if err != nil {
return err
}
@@ -506,7 +507,7 @@ func Run(conf *boot.Config, args Args) (syscall.WaitStatus, error) {
}
// Clean up partially created container if an error occurs.
// Any errors returned by Destroy() itself are ignored.
- cu := specutils.MakeCleanup(func() {
+ cu := cleanup.Make(func() {
c.Destroy()
})
defer cu.Clean()
@@ -860,7 +861,7 @@ func (c *Container) waitForStopped() error {
return backoff.Retry(op, b)
}
-func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) {
+func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) {
// Start with the general config flags.
args := conf.ToFlags()
@@ -954,6 +955,14 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
cmd.ExtraFiles = goferEnds
cmd.Args[0] = "runsc-gofer"
+ if attached {
+ // The gofer is attached to the lifetime of this process, so it
+ // should synchronously die when this process dies.
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ Pdeathsig: syscall.SIGKILL,
+ }
+ }
+
// Enter new namespaces to isolate from the rest of the system. Don't unshare
// cgroup because gofer is added to a cgroup in the caller's namespace.
nss := []specs.LinuxNamespace{
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 1a6d50d0d..5e8247bc8 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -20,6 +20,7 @@ import (
"fmt"
"io"
"io/ioutil"
+ "math"
"os"
"path"
"path/filepath"
@@ -53,9 +54,8 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
err = fmt.Errorf("error getting process data from container: %v", err)
return &backoff.PermanentError{Err: err}
}
- if r, err := procListsEqual(got, want); !r {
- return fmt.Errorf("container got process list: %s, want: %s: error: %v",
- procListToString(got), procListToString(want), err)
+ if !procListsEqual(got, want) {
+ return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
}
return nil
}
@@ -92,36 +92,72 @@ func blockUntilWaitable(pid int) error {
return err
}
-// procListsEqual is used to check whether 2 Process lists are equal for all
-// implemented fields.
-func procListsEqual(got, want []*control.Process) (bool, error) {
- if len(got) != len(want) {
- return false, nil
- }
- for i := range got {
- pd1 := got[i]
- pd2 := want[i]
- // Zero out timing dependant fields.
- pd1.Time = ""
- pd1.STime = ""
- pd1.C = 0
- // Ignore TTY field too, since it's not relevant in the cases
- // where we use this method. Tests that care about the TTY
- // field should check for it themselves.
- pd1.TTY = ""
- pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1})
- if err != nil {
- return false, err
+// procListsEqual is used to check whether 2 Process lists are equal. Fields
+// set to -1 in wants are ignored. Timestamp and threads fields are always
+// ignored.
+func procListsEqual(gots, wants []*control.Process) bool {
+ if len(gots) != len(wants) {
+ return false
+ }
+ for i := range gots {
+ got := gots[i]
+ want := wants[i]
+
+ if want.UID != math.MaxUint32 && want.UID != got.UID {
+ return false
}
- pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2})
- if err != nil {
- return false, err
+ if want.PID != -1 && want.PID != got.PID {
+ return false
}
- if pd1Json != pd2Json {
- return false, nil
+ if want.PPID != -1 && want.PPID != got.PPID {
+ return false
}
+ if len(want.TTY) != 0 && want.TTY != got.TTY {
+ return false
+ }
+ if len(want.Cmd) != 0 && want.Cmd != got.Cmd {
+ return false
+ }
+ }
+ return true
+}
+
+type processBuilder struct {
+ process control.Process
+}
+
+func newProcessBuilder() *processBuilder {
+ return &processBuilder{
+ process: control.Process{
+ UID: math.MaxUint32,
+ PID: -1,
+ PPID: -1,
+ },
}
- return true, nil
+}
+
+func (p *processBuilder) Cmd(cmd string) *processBuilder {
+ p.process.Cmd = cmd
+ return p
+}
+
+func (p *processBuilder) PID(pid kernel.ThreadID) *processBuilder {
+ p.process.PID = pid
+ return p
+}
+
+func (p *processBuilder) PPID(ppid kernel.ThreadID) *processBuilder {
+ p.process.PPID = ppid
+ return p
+}
+
+func (p *processBuilder) UID(uid auth.KUID) *processBuilder {
+ p.process.UID = uid
+ return p
+}
+
+func (p *processBuilder) Process() *control.Process {
+ return &p.process
}
func procListToString(pl []*control.Process) string {
@@ -256,8 +292,6 @@ var (
func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
// Always load the default config.
cs := make(map[string]*boot.Config)
- cs["default"] = testutil.TestConfig(t)
-
for _, o := range opts {
switch o {
case overlay:
@@ -285,9 +319,16 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config {
vfs1 := configs(t, opts...)
- vfs2 := configs(t, opts...)
- for key, value := range vfs2 {
+ var optsVFS2 []configOption
+ for _, opt := range opts {
+ // TODO(gvisor.dev/issue/1487): Enable overlay tests.
+ if opt != overlay {
+ optsVFS2 = append(optsVFS2, opt)
+ }
+ }
+
+ for key, value := range configs(t, optsVFS2...) {
value.VFS2 = true
vfs1[key+"VFS2"] = value
}
@@ -318,14 +359,7 @@ func TestLifecycle(t *testing.T) {
// expectedPL lists the expected process state of the container.
expectedPL := []*control.Process{
- {
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
- Threads: []kernel.ThreadID{1},
- },
+ newProcessBuilder().Cmd("sleep").Process(),
}
// Create the container.
args := Args{
@@ -603,10 +637,16 @@ func doAppExitStatus(t *testing.T, vfs2 bool) {
// TestExec verifies that a container can exec a new program.
func TestExec(t *testing.T) {
- for name, conf := range configs(t, overlay) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
- const uid = 343
- spec := testutil.NewSpecWithArgs("sleep", "100")
+ dir, err := ioutil.TempDir(testutil.TmpDir(), "exec-test")
+ if err != nil {
+ t.Fatalf("error creating temporary directory: %v", err)
+ }
+ // Note that some shells may exec the final command in a sequence as
+ // an optimization. We avoid this here by adding the exit 0.
+ cmd := fmt.Sprintf("ln -s /bin/true %q/symlink && sleep 100 && exit 0", dir)
+ spec := testutil.NewSpecWithArgs("sh", "-c", cmd)
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
if err != nil {
@@ -629,29 +669,127 @@ func TestExec(t *testing.T) {
t.Fatalf("error starting container: %v", err)
}
- // expectedPL lists the expected process state of the container.
+ // Wait until sleep is running to ensure the symlink was created.
expectedPL := []*control.Process{
+ newProcessBuilder().Cmd("sh").Process(),
+ newProcessBuilder().Cmd("sleep").Process(),
+ }
+ if err := waitForProcessList(cont, expectedPL); err != nil {
+ t.Fatalf("waitForProcessList: %v", err)
+ }
+
+ for _, tc := range []struct {
+ name string
+ args control.ExecArgs
+ }{
+ {
+ name: "complete",
+ args: control.ExecArgs{
+ Filename: "/bin/true",
+ Argv: []string{"/bin/true"},
+ },
+ },
+ {
+ name: "filename",
+ args: control.ExecArgs{
+ Filename: "/bin/true",
+ },
+ },
+ {
+ name: "argv",
+ args: control.ExecArgs{
+ Argv: []string{"/bin/true"},
+ },
+ },
+ {
+ name: "filename resolution",
+ args: control.ExecArgs{
+ Filename: "true",
+ Envv: []string{"PATH=/bin"},
+ },
+ },
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
- Threads: []kernel.ThreadID{1},
+ name: "argv resolution",
+ args: control.ExecArgs{
+ Argv: []string{"true"},
+ Envv: []string{"PATH=/bin"},
+ },
},
{
- UID: uid,
- PID: 2,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
- Threads: []kernel.ThreadID{2},
+ name: "argv symlink",
+ args: control.ExecArgs{
+ Argv: []string{filepath.Join(dir, "symlink")},
+ },
},
+ {
+ name: "working dir",
+ args: control.ExecArgs{
+ Argv: []string{"/bin/sh", "-c", `if [[ "${PWD}" != "/tmp" ]]; then exit 1; fi`},
+ WorkingDirectory: "/tmp",
+ },
+ },
+ {
+ name: "user",
+ args: control.ExecArgs{
+ Argv: []string{"/bin/sh", "-c", `if [[ "$(id -u)" != "343" ]]; then exit 1; fi`},
+ KUID: 343,
+ },
+ },
+ {
+ name: "group",
+ args: control.ExecArgs{
+ Argv: []string{"/bin/sh", "-c", `if [[ "$(id -g)" != "343" ]]; then exit 1; fi`},
+ KGID: 343,
+ },
+ },
+ {
+ name: "env",
+ args: control.ExecArgs{
+ Argv: []string{"/bin/sh", "-c", `if [[ "${FOO}" != "123" ]]; then exit 1; fi`},
+ Envv: []string{"FOO=123"},
+ },
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ // t.Parallel()
+ if ws, err := cont.executeSync(&tc.args); err != nil {
+ t.Fatalf("executeAsync(%+v): %v", tc.args, err)
+ } else if ws != 0 {
+ t.Fatalf("executeAsync(%+v) failed with exit: %v", tc.args, ws)
+ }
+ })
}
+ })
+ }
+}
- // Verify that "sleep 100" is running.
- if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
- t.Error(err)
+// TestExecProcList verifies that a container can exec a new program and it
+// shows correcly in the process list.
+func TestExecProcList(t *testing.T) {
+ for name, conf := range configsWithVFS2(t, all...) {
+ t.Run(name, func(t *testing.T) {
+ const uid = 343
+ spec := testutil.NewSpecWithArgs("sleep", "100")
+
+ _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer cleanup()
+
+ // Create and start the container.
+ args := Args{
+ ID: testutil.RandomContainerID(),
+ Spec: spec,
+ BundleDir: bundleDir,
+ }
+ cont, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("error creating container: %v", err)
+ }
+ defer cont.Destroy()
+ if err := cont.Start(conf); err != nil {
+ t.Fatalf("error starting container: %v", err)
}
execArgs := &control.ExecArgs{
@@ -661,9 +799,8 @@ func TestExec(t *testing.T) {
KUID: uid,
}
- // Verify that "sleep 100" and "sleep 5" are running
- // after exec. First, start running exec (whick
- // blocks).
+ // Verify that "sleep 100" and "sleep 5" are running after exec. First,
+ // start running exec (which blocks).
ch := make(chan error)
go func() {
exitStatus, err := cont.executeSync(execArgs)
@@ -676,6 +813,11 @@ func TestExec(t *testing.T) {
}
}()
+ // expectedPL lists the expected process state of the container.
+ expectedPL := []*control.Process{
+ newProcessBuilder().PID(1).PPID(0).Cmd("sleep").UID(0).Process(),
+ newProcessBuilder().PID(2).PPID(0).Cmd("sleep").UID(uid).Process(),
+ }
if err := waitForProcessList(cont, expectedPL); err != nil {
t.Fatalf("error waiting for processes: %v", err)
}
@@ -695,7 +837,7 @@ func TestExec(t *testing.T) {
// TestKillPid verifies that we can signal individual exec'd processes.
func TestKillPid(t *testing.T) {
- for name, conf := range configs(t, overlay) {
+ for name, conf := range configsWithVFS2(t, overlay) {
t.Run(name, func(t *testing.T) {
app, err := testutil.FindFile("test/cmd/test_app/test_app")
if err != nil {
@@ -1211,7 +1353,7 @@ func TestCapabilities(t *testing.T) {
uid := auth.KUID(os.Getuid() + 1)
gid := auth.KGID(os.Getgid() + 1)
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
spec := testutil.NewSpecWithArgs("sleep", "100")
rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
@@ -1237,24 +1379,9 @@ func TestCapabilities(t *testing.T) {
// expectedPL lists the expected process state of the container.
expectedPL := []*control.Process{
- {
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
- Threads: []kernel.ThreadID{1},
- },
- {
- UID: uid,
- PID: 2,
- PPID: 0,
- C: 0,
- Cmd: "exe",
- Threads: []kernel.ThreadID{2},
- },
+ newProcessBuilder().Cmd("sleep").Process(),
}
- if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
+ if err := waitForProcessList(cont, expectedPL); err != nil {
t.Fatalf("Failed to wait for sleep to start, err: %v", err)
}
@@ -1409,7 +1536,7 @@ func TestReadonlyRoot(t *testing.T) {
}
func TestUIDMap(t *testing.T) {
- for name, conf := range configs(t, noOverlay...) {
+ for name, conf := range configsWithVFS2(t, noOverlay...) {
t.Run(name, func(t *testing.T) {
testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
if err != nil {
@@ -1537,28 +1664,6 @@ func TestReadonlyMount(t *testing.T) {
}
}
-func TestBindMountByOption(t *testing.T) {
- for _, conf := range configs(t, overlay) {
- t.Logf("Running test with conf: %+v", conf)
-
- dir, err := ioutil.TempDir(testutil.TmpDir(), "bind-mount")
- spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file"))
- if err != nil {
- t.Fatalf("ioutil.TempDir() failed: %v", err)
- }
- spec.Mounts = append(spec.Mounts, specs.Mount{
- Destination: dir,
- Source: dir,
- Type: "none",
- Options: []string{"rw", "bind"},
- })
-
- if err := run(spec, conf); err != nil {
- t.Fatalf("error running sandbox: %v", err)
- }
- }
-}
-
// TestAbbreviatedIDs checks that runsc supports using abbreviated container
// IDs in place of full IDs.
func TestAbbreviatedIDs(t *testing.T) {
@@ -1908,7 +2013,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) {
}
func TestCreateWorkingDir(t *testing.T) {
- for name, conf := range configs(t, overlay) {
+ for name, conf := range configsWithVFS2(t, overlay) {
t.Run(name, func(t *testing.T) {
tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
if err != nil {
@@ -2031,7 +2136,7 @@ func TestMountPropagation(t *testing.T) {
}
func TestMountSymlink(t *testing.T) {
- for name, conf := range configs(t, overlay) {
+ for name, conf := range configsWithVFS2(t, overlay) {
t.Run(name, func(t *testing.T) {
dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
if err != nil {
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index f6861b1dd..e189648f4 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -27,6 +27,7 @@ import (
"time"
specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
@@ -64,29 +65,16 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
}
- var (
- containers []*Container
- cleanups []func()
- )
- cleanups = append(cleanups, func() {
- for _, c := range containers {
- c.Destroy()
- }
- })
- cleanupAll := func() {
- for _, c := range cleanups {
- c()
- }
- }
- localClean := specutils.MakeCleanup(cleanupAll)
- defer localClean.Clean()
+ cu := cleanup.Cleanup{}
+ defer cu.Clean()
+ var containers []*Container
for i, spec := range specs {
bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
if err != nil {
return nil, nil, fmt.Errorf("error setting up container: %v", err)
}
- cleanups = append(cleanups, cleanup)
+ cu.Add(cleanup)
args := Args{
ID: ids[i],
@@ -97,6 +85,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
if err != nil {
return nil, nil, fmt.Errorf("error creating container: %v", err)
}
+ cu.Add(func() { cont.Destroy() })
containers = append(containers, cont)
if err := cont.Start(conf); err != nil {
@@ -104,27 +93,27 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
}
}
- localClean.Release()
- return containers, cleanupAll, nil
+ return containers, cu.Release(), nil
}
type execDesc struct {
c *Container
cmd []string
want int
- desc string
+ name string
}
-func execMany(execs []execDesc) error {
+func execMany(t *testing.T, execs []execDesc) {
for _, exec := range execs {
- args := &control.ExecArgs{Argv: exec.cmd}
- if ws, err := exec.c.executeSync(args); err != nil {
- return fmt.Errorf("error executing %+v: %v", args, err)
- } else if ws.ExitStatus() != exec.want {
- return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want)
- }
+ t.Run(exec.name, func(t *testing.T) {
+ args := &control.ExecArgs{Argv: exec.cmd}
+ if ws, err := exec.c.executeSync(args); err != nil {
+ t.Errorf("error executing %+v: %v", args, err)
+ } else if ws.ExitStatus() != exec.want {
+ t.Errorf("%q: exec %q got exit status: %d, want: %d", exec.name, exec.cmd, ws.ExitStatus(), exec.want)
+ }
+ })
}
- return nil
}
func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
@@ -141,7 +130,7 @@ func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
// TestMultiContainerSanity checks that it is possible to run 2 dead-simple
// containers in the same sandbox.
func TestMultiContainerSanity(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -161,13 +150,13 @@ func TestMultiContainerSanity(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+ newProcessBuilder().PID(1).PPID(0).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
}
expectedPL = []*control.Process{
- {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+ newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -207,13 +196,13 @@ func TestMultiPIDNS(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+ newProcessBuilder().PID(1).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
}
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+ newProcessBuilder().PID(1).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -269,7 +258,7 @@ func TestMultiPIDNSPath(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+ newProcessBuilder().PID(1).PPID(0).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -279,7 +268,7 @@ func TestMultiPIDNSPath(t *testing.T) {
}
expectedPL = []*control.Process{
- {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+ newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -312,7 +301,7 @@ func TestMultiContainerWait(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+ newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -357,7 +346,7 @@ func TestMultiContainerWait(t *testing.T) {
// After Wait returns, ensure that the root container is running and
// the child has finished.
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+ newProcessBuilder().Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
@@ -389,7 +378,7 @@ func TestExecWait(t *testing.T) {
// Check via ps that process is running.
expectedPL := []*control.Process{
- {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+ newProcessBuilder().Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Fatalf("failed to wait for sleep to start: %v", err)
@@ -424,7 +413,7 @@ func TestExecWait(t *testing.T) {
// Wait for the exec'd process to exit.
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+ newProcessBuilder().PID(1).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Fatalf("failed to wait for second container to stop: %v", err)
@@ -510,9 +499,8 @@ func TestMultiContainerSignal(t *testing.T) {
// Check via ps that container 1 process is running.
expectedPL := []*control.Process{
- {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
+ newProcessBuilder().Cmd("sleep").Process(),
}
-
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
}
@@ -524,7 +512,7 @@ func TestMultiContainerSignal(t *testing.T) {
// Make sure process 1 is still running.
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+ newProcessBuilder().PID(1).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -638,8 +626,10 @@ func TestMultiContainerDestroy(t *testing.T) {
if err != nil {
t.Fatalf("error getting process data from sandbox: %v", err)
}
- expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
- if r, err := procListsEqual(pss, expectedPL); !r {
+ expectedPL := []*control.Process{
+ newProcessBuilder().PID(1).Cmd("sleep").Process(),
+ }
+ if !procListsEqual(pss, expectedPL) {
t.Errorf("container got process list: %s, want: %s: error: %v",
procListToString(pss), procListToString(expectedPL), err)
}
@@ -676,7 +666,7 @@ func TestMultiContainerProcesses(t *testing.T) {
// Check root's container process list doesn't include other containers.
expectedPL0 := []*control.Process{
- {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+ newProcessBuilder().PID(1).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[0], expectedPL0); err != nil {
t.Errorf("failed to wait for process to start: %v", err)
@@ -684,8 +674,8 @@ func TestMultiContainerProcesses(t *testing.T) {
// Same for the other container.
expectedPL1 := []*control.Process{
- {PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}},
- {PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
+ newProcessBuilder().PID(2).Cmd("sh").Process(),
+ newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process(),
}
if err := waitForProcessList(containers[1], expectedPL1); err != nil {
t.Errorf("failed to wait for process to start: %v", err)
@@ -699,7 +689,7 @@ func TestMultiContainerProcesses(t *testing.T) {
if _, err := containers[1].Execute(args); err != nil {
t.Fatalf("error exec'ing: %v", err)
}
- expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}})
+ expectedPL1 = append(expectedPL1, newProcessBuilder().PID(4).Cmd("sleep").Process())
if err := waitForProcessList(containers[1], expectedPL1); err != nil {
t.Errorf("failed to wait for process to start: %v", err)
}
@@ -1083,7 +1073,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
// Test that pod shared mounts are properly mounted in 2 containers and that
// changes from one container is reflected in the other.
func TestMultiContainerSharedMount(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -1121,84 +1111,82 @@ func TestMultiContainerSharedMount(t *testing.T) {
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
- desc: "directory is mounted in container0",
+ name: "directory is mounted in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
- desc: "directory is mounted in container1",
+ name: "directory is mounted in container1",
},
{
c: containers[0],
- cmd: []string{"/usr/bin/touch", file0},
- desc: "create file in container0",
+ cmd: []string{"/bin/touch", file0},
+ name: "create file in container0",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-f", file0},
- desc: "file appears in container0",
+ name: "file appears in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-f", file1},
- desc: "file appears in container1",
+ name: "file appears in container1",
},
{
c: containers[1],
cmd: []string{"/bin/rm", file1},
- desc: "file removed from container1",
+ name: "remove file from container1",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "!", "-f", file0},
- desc: "file removed from container0",
+ name: "file removed from container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "!", "-f", file1},
- desc: "file removed from container1",
+ name: "file removed from container1",
},
{
c: containers[1],
cmd: []string{"/bin/mkdir", file1},
- desc: "create directory in container1",
+ name: "create directory in container1",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-d", file0},
- desc: "dir appears in container0",
+ name: "dir appears in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-d", file1},
- desc: "dir appears in container1",
+ name: "dir appears in container1",
},
{
c: containers[0],
cmd: []string{"/bin/rmdir", file0},
- desc: "create directory in container0",
+ name: "remove directory from container0",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "!", "-d", file0},
- desc: "dir removed from container0",
+ name: "dir removed from container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "!", "-d", file1},
- desc: "dir removed from container1",
+ name: "dir removed from container1",
},
}
- if err := execMany(execs); err != nil {
- t.Fatal(err.Error())
- }
+ execMany(t, execs)
})
}
}
// Test that pod mounts are mounted as readonly when requested.
func TestMultiContainerSharedMountReadonly(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -1236,35 +1224,34 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
- desc: "directory is mounted in container0",
+ name: "directory is mounted in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
- desc: "directory is mounted in container1",
+ name: "directory is mounted in container1",
},
{
c: containers[0],
- cmd: []string{"/usr/bin/touch", file0},
+ cmd: []string{"/bin/touch", file0},
want: 1,
- desc: "fails to write to container0",
+ name: "fails to write to container0",
},
{
c: containers[1],
- cmd: []string{"/usr/bin/touch", file1},
+ cmd: []string{"/bin/touch", file1},
want: 1,
- desc: "fails to write to container1",
+ name: "fails to write to container1",
},
}
- if err := execMany(execs); err != nil {
- t.Fatal(err.Error())
- }
+ execMany(t, execs)
})
}
}
// Test that shared pod mounts continue to work after container is restarted.
func TestMultiContainerSharedMountRestart(t *testing.T) {
+ //TODO(gvisor.dev/issue/1487): This is failing with VFS2.
for name, conf := range configs(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
@@ -1302,23 +1289,21 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
execs := []execDesc{
{
c: containers[0],
- cmd: []string{"/usr/bin/touch", file0},
- desc: "create file in container0",
+ cmd: []string{"/bin/touch", file0},
+ name: "create file in container0",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-f", file0},
- desc: "file appears in container0",
+ name: "file appears in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-f", file1},
- desc: "file appears in container1",
+ name: "file appears in container1",
},
}
- if err := execMany(execs); err != nil {
- t.Fatal(err.Error())
- }
+ execMany(t, execs)
containers[1].Destroy()
@@ -1345,32 +1330,30 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
{
c: containers[0],
cmd: []string{"/usr/bin/test", "-f", file0},
- desc: "file is still in container0",
+ name: "file is still in container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "-f", file1},
- desc: "file is still in container1",
+ name: "file is still in container1",
},
{
c: containers[1],
cmd: []string{"/bin/rm", file1},
- desc: "file removed from container1",
+ name: "file removed from container1",
},
{
c: containers[0],
cmd: []string{"/usr/bin/test", "!", "-f", file0},
- desc: "file removed from container0",
+ name: "file removed from container0",
},
{
c: containers[1],
cmd: []string{"/usr/bin/test", "!", "-f", file1},
- desc: "file removed from container1",
+ name: "file removed from container1",
},
}
- if err := execMany(execs); err != nil {
- t.Fatal(err.Error())
- }
+ execMany(t, execs)
})
}
}
@@ -1378,53 +1361,53 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
// Test that unsupported pod mounts options are ignored when matching master and
// slave mounts.
func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
- rootDir, cleanup, err := testutil.SetupRootDir()
- if err != nil {
- t.Fatalf("error creating root dir: %v", err)
- }
- defer cleanup()
-
- conf := testutil.TestConfig(t)
- conf.RootDir = rootDir
+ for name, conf := range configsWithVFS2(t, all...) {
+ t.Run(name, func(t *testing.T) {
+ rootDir, cleanup, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer cleanup()
+ conf.RootDir = rootDir
- // Setup the containers.
- sleep := []string{"/bin/sleep", "100"}
- podSpec, ids := createSpecs(sleep, sleep)
- mnt0 := specs.Mount{
- Destination: "/mydir/test",
- Source: "/some/dir",
- Type: "tmpfs",
- Options: []string{"rw", "relatime"},
- }
- podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+ // Setup the containers.
+ sleep := []string{"/bin/sleep", "100"}
+ podSpec, ids := createSpecs(sleep, sleep)
+ mnt0 := specs.Mount{
+ Destination: "/mydir/test",
+ Source: "/some/dir",
+ Type: "tmpfs",
+ Options: []string{"rw", "rbind", "relatime"},
+ }
+ podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
- mnt1 := mnt0
- mnt1.Destination = "/mydir2/test2"
- mnt1.Options = []string{"rw", "nosuid"}
- podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+ mnt1 := mnt0
+ mnt1.Destination = "/mydir2/test2"
+ mnt1.Options = []string{"rw", "nosuid"}
+ podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
- createSharedMount(mnt0, "test-mount", podSpec...)
+ createSharedMount(mnt0, "test-mount", podSpec...)
- containers, cleanup, err := startContainers(conf, podSpec, ids)
- if err != nil {
- t.Fatalf("error starting containers: %v", err)
- }
- defer cleanup()
+ containers, cleanup, err := startContainers(conf, podSpec, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
- execs := []execDesc{
- {
- c: containers[0],
- cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
- desc: "directory is mounted in container0",
- },
- {
- c: containers[1],
- cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
- desc: "directory is mounted in container1",
- },
- }
- if err := execMany(execs); err != nil {
- t.Fatal(err.Error())
+ execs := []execDesc{
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
+ name: "directory is mounted in container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
+ name: "directory is mounted in container1",
+ },
+ }
+ execMany(t, execs)
+ })
}
}
@@ -1517,7 +1500,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
// Ensure container is running
c := containers[2]
expectedPL := []*control.Process{
- {PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
+ newProcessBuilder().PID(3).Cmd("sleep").Process(),
}
if err := waitForProcessList(c, expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -1545,7 +1528,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
continue // container[2] has been killed.
}
pl := []*control.Process{
- {PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}},
+ newProcessBuilder().PID(kernel.ThreadID(i + 1)).Cmd("sleep").Process(),
}
if err := waitForProcessList(c, pl); err != nil {
t.Errorf("Container %q was affected by another container: %v", c.ID, err)
@@ -1565,7 +1548,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
// and RPC errors out.
impossiblePL := []*control.Process{
- {PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}},
+ newProcessBuilder().Cmd("non-existent-process").Process(),
}
if err := waitForProcessList(c, impossiblePL); err == nil {
t.Fatalf("Sandbox was not killed after gofer death")
@@ -1709,3 +1692,83 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
t.Fatalf("child container failed, waitStatus: %v", ws)
}
}
+
+// TestMultiContainerHomeEnvDir tests that the HOME environment variable is set
+// for root containers, sub-containers, and execed processes.
+func TestMultiContainerHomeEnvDir(t *testing.T) {
+ // TODO(gvisor.dev/issue/1487): VFSv2 configs failing.
+ // NOTE: Don't use overlay since we need changes to persist to the temp dir
+ // outside the sandbox.
+ for testName, conf := range configs(t, noOverlay...) {
+ t.Run(testName, func(t *testing.T) {
+
+ rootDir, cleanup, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer cleanup()
+ conf.RootDir = rootDir
+
+ // Create temp files we can write the value of $HOME to.
+ homeDirs := map[string]*os.File{}
+ for _, name := range []string{"root", "sub", "exec"} {
+ homeFile, err := ioutil.TempFile(testutil.TmpDir(), name)
+ if err != nil {
+ t.Fatalf("creating temp file: %v", err)
+ }
+ homeDirs[name] = homeFile
+ }
+
+ // We will sleep in the root container in order to ensure that
+ // the root container doesn't terminate before sub containers can be
+ // created.
+ rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())}
+ subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())}
+ execCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())}
+
+ // Setup the containers, a root container and sub container.
+ specConfig, ids := createSpecs(rootCmd, subCmd)
+ containers, cleanup, err := startContainers(conf, specConfig, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+
+ // Exec into the root container synchronously.
+ args := &control.ExecArgs{Argv: execCmd}
+ if _, err := containers[0].executeSync(args); err != nil {
+ t.Errorf("error executing %+v: %v", args, err)
+ }
+
+ // Wait for the subcontainer to finish.
+ _, err = containers[1].Wait()
+ if err != nil {
+ t.Errorf("wait on child container: %v", err)
+ }
+
+ // Wait for the root container to run.
+ expectedPL := []*control.Process{
+ newProcessBuilder().Cmd("sh").Process(),
+ newProcessBuilder().Cmd("sleep").Process(),
+ }
+ if err := waitForProcessList(containers[0], expectedPL); err != nil {
+ t.Errorf("failed to wait for sleep to start: %v", err)
+ }
+
+ // Check the written files.
+ for name, tmpFile := range homeDirs {
+ dirBytes, err := ioutil.ReadAll(tmpFile)
+ if err != nil {
+ t.Fatalf("reading %s temp file: %v", name, err)
+ }
+ got := string(dirBytes)
+
+ want := "/"
+ if got != want {
+ t.Errorf("%s $HOME incorrect: got: %q, want: %q", name, got, want)
+ }
+ }
+
+ })
+ }
+}
diff --git a/runsc/debian/postinst.sh b/runsc/debian/postinst.sh
index dc7aeee87..d1e28e17b 100755
--- a/runsc/debian/postinst.sh
+++ b/runsc/debian/postinst.sh
@@ -18,7 +18,14 @@ if [ "$1" != configure ]; then
exit 0
fi
+# Update docker configuration.
if [ -f /etc/docker/daemon.json ]; then
runsc install
- systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2
+ if systemctl status docker 2>/dev/null; then
+ systemctl restart docker || echo "unable to restart docker; you must do so manually." >&2
+ fi
fi
+
+# For containerd-based installers, we don't automatically update the
+# configuration. If it uses a v2 shim, then it will find the package binaries
+# automatically when provided the appropriate annotation.
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 64a406ae2..05e3637f7 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -13,12 +13,12 @@ go_library(
visibility = ["//runsc:__subpackages__"],
deps = [
"//pkg/abi/linux",
+ "//pkg/cleanup",
"//pkg/fd",
"//pkg/log",
"//pkg/p9",
"//pkg/sync",
"//pkg/syserr",
- "//runsc/specutils",
"@org_golang_x_sys//unix:go_default_library",
],
)
@@ -31,5 +31,6 @@ go_test(
deps = [
"//pkg/log",
"//pkg/p9",
+ "//pkg/test/testutil",
],
)
diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go
index 1dce36965..88814b83c 100644
--- a/runsc/fsgofer/filter/config.go
+++ b/runsc/fsgofer/filter/config.go
@@ -128,6 +128,7 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_MADVISE: {},
unix.SYS_MEMFD_CREATE: {}, /// Used by flipcall.PacketWindowAllocator.Init().
syscall.SYS_MKDIRAT: {},
+ syscall.SYS_MKNODAT: {},
// Used by the Go runtime as a temporarily workaround for a Linux
// 5.2-5.4 bug.
//
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 1942f50d7..c6694c278 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -33,11 +33,11 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/runsc/specutils"
)
const (
@@ -48,36 +48,6 @@ const (
openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC
)
-type fileType int
-
-const (
- regular fileType = iota
- directory
- symlink
- socket
- unknown
-)
-
-// String implements fmt.Stringer.
-func (f fileType) String() string {
- switch f {
- case regular:
- return "regular"
- case directory:
- return "directory"
- case symlink:
- return "symlink"
- case socket:
- return "socket"
- }
- return "unknown"
-}
-
-// ControlSocketAddr generates an abstract unix socket name for the given id.
-func ControlSocketAddr(id string) string {
- return fmt.Sprintf("\x00runsc-gofer.%s", id)
-}
-
// Config sets configuration options for each attach point.
type Config struct {
// ROMount is set to true if this is a readonly mount.
@@ -132,19 +102,19 @@ func (a *attachPoint) Attach() (p9.File, error) {
return nil, fmt.Errorf("attach point already attached, prefix: %s", a.prefix)
}
- f, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) {
+ f, readable, err := openAnyFile(a.prefix, func(mode int) (*fd.FD, error) {
return fd.Open(a.prefix, openFlags|mode, 0)
})
if err != nil {
return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err)
}
- stat, err := stat(f.FD())
+ stat, err := fstat(f.FD())
if err != nil {
return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err)
}
- lf, err := newLocalFile(a, f, a.prefix, stat)
+ lf, err := newLocalFile(a, f, a.prefix, readable, stat)
if err != nil {
return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err)
}
@@ -175,8 +145,6 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
log.Warningf("first 8 bytes of host inode id %x will be truncated to construct virtual inode id", stat.Ino)
}
ino := uint64(dev)<<56 | maskedIno
- log.Debugf("host inode %x on device %x mapped to virtual inode %x", stat.Ino, stat.Dev, ino)
-
return p9.QID{
Type: p9.FileMode(stat.Mode).QIDType(),
Path: ino,
@@ -201,8 +169,6 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
// entire file up when it's opened in write mode, and would perform badly when
// multiple files are only being opened for read (esp. startup).
type localFile struct {
- p9.DefaultWalkGetAttr
-
// attachPoint is the attachPoint that serves this localFile.
attachPoint *attachPoint
@@ -214,12 +180,19 @@ type localFile struct {
// opened with.
file *fd.FD
+ // controlReadable tells whether 'file' was opened with read permissions
+ // during a walk.
+ controlReadable bool
+
// mode is the mode in which the file was opened. Set to invalidMode
// if localFile isn't opened.
mode p9.OpenFlags
- // ft is the fileType for this file.
- ft fileType
+ // fileType for this file. It is equivalent to:
+ // syscall.Stat_t.Mode & syscall.S_IFMT
+ fileType uint32
+
+ qid p9.QID
// readDirMu protects against concurrent Readdir calls.
readDirMu sync.Mutex
@@ -253,83 +226,88 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
return fd.New(d), nil
}
-func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, error) {
+func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) {
path := path.Join(parent.hostPath, name)
- f, err := openAnyFile(path, func(mode int) (*fd.FD, error) {
+ f, readable, err := openAnyFile(path, func(mode int) (*fd.FD, error) {
return fd.OpenAt(parent.file, name, openFlags|mode, 0)
})
- return f, path, err
+ return f, path, readable, err
}
// openAnyFile attempts to open the file in O_RDONLY and if it fails fallsback
// to O_PATH. 'path' is used for logging messages only. 'fn' is what does the
// actual file open and is customizable by the caller.
-func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, error) {
+func openAnyFile(path string, fn func(mode int) (*fd.FD, error)) (*fd.FD, bool, error) {
// Attempt to open file in the following mode in order:
// 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
// Use non-blocking to prevent getting stuck inside open(2) for
// FIFOs. This option has no effect on regular files.
// 2. PATH: for symlinks, sockets.
- modes := []int{syscall.O_RDONLY | syscall.O_NONBLOCK, unix.O_PATH}
+ options := []struct {
+ mode int
+ readable bool
+ }{
+ {
+ mode: syscall.O_RDONLY | syscall.O_NONBLOCK,
+ readable: true,
+ },
+ {
+ mode: unix.O_PATH,
+ readable: false,
+ },
+ }
var err error
- var file *fd.FD
- for i, mode := range modes {
- file, err = fn(mode)
+ for i, option := range options {
+ var file *fd.FD
+ file, err = fn(option.mode)
if err == nil {
- // openat succeeded, we're done.
- break
+ // Succeeded opening the file, we're done.
+ return file, option.readable, nil
}
switch e := extractErrno(err); e {
case syscall.ENOENT:
// File doesn't exist, no point in retrying.
- return nil, e
+ return nil, false, e
}
- // openat failed. Try again with next mode, preserving 'err' in case this
- // was the last attempt.
- log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|mode, path, err)
- }
- if err != nil {
- // All attempts to open file have failed, return the last error.
- log.Debugf("Failed to open file, path: %q, err: %v", path, err)
- return nil, extractErrno(err)
+ // File failed to open. Try again with next mode, preserving 'err' in case
+ // this was the last attempt.
+ log.Debugf("Attempt %d to open file failed, mode: %#x, path: %q, err: %v", i, openFlags|option.mode, path, err)
}
-
- return file, nil
+ // All attempts to open file have failed, return the last error.
+ log.Debugf("Failed to open file, path: %q, err: %v", path, err)
+ return nil, false, extractErrno(err)
}
-func getSupportedFileType(stat syscall.Stat_t, permitSocket bool) (fileType, error) {
- var ft fileType
+func checkSupportedFileType(stat syscall.Stat_t, permitSocket bool) error {
switch stat.Mode & syscall.S_IFMT {
- case syscall.S_IFREG:
- ft = regular
- case syscall.S_IFDIR:
- ft = directory
- case syscall.S_IFLNK:
- ft = symlink
+ case syscall.S_IFREG, syscall.S_IFDIR, syscall.S_IFLNK:
+ return nil
+
case syscall.S_IFSOCK:
if !permitSocket {
- return unknown, syscall.EPERM
+ return syscall.EPERM
}
- ft = socket
+ return nil
+
default:
- return unknown, syscall.EPERM
+ return syscall.EPERM
}
- return ft, nil
}
-func newLocalFile(a *attachPoint, file *fd.FD, path string, stat syscall.Stat_t) (*localFile, error) {
- ft, err := getSupportedFileType(stat, a.conf.HostUDS)
- if err != nil {
+func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat syscall.Stat_t) (*localFile, error) {
+ if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil {
return nil, err
}
return &localFile{
- attachPoint: a,
- hostPath: path,
- file: file,
- mode: invalidMode,
- ft: ft,
+ attachPoint: a,
+ hostPath: path,
+ file: file,
+ mode: invalidMode,
+ fileType: stat.Mode & syscall.S_IFMT,
+ qid: a.makeQID(stat),
+ controlReadable: readable,
}, nil
}
@@ -348,13 +326,13 @@ func newFDMaybe(file *fd.FD) *fd.FD {
// fd is blocking; non-blocking is required.
if err := syscall.SetNonblock(dup.FD(), true); err != nil {
- dup.Close()
+ _ = dup.Close()
return nil
}
return dup
}
-func stat(fd int) (syscall.Stat_t, error) {
+func fstat(fd int) (syscall.Stat_t, error) {
var stat syscall.Stat_t
if err := syscall.Fstat(fd, &stat); err != nil {
return syscall.Stat_t{}, err
@@ -362,6 +340,14 @@ func stat(fd int) (syscall.Stat_t, error) {
return stat, nil
}
+func stat(path string) (syscall.Stat_t, error) {
+ var stat syscall.Stat_t
+ if err := syscall.Stat(path, &stat); err != nil {
+ return syscall.Stat_t{}, err
+ }
+ return stat, nil
+}
+
func fchown(fd int, uid p9.UID, gid p9.GID) error {
return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
}
@@ -374,7 +360,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
// Check if control file can be used or if a new open must be created.
var newFile *fd.FD
- if flags == p9.ReadOnly {
+ if flags == p9.ReadOnly && l.controlReadable {
log.Debugf("Open reusing control file, flags: %v, %q", flags, l.hostPath)
newFile = l.file
} else {
@@ -390,16 +376,8 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
}
}
- stat, err := stat(newFile.FD())
- if err != nil {
- if newFile != l.file {
- newFile.Close()
- }
- return nil, p9.QID{}, 0, extractErrno(err)
- }
-
var fd *fd.FD
- if stat.Mode&syscall.S_IFMT == syscall.S_IFREG {
+ if l.fileType == syscall.S_IFREG {
// Donate FD for regular files only.
fd = newFDMaybe(newFile)
}
@@ -412,7 +390,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
l.file = newFile
}
l.mode = flags & p9.OpenFlagsModeMask
- return fd, l.attachPoint.makeQID(stat), 0, nil
+ return fd, l.qid, 0, nil
}
// Create implements p9.File.
@@ -439,8 +417,8 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
if err != nil {
return nil, nil, p9.QID{}, 0, extractErrno(err)
}
- cu := specutils.MakeCleanup(func() {
- child.Close()
+ cu := cleanup.Make(func() {
+ _ = child.Close()
// Best effort attempt to remove the file in case of failure.
if err := syscall.Unlinkat(l.file.FD(), name); err != nil {
log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, name), err)
@@ -451,7 +429,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
if err := fchown(child.FD(), uid, gid); err != nil {
return nil, nil, p9.QID{}, 0, extractErrno(err)
}
- stat, err := stat(child.FD())
+ stat, err := fstat(child.FD())
if err != nil {
return nil, nil, p9.QID{}, 0, extractErrno(err)
}
@@ -461,10 +439,12 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid
hostPath: path.Join(l.hostPath, name),
file: child,
mode: mode,
+ fileType: syscall.S_IFREG,
+ qid: l.attachPoint.makeQID(stat),
}
cu.Release()
- return newFDMaybe(c.file), c, l.attachPoint.makeQID(stat), 0, nil
+ return newFDMaybe(c.file), c, c.qid, 0, nil
}
// Mkdir implements p9.File.
@@ -480,7 +460,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
if err := syscall.Mkdirat(l.file.FD(), name, uint32(perm.Permissions())); err != nil {
return p9.QID{}, extractErrno(err)
}
- cu := specutils.MakeCleanup(func() {
+ cu := cleanup.Make(func() {
// Best effort attempt to remove the dir in case of failure.
if err := unix.Unlinkat(l.file.FD(), name, unix.AT_REMOVEDIR); err != nil {
log.Warningf("error unlinking dir %q after failure: %v", path.Join(l.hostPath, name), err)
@@ -499,7 +479,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
if err := fchown(f.FD(), uid, gid); err != nil {
return p9.QID{}, extractErrno(err)
}
- stat, err := stat(f.FD())
+ stat, err := fstat(f.FD())
if err != nil {
return p9.QID{}, extractErrno(err)
}
@@ -510,55 +490,74 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
// Walk implements p9.File.
func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) {
+ qids, file, _, err := l.walk(names)
+ return qids, file, err
+}
+
+// WalkGetAttr implements p9.File.
+func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask, p9.Attr, error) {
+ qids, file, stat, err := l.walk(names)
+ if err != nil {
+ return nil, nil, p9.AttrMask{}, p9.Attr{}, err
+ }
+ mask, attr := l.fillAttr(stat)
+ return qids, file, mask, attr, nil
+}
+
+func (l *localFile) walk(names []string) ([]p9.QID, p9.File, syscall.Stat_t, error) {
// Duplicate current file if 'names' is empty.
if len(names) == 0 {
- newFile, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
+ newFile, readable, err := openAnyFile(l.hostPath, func(mode int) (*fd.FD, error) {
return reopenProcFd(l.file, openFlags|mode)
})
if err != nil {
- return nil, nil, extractErrno(err)
+ return nil, nil, syscall.Stat_t{}, extractErrno(err)
}
- stat, err := stat(newFile.FD())
+ stat, err := fstat(newFile.FD())
if err != nil {
- newFile.Close()
- return nil, nil, extractErrno(err)
+ _ = newFile.Close()
+ return nil, nil, syscall.Stat_t{}, extractErrno(err)
}
c := &localFile{
- attachPoint: l.attachPoint,
- hostPath: l.hostPath,
- file: newFile,
- mode: invalidMode,
+ attachPoint: l.attachPoint,
+ hostPath: l.hostPath,
+ file: newFile,
+ mode: invalidMode,
+ fileType: l.fileType,
+ qid: l.attachPoint.makeQID(stat),
+ controlReadable: readable,
}
- return []p9.QID{l.attachPoint.makeQID(stat)}, c, nil
+ return []p9.QID{c.qid}, c, stat, nil
}
var qids []p9.QID
+ var lastStat syscall.Stat_t
last := l
for _, name := range names {
- f, path, err := openAnyFileFromParent(last, name)
+ f, path, readable, err := openAnyFileFromParent(last, name)
if last != l {
- last.Close()
+ _ = last.Close()
}
if err != nil {
- return nil, nil, extractErrno(err)
+ return nil, nil, syscall.Stat_t{}, extractErrno(err)
}
- stat, err := stat(f.FD())
+ lastStat, err = fstat(f.FD())
if err != nil {
- f.Close()
- return nil, nil, extractErrno(err)
+ _ = f.Close()
+ return nil, nil, syscall.Stat_t{}, extractErrno(err)
}
- c, err := newLocalFile(last.attachPoint, f, path, stat)
+ c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat)
if err != nil {
- f.Close()
- return nil, nil, extractErrno(err)
+ _ = f.Close()
+ return nil, nil, syscall.Stat_t{}, extractErrno(err)
}
- qids = append(qids, l.attachPoint.makeQID(stat))
+ qids = append(qids, c.qid)
last = c
}
- return qids, last, nil
+ return qids, last, lastStat, nil
}
// StatFS implements p9.File.
@@ -594,11 +593,15 @@ func (l *localFile) FSync() error {
// GetAttr implements p9.File.
func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
- stat, err := stat(l.file.FD())
+ stat, err := fstat(l.file.FD())
if err != nil {
return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
}
+ mask, attr := l.fillAttr(stat)
+ return l.qid, mask, attr, nil
+}
+func (l *localFile) fillAttr(stat syscall.Stat_t) (p9.AttrMask, p9.Attr) {
attr := p9.Attr{
Mode: p9.FileMode(stat.Mode),
UID: p9.UID(stat.Uid),
@@ -627,8 +630,7 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
MTime: true,
CTime: true,
}
-
- return l.attachPoint.makeQID(stat), valid, attr, nil
+ return valid, attr
}
// SetAttr implements p9.File. Due to mismatch in file API, options
@@ -669,7 +671,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
// Check if it's possible to use cached file, or if another one needs to be
// opened for write.
f := l.file
- if l.ft == regular && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
+ if l.fileType == syscall.S_IFREG && l.mode != p9.WriteOnly && l.mode != p9.ReadWrite {
var err error
f, err = reopenProcFd(l.file, openFlags|os.O_WRONLY)
if err != nil {
@@ -725,7 +727,7 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
}
}
- if l.ft == symlink {
+ if l.fileType == syscall.S_IFLNK {
// utimensat operates different that other syscalls. To operate on a
// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
// name.
@@ -864,7 +866,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
if err := unix.Symlinkat(target, l.file.FD(), newName); err != nil {
return p9.QID{}, extractErrno(err)
}
- cu := specutils.MakeCleanup(func() {
+ cu := cleanup.Make(func() {
// Best effort attempt to remove the symlink in case of failure.
if err := syscall.Unlinkat(l.file.FD(), newName); err != nil {
log.Warningf("error unlinking file %q after failure: %v", path.Join(l.hostPath, newName), err)
@@ -882,7 +884,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
if err := fchown(f.FD(), uid, gid); err != nil {
return p9.QID{}, extractErrno(err)
}
- stat, err := stat(f.FD())
+ stat, err := fstat(f.FD())
if err != nil {
return p9.QID{}, extractErrno(err)
}
@@ -909,13 +911,39 @@ func (l *localFile) Link(target p9.File, newName string) error {
}
// Mknod implements p9.File.
-//
-// Not implemented.
-func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) {
+ conf := l.attachPoint.conf
+ if conf.ROMount {
+ if conf.PanicOnWrite {
+ panic("attempt to write to RO mount")
+ }
+ return p9.QID{}, syscall.EROFS
+ }
+
+ hostPath := path.Join(l.hostPath, name)
+
+ // Return EEXIST if the file already exists.
+ if _, err := stat(hostPath); err == nil {
+ return p9.QID{}, syscall.EEXIST
+ }
+
// From mknod(2) man page:
// "EPERM: [...] if the filesystem containing pathname does not support
// the type of node requested."
- return p9.QID{}, syscall.EPERM
+ if mode.FileType() != p9.ModeRegular {
+ return p9.QID{}, syscall.EPERM
+ }
+
+ // Allow Mknod to create regular files.
+ if err := syscall.Mknod(hostPath, uint32(mode), 0); err != nil {
+ return p9.QID{}, err
+ }
+
+ stat, err := stat(hostPath)
+ if err != nil {
+ return p9.QID{}, extractErrno(err)
+ }
+ return l.attachPoint.makeQID(stat), nil
}
// UnlinkAt implements p9.File.
@@ -951,9 +979,12 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
skip := uint64(0)
- // Check if the file is at the correct position already. If not, seek to the
- // beginning and read the entire directory again.
- if l.lastDirentOffset != offset {
+ // Check if the file is at the correct position already. If not, seek to
+ // the beginning and read the entire directory again. We always seek if
+ // offset is 0, since this is side-effectual (equivalent to rewinddir(3),
+ // which causes the directory stream to resynchronize with the directory's
+ // current contents).
+ if l.lastDirentOffset != offset || offset == 0 {
if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil {
return nil, extractErrno(err)
}
@@ -1081,13 +1112,13 @@ func (l *localFile) Connect(flags p9.ConnectFlags) (*fd.FD, error) {
}
if err := syscall.SetNonblock(f, true); err != nil {
- syscall.Close(f)
+ _ = syscall.Close(f)
return nil, err
}
sa := syscall.SockaddrUnix{Name: l.hostPath}
if err := syscall.Connect(f, &sa); err != nil {
- syscall.Close(f)
+ _ = syscall.Close(f)
return nil, err
}
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 05af7e397..94f167417 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -26,6 +26,19 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
+
+var (
+ allTypes = []uint32{syscall.S_IFREG, syscall.S_IFDIR, syscall.S_IFLNK}
+
+ // allConfs is set in init().
+ allConfs []Config
+
+ rwConfs = []Config{{ROMount: false}}
+ roConfs = []Config{{ROMount: true}}
)
func init() {
@@ -39,6 +52,13 @@ func init() {
}
}
+func configTestName(config *Config) string {
+ if config.ROMount {
+ return "ROMount"
+ }
+ return "RWMount"
+}
+
func assertPanic(t *testing.T, f func()) {
defer func() {
if r := recover(); r == nil {
@@ -88,71 +108,76 @@ func testReadWrite(f p9.File, flags p9.OpenFlags, content []byte) error {
return nil
}
-var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
-
-var (
- allTypes = []fileType{regular, directory, symlink}
-
- // allConfs is set in init() above.
- allConfs []Config
-
- rwConfs = []Config{{ROMount: false}}
- roConfs = []Config{{ROMount: true}}
-)
-
type state struct {
- root *localFile
- file *localFile
- conf Config
- ft fileType
+ root *localFile
+ file *localFile
+ conf Config
+ fileType uint32
}
func (s state) String() string {
- return fmt.Sprintf("type(%v)", s.ft)
+ return fmt.Sprintf("type(%v)", s.fileType)
+}
+
+func typeName(fileType uint32) string {
+ switch fileType {
+ case syscall.S_IFREG:
+ return "file"
+ case syscall.S_IFDIR:
+ return "directory"
+ case syscall.S_IFLNK:
+ return "symlink"
+ default:
+ panic(fmt.Sprintf("invalid file type for test: %d", fileType))
+ }
}
func runAll(t *testing.T, test func(*testing.T, state)) {
runCustom(t, allTypes, allConfs, test)
}
-func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) {
+func runCustom(t *testing.T, types []uint32, confs []Config, test func(*testing.T, state)) {
for _, c := range confs {
- t.Logf("Config: %+v", c)
-
for _, ft := range types {
- t.Logf("File type: %v", ft)
+ name := fmt.Sprintf("%s/%s", configTestName(&c), typeName(ft))
+ t.Run(name, func(t *testing.T) {
+ path, name, err := setup(ft)
+ if err != nil {
+ t.Fatalf("%v", err)
+ }
+ defer os.RemoveAll(path)
- path, name, err := setup(ft)
- if err != nil {
- t.Fatalf("%v", err)
- }
- defer os.RemoveAll(path)
+ a, err := NewAttachPoint(path, c)
+ if err != nil {
+ t.Fatalf("NewAttachPoint failed: %v", err)
+ }
+ root, err := a.Attach()
+ if err != nil {
+ t.Fatalf("Attach failed, err: %v", err)
+ }
- a, err := NewAttachPoint(path, c)
- if err != nil {
- t.Fatalf("NewAttachPoint failed: %v", err)
- }
- root, err := a.Attach()
- if err != nil {
- t.Fatalf("Attach failed, err: %v", err)
- }
+ _, file, err := root.Walk([]string{name})
+ if err != nil {
+ root.Close()
+ t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
+ }
- _, file, err := root.Walk([]string{name})
- if err != nil {
+ st := state{
+ root: root.(*localFile),
+ file: file.(*localFile),
+ conf: c,
+ fileType: ft,
+ }
+ test(t, st)
+ file.Close()
root.Close()
- t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err)
- }
-
- st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft}
- test(t, st)
- file.Close()
- root.Close()
+ })
}
}
}
-func setup(ft fileType) (string, string, error) {
- path, err := ioutil.TempDir("", "root-")
+func setup(fileType uint32) (string, string, error) {
+ path, err := ioutil.TempDir(testutil.TmpDir(), "root-")
if err != nil {
return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err)
}
@@ -169,26 +194,26 @@ func setup(ft fileType) (string, string, error) {
defer root.Close()
var name string
- switch ft {
- case regular:
+ switch fileType {
+ case syscall.S_IFREG:
name = "file"
_, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
if err != nil {
return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err)
}
defer f.Close()
- case directory:
+ case syscall.S_IFDIR:
name = "dir"
if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err)
}
- case symlink:
+ case syscall.S_IFLNK:
name = "symlink"
if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err)
}
default:
- panic(fmt.Sprintf("unknown file type %v", ft))
+ panic(fmt.Sprintf("unknown file type %v", fileType))
}
return path, name, nil
}
@@ -202,7 +227,7 @@ func createFile(dir *localFile, name string) (*localFile, error) {
}
func TestReadWrite(t *testing.T) {
- runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
child, err := createFile(s.file, "test")
if err != nil {
t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -232,7 +257,7 @@ func TestReadWrite(t *testing.T) {
}
func TestCreate(t *testing.T) {
- runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
for i, flags := range allOpenFlags {
_, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
if err != nil {
@@ -249,7 +274,7 @@ func TestCreate(t *testing.T) {
// TestReadWriteDup tests that a file opened in any mode can be dup'ed and
// reopened in any other mode.
func TestReadWriteDup(t *testing.T) {
- runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
child, err := createFile(s.file, "test")
if err != nil {
t.Fatalf("%v: createFile() failed, err: %v", s, err)
@@ -291,7 +316,7 @@ func TestReadWriteDup(t *testing.T) {
}
func TestUnopened(t *testing.T) {
- runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{syscall.S_IFREG}, allConfs, func(t *testing.T, s state) {
b := []byte("foobar")
if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF {
t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err)
@@ -308,6 +333,32 @@ func TestUnopened(t *testing.T) {
})
}
+// TestOpenOPath is a regression test to ensure that a file that cannot be open
+// for read is allowed to be open. This was happening because the control file
+// was open with O_PATH, but Open() was not checking for it and allowing the
+// control file to be reused.
+func TestOpenOPath(t *testing.T) {
+ runCustom(t, []uint32{syscall.S_IFREG}, rwConfs, func(t *testing.T, s state) {
+ // Fist remove all permissions on the file.
+ if err := s.file.SetAttr(p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(0)}); err != nil {
+ t.Fatalf("SetAttr(): %v", err)
+ }
+ // Then walk to the file again to open a new control file.
+ filename := filepath.Base(s.file.hostPath)
+ _, newFile, err := s.root.Walk([]string{filename})
+ if err != nil {
+ t.Fatalf("root.Walk(%q): %v", filename, err)
+ }
+
+ if newFile.(*localFile).controlReadable {
+ t.Fatalf("control file didn't open with O_PATH: %+v", newFile)
+ }
+ if _, _, _, err := newFile.Open(p9.ReadOnly); err != syscall.EACCES {
+ t.Fatalf("Open() should have failed, got: %v, wanted: EACCES", err)
+ }
+ })
+}
+
func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, error) {
if err := l.SetAttr(valid, attr); err != nil {
return p9.Attr{}, err
@@ -324,7 +375,7 @@ func TestSetAttrPerm(t *testing.T) {
valid := p9.SetAttrMask{Permissions: true}
attr := p9.SetAttr{Permissions: 0777}
got, err := SetGetAttr(s.file, valid, attr)
- if s.ft == symlink {
+ if s.fileType == syscall.S_IFLNK {
if err == nil {
t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
}
@@ -345,7 +396,7 @@ func TestSetAttrSize(t *testing.T) {
valid := p9.SetAttrMask{Size: true}
attr := p9.SetAttr{Size: size}
got, err := SetGetAttr(s.file, valid, attr)
- if s.ft == symlink || s.ft == directory {
+ if s.fileType == syscall.S_IFLNK || s.fileType == syscall.S_IFDIR {
if err == nil {
t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions)
}
@@ -427,7 +478,7 @@ func TestLink(t *testing.T) {
}
err = dir.Link(s.file, linkFile)
- if s.ft == directory {
+ if s.fileType == syscall.S_IFDIR {
if err != syscall.EPERM {
t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err)
}
@@ -485,7 +536,7 @@ func TestROMountPanics(t *testing.T) {
}
func TestWalkNotFound(t *testing.T) {
- runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{syscall.S_IFDIR}, allConfs, func(t *testing.T, s state) {
if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT {
t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err)
}
@@ -506,7 +557,7 @@ func TestWalkDup(t *testing.T) {
}
func TestReaddir(t *testing.T) {
- runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) {
+ runCustom(t, []uint32{syscall.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
name := "dir"
if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err)
diff --git a/runsc/main.go b/runsc/main.go
index 920ed84a5..69cb505fa 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -71,7 +71,9 @@ var (
platformName = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm.")
network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
hardwareGSO = flag.Bool("gso", true, "enable hardware segmentation offload if it is supported by a network device.")
- softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware ofload can't be enabled.")
+ softwareGSO = flag.Bool("software-gso", true, "enable software segmentation offload when hardware offload can't be enabled.")
+ txChecksumOffload = flag.Bool("tx-checksum-offload", false, "enable TX checksum offload.")
+ rxChecksumOffload = flag.Bool("rx-checksum-offload", true, "enable RX checksum offload.")
qDisc = flag.String("qdisc", "fifo", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.")
fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
@@ -86,6 +88,7 @@ var (
referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
vfs2Enabled = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
+ fuseEnabled = flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.")
// Test flags, not to be used outside tests, ever.
testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
@@ -223,6 +226,8 @@ func main() {
Network: netType,
HardwareGSO: *hardwareGSO,
SoftwareGSO: *softwareGSO,
+ TXChecksumOffload: *txChecksumOffload,
+ RXChecksumOffload: *rxChecksumOffload,
LogPackets: *logPackets,
Platform: platformType,
Strace: *strace,
@@ -238,6 +243,7 @@ func main() {
OverlayfsStaleRead: *overlayfsStaleRead,
CPUNumFromQuota: *cpuNumFromQuota,
VFS2: *vfs2Enabled,
+ FUSE: *fuseEnabled,
QDisc: queueingDiscipline,
TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
TestOnlyTestNameEnv: *testOnlyTestNameEnv,
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index c95d50294..2b9d4549d 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -13,6 +13,7 @@ go_library(
"//runsc:__subpackages__",
],
deps = [
+ "//pkg/cleanup",
"//pkg/control/client",
"//pkg/control/server",
"//pkg/log",
@@ -28,7 +29,7 @@ go_library(
"//runsc/console",
"//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@com_github_syndtr_gocapability//capability:go_default_library",
"@com_github_vishvananda_netlink//:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 209bfdb20..817a923ad 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -62,7 +62,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
// Build the path to the net namespace of the sandbox process.
// This is what we will copy.
nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
- if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.NumNetworkChannels, conf.QDisc); err != nil {
+ if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.HardwareGSO, conf.SoftwareGSO, conf.TXChecksumOffload, conf.RXChecksumOffload, conf.NumNetworkChannels, conf.QDisc); err != nil {
return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
}
case boot.NetworkHost:
@@ -115,7 +115,7 @@ func isRootNS() (bool, error) {
// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
// net namespace with the given path, creates them in the sandbox, and removes
// them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareGSO bool, softwareGSO bool, txChecksumOffload bool, rxChecksumOffload bool, numNetworkChannels int, qDisc boot.QueueingDiscipline) error {
// Join the network namespace that we will be copying.
restore, err := joinNetNS(nsPath)
if err != nil {
@@ -134,7 +134,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
return err
}
if isRoot {
-
return fmt.Errorf("cannot run with network enabled in root network namespace")
}
@@ -197,11 +196,13 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
}
link := boot.FDBasedLink{
- Name: iface.Name,
- MTU: iface.MTU,
- Routes: routes,
- NumChannels: numNetworkChannels,
- QDisc: qDisc,
+ Name: iface.Name,
+ MTU: iface.MTU,
+ Routes: routes,
+ TXChecksumOffload: txChecksumOffload,
+ RXChecksumOffload: rxChecksumOffload,
+ NumChannels: numNetworkChannels,
+ QDisc: qDisc,
}
// Get the link for the interface.
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index e4ec16e2f..36bb0c9c9 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -30,6 +30,7 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/syndtr/gocapability/capability"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/control/client"
"gvisor.dev/gvisor/pkg/control/server"
"gvisor.dev/gvisor/pkg/log"
@@ -119,7 +120,7 @@ func New(conf *boot.Config, args *Args) (*Sandbox, error) {
s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup}
// The Cleanup object cleans up partially created sandboxes when an error
// occurs. Any errors occurring during cleanup itself are ignored.
- c := specutils.MakeCleanup(func() {
+ c := cleanup.Make(func() {
err := s.destroy()
log.Warningf("error destroying sandbox: %v", err)
})
@@ -477,9 +478,7 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
// If the console control socket file is provided, then create a new
// pty master/slave pair and set the TTY on the sandbox process.
- if args.ConsoleSocket != "" {
- cmd.Args = append(cmd.Args, "--console=true")
-
+ if args.Spec.Process.Terminal && args.ConsoleSocket != "" {
// console.NewWithSocket will send the master on the given
// socket, and return the slave.
tty, err := console.NewWithSocket(args.ConsoleSocket)
@@ -1013,26 +1012,6 @@ func (s *Sandbox) StopCPUProfile() error {
return nil
}
-// GoroutineProfile writes a goroutine profile to the given file.
-func (s *Sandbox) GoroutineProfile(f *os.File) error {
- log.Debugf("Goroutine profile %q", s.ID)
- conn, err := s.sandboxConnect()
- if err != nil {
- return err
- }
- defer conn.Close()
-
- opts := control.ProfileOpts{
- FilePayload: urpc.FilePayload{
- Files: []*os.File{f},
- },
- }
- if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil {
- return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err)
- }
- return nil
-}
-
// BlockProfile writes a block profile to the given file.
func (s *Sandbox) BlockProfile(f *os.File) error {
log.Debugf("Block profile %q", s.ID)
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 4ccd77f63..43851a22f 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -17,7 +17,8 @@ go_library(
"//pkg/log",
"//pkg/sentry/kernel/auth",
"@com_github_cenkalti_backoff//:go_default_library",
- "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+ "@com_github_mohae_deepcopy//:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@com_github_syndtr_gocapability//capability:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
],
@@ -28,5 +29,5 @@ go_test(
size = "small",
srcs = ["specutils_test.go"],
library = ":specutils",
- deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"],
+ deps = ["@com_github_opencontainers_runtime_spec//specs-go:go_default_library"],
)
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 60bb7b7ee..23001d67c 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -18,6 +18,7 @@ import (
"fmt"
"os"
"os/exec"
+ "os/signal"
"path/filepath"
"runtime"
"syscall"
@@ -261,7 +262,18 @@ func MaybeRunAsRoot() error {
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
- if err := cmd.Run(); err != nil {
+ if err := cmd.Start(); err != nil {
+ return fmt.Errorf("re-executing self: %w", err)
+ }
+ ch := make(chan os.Signal, 1)
+ signal.Notify(ch)
+ go func() {
+ for {
+ // Forward all signals to child process.
+ cmd.Process.Signal(<-ch)
+ }
+ }()
+ if err := cmd.Wait(); err != nil {
if exit, ok := err.(*exec.ExitError); ok {
if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
os.Exit(ws.ExitStatus())
@@ -269,7 +281,7 @@ func MaybeRunAsRoot() error {
log.Warningf("No wait status provided, exiting with -1: %v", err)
os.Exit(-1)
}
- return fmt.Errorf("re-executing self: %v", err)
+ return err
}
// Child completed with success.
os.Exit(0)
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 202518b58..5015c3a84 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -29,6 +29,7 @@ import (
"time"
"github.com/cenkalti/backoff"
+ "github.com/mohae/deepcopy"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/bits"
@@ -44,20 +45,31 @@ var ExePath = "/proc/self/exe"
var Version = specs.Version
// LogSpec logs the spec in a human-friendly way.
-func LogSpec(spec *specs.Spec) {
- log.Debugf("Spec: %+v", spec)
- log.Debugf("Spec.Hooks: %+v", spec.Hooks)
- log.Debugf("Spec.Linux: %+v", spec.Linux)
- if spec.Linux != nil && spec.Linux.Resources != nil {
- res := spec.Linux.Resources
- log.Debugf("Spec.Linux.Resources.Memory: %+v", res.Memory)
- log.Debugf("Spec.Linux.Resources.CPU: %+v", res.CPU)
- log.Debugf("Spec.Linux.Resources.BlockIO: %+v", res.BlockIO)
- log.Debugf("Spec.Linux.Resources.Network: %+v", res.Network)
- }
- log.Debugf("Spec.Process: %+v", spec.Process)
- log.Debugf("Spec.Root: %+v", spec.Root)
- log.Debugf("Spec.Mounts: %+v", spec.Mounts)
+func LogSpec(orig *specs.Spec) {
+ if !log.IsLogging(log.Debug) {
+ return
+ }
+
+ // Strip down parts of the spec that are not interesting.
+ spec := deepcopy.Copy(orig).(*specs.Spec)
+ if spec.Process != nil {
+ spec.Process.Capabilities = nil
+ }
+ if spec.Linux != nil {
+ spec.Linux.Seccomp = nil
+ spec.Linux.MaskedPaths = nil
+ spec.Linux.ReadonlyPaths = nil
+ if spec.Linux.Resources != nil {
+ spec.Linux.Resources.Devices = nil
+ }
+ }
+
+ out, err := json.MarshalIndent(spec, "", " ")
+ if err != nil {
+ log.Debugf("Failed to marshal spec: %v", err)
+ return
+ }
+ log.Debugf("Spec:\n%s", out)
}
// ValidateSpec validates that the spec is compatible with runsc.
@@ -311,19 +323,7 @@ func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.
// Is9PMount returns true if the given mount can be mounted as an external gofer.
func Is9PMount(m specs.Mount) bool {
- var isBind bool
- switch m.Type {
- case "bind":
- isBind = true
- default:
- for _, opt := range m.Options {
- if opt == "bind" || opt == "rbind" {
- isBind = true
- break
- }
- }
- }
- return isBind && m.Source != "" && IsSupportedDevMount(m)
+ return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m)
}
// IsSupportedDevMount returns true if the mount is a supported /dev mount.
@@ -456,36 +456,6 @@ func ContainsStr(strs []string, str string) bool {
return false
}
-// Cleanup allows defers to be aborted when cleanup needs to happen
-// conditionally. Usage:
-// c := MakeCleanup(func() { f.Close() })
-// defer c.Clean() // any failure before release is called will close the file.
-// ...
-// c.Release() // on success, aborts closing the file and return it.
-// return f
-type Cleanup struct {
- clean func()
-}
-
-// MakeCleanup creates a new Cleanup object.
-func MakeCleanup(f func()) Cleanup {
- return Cleanup{clean: f}
-}
-
-// Clean calls the cleanup function.
-func (c *Cleanup) Clean() {
- if c.clean != nil {
- c.clean()
- c.clean = nil
- }
-}
-
-// Release releases the cleanup from its duties, i.e. cleanup function is not
-// called after this point.
-func (c *Cleanup) Release() {
- c.clean = nil
-}
-
// RetryEintr retries the function until an error different than EINTR is
// returned.
func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {