diff options
-rw-r--r-- | pkg/abi/linux/linux_abi_autogen_unsafe.go | 8 | ||||
-rw-r--r-- | runsc/boot/fs.go | 18 | ||||
-rw-r--r-- | runsc/boot/vfs.go | 130 | ||||
-rw-r--r-- | runsc/specutils/namespace.go | 16 |
4 files changed, 128 insertions, 44 deletions
diff --git a/pkg/abi/linux/linux_abi_autogen_unsafe.go b/pkg/abi/linux/linux_abi_autogen_unsafe.go index 07bf06264..26d27a419 100644 --- a/pkg/abi/linux/linux_abi_autogen_unsafe.go +++ b/pkg/abi/linux/linux_abi_autogen_unsafe.go @@ -124,12 +124,12 @@ func (s *Statx) UnmarshalBytes(src []byte) { // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Statx) Packed() bool { - return s.Mtime.Packed() && s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() + return s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Statx) MarshalUnsafe(dst []byte) { - if s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { + if s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() && s.Atime.Packed() { safecopy.CopyIn(dst, unsafe.Pointer(s)) } else { s.MarshalBytes(dst) @@ -148,7 +148,7 @@ func (s *Statx) UnmarshalUnsafe(src []byte) { // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (s *Statx) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { - if !s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { + if !s.Ctime.Packed() && s.Mtime.Packed() && s.Atime.Packed() && s.Btime.Packed() { // Type Statx doesn't have a packed layout in memory, fall back to MarshalBytes. buf := task.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. @@ -204,7 +204,7 @@ func (s *Statx) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { // WriteTo implements io.WriterTo.WriteTo. func (s *Statx) WriteTo(w io.Writer) (int64, error) { - if !s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() && s.Atime.Packed() { + if !s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { // Type Statx doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 52f8344ca..b98a1eb50 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -63,7 +63,7 @@ const ( ) // tmpfs has some extra supported options that we must pass through. -var tmpfsAllowedOptions = []string{"mode", "uid", "gid"} +var tmpfsAllowedData = []string{"mode", "uid", "gid"} func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { // Upper layer uses the same flags as lower, but it must be read-write. @@ -154,8 +154,8 @@ func compileMounts(spec *specs.Spec) []specs.Mount { return mounts } -// p9MountOptions creates a slice of options for a p9 mount. -func p9MountOptions(fd int, fa FileAccessType, vfs2 bool) []string { +// p9MountData creates a slice of p9 mount data. +func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string { opts := []string{ "trans=fd", "rfdno=" + strconv.Itoa(fd), @@ -235,7 +235,7 @@ func isSupportedMountFlag(fstype, opt string) bool { return true } if fstype == tmpfsvfs2.Name { - ok, err := parseMountOption(opt, tmpfsAllowedOptions...) + ok, err := parseMountOption(opt, tmpfsAllowedData...) return ok && err == nil } return false @@ -716,7 +716,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (* fd := c.fds.remove() log.Infof("Mounting root over 9P, ioFD: %d", fd) p9FS := mustFindFilesystem("9p") - opts := p9MountOptions(fd, conf.FileAccess, false /* vfs2 */) + opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) if conf.OverlayfsStaleRead { // We can't check for overlayfs here because sandbox is chroot'ed and gofer @@ -770,7 +770,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( fsName = m.Type var err error - opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...) + opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { return "", nil, false, err } @@ -778,7 +778,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) ( case bind: fd := c.fds.remove() fsName = gofervfs2.Name - opts = p9MountOptions(fd, c.getMountAccessType(m), conf.VFS2) + opts = p9MountData(fd, c.getMountAccessType(m), conf.VFS2) // If configured, add overlay to all writable mounts. useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly @@ -931,7 +931,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn // Add root mount. fd := c.fds.remove() - opts := p9MountOptions(fd, conf.FileAccess, false /* vfs2 */) + opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) mf := fs.MountSourceFlags{} if c.root.Readonly || conf.Overlay { @@ -1019,7 +1019,7 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M Destination: "/tmp", // Sticky bit is added to prevent accidental deletion of files from // another user. This is normally done for /tmp. - Options: []string{"mode=1777"}, + Options: []string{"mode=01777"}, } return c.mountSubmount(ctx, conf, mns, root, tmpMount) diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index f48e6b0f1..6c84f0794 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -136,7 +136,7 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { fd := c.fds.remove() - opts := strings.Join(p9MountOptions(fd, conf.FileAccess, true /* vfs2 */), ",") + opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",") log.Infof("Mounting root over 9P, ioFD: %d", fd) mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts}) @@ -160,8 +160,9 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, } } - // TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go. - + if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil { + return fmt.Errorf(`mount submount "\tmp": %w`, err) + } return nil } @@ -199,8 +200,6 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { return mounts, nil } -// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 -// version. func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { root := mns.Root() defer root.DecRef() @@ -209,12 +208,11 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, Start: root, Path: fspath.Parse(submount.Destination), } - - fsName, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount) + fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount) if err != nil { return fmt.Errorf("mountOptions failed: %w", err) } - if fsName == "" { + if len(fsName) == 0 { // Filesystem is not supported (e.g. cgroup), just skip it. return nil } @@ -222,17 +220,6 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil { return err } - - opts := &vfs.MountOptions{ - GetFilesystemOptions: vfs.GetFilesystemOptions{ - Data: strings.Join(options, ","), - }, - InternalMount: true, - } - - // All writes go to upper, be paranoid and make lower readonly. - opts.ReadOnly = useOverlay - if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil { return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } @@ -242,13 +229,13 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values // used for mounts. -func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, []string, bool, error) { +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) { var ( - fsName string - opts []string - useOverlay bool + fsName string + data []string ) + // Find filesystem name and FS specific data field. switch m.Type { case devpts.Name, devtmpfs.Name, proc.Name, sys.Name: fsName = m.Type @@ -258,21 +245,46 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF fsName = m.Type var err error - opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...) + data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { - return "", nil, false, err + return "", nil, err } case bind: fsName = gofer.Name - opts = p9MountOptions(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) - // If configured, add overlay to all writable mounts. - useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) default: log.Warningf("ignoring unknown filesystem type %q", m.Type) } - return fsName, opts, useOverlay, nil + + opts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(data, ","), + }, + InternalMount: true, + } + + for _, o := range m.Options { + switch o { + case "rw": + opts.ReadOnly = false + case "ro": + opts.ReadOnly = true + case "noatime": + // TODO(gvisor.dev/issue/1193): Implement MS_NOATIME. + case "noexec": + opts.Flags.NoExec = true + default: + log.Warningf("ignoring unknown mount option %q", o) + } + } + + if conf.Overlay { + // All writes go to upper, be paranoid and make lower readonly. + opts.ReadOnly = true + } + return fsName, opts, nil } func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { @@ -301,3 +313,63 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s } return nil } + +// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so. +// Technically we don't have to mount tmpfs at /tmp, as we could just rely on +// the host /tmp, but this is a nice optimization, and fixes some apps that call +// mknod in /tmp. It's unsafe to mount tmpfs if: +// 1. /tmp is mounted explicitly: we should not override user's wish +// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp +// +// Note that when there are submounts inside of '/tmp', directories for the +// mount points must be present, making '/tmp' not empty anymore. +func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { + for _, m := range c.mounts { + // m.Destination has been cleaned, so it's to use equality here. + if m.Destination == "/tmp" { + log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m) + return nil + } + } + + root := mns.Root() + defer root.DecRef() + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse("/tmp"), + } + // TODO(gvisor.dev/issue/2782): Use O_PATH when available. + statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{}) + switch err { + case nil: + // Found '/tmp' in filesystem, check if it's empty. + if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory { + // Not a dir?! Leave it be. + return nil + } + if statx.Nlink > 2 { + // If more than "." and ".." is found, skip internal tmpfs to prevent + // hiding existing files. + log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) + return nil + } + log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) + fallthrough + + case syserror.ENOENT: + // No '/tmp' found (or fallthrough from above). It's safe to mount internal + // tmpfs. + tmpMount := specs.Mount{ + Type: tmpfs.Name, + Destination: "/tmp", + // Sticky bit is added to prevent accidental deletion of files from + // another user. This is normally done for /tmp. + Options: []string{"mode=01777"}, + } + return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + + default: + return fmt.Errorf(`stating "/tmp" inside container: %w`, err) + } +} diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go index 60bb7b7ee..23001d67c 100644 --- a/runsc/specutils/namespace.go +++ b/runsc/specutils/namespace.go @@ -18,6 +18,7 @@ import ( "fmt" "os" "os/exec" + "os/signal" "path/filepath" "runtime" "syscall" @@ -261,7 +262,18 @@ func MaybeRunAsRoot() error { cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { + if err := cmd.Start(); err != nil { + return fmt.Errorf("re-executing self: %w", err) + } + ch := make(chan os.Signal, 1) + signal.Notify(ch) + go func() { + for { + // Forward all signals to child process. + cmd.Process.Signal(<-ch) + } + }() + if err := cmd.Wait(); err != nil { if exit, ok := err.(*exec.ExitError); ok { if ws, ok := exit.Sys().(syscall.WaitStatus); ok { os.Exit(ws.ExitStatus()) @@ -269,7 +281,7 @@ func MaybeRunAsRoot() error { log.Warningf("No wait status provided, exiting with -1: %v", err) os.Exit(-1) } - return fmt.Errorf("re-executing self: %v", err) + return err } // Child completed with success. os.Exit(0) |