diff options
-rw-r--r-- | go.mod | 3 | ||||
-rw-r--r-- | pkg/sentry/fs/file.go | 24 | ||||
-rw-r--r-- | pkg/sentry/fs/gofer/socket.go | 5 | ||||
-rwxr-xr-x | pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go | 3 | ||||
-rwxr-xr-x | pkg/sentry/platform/ring0/defs_impl.go | 2 | ||||
-rw-r--r-- | pkg/sentry/socket/control/control.go | 12 | ||||
-rw-r--r-- | pkg/sentry/socket/unix/transport/unix.go | 4 | ||||
-rw-r--r-- | pkg/sentry/socket/unix/unix.go | 6 | ||||
-rwxr-xr-x | pkg/sentry/time/seqatomic_parameters.go | 3 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcp/accept.go | 5 | ||||
-rw-r--r-- | runsc/boot/controller.go | 16 | ||||
-rw-r--r-- | runsc/boot/fds.go | 3 | ||||
-rw-r--r-- | runsc/boot/fs.go | 727 | ||||
-rw-r--r-- | runsc/boot/loader.go | 73 | ||||
-rw-r--r-- | runsc/cmd/exec.go | 36 | ||||
-rw-r--r-- | runsc/cmd/wait.go | 4 | ||||
-rw-r--r-- | runsc/container/container.go | 8 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 7 |
18 files changed, 482 insertions, 459 deletions
@@ -1,8 +1,9 @@ module gvisor.googlesource.com/gvisor + go 1.12 require ( - github.com/cenkalti/backoff v2.1.1 + github.com/cenkalti/backoff v2.2.0 github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079 github.com/golang/mock v1.3.1 github.com/golang/protobuf v1.3.1 diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 8c1307235..f64954457 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -545,12 +545,28 @@ type lockedWriter struct { // Write implements io.Writer.Write. func (w *lockedWriter) Write(buf []byte) (int, error) { - n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), w.File.offset) - return int(n), err + return w.WriteAt(buf, w.File.offset) } // WriteAt implements io.Writer.WriteAt. func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) { - n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), offset) - return int(n), err + var ( + written int + err error + ) + // The io.Writer contract requires that Write writes all available + // bytes and does not return short writes. This causes errors with + // io.Copy, since our own Write interface does not have this same + // contract. Enforce that here. + for written < len(buf) { + var n int64 + n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written)) + if n > 0 { + written += int(n) + } + if err != nil { + break + } + } + return written, err } diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index cbd5b9a84..7376fd76f 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -139,3 +139,8 @@ func (e *endpoint) UnidirectionalConnect() (transport.ConnectedEndpoint, *syserr func (e *endpoint) Release() { e.inode.DecRef() } + +// Passcred implements transport.BoundEndpoint.Passcred. +func (e *endpoint) Passcred() bool { + return false +} diff --git a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go index 5b4ad3062..532b33b9e 100755 --- a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go +++ b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo.go @@ -2,11 +2,10 @@ package kernel import ( "fmt" + "gvisor.googlesource.com/gvisor/third_party/gvsync" "reflect" "strings" "unsafe" - - "gvisor.googlesource.com/gvisor/third_party/gvsync" ) // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race diff --git a/pkg/sentry/platform/ring0/defs_impl.go b/pkg/sentry/platform/ring0/defs_impl.go index 71f8ea781..582553bc7 100755 --- a/pkg/sentry/platform/ring0/defs_impl.go +++ b/pkg/sentry/platform/ring0/defs_impl.go @@ -1,10 +1,10 @@ package ring0 import ( - "gvisor.googlesource.com/gvisor/pkg/cpuid" "syscall" "fmt" + "gvisor.googlesource.com/gvisor/pkg/cpuid" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "io" diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index c0238691d..434d7ca2e 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -406,12 +406,20 @@ func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials { return nil } if cr, ok := socketOrEndpoint.(transport.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) { - tcred := t.Credentials() - return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID} + return MakeCreds(t) } return nil } +// MakeCreds creates default SCMCredentials. +func MakeCreds(t *kernel.Task) SCMCredentials { + if t == nil { + return nil + } + tcred := t.Credentials() + return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID} +} + // New creates default control messages if needed. func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages { return transport.ControlMessages{ diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index b734b4c20..37d82bb6b 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -237,6 +237,10 @@ type BoundEndpoint interface { // endpoint. UnidirectionalConnect() (ConnectedEndpoint, *syserr.Error) + // Passcred returns whether or not the SO_PASSCRED socket option is + // enabled on this end. + Passcred() bool + // Release releases any resources held by the BoundEndpoint. It must be // called before dropping all references to a BoundEndpoint returned by a // function. diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 1414be0c6..388cc0d8b 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -385,6 +385,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] } defer ep.Release() w.To = ep + + if ep.Passcred() && w.Control.Credentials == nil { + w.Control.Credentials = control.MakeCreds(t) + } } n, err := src.CopyInTo(t, &w) @@ -516,7 +520,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait { var from interface{} var fromLen uint32 - if r.From != nil { + if r.From != nil && len([]byte(r.From.Addr)) != 0 { from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From) } diff --git a/pkg/sentry/time/seqatomic_parameters.go b/pkg/sentry/time/seqatomic_parameters.go index 54152a39a..f142c681a 100755 --- a/pkg/sentry/time/seqatomic_parameters.go +++ b/pkg/sentry/time/seqatomic_parameters.go @@ -2,11 +2,10 @@ package time import ( "fmt" + "gvisor.googlesource.com/gvisor/third_party/gvsync" "reflect" "strings" "unsafe" - - "gvisor.googlesource.com/gvisor/third_party/gvsync" ) // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index d4b860975..31e365ae5 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -19,7 +19,6 @@ import ( "encoding/binary" "hash" "io" - "log" "sync" "time" @@ -307,7 +306,6 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header func (e *endpoint) incSynRcvdCount() bool { e.mu.Lock() - log.Printf("l: %d, c: %d, e.synRcvdCount: %d", len(e.acceptedChan), cap(e.acceptedChan), e.synRcvdCount) if l, c := len(e.acceptedChan), cap(e.acceptedChan); l == c && e.synRcvdCount >= c { e.mu.Unlock() return false @@ -333,17 +331,14 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) { // Drop the SYN if the listen endpoint's accept queue is // overflowing. if e.incSynRcvdCount() { - log.Printf("processing syn packet") s.incRef() go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier. return } - log.Printf("dropping syn packet") e.stack.Stats().TCP.ListenOverflowSynDrop.Increment() e.stack.Stats().DroppedPackets.Increment() return } else { - // TODO(bhaskerh): Increment syncookie sent stat. cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS)) // Send SYN with window scaling because we currently // dont't encode this information in the cookie. diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 72ab9ef86..a277145b1 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -237,7 +237,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") } - err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files) + err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files) if err != nil { log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) return err @@ -340,8 +340,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { cm.l.k = k // Set up the restore environment. - fds := &fdDispenser{fds: cm.l.goferFDs} - renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds) + mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k) + renv, err := mntr.createRestoreEnvironment(cm.l.conf) if err != nil { return fmt.Errorf("creating RestoreEnvironment: %v", err) } @@ -369,11 +369,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { k.Timekeeper().SetClocks(time.NewCalibratedClocks()) // Since we have a new kernel we also must make a new watchdog. - watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction) + dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction) // Change the loader fields to reflect the changes made when restoring. cm.l.k = k - cm.l.watchdog = watchdog + cm.l.watchdog = dog cm.l.rootProcArgs = kernel.CreateProcessArgs{} cm.l.restore = true @@ -420,16 +420,12 @@ type WaitPIDArgs struct { // CID is the container ID. CID string - - // ClearStatus determines whether the exit status of the process should - // be cleared when WaitPID returns. - ClearStatus bool } // WaitPID waits for the process with PID 'pid' in the sandbox. func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error { log.Debugf("containerManager.Wait") - return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus) + return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus) } // SignalDeliveryMode enumerates different signal delivery modes. diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 4e428b49c..0811e10f4 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -28,11 +28,12 @@ import ( // createFDMap creates an FD map that contains stdin, stdout, and stderr. If // console is true, then ioctl calls will be passed through to the host FD. // Upon success, createFDMap dups then closes stdioFDs. -func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) { +func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) { if len(stdioFDs) != 3 { return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } + k := kernel.KernelFromContext(ctx) fdm := k.NewFDMap() defer fdm.DecRef() mounter := fs.FileOwnerFromContext(ctx) diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 4b1557b9a..939f2419c 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -29,9 +29,6 @@ import ( _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" - "gvisor.googlesource.com/gvisor/pkg/sentry/limits" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/abi/linux" @@ -40,6 +37,8 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/syserror" "gvisor.googlesource.com/gvisor/runsc/specutils" ) @@ -65,67 +64,24 @@ const ( nonefs = "none" ) -type fdDispenser struct { - fds []int -} - -func (f *fdDispenser) remove() int { - if f.empty() { - panic("fdDispenser out of fds") - } - rv := f.fds[0] - f.fds = f.fds[1:] - return rv -} - -func (f *fdDispenser) empty() bool { - return len(f.fds) == 0 -} +func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { + // Upper layer uses the same flags as lower, but it must be read-write. + upperFlags := lowerFlags + upperFlags.ReadOnly = false -func adjustDirentCache(k *kernel.Kernel) error { - var hl syscall.Rlimit - if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil { - return fmt.Errorf("getting RLIMIT_NOFILE: %v", err) - } - if int64(hl.Cur) != syscall.RLIM_INFINITY { - newSize := hl.Cur / 2 - if newSize < gofer.DefaultDirentCacheSize { - log.Infof("Setting gofer dirent cache size to %d", newSize) - gofer.DefaultDirentCacheSize = newSize - k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize) - } + tmpFS := mustFindFilesystem("tmpfs") + if !fs.IsDir(lower.StableAttr) { + // Create overlay on top of mount file, e.g. /etc/hostname. + msrc := fs.NewCachingMountSource(tmpFS, upperFlags) + return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags) } - return nil -} - -// setupRootContainerFS creates a mount namespace containing the root filesystem -// and all mounts. 'rootCtx' is used to walk directories to find mount points. -// 'setMountNS' is called after namespace is created. It must set the mount NS -// to 'rootCtx'. -func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int, setMountNS func(*fs.MountNamespace)) error { - mounts := compileMounts(spec) - - // Create a tmpfs mount where we create and mount a root filesystem for - // each child container. - mounts = append(mounts, specs.Mount{ - Type: tmpfs, - Destination: ChildContainersDir, - }) - fds := &fdDispenser{fds: goferFDs} - rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts) - if err != nil { - return fmt.Errorf("creating root mount: %v", err) - } - mns, err := fs.NewMountNamespace(userCtx, rootInode) + // Create overlay on top of mount dir. + upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil) if err != nil { - return fmt.Errorf("creating root mount namespace: %v", err) + return nil, fmt.Errorf("creating tmpfs overlay: %v", err) } - setMountNS(mns) - - root := mns.Root() - defer root.DecRef() - return mountSubmounts(rootCtx, conf, mns, root, mounts, fds) + return fs.NewOverlayRoot(ctx, upper, lower, upperFlags) } // compileMounts returns the supported mounts from the mount spec, adding any @@ -184,186 +140,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount { return mounts } -// createRootMount creates the root filesystem. -func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) { - // First construct the filesystem from the spec.Root. - mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly || conf.Overlay} - - var ( - rootInode *fs.Inode - err error - ) - - fd := fds.remove() - log.Infof("Mounting root over 9P, ioFD: %d", fd) - p9FS := mustFindFilesystem("9p") - opts := p9MountOptions(fd, conf.FileAccess) - rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) - if err != nil { - return nil, fmt.Errorf("creating root mount point: %v", err) - } - - // We need to overlay the root on top of a ramfs with stub directories - // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always - // mounted even if they are not in the spec. - submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp") - rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) - if err != nil { - return nil, fmt.Errorf("adding submount overlay: %v", err) - } - - if conf.Overlay && !spec.Root.Readonly { - log.Debugf("Adding overlay on top of root mount") - // Overlay a tmpfs filesystem on top of the root. - rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) - if err != nil { - return nil, err - } - } - - log.Infof("Mounted %q to %q type root", spec.Root.Path, "/") - return rootInode, nil -} - -func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { - // Upper layer uses the same flags as lower, but it must be read-write. - lowerFlags.ReadOnly = false - - tmpFS := mustFindFilesystem("tmpfs") - if !fs.IsDir(lower.StableAttr) { - // Create overlay on top of mount file, e.g. /etc/hostname. - msrc := fs.NewCachingMountSource(tmpFS, lowerFlags) - return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags) - } - - // Create overlay on top of mount dir. - upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil) - if err != nil { - return nil, fmt.Errorf("creating tmpfs overlay: %v", err) - } - return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags) -} - -// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values -// used for mounts. -func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) { - var ( - fsName string - opts []string - useOverlay bool - err error - ) - - switch m.Type { - case devpts, devtmpfs, proc, sysfs: - fsName = m.Type - case nonefs: - fsName = sysfs - case tmpfs: - fsName = m.Type - - // tmpfs has some extra supported options that we must pass through. - opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") - - case bind: - fd := fds.remove() - fsName = "9p" - // Non-root bind mounts are always shared. - opts = p9MountOptions(fd, FileAccessShared) - // If configured, add overlay to all writable mounts. - useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly - - default: - // TODO(nlacasse): Support all the mount types and make this a - // fatal error. Most applications will "just work" without - // them, so this is a warning for now. - // we do not support. - log.Warningf("ignoring unknown filesystem type %q", m.Type) - } - return fsName, opts, useOverlay, err -} - -func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount, fds *fdDispenser) error { - for _, m := range mounts { - if err := mountSubmount(ctx, conf, mns, root, fds, m, mounts); err != nil { - return fmt.Errorf("mount submount %q: %v", m.Destination, err) - } - } - - if err := mountTmp(ctx, conf, mns, root, mounts); err != nil { - return fmt.Errorf("mount submount %q: %v", "tmp", err) - } - - if !fds.empty() { - return fmt.Errorf("not all mount points were consumed, remaining: %v", fds) - } - return nil -} - -// mountSubmount mounts volumes inside the container's root. Because mounts may -// be readonly, a lower ramfs overlay is added to create the mount point dir. -// Another overlay is added with tmpfs on top if Config.Overlay is true. -// 'm.Destination' must be an absolute path with '..' and symlinks resolved. -func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error { - // Map mount type to filesystem name, and parse out the options that we are - // capable of dealing with. - fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds) - - // Return the error or nil that corresponds to the default case in getMountNameAndOptions. - if err != nil { - return err - } - if fsName == "" { - return nil - } - - // All filesystem names should have been mapped to something we know. - filesystem := mustFindFilesystem(fsName) - - mf := mountFlags(m.Options) - if useOverlay { - // All writes go to upper, be paranoid and make lower readonly. - mf.ReadOnly = true - } - - inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil) - if err != nil { - return fmt.Errorf("creating mount with source %q: %v", m.Source, err) - } - - // If there are submounts, we need to overlay the mount on top of a - // ramfs with stub directories for submount paths. - submounts := subtargets(m.Destination, mounts) - if len(submounts) > 0 { - log.Infof("Adding submount overlay over %q", m.Destination) - inode, err = addSubmountOverlay(ctx, inode, submounts) - if err != nil { - return fmt.Errorf("adding submount overlay: %v", err) - } - } - - if useOverlay { - log.Debugf("Adding overlay on top of mount %q", m.Destination) - inode, err = addOverlay(ctx, conf, inode, m.Type, mf) - if err != nil { - return err - } - } - - maxTraversals := uint(0) - dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals) - if err != nil { - return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) - } - defer dirent.DecRef() - if err := mns.Mount(ctx, dirent, inode); err != nil { - return fmt.Errorf("mount %q error: %v", m.Destination, err) - } - - log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) - return nil -} - // p9MountOptions creates a slice of options for a p9 mount. func p9MountOptions(fd int, fa FileAccessType) []string { opts := []string{ @@ -416,82 +192,6 @@ func mountDevice(m specs.Mount) string { return "none" } -// addRestoreMount adds a mount to the MountSources map used for restoring a -// checkpointed container. -func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error { - fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds) - - // Return the error or nil that corresponds to the default case in getMountNameAndOptions. - if err != nil { - return err - } - // TODO(nlacasse): Fix this when we support all the mount types and - // make this a fatal error. - if fsName == "" { - return nil - } - - newMount := fs.MountArgs{ - Dev: mountDevice(m), - Flags: mountFlags(m.Options), - DataString: strings.Join(opts, ","), - } - if useOverlay { - newMount.Flags.ReadOnly = true - } - renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount) - log.Infof("Added mount at %q: %+v", fsName, newMount) - return nil -} - -// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts -// to the environment. -func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) { - renv := &fs.RestoreEnvironment{ - MountSources: make(map[string][]fs.MountArgs), - } - - // Add root mount. - fd := fds.remove() - opts := p9MountOptions(fd, conf.FileAccess) - - mf := fs.MountSourceFlags{} - if spec.Root.Readonly || conf.Overlay { - mf.ReadOnly = true - } - - rootMount := fs.MountArgs{ - Dev: rootDevice, - Flags: mf, - DataString: strings.Join(opts, ","), - } - renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount) - - // Add submounts. - var tmpMounted bool - for _, m := range compileMounts(spec) { - if err := addRestoreMount(conf, renv, m, fds); err != nil { - return nil, err - } - if filepath.Clean(m.Destination) == "/tmp" { - tmpMounted = true - } - } - - // TODO(b/67958150): handle '/tmp' properly (see mountTmp()). - if !tmpMounted { - tmpMount := specs.Mount{ - Type: tmpfs, - Destination: "/tmp", - } - if err := addRestoreMount(conf, renv, tmpMount, fds); err != nil { - return nil, err - } - } - - return renv, nil -} - func mountFlags(opts []string) fs.MountSourceFlags { mf := fs.MountSourceFlags{} for _, o := range opts { @@ -546,22 +246,83 @@ func subtargets(root string, mnts []specs.Mount) []string { return targets } -// setupContainerFS is used to set up the file system and amend the procArgs accordingly. -// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs. -func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error { - ctx := procArgs.NewContext(k) - - // Create the FD map, which will set stdin, stdout, and stderr. If console - // is true, then ioctl calls will be passed through to the host fd. - fdm, err := createFDMap(ctx, k, ls, console, stdioFDs) +// setExecutablePath sets the procArgs.Filename by searching the PATH for an +// executable matching the procArgs.Argv[0]. +func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error { + paths := fs.GetPath(procArgs.Envv) + exe := procArgs.Argv[0] + f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths) if err != nil { - return fmt.Errorf("importing fds: %v", err) + return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err) + } + procArgs.Filename = f + return nil +} + +func adjustDirentCache(k *kernel.Kernel) error { + var hl syscall.Rlimit + if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil { + return fmt.Errorf("getting RLIMIT_NOFILE: %v", err) + } + if int64(hl.Cur) != syscall.RLIM_INFINITY { + newSize := hl.Cur / 2 + if newSize < gofer.DefaultDirentCacheSize { + log.Infof("Setting gofer dirent cache size to %d", newSize) + gofer.DefaultDirentCacheSize = newSize + k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize) + } } + return nil +} - // CreateProcess takes a reference on FDMap if successful. We - // won't need ours either way. - procArgs.FDMap = fdm +type fdDispenser struct { + fds []int +} + +func (f *fdDispenser) remove() int { + if f.empty() { + panic("fdDispenser out of fds") + } + rv := f.fds[0] + f.fds = f.fds[1:] + return rv +} +func (f *fdDispenser) empty() bool { + return len(f.fds) == 0 +} + +type containerMounter struct { + // cid is the container ID. May be set to empty for the root container. + cid string + + root *specs.Root + + // mounts is the set of submounts for the container. It's a copy from the spec + // that may be freely modified without affecting the original spec. + mounts []specs.Mount + + // fds is the list of FDs to be dispensed for mounts that require it. + fds fdDispenser + + k *kernel.Kernel +} + +func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel) *containerMounter { + return &containerMounter{ + cid: cid, + root: spec.Root, + mounts: compileMounts(spec), + fds: fdDispenser{fds: goferFDs}, + k: k, + } +} + +// setupFS is used to set up the file system for containers and amend +// the procArgs accordingly. This is the main entry point for this rest of +// functions in this file. procArgs are passed by reference and the FDMap field +// is modified. It dups stdioFDs. +func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error { // Use root user to configure mounts. The current user might not have // permission to do so. rootProcArgs := kernel.CreateProcessArgs{ @@ -570,16 +331,19 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf Umask: 0022, MaxSymlinkTraversals: linux.MaxSymlinkTraversals, } - rootCtx := rootProcArgs.NewContext(k) + rootCtx := rootProcArgs.NewContext(c.k) // If this is the root container, we also need to setup the root mount // namespace. - mns := k.RootMountNamespace() + mns := c.k.RootMountNamespace() if mns == nil { // Setup the root container. - return setupRootContainerFS(ctx, rootCtx, spec, conf, goferFDs, func(mns *fs.MountNamespace) { - k.SetRootMountNamespace(mns) - }) + if err := c.setupRootContainer(ctx, rootCtx, conf, func(mns *fs.MountNamespace) { + c.k.SetRootMountNamespace(mns) + }); err != nil { + return err + } + return c.checkDispenser() } // Setup a child container. @@ -593,18 +357,17 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf if err != nil { return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err) } - if err := contDir.CreateDirectory(ctx, globalRoot, cid, fs.FilePermsFromMode(0755)); err != nil { - return fmt.Errorf("create directory %q: %v", cid, err) + if err := contDir.CreateDirectory(ctx, globalRoot, c.cid, fs.FilePermsFromMode(0755)); err != nil { + return fmt.Errorf("create directory %q: %v", c.cid, err) } - containerRoot, err := contDir.Walk(ctx, globalRoot, cid) + containerRoot, err := contDir.Walk(ctx, globalRoot, c.cid) if err != nil { - return fmt.Errorf("walk to %q failed: %v", cid, err) + return fmt.Errorf("walk to %q failed: %v", c.cid, err) } defer containerRoot.DecRef() // Create the container's root filesystem mount. - fds := &fdDispenser{fds: goferFDs} - rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil) + rootInode, err := c.createRootMount(rootCtx, conf) if err != nil { return fmt.Errorf("creating filesystem for container: %v", err) } @@ -614,39 +377,32 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf return fmt.Errorf("mount container root: %v", err) } - // We have to re-walk to the dirent to find the mounted - // directory. The old dirent is invalid at this point. - containerRoot, err = contDir.Walk(ctx, globalRoot, cid) + // We have to re-walk to the dirent to find the mounted directory. The old + // dirent is invalid at this point. + containerRoot, err = contDir.Walk(ctx, globalRoot, c.cid) if err != nil { - return fmt.Errorf("find container mount point %q: %v", cid, err) + return fmt.Errorf("find container mount point %q: %v", c.cid, err) } cu := specutils.MakeCleanup(func() { containerRoot.DecRef() }) defer cu.Clean() - log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid)) + log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, c.cid)) // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it. procArgs.Root = containerRoot // Mount all submounts. - mounts := compileMounts(spec) - if err := mountSubmounts(rootCtx, conf, mns, containerRoot, mounts, fds); err != nil { + if err := c.mountSubmounts(rootCtx, conf, mns, containerRoot); err != nil { return err } cu.Release() - return nil + return c.checkDispenser() } -// setExecutablePath sets the procArgs.Filename by searching the PATH for an -// executable matching the procArgs.Argv[0]. -func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error { - paths := fs.GetPath(procArgs.Envv) - exe := procArgs.Argv[0] - f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths) - if err != nil { - return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err) +func (c *containerMounter) checkDispenser() error { + if !c.fds.empty() { + return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds) } - procArgs.Filename = f return nil } @@ -715,6 +471,261 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error return nil } +// setupRootContainer creates a mount namespace containing the root filesystem +// and all mounts. 'rootCtx' is used to walk directories to find mount points. +// 'setMountNS' is called after namespace is created. It must set the mount NS +// to 'rootCtx'. +func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error { + // Create a tmpfs mount where we create and mount a root filesystem for + // each child container. + c.mounts = append(c.mounts, specs.Mount{ + Type: tmpfs, + Destination: ChildContainersDir, + }) + + rootInode, err := c.createRootMount(rootCtx, conf) + if err != nil { + return fmt.Errorf("creating root mount: %v", err) + } + mns, err := fs.NewMountNamespace(userCtx, rootInode) + if err != nil { + return fmt.Errorf("creating root mount namespace: %v", err) + } + setMountNS(mns) + + root := mns.Root() + defer root.DecRef() + return c.mountSubmounts(rootCtx, conf, mns, root) +} + +// createRootMount creates the root filesystem. +func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) { + // First construct the filesystem from the spec.Root. + mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay} + + var ( + rootInode *fs.Inode + err error + ) + + fd := c.fds.remove() + log.Infof("Mounting root over 9P, ioFD: %d", fd) + p9FS := mustFindFilesystem("9p") + opts := p9MountOptions(fd, conf.FileAccess) + rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) + if err != nil { + return nil, fmt.Errorf("creating root mount point: %v", err) + } + + // We need to overlay the root on top of a ramfs with stub directories + // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always + // mounted even if they are not in the spec. + submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + if err != nil { + return nil, fmt.Errorf("adding submount overlay: %v", err) + } + + if conf.Overlay && !c.root.Readonly { + log.Debugf("Adding overlay on top of root mount") + // Overlay a tmpfs filesystem on top of the root. + rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) + if err != nil { + return nil, err + } + } + + log.Infof("Mounted %q to %q type root", c.root.Path, "/") + return rootInode, nil +} + +// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) { + var ( + fsName string + opts []string + useOverlay bool + err error + ) + + switch m.Type { + case devpts, devtmpfs, proc, sysfs: + fsName = m.Type + case nonefs: + fsName = sysfs + case tmpfs: + fsName = m.Type + + // tmpfs has some extra supported options that we must pass through. + opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") + + case bind: + fd := c.fds.remove() + fsName = "9p" + // Non-root bind mounts are always shared. + opts = p9MountOptions(fd, FileAccessShared) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + + default: + // TODO(nlacasse): Support all the mount types and make this a fatal error. + // Most applications will "just work" without them, so this is a warning + // for now. + log.Warningf("ignoring unknown filesystem type %q", m.Type) + } + return fsName, opts, useOverlay, err +} + +func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { + for _, m := range c.mounts { + if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { + return fmt.Errorf("mount submount %q: %v", m.Destination, err) + } + } + + if err := c.mountTmp(ctx, conf, mns, root); err != nil { + return fmt.Errorf("mount submount %q: %v", "tmp", err) + } + return nil +} + +// mountSubmount mounts volumes inside the container's root. Because mounts may +// be readonly, a lower ramfs overlay is added to create the mount point dir. +// Another overlay is added with tmpfs on top if Config.Overlay is true. +// 'm.Destination' must be an absolute path with '..' and symlinks resolved. +func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) + if err != nil { + return err + } + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + // All filesystem names should have been mapped to something we know. + filesystem := mustFindFilesystem(fsName) + + mf := mountFlags(m.Options) + if useOverlay { + // All writes go to upper, be paranoid and make lower readonly. + mf.ReadOnly = true + } + + inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil) + if err != nil { + return fmt.Errorf("creating mount with source %q: %v", m.Source, err) + } + + // If there are submounts, we need to overlay the mount on top of a ramfs + // with stub directories for submount paths. + submounts := subtargets(m.Destination, c.mounts) + if len(submounts) > 0 { + log.Infof("Adding submount overlay over %q", m.Destination) + inode, err = addSubmountOverlay(ctx, inode, submounts) + if err != nil { + return fmt.Errorf("adding submount overlay: %v", err) + } + } + + if useOverlay { + log.Debugf("Adding overlay on top of mount %q", m.Destination) + inode, err = addOverlay(ctx, conf, inode, m.Type, mf) + if err != nil { + return err + } + } + + maxTraversals := uint(0) + dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals) + if err != nil { + return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) + } + defer dirent.DecRef() + if err := mns.Mount(ctx, dirent, inode); err != nil { + return fmt.Errorf("mount %q error: %v", m.Destination, err) + } + + log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) + return nil +} + +// addRestoreMount adds a mount to the MountSources map used for restoring a +// checkpointed container. +func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error { + fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) + if err != nil { + return err + } + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + newMount := fs.MountArgs{ + Dev: mountDevice(m), + Flags: mountFlags(m.Options), + DataString: strings.Join(opts, ","), + } + if useOverlay { + newMount.Flags.ReadOnly = true + } + renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount) + log.Infof("Added mount at %q: %+v", fsName, newMount) + return nil +} + +// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts +// to the environment. +func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) { + renv := &fs.RestoreEnvironment{ + MountSources: make(map[string][]fs.MountArgs), + } + + // Add root mount. + fd := c.fds.remove() + opts := p9MountOptions(fd, conf.FileAccess) + + mf := fs.MountSourceFlags{} + if c.root.Readonly || conf.Overlay { + mf.ReadOnly = true + } + + rootMount := fs.MountArgs{ + Dev: rootDevice, + Flags: mf, + DataString: strings.Join(opts, ","), + } + renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount) + + // Add submounts. + var tmpMounted bool + for _, m := range c.mounts { + if err := c.addRestoreMount(conf, renv, m); err != nil { + return nil, err + } + if filepath.Clean(m.Destination) == "/tmp" { + tmpMounted = true + } + } + + // TODO(b/67958150): handle '/tmp' properly (see mountTmp()). + if !tmpMounted { + tmpMount := specs.Mount{ + Type: tmpfs, + Destination: "/tmp", + } + if err := c.addRestoreMount(conf, renv, tmpMount); err != nil { + return nil, err + } + } + + return renv, nil +} + // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so. // Technically we don't have to mount tmpfs at /tmp, as we could just rely on // the host /tmp, but this is a nice optimization, and fixes some apps that call @@ -724,8 +735,8 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error // // Note that when there are submounts inside of '/tmp', directories for the // mount points must be present, making '/tmp' not empty anymore. -func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount) error { - for _, m := range mounts { +func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error { + for _, m := range c.mounts { if filepath.Clean(m.Destination) == "/tmp" { log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m) return nil @@ -766,7 +777,7 @@ func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *f // another user. This is normally done for /tmp. Options: []string{"mode=1777"}, } - return mountSubmount(ctx, conf, mns, root, nil, tmpMount, mounts) + return c.mountSubmount(ctx, conf, mns, root, tmpMount) default: return err diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 6ac6b94dd..a997776f8 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -288,7 +288,7 @@ func New(args Args) (*Loader, error) { } // Create a watchdog. - watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction) + dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction) procArgs, err := newProcess(args.ID, args.Spec, creds, k) if err != nil { @@ -304,7 +304,7 @@ func New(args Args) (*Loader, error) { k: k, conf: args.Conf, console: args.Console, - watchdog: watchdog, + watchdog: dog, spec: args.Spec, goferFDs: args.GoferFDs, stdioFDs: args.StdioFDs, @@ -432,7 +432,7 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return mf, nil } -// Run runs the root container.. +// Run runs the root container. func (l *Loader) Run() error { err := l.run() l.ctrl.manager.startResultChan <- err @@ -486,17 +486,21 @@ func (l *Loader) run() error { // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. if !l.restore { - if err := setupContainerFS( - &l.rootProcArgs, - l.spec, - l.conf, - l.stdioFDs, - l.goferFDs, - l.console, - l.rootProcArgs.Credentials, - l.rootProcArgs.Limits, - l.k, - "" /* CID, which isn't needed for the root container */); err != nil { + // Create the FD map, which will set stdin, stdout, and stderr. If console + // is true, then ioctl calls will be passed through to the host fd. + ctx := l.rootProcArgs.NewContext(l.k) + fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs) + if err != nil { + return fmt.Errorf("importing fds: %v", err) + } + // CreateProcess takes a reference on FDMap if successful. We won't need + // ours either way. + l.rootProcArgs.FDMap = fdm + + // cid for root container can be empty. Only subcontainers need it to set + // the mount location. + mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k) + if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil { return err } @@ -552,7 +556,7 @@ func (l *Loader) createContainer(cid string) error { // startContainer starts a child container. It returns the thread group ID of // the newly created process. Caller owns 'files' and may close them after // this method returns. -func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error { +func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -596,6 +600,16 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config stdioFDs = append(stdioFDs, int(f.Fd())) } + // Create the FD map, which will set stdin, stdout, and stderr. + ctx := procArgs.NewContext(l.k) + fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs) + if err != nil { + return fmt.Errorf("importing fds: %v", err) + } + // CreateProcess takes a reference on FDMap if successful. We won't need ours + // either way. + procArgs.FDMap = fdm + // Can't take ownership away from os.File. dup them to get a new FDs. var goferFDs []int for _, f := range files[3:] { @@ -606,22 +620,12 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config goferFDs = append(goferFDs, fd) } - if err := setupContainerFS( - &procArgs, - spec, - conf, - stdioFDs, - goferFDs, - false, - creds, - procArgs.Limits, - k, - cid); err != nil { + mntr := newContainerMounter(spec, cid, goferFDs, l.k) + if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil { return fmt.Errorf("configuring container FS: %v", err) } - ctx := procArgs.NewContext(l.k) - mns := k.RootMountNamespace() + mns := l.k.RootMountNamespace() if err := setExecutablePath(ctx, mns, &procArgs); err != nil { return fmt.Errorf("setting executable path for %+v: %v", procArgs, err) } @@ -724,7 +728,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { return nil } -func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error { +func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { if tgid <= 0 { return fmt.Errorf("PID (%d) must be positive", tgid) } @@ -736,13 +740,10 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai ws := l.wait(execTG) *waitStatus = ws - // Remove tg from the cache if caller requested it. - if clearStatus { - l.mu.Lock() - delete(l.processes, eid) - log.Debugf("updated processes (removal): %v", l.processes) - l.mu.Unlock() - } + l.mu.Lock() + delete(l.processes, eid) + log.Debugf("updated processes (removal): %v", l.processes) + l.mu.Unlock() return nil } diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 52fd7ac4b..8cd070e61 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -40,8 +40,6 @@ import ( "gvisor.googlesource.com/gvisor/runsc/specutils" ) -const privateClearStatusFlag = "private-clear-status" - // Exec implements subcommands.Command for the "exec" command. type Exec struct { cwd string @@ -51,7 +49,6 @@ type Exec struct { extraKGIDs stringSlice caps stringSlice detach bool - clearStatus bool processPath string pidFile string internalPidFile string @@ -103,10 +100,6 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) { f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to") f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to") f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal") - - // This flag clears the status of the exec'd process upon completion. It is - // only used when we fork due to --detach being set on the parent. - f.BoolVar(&ex.clearStatus, privateClearStatusFlag, true, "private flag, do not use") } // Execute implements subcommands.Command.Execute. It starts a process in an @@ -156,7 +149,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Start the new process and get it pid. pid, err := c.Execute(e) if err != nil { - Fatalf("getting processes for container: %v", err) + Fatalf("executing processes for container: %v", err) } if e.StdioIsPty { @@ -184,7 +177,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } // Wait for the process to exit. - ws, err := c.WaitPID(pid, ex.clearStatus) + ws, err := c.WaitPID(pid) if err != nil { Fatalf("waiting on pid %d: %v", pid, err) } @@ -193,8 +186,12 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus { - binPath := specutils.ExePath var args []string + for _, a := range os.Args[1:] { + if !strings.Contains(a, "detach") { + args = append(args, a) + } + } // The command needs to write a pid file so that execAndWait can tell // when it has started. If no pid-file was provided, we should use a @@ -210,19 +207,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat args = append(args, "--pid-file="+pidFile) } - // Add the rest of the args, excluding the "detach" flag. - for _, a := range os.Args[1:] { - if strings.Contains(a, "detach") { - // Replace with the "private-clear-status" flag, which tells - // the new process it's a detached child and shouldn't - // clear the exit status of the sentry process. - args = append(args, fmt.Sprintf("--%s=false", privateClearStatusFlag)) - } else { - args = append(args, a) - } - } - - cmd := exec.Command(binPath, args...) + cmd := exec.Command(specutils.ExePath, args...) cmd.Args[0] = "runsc-exec" // Exec stdio defaults to current process stdio. @@ -233,8 +218,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat // If the console control socket file is provided, then create a new // pty master/slave pair and set the TTY on the sandbox process. if ex.consoleSocket != "" { - // Create a new TTY pair and send the master on the provided - // socket. + // Create a new TTY pair and send the master on the provided socket. tty, err := console.NewWithSocket(ex.consoleSocket) if err != nil { Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err) @@ -256,7 +240,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat Fatalf("failure to start child exec process, err: %v", err) } - log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args) + log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, specutils.ExePath, args) // Wait for PID file to ensure that child process has started. Otherwise, // '--process' file is deleted as soon as this process returns and the child diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go index a55a682f3..58fd01974 100644 --- a/runsc/cmd/wait.go +++ b/runsc/cmd/wait.go @@ -88,14 +88,14 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) waitStatus = ws // Wait on a PID in the root PID namespace. case wt.rootPID != unsetPID: - ws, err := c.WaitRootPID(int32(wt.rootPID), true /* clearStatus */) + ws, err := c.WaitRootPID(int32(wt.rootPID)) if err != nil { Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err) } waitStatus = ws // Wait on a PID in the container's PID namespace. case wt.pid != unsetPID: - ws, err := c.WaitPID(int32(wt.pid), true /* clearStatus */) + ws, err := c.WaitPID(int32(wt.pid)) if err != nil { Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err) } diff --git a/runsc/container/container.go b/runsc/container/container.go index 513085836..04b611b56 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -530,22 +530,22 @@ func (c *Container) Wait() (syscall.WaitStatus, error) { // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and // returns its WaitStatus. -func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) { +func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID) if !c.isSandboxRunning() { return 0, fmt.Errorf("sandbox is not running") } - return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus) + return c.Sandbox.WaitPID(c.Sandbox.ID, pid) } // WaitPID waits for process 'pid' in the container's PID namespace and returns // its WaitStatus. -func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) { +func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) { log.Debugf("Wait on PID %d in container %q", pid, c.ID) if !c.isSandboxRunning() { return 0, fmt.Errorf("sandbox is not running") } - return c.Sandbox.WaitPID(c.ID, pid, clearStatus) + return c.Sandbox.WaitPID(c.ID, pid) } // SignalContainer sends the signal to the container. If all is true and signal diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 47a66afb2..032190636 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -649,7 +649,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { // WaitPID waits for process 'pid' in the container's sandbox and returns its // WaitStatus. -func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.WaitStatus, error) { +func (s *Sandbox) WaitPID(cid string, pid int32) (syscall.WaitStatus, error) { log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) var ws syscall.WaitStatus conn, err := s.sandboxConnect() @@ -659,9 +659,8 @@ func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.Wait defer conn.Close() args := &boot.WaitPIDArgs{ - PID: pid, - CID: cid, - ClearStatus: clearStatus, + PID: pid, + CID: cid, } if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil { return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err) |