diff options
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/boot/controller.go | 51 | ||||
-rw-r--r-- | runsc/boot/fs.go | 12 | ||||
-rw-r--r-- | runsc/boot/loader.go | 53 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 8 | ||||
-rw-r--r-- | runsc/boot/vfs.go | 39 | ||||
-rw-r--r-- | runsc/cmd/debug.go | 40 | ||||
-rw-r--r-- | runsc/fsgofer/fsgofer.go | 9 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 20 |
8 files changed, 117 insertions, 115 deletions
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 3e5e4c22f..626a3816e 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -101,14 +101,13 @@ const ( // Profiling related commands (see pprof.go for more details). const ( - StartCPUProfile = "Profile.StartCPUProfile" - StopCPUProfile = "Profile.StopCPUProfile" - HeapProfile = "Profile.HeapProfile" - GoroutineProfile = "Profile.GoroutineProfile" - BlockProfile = "Profile.BlockProfile" - MutexProfile = "Profile.MutexProfile" - StartTrace = "Profile.StartTrace" - StopTrace = "Profile.StopTrace" + StartCPUProfile = "Profile.StartCPUProfile" + StopCPUProfile = "Profile.StopCPUProfile" + HeapProfile = "Profile.HeapProfile" + BlockProfile = "Profile.BlockProfile" + MutexProfile = "Profile.MutexProfile" + StartTrace = "Profile.StartTrace" + StopTrace = "Profile.StopTrace" ) // Logging related commands (see logging.go for more details). @@ -129,42 +128,52 @@ type controller struct { // manager holds the containerManager methods. manager *containerManager + + // pprop holds the profile instance if enabled. It may be nil. + pprof *control.Profile } // newController creates a new controller. The caller must call // controller.srv.StartServing() to start the controller. func newController(fd int, l *Loader) (*controller, error) { - srv, err := server.CreateFromFD(fd) + ctrl := &controller{} + var err error + ctrl.srv, err = server.CreateFromFD(fd) if err != nil { return nil, err } - manager := &containerManager{ + ctrl.manager = &containerManager{ startChan: make(chan struct{}), startResultChan: make(chan error), l: l, } - srv.Register(manager) + ctrl.srv.Register(ctrl.manager) if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { net := &Network{ Stack: eps.Stack, } - srv.Register(net) + ctrl.srv.Register(net) } - srv.Register(&debug{}) - srv.Register(&control.Logging{}) + ctrl.srv.Register(&debug{}) + ctrl.srv.Register(&control.Logging{}) + if l.root.conf.ProfileEnable { - srv.Register(&control.Profile{ - Kernel: l.k, - }) + ctrl.pprof = &control.Profile{Kernel: l.k} + ctrl.srv.Register(ctrl.pprof) } - return &controller{ - srv: srv, - manager: manager, - }, nil + return ctrl, nil +} + +func (c *controller) stop() { + if c.pprof != nil { + // These are noop if there is nothing being profiled. + _ = c.pprof.StopCPUProfile(nil, nil) + _ = c.pprof.StopTrace(nil, nil) + } } // containerManager manages sandbox containers. diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 59639ba19..9dd5b0184 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -640,7 +640,7 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, m := range c.mounts { log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) @@ -868,7 +868,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns if err != nil { return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) } - defer dirent.DecRef() + defer dirent.DecRef(ctx) if err := mns.Mount(ctx, dirent, inode); err != nil { return fmt.Errorf("mount %q error: %v", m.Destination, err) } @@ -889,12 +889,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun if err != nil { return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) } - defer target.DecRef() + defer target.DecRef(ctx) // Take a ref on the inode that is about to be (re)-mounted. source.root.IncRef() if err := mns.Mount(ctx, target, source.root); err != nil { - source.root.DecRef() + source.root.DecRef(ctx) return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) } @@ -997,12 +997,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M switch err { case nil: // Found '/tmp' in filesystem, check if it's empty. - defer tmp.DecRef() + defer tmp.DecRef(ctx) f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) if err != nil { return err } - defer f.DecRef() + defer f.DecRef(ctx) serializer := &fs.CollectEntriesSerializer{} if err := f.Readdir(ctx, serializer); err != nil { return err diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 9cd9c5909..533b9c5e7 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -16,12 +16,12 @@ package boot import ( + "errors" "fmt" mrand "math/rand" "os" "runtime" "sync/atomic" - "syscall" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" @@ -187,7 +187,7 @@ type Args struct { } // make sure stdioFDs are always the same on initial start and on restore -const startingStdioFD = 64 +const startingStdioFD = 256 // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. @@ -346,7 +346,7 @@ func New(args Args) (*Loader, error) { if err != nil { return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) } - defer hostFilesystem.DecRef() + defer hostFilesystem.DecRef(k.SupervisorContext()) hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { return nil, fmt.Errorf("failed to create hostfs mount: %v", err) @@ -360,15 +360,20 @@ func New(args Args) (*Loader, error) { var stdioFDs []int newfd := startingStdioFD for _, fd := range args.StdioFDs { - err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC) + // Check that newfd is unused to avoid clobbering over it. + if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { + if err != nil { + return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) + } + return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) + } + + err := unix.Dup3(fd, newfd, unix.O_CLOEXEC) if err != nil { return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) } stdioFDs = append(stdioFDs, newfd) - err = syscall.Close(fd) - if err != nil { - return nil, fmt.Errorf("close original stdioFDs failed: %v", err) - } + _ = unix.Close(fd) newfd++ } @@ -458,6 +463,11 @@ func (l *Loader) Destroy() { l.stopSignalForwarding() } l.watchdog.Stop() + + for i, fd := range l.root.stdioFDs { + _ = unix.Close(fd) + l.root.stdioFDs[i] = -1 + } } func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) { @@ -591,11 +601,9 @@ func (l *Loader) run() error { // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the // passed FDs, so only close for VFS1. if !kernel.VFS2Enabled { - for _, fd := range l.root.stdioFDs { - err := syscall.Close(fd) - if err != nil { - return fmt.Errorf("close dup()ed stdioFDs: %v", err) - } + for i, fd := range l.root.stdioFDs { + _ = unix.Close(fd) + l.root.stdioFDs[i] = -1 } } @@ -686,7 +694,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file // Can't take ownership away from os.File. dup them to get a new FDs. for _, f := range files[3:] { - fd, err := syscall.Dup(int(f.Fd())) + fd, err := unix.Dup(int(f.Fd())) if err != nil { return fmt.Errorf("failed to dup file: %v", err) } @@ -755,7 +763,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn return nil, fmt.Errorf("creating process: %v", err) } // CreateProcess takes a reference on FDTable if successful. - info.procArgs.FDTable.DecRef() + info.procArgs.FDTable.DecRef(ctx) // Set the foreground process group on the TTY to the global init process // group, since that is what we are about to start running. @@ -890,22 +898,20 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { // Add the HOME environment variable if it is not already set. if kernel.VFS2Enabled { - defer args.MountNamespaceVFS2.DecRef() - root := args.MountNamespaceVFS2.Root() - defer root.DecRef() ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespaceVFS2.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) if err != nil { return 0, err } args.Envv = envv } else { - defer args.MountNamespace.DecRef() - root := args.MountNamespace.Root() - defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespace.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) if err != nil { return 0, err @@ -1002,6 +1008,9 @@ func (l *Loader) WaitExit() kernel.ExitStatus { // Wait for container. l.k.WaitExited() + // Cleanup + l.ctrl.stop() + return l.k.GlobalInit().ExitStatus() } @@ -1263,7 +1272,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F fdTable := k.NewFDTable() ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) if err != nil { - fdTable.DecRef() + fdTable.DecRef(ctx) return nil, nil, nil, err } return fdTable, ttyFile, ttyFileVFS2, nil diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 8e6fe57e1..aa3fdf96c 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -450,13 +450,13 @@ func TestCreateMountNamespace(t *testing.T) { } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { maxTraversals := uint(0) if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) @@ -491,7 +491,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { target := &vfs.PathOperation{ Root: root, @@ -502,7 +502,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 9a1ed8e9e..e7d6035bb 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -37,6 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -89,6 +90,12 @@ func registerFilesystems(k *kernel.Kernel) error { if err := ttydev.Register(vfsObj); err != nil { return fmt.Errorf("registering ttydev: %w", err) } + tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx)) + if tunSupported { + if err := tundev.Register(vfsObj); err != nil { + return fmt.Errorf("registering tundev: %v", err) + } + } if kernel.FUSEEnabled { if err := fuse.Register(vfsObj); err != nil { @@ -96,14 +103,11 @@ func registerFilesystems(k *kernel.Kernel) error { } } - if err := tundev.Register(vfsObj); err != nil { - return fmt.Errorf("registering tundev: %v", err) - } a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name) if err != nil { return fmt.Errorf("creating devtmpfs accessor: %w", err) } - defer a.Release() + defer a.Release(ctx) if err := a.UserspaceInit(ctx); err != nil { return fmt.Errorf("initializing userspace: %w", err) @@ -114,8 +118,10 @@ func registerFilesystems(k *kernel.Kernel) error { if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil { return fmt.Errorf("creating ttydev devtmpfs files: %w", err) } - if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { - return fmt.Errorf("creating tundev devtmpfs files: %v", err) + if tunSupported { + if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating tundev devtmpfs files: %v", err) + } } if kernel.FUSEEnabled { @@ -171,10 +177,19 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { fd := c.fds.remove() - opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",") + opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */) + + if conf.OverlayfsStaleRead { + // We can't check for overlayfs here because sandbox is chroot'ed and gofer + // can only send mount options for specs.Mounts (specs.Root is missing + // Options field). So assume root is always on top of overlayfs. + opts = append(opts, "overlayfs_stale_read") + } log.Infof("Mounting root over 9P, ioFD: %d", fd) - mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts}) + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{ + Data: strings.Join(opts, ","), + }) if err != nil { return nil, fmt.Errorf("setting up mount namespace: %w", err) } @@ -243,7 +258,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, Start: root, @@ -378,7 +393,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, @@ -472,10 +487,10 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Co if err != nil { return err } - defer newMnt.DecRef() + defer newMnt.DecRef(ctx) root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil { return err } diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index b5de2588b..742f8c344 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -32,20 +32,19 @@ import ( // Debug implements subcommands.Command for the "debug" command. type Debug struct { - pid int - stacks bool - signal int - profileHeap string - profileCPU string - profileGoroutine string - profileBlock string - profileMutex string - trace string - strace string - logLevel string - logPackets string - duration time.Duration - ps bool + pid int + stacks bool + signal int + profileHeap string + profileCPU string + profileBlock string + profileMutex string + trace string + strace string + logLevel string + logPackets string + duration time.Duration + ps bool } // Name implements subcommands.Command. @@ -69,7 +68,6 @@ func (d *Debug) SetFlags(f *flag.FlagSet) { f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log") f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") - f.StringVar(&d.profileGoroutine, "profile-goroutine", "", "writes goroutine profile to the given file.") f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.") f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.") f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles") @@ -153,18 +151,6 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } log.Infof("Heap profile written to %q", d.profileHeap) } - if d.profileGoroutine != "" { - f, err := os.Create(d.profileGoroutine) - if err != nil { - return Errorf(err.Error()) - } - defer f.Close() - - if err := c.Sandbox.GoroutineProfile(f); err != nil { - return Errorf(err.Error()) - } - log.Infof("Goroutine profile written to %q", d.profileGoroutine) - } if d.profileBlock != "" { f, err := os.Create(d.profileBlock) if err != nil { diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index ebefeacf2..c6694c278 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -979,9 +979,12 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { skip := uint64(0) - // Check if the file is at the correct position already. If not, seek to the - // beginning and read the entire directory again. - if l.lastDirentOffset != offset { + // Check if the file is at the correct position already. If not, seek to + // the beginning and read the entire directory again. We always seek if + // offset is 0, since this is side-effectual (equivalent to rewinddir(3), + // which causes the directory stream to resynchronize with the directory's + // current contents). + if l.lastDirentOffset != offset || offset == 0 { if _, err := syscall.Seek(l.file.FD(), 0, 0); err != nil { return nil, extractErrno(err) } diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 2afcc27af..36bb0c9c9 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -1012,26 +1012,6 @@ func (s *Sandbox) StopCPUProfile() error { return nil } -// GoroutineProfile writes a goroutine profile to the given file. -func (s *Sandbox) GoroutineProfile(f *os.File) error { - log.Debugf("Goroutine profile %q", s.ID) - conn, err := s.sandboxConnect() - if err != nil { - return err - } - defer conn.Close() - - opts := control.ProfileOpts{ - FilePayload: urpc.FilePayload{ - Files: []*os.File{f}, - }, - } - if err := conn.Call(boot.GoroutineProfile, &opts, nil); err != nil { - return fmt.Errorf("getting sandbox %q goroutine profile: %v", s.ID, err) - } - return nil -} - // BlockProfile writes a block profile to the given file. func (s *Sandbox) BlockProfile(f *os.File) error { log.Debugf("Block profile %q", s.ID) |