diff options
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/boot/filter/config.go | 6 | ||||
-rw-r--r-- | runsc/boot/loader.go | 44 | ||||
-rw-r--r-- | runsc/boot/loader_test.go | 25 | ||||
-rw-r--r-- | runsc/boot/vfs.go | 5 | ||||
-rw-r--r-- | runsc/container/container_test.go | 16 | ||||
-rw-r--r-- | runsc/container/multi_container_test.go | 13 | ||||
-rw-r--r-- | runsc/flag/flag.go | 1 | ||||
-rw-r--r-- | runsc/sandbox/network.go | 13 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 39 |
9 files changed, 102 insertions, 60 deletions
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 6ac19668f..a7c4ebb0c 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -162,6 +162,12 @@ var allowedSyscalls = seccomp.SyscallRules{ }, syscall.SYS_LSEEK: {}, syscall.SYS_MADVISE: {}, + unix.SYS_MEMBARRIER: []seccomp.Rule{ + { + seccomp.EqualTo(linux.MEMBARRIER_CMD_GLOBAL), + seccomp.EqualTo(0), + }, + }, syscall.SYS_MINCORE: {}, // Used by the Go runtime as a temporarily workaround for a Linux // 5.2-5.4 bug. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 2e652ddad..8ad000497 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -282,6 +282,7 @@ func New(args Args) (*Loader, error) { args.NumCPU = runtime.NumCPU() } log.Infof("CPUs: %d", args.NumCPU) + runtime.GOMAXPROCS(args.NumCPU) if args.TotalMem > 0 { // Adjust the total memory returned by the Sentry so that applications that @@ -471,9 +472,13 @@ func (l *Loader) Destroy() { } l.watchdog.Stop() + // Release all kernel resources. This is only safe after we can no longer + // save/restore. + l.k.Release() + // In the success case, stdioFDs and goferFDs will only contain // released/closed FDs that ownership has been passed over to host FDs and - // gofer sessions. Close them here in case on failure. + // gofer sessions. Close them here in case of failure. for _, fd := range l.root.stdioFDs { _ = fd.Close() } @@ -797,7 +802,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on -// the gofer FDs looking for disconnects, and destroys the container if a +// the gofer FDs looking for disconnects, and kills the container processes if a // disconnect occurs in any of the gofer FDs. func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { go func() { @@ -818,18 +823,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err)) } - // Check if the gofer has stopped as part of normal container destruction. - // This is done just to avoid sending an annoying error message to the log. - // Note that there is a small race window in between mu.Unlock() and the - // lock being reacquired in destroyContainer(), but it's harmless to call - // destroyContainer() multiple times. l.mu.Lock() - _, ok := l.processes[execID{cid: cid}] - l.mu.Unlock() - if ok { - log.Infof("Gofer socket disconnected, destroying container %q", cid) - if err := l.destroyContainer(cid); err != nil { - log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err) + defer l.mu.Unlock() + + // The gofer could have been stopped due to a normal container shutdown. + // Check if the container has not stopped yet. + if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil { + log.Infof("Gofer socket disconnected, killing container %q", cid) + if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { + log.Warningf("Error killing container %q after gofer stopped: %v", cid, err) } } }() @@ -898,17 +900,24 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { return 0, fmt.Errorf("container %q not started", args.ContainerID) } - // Get the container MountNamespace from the Task. + // Get the container MountNamespace from the Task. Try to acquire ref may fail + // in case it raced with task exit. if kernel.VFS2Enabled { - // task.MountNamespace() does not take a ref, so we must do so ourselves. + // task.MountNamespaceVFS2() does not take a ref, so we must do so ourselves. args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2() - args.MountNamespaceVFS2.IncRef() + if !args.MountNamespaceVFS2.TryIncRef() { + return 0, fmt.Errorf("container %q has stopped", args.ContainerID) + } } else { + var reffed bool tg.Leader().WithMuLocked(func(t *kernel.Task) { // task.MountNamespace() does not take a ref, so we must do so ourselves. args.MountNamespace = t.MountNamespace() - args.MountNamespace.IncRef() + reffed = args.MountNamespace.TryIncRef() }) + if !reffed { + return 0, fmt.Errorf("container %q has stopped", args.ContainerID) + } } // Add the HOME environment variable if it is not already set. @@ -916,7 +925,6 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { root := args.MountNamespaceVFS2.Root() ctx := vfs.WithRoot(l.k.SupervisorContext(), root) defer args.MountNamespaceVFS2.DecRef(ctx) - defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) if err != nil { return 0, err diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index bf9ec5d38..e376f944b 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -264,7 +264,7 @@ type CreateMountTestcase struct { expectedPaths []string } -func createMountTestcases(vfs2 bool) []*CreateMountTestcase { +func createMountTestcases() []*CreateMountTestcase { testCases := []*CreateMountTestcase{ &CreateMountTestcase{ // Only proc. @@ -409,32 +409,26 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase { Destination: "/proc", Type: "tmpfs", }, - // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports - // MkDirAt in VFS2 (and remove the reduntant append). - // { - // Destination: "/sys/bar", - // Type: "tmpfs", - // }, - // + { + Destination: "/sys/bar", + Type: "tmpfs", + }, + { Destination: "/tmp/baz", Type: "tmpfs", }, }, }, - expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"}, + expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"}, } - if !vfs2 { - vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"}) - vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar") - } return append(testCases, vfsCase) } // Test that MountNamespace can be created with various specs. func TestCreateMountNamespace(t *testing.T) { - for _, tc := range createMountTestcases(false /* vfs2 */) { + for _, tc := range createMountTestcases() { t.Run(tc.name, func(t *testing.T) { conf := testConfig() ctx := contexttest.Context(t) @@ -471,7 +465,7 @@ func TestCreateMountNamespace(t *testing.T) { // Test that MountNamespace can be created with various specs. func TestCreateMountNamespaceVFS2(t *testing.T) { - for _, tc := range createMountTestcases(true /* vfs2 */) { + for _, tc := range createMountTestcases() { t.Run(tc.name, func(t *testing.T) { spec := testSpec() spec.Mounts = tc.spec.Mounts @@ -497,6 +491,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { } root := mns.Root() + root.IncRef() defer root.DecRef(ctx) for _, p := range tc.expectedPaths { target := &vfs.PathOperation{ diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index e36664938..82e459f46 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -170,6 +170,7 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create rootProcArgs.MountNamespaceVFS2 = mns root := mns.Root() + root.IncRef() defer root.DecRef(rootCtx) if root.Mount().ReadOnly() { // Switch to ReadWrite while we setup submounts. @@ -377,6 +378,7 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C } root := mns.Root() + root.IncRef() defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, @@ -474,6 +476,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config } root := mns.Root() + root.IncRef() defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, @@ -597,6 +600,7 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co defer newMnt.DecRef(ctx) root := mns.Root() + root.IncRef() defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, @@ -617,6 +621,7 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error { root := mns.Root() + root.IncRef() defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 548c68087..1f8e277cc 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -316,6 +316,7 @@ func configs(t *testing.T, opts ...configOption) map[string]*config.Config { return cs } +// TODO(gvisor.dev/issue/1624): Merge with configs when VFS2 is the default. func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config { all := configs(t, opts...) for key, value := range configs(t, opts...) { @@ -894,13 +895,15 @@ func TestKillPid(t *testing.T) { } } -// TestCheckpointRestore creates a container that continuously writes successive integers -// to a file. To test checkpoint and restore functionality, the container is -// checkpointed and the last number printed to the file is recorded. Then, it is restored in two -// new containers and the first number printed from these containers is checked. Both should -// be the next consecutive number after the last number from the checkpointed container. +// TestCheckpointRestore creates a container that continuously writes successive +// integers to a file. To test checkpoint and restore functionality, the +// container is checkpointed and the last number printed to the file is +// recorded. Then, it is restored in two new containers and the first number +// printed from these containers is checked. Both should be the next consecutive +// number after the last number from the checkpointed container. func TestCheckpointRestore(t *testing.T) { // Skip overlay because test requires writing to host file. + // TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added. for name, conf := range configs(t, noOverlay...) { t.Run(name, func(t *testing.T) { dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test") @@ -1062,6 +1065,7 @@ func TestCheckpointRestore(t *testing.T) { // with filesystem Unix Domain Socket use. func TestUnixDomainSockets(t *testing.T) { // Skip overlay because test requires writing to host file. + // TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added. for name, conf := range configs(t, noOverlay...) { t.Run(name, func(t *testing.T) { // UDS path is limited to 108 chars for compatibility with older systems. @@ -1199,7 +1203,7 @@ func TestUnixDomainSockets(t *testing.T) { // recreated. Then it resumes the container, verify that the file gets created // again. func TestPauseResume(t *testing.T) { - for name, conf := range configs(t, noOverlay...) { + for name, conf := range configsWithVFS2(t, noOverlay...) { t.Run(name, func(t *testing.T) { tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock") if err != nil { diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 952215ec1..850e80290 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -480,7 +480,7 @@ func TestMultiContainerMount(t *testing.T) { // TestMultiContainerSignal checks that it is possible to signal individual // containers without killing the entire sandbox. func TestMultiContainerSignal(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1691,12 +1691,11 @@ func TestMultiContainerRunNonRoot(t *testing.T) { } // TestMultiContainerHomeEnvDir tests that the HOME environment variable is set -// for root containers, sub-containers, and execed processes. +// for root containers, sub-containers, and exec'ed processes. func TestMultiContainerHomeEnvDir(t *testing.T) { - // TODO(gvisor.dev/issue/1487): VFSv2 configs failing. // NOTE: Don't use overlay since we need changes to persist to the temp dir // outside the sandbox. - for testName, conf := range configs(t, noOverlay...) { + for testName, conf := range configsWithVFS2(t, noOverlay...) { t.Run(testName, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() @@ -1718,9 +1717,9 @@ func TestMultiContainerHomeEnvDir(t *testing.T) { // We will sleep in the root container in order to ensure that the root //container doesn't terminate before sub containers can be created. - rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())} - subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())} - execCmd := fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name()) + rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s; sleep 1000`, homeDirs["root"].Name())} + subCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["sub"].Name())} + execCmd := fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["exec"].Name()) // Setup the containers, a root container and sub container. specConfig, ids := createSpecs(rootCmd, subCmd) diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go index ba1ff833f..775325c06 100644 --- a/runsc/flag/flag.go +++ b/runsc/flag/flag.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package flag wraps flag primitives. package flag import ( diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 0b9f39466..8f66dd1f8 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -309,11 +309,20 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) ( const bufSize = 4 << 20 // 4MB. if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil { - return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err) + syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize) + sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF) + + if sz < bufSize { + log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err) + } } if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil { - return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err) + syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize) + sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF) + if sz < bufSize { + log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Curent buffer %d: %v", bufSize, iface.Name, sz, err) + } } return &socketEntry{deviceFile, gsoMaxSize}, nil diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index a8f4f64a5..c4309feb3 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -72,11 +72,14 @@ type Sandbox struct { // will have it as a child process. child bool - // status is an exit status of a sandbox process. - status syscall.WaitStatus - // statusMu protects status. statusMu sync.Mutex + + // status is the exit status of a sandbox process. It's only set if the + // child==true and the sandbox was waited on. This field allows for multiple + // threads to wait on sandbox and get the exit code, since Linux will return + // WaitStatus to one of the waiters only. + status syscall.WaitStatus } // Args is used to configure a new sandbox. @@ -746,35 +749,47 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn // Wait waits for the containerized process to exit, and returns its WaitStatus. func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) - var ws syscall.WaitStatus if conn, err := s.sandboxConnect(); err != nil { - // The sandbox may have exited while before we had a chance to - // wait on it. + // The sandbox may have exited while before we had a chance to wait on it. + // There is nothing we can do for subcontainers. For the init container, we + // can try to get the sandbox exit code. + if !s.IsRootContainer(cid) { + return syscall.WaitStatus(0), err + } log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) } else { defer conn.Close() + // Try the Wait RPC to the sandbox. + var ws syscall.WaitStatus err = conn.Call(boot.ContainerWait, &cid, &ws) if err == nil { // It worked! return ws, nil } + // See comment above. + if !s.IsRootContainer(cid) { + return syscall.WaitStatus(0), err + } + // The sandbox may have exited after we connected, but before // or during the Wait RPC. log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) } - // The sandbox may have already exited, or exited while handling the - // Wait RPC. The best we can do is ask Linux what the sandbox exit - // status was, since in most cases that will be the same as the - // container exit status. + // The sandbox may have already exited, or exited while handling the Wait RPC. + // The best we can do is ask Linux what the sandbox exit status was, since in + // most cases that will be the same as the container exit status. if err := s.waitForStopped(); err != nil { - return ws, err + return syscall.WaitStatus(0), err } if !s.child { - return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable") + return syscall.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable") } + + s.statusMu.Lock() + defer s.statusMu.Unlock() return s.status, nil } |