summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
Diffstat (limited to 'runsc')
-rw-r--r--runsc/boot/filter/config.go6
-rw-r--r--runsc/boot/loader.go44
-rw-r--r--runsc/boot/loader_test.go25
-rw-r--r--runsc/boot/vfs.go5
-rw-r--r--runsc/container/container_test.go16
-rw-r--r--runsc/container/multi_container_test.go13
-rw-r--r--runsc/flag/flag.go1
-rw-r--r--runsc/sandbox/network.go13
-rw-r--r--runsc/sandbox/sandbox.go39
9 files changed, 102 insertions, 60 deletions
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 6ac19668f..a7c4ebb0c 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -162,6 +162,12 @@ var allowedSyscalls = seccomp.SyscallRules{
},
syscall.SYS_LSEEK: {},
syscall.SYS_MADVISE: {},
+ unix.SYS_MEMBARRIER: []seccomp.Rule{
+ {
+ seccomp.EqualTo(linux.MEMBARRIER_CMD_GLOBAL),
+ seccomp.EqualTo(0),
+ },
+ },
syscall.SYS_MINCORE: {},
// Used by the Go runtime as a temporarily workaround for a Linux
// 5.2-5.4 bug.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 2e652ddad..8ad000497 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -282,6 +282,7 @@ func New(args Args) (*Loader, error) {
args.NumCPU = runtime.NumCPU()
}
log.Infof("CPUs: %d", args.NumCPU)
+ runtime.GOMAXPROCS(args.NumCPU)
if args.TotalMem > 0 {
// Adjust the total memory returned by the Sentry so that applications that
@@ -471,9 +472,13 @@ func (l *Loader) Destroy() {
}
l.watchdog.Stop()
+ // Release all kernel resources. This is only safe after we can no longer
+ // save/restore.
+ l.k.Release()
+
// In the success case, stdioFDs and goferFDs will only contain
// released/closed FDs that ownership has been passed over to host FDs and
- // gofer sessions. Close them here in case on failure.
+ // gofer sessions. Close them here in case of failure.
for _, fd := range l.root.stdioFDs {
_ = fd.Close()
}
@@ -797,7 +802,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
}
// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
-// the gofer FDs looking for disconnects, and destroys the container if a
+// the gofer FDs looking for disconnects, and kills the container processes if a
// disconnect occurs in any of the gofer FDs.
func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
go func() {
@@ -818,18 +823,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
}
- // Check if the gofer has stopped as part of normal container destruction.
- // This is done just to avoid sending an annoying error message to the log.
- // Note that there is a small race window in between mu.Unlock() and the
- // lock being reacquired in destroyContainer(), but it's harmless to call
- // destroyContainer() multiple times.
l.mu.Lock()
- _, ok := l.processes[execID{cid: cid}]
- l.mu.Unlock()
- if ok {
- log.Infof("Gofer socket disconnected, destroying container %q", cid)
- if err := l.destroyContainer(cid); err != nil {
- log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+ defer l.mu.Unlock()
+
+ // The gofer could have been stopped due to a normal container shutdown.
+ // Check if the container has not stopped yet.
+ if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
+ log.Infof("Gofer socket disconnected, killing container %q", cid)
+ if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+ log.Warningf("Error killing container %q after gofer stopped: %v", cid, err)
}
}
}()
@@ -898,17 +900,24 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
return 0, fmt.Errorf("container %q not started", args.ContainerID)
}
- // Get the container MountNamespace from the Task.
+ // Get the container MountNamespace from the Task. Try to acquire ref may fail
+ // in case it raced with task exit.
if kernel.VFS2Enabled {
- // task.MountNamespace() does not take a ref, so we must do so ourselves.
+ // task.MountNamespaceVFS2() does not take a ref, so we must do so ourselves.
args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
- args.MountNamespaceVFS2.IncRef()
+ if !args.MountNamespaceVFS2.TryIncRef() {
+ return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+ }
} else {
+ var reffed bool
tg.Leader().WithMuLocked(func(t *kernel.Task) {
// task.MountNamespace() does not take a ref, so we must do so ourselves.
args.MountNamespace = t.MountNamespace()
- args.MountNamespace.IncRef()
+ reffed = args.MountNamespace.TryIncRef()
})
+ if !reffed {
+ return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+ }
}
// Add the HOME environment variable if it is not already set.
@@ -916,7 +925,6 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
root := args.MountNamespaceVFS2.Root()
ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
defer args.MountNamespaceVFS2.DecRef(ctx)
- defer root.DecRef(ctx)
envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
if err != nil {
return 0, err
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index bf9ec5d38..e376f944b 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -264,7 +264,7 @@ type CreateMountTestcase struct {
expectedPaths []string
}
-func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+func createMountTestcases() []*CreateMountTestcase {
testCases := []*CreateMountTestcase{
&CreateMountTestcase{
// Only proc.
@@ -409,32 +409,26 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
Destination: "/proc",
Type: "tmpfs",
},
- // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
- // MkDirAt in VFS2 (and remove the reduntant append).
- // {
- // Destination: "/sys/bar",
- // Type: "tmpfs",
- // },
- //
+ {
+ Destination: "/sys/bar",
+ Type: "tmpfs",
+ },
+
{
Destination: "/tmp/baz",
Type: "tmpfs",
},
},
},
- expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
+ expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"},
}
- if !vfs2 {
- vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
- vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
- }
return append(testCases, vfsCase)
}
// Test that MountNamespace can be created with various specs.
func TestCreateMountNamespace(t *testing.T) {
- for _, tc := range createMountTestcases(false /* vfs2 */) {
+ for _, tc := range createMountTestcases() {
t.Run(tc.name, func(t *testing.T) {
conf := testConfig()
ctx := contexttest.Context(t)
@@ -471,7 +465,7 @@ func TestCreateMountNamespace(t *testing.T) {
// Test that MountNamespace can be created with various specs.
func TestCreateMountNamespaceVFS2(t *testing.T) {
- for _, tc := range createMountTestcases(true /* vfs2 */) {
+ for _, tc := range createMountTestcases() {
t.Run(tc.name, func(t *testing.T) {
spec := testSpec()
spec.Mounts = tc.spec.Mounts
@@ -497,6 +491,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
}
root := mns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
for _, p := range tc.expectedPaths {
target := &vfs.PathOperation{
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index e36664938..82e459f46 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -170,6 +170,7 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create
rootProcArgs.MountNamespaceVFS2 = mns
root := mns.Root()
+ root.IncRef()
defer root.DecRef(rootCtx)
if root.Mount().ReadOnly() {
// Switch to ReadWrite while we setup submounts.
@@ -377,6 +378,7 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
}
root := mns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
target := &vfs.PathOperation{
Root: root,
@@ -474,6 +476,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config
}
root := mns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
pop := vfs.PathOperation{
Root: root,
@@ -597,6 +600,7 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
defer newMnt.DecRef(ctx)
root := mns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
target := &vfs.PathOperation{
Root: root,
@@ -617,6 +621,7 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
root := mns.Root()
+ root.IncRef()
defer root.DecRef(ctx)
target := &vfs.PathOperation{
Root: root,
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 548c68087..1f8e277cc 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -316,6 +316,7 @@ func configs(t *testing.T, opts ...configOption) map[string]*config.Config {
return cs
}
+// TODO(gvisor.dev/issue/1624): Merge with configs when VFS2 is the default.
func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config {
all := configs(t, opts...)
for key, value := range configs(t, opts...) {
@@ -894,13 +895,15 @@ func TestKillPid(t *testing.T) {
}
}
-// TestCheckpointRestore creates a container that continuously writes successive integers
-// to a file. To test checkpoint and restore functionality, the container is
-// checkpointed and the last number printed to the file is recorded. Then, it is restored in two
-// new containers and the first number printed from these containers is checked. Both should
-// be the next consecutive number after the last number from the checkpointed container.
+// TestCheckpointRestore creates a container that continuously writes successive
+// integers to a file. To test checkpoint and restore functionality, the
+// container is checkpointed and the last number printed to the file is
+// recorded. Then, it is restored in two new containers and the first number
+// printed from these containers is checked. Both should be the next consecutive
+// number after the last number from the checkpointed container.
func TestCheckpointRestore(t *testing.T) {
// Skip overlay because test requires writing to host file.
+ // TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added.
for name, conf := range configs(t, noOverlay...) {
t.Run(name, func(t *testing.T) {
dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
@@ -1062,6 +1065,7 @@ func TestCheckpointRestore(t *testing.T) {
// with filesystem Unix Domain Socket use.
func TestUnixDomainSockets(t *testing.T) {
// Skip overlay because test requires writing to host file.
+ // TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added.
for name, conf := range configs(t, noOverlay...) {
t.Run(name, func(t *testing.T) {
// UDS path is limited to 108 chars for compatibility with older systems.
@@ -1199,7 +1203,7 @@ func TestUnixDomainSockets(t *testing.T) {
// recreated. Then it resumes the container, verify that the file gets created
// again.
func TestPauseResume(t *testing.T) {
- for name, conf := range configs(t, noOverlay...) {
+ for name, conf := range configsWithVFS2(t, noOverlay...) {
t.Run(name, func(t *testing.T) {
tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock")
if err != nil {
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 952215ec1..850e80290 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -480,7 +480,7 @@ func TestMultiContainerMount(t *testing.T) {
// TestMultiContainerSignal checks that it is possible to signal individual
// containers without killing the entire sandbox.
func TestMultiContainerSignal(t *testing.T) {
- for name, conf := range configs(t, all...) {
+ for name, conf := range configsWithVFS2(t, all...) {
t.Run(name, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
if err != nil {
@@ -1691,12 +1691,11 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
}
// TestMultiContainerHomeEnvDir tests that the HOME environment variable is set
-// for root containers, sub-containers, and execed processes.
+// for root containers, sub-containers, and exec'ed processes.
func TestMultiContainerHomeEnvDir(t *testing.T) {
- // TODO(gvisor.dev/issue/1487): VFSv2 configs failing.
// NOTE: Don't use overlay since we need changes to persist to the temp dir
// outside the sandbox.
- for testName, conf := range configs(t, noOverlay...) {
+ for testName, conf := range configsWithVFS2(t, noOverlay...) {
t.Run(testName, func(t *testing.T) {
rootDir, cleanup, err := testutil.SetupRootDir()
@@ -1718,9 +1717,9 @@ func TestMultiContainerHomeEnvDir(t *testing.T) {
// We will sleep in the root container in order to ensure that the root
//container doesn't terminate before sub containers can be created.
- rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())}
- subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())}
- execCmd := fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())
+ rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s; sleep 1000`, homeDirs["root"].Name())}
+ subCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["sub"].Name())}
+ execCmd := fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["exec"].Name())
// Setup the containers, a root container and sub container.
specConfig, ids := createSpecs(rootCmd, subCmd)
diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go
index ba1ff833f..775325c06 100644
--- a/runsc/flag/flag.go
+++ b/runsc/flag/flag.go
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// Package flag wraps flag primitives.
package flag
import (
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 0b9f39466..8f66dd1f8 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -309,11 +309,20 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (
const bufSize = 4 << 20 // 4MB.
if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil {
- return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err)
+ syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize)
+ sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF)
+
+ if sz < bufSize {
+ log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
+ }
}
if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil {
- return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err)
+ syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize)
+ sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+ if sz < bufSize {
+ log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Curent buffer %d: %v", bufSize, iface.Name, sz, err)
+ }
}
return &socketEntry{deviceFile, gsoMaxSize}, nil
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index a8f4f64a5..c4309feb3 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -72,11 +72,14 @@ type Sandbox struct {
// will have it as a child process.
child bool
- // status is an exit status of a sandbox process.
- status syscall.WaitStatus
-
// statusMu protects status.
statusMu sync.Mutex
+
+ // status is the exit status of a sandbox process. It's only set if the
+ // child==true and the sandbox was waited on. This field allows for multiple
+ // threads to wait on sandbox and get the exit code, since Linux will return
+ // WaitStatus to one of the waiters only.
+ status syscall.WaitStatus
}
// Args is used to configure a new sandbox.
@@ -746,35 +749,47 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
// Wait waits for the containerized process to exit, and returns its WaitStatus.
func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
- var ws syscall.WaitStatus
if conn, err := s.sandboxConnect(); err != nil {
- // The sandbox may have exited while before we had a chance to
- // wait on it.
+ // The sandbox may have exited while before we had a chance to wait on it.
+ // There is nothing we can do for subcontainers. For the init container, we
+ // can try to get the sandbox exit code.
+ if !s.IsRootContainer(cid) {
+ return syscall.WaitStatus(0), err
+ }
log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
} else {
defer conn.Close()
+
// Try the Wait RPC to the sandbox.
+ var ws syscall.WaitStatus
err = conn.Call(boot.ContainerWait, &cid, &ws)
if err == nil {
// It worked!
return ws, nil
}
+ // See comment above.
+ if !s.IsRootContainer(cid) {
+ return syscall.WaitStatus(0), err
+ }
+
// The sandbox may have exited after we connected, but before
// or during the Wait RPC.
log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
}
- // The sandbox may have already exited, or exited while handling the
- // Wait RPC. The best we can do is ask Linux what the sandbox exit
- // status was, since in most cases that will be the same as the
- // container exit status.
+ // The sandbox may have already exited, or exited while handling the Wait RPC.
+ // The best we can do is ask Linux what the sandbox exit status was, since in
+ // most cases that will be the same as the container exit status.
if err := s.waitForStopped(); err != nil {
- return ws, err
+ return syscall.WaitStatus(0), err
}
if !s.child {
- return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable")
+ return syscall.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable")
}
+
+ s.statusMu.Lock()
+ defer s.statusMu.Unlock()
return s.status, nil
}