summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/p9/client.go4
-rw-r--r--pkg/p9/client_file.go25
-rw-r--r--pkg/sentry/fsimpl/fuse/dev.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go2
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go61
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go85
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go7
-rw-r--r--pkg/sentry/platform/ring0/aarch64.go1
-rw-r--r--pkg/sentry/platform/ring0/entry_arm64.s74
-rw-r--r--pkg/sentry/platform/ring0/kernel_arm64.go4
-rw-r--r--pkg/sentry/platform/ring0/offsets_arm64.go1
-rw-r--r--pkg/sentry/socket/netstack/netstack.go13
-rw-r--r--pkg/tcpip/adapters/gonet/BUILD1
-rw-r--r--pkg/tcpip/adapters/gonet/gonet_test.go3
-rw-r--r--pkg/tcpip/link/sniffer/sniffer.go58
-rw-r--r--pkg/tcpip/transport/tcp/connect.go13
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go47
-rw-r--r--pkg/tcpip/transport/tcp/endpoint_state.go10
-rw-r--r--pkg/tcpip/transport/tcp/tcp_test.go14
-rw-r--r--pkg/tcpip/transport/udp/BUILD1
-rw-r--r--pkg/tcpip/transport/udp/endpoint.go29
-rw-r--r--pkg/tcpip/transport/udp/udp_test.go50
-rw-r--r--pkg/test/testutil/testutil.go29
-rw-r--r--runsc/boot/controller.go72
-rw-r--r--runsc/boot/fs.go18
-rw-r--r--runsc/boot/loader.go80
-rw-r--r--runsc/cmd/exec.go2
-rw-r--r--runsc/console/console.go4
-rw-r--r--runsc/container/BUILD1
-rw-r--r--runsc/container/console_test.go205
-rw-r--r--runsc/container/container.go36
-rw-r--r--runsc/sandbox/sandbox.go30
-rw-r--r--test/root/crictl_test.go2
-rw-r--r--test/syscalls/linux/socket_inet_loopback.cc37
-rw-r--r--test/syscalls/linux/tcp_socket.cc160
-rw-r--r--test/syscalls/linux/udp_socket.cc36
36 files changed, 924 insertions, 293 deletions
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 71e944c30..eadea390a 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -570,6 +570,8 @@ func (c *Client) Version() uint32 {
func (c *Client) Close() {
// unet.Socket.Shutdown() has no effect if unet.Socket.Close() has already
// been called (by c.watch()).
- c.socket.Shutdown()
+ if err := c.socket.Shutdown(); err != nil {
+ log.Warningf("Socket.Shutdown() failed (FD: %d): %v", c.socket.FD(), err)
+ }
c.closedWg.Wait()
}
diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go
index 28fe081d6..8b46a2987 100644
--- a/pkg/p9/client_file.go
+++ b/pkg/p9/client_file.go
@@ -478,28 +478,23 @@ func (r *ReadWriterFile) ReadAt(p []byte, offset int64) (int, error) {
}
// Write implements part of the io.ReadWriter interface.
+//
+// Note that this may return a short write with a nil error. This violates the
+// contract of io.Writer, but is more consistent with gVisor's pattern of
+// returning errors that correspond to Linux errnos. Since short writes without
+// error are common in Linux, returning a nil error is appropriate.
func (r *ReadWriterFile) Write(p []byte) (int, error) {
n, err := r.File.WriteAt(p, r.Offset)
r.Offset += uint64(n)
- if err != nil {
- return n, err
- }
- if n < len(p) {
- return n, io.ErrShortWrite
- }
- return n, nil
+ return n, err
}
// WriteAt implements the io.WriteAt interface.
+//
+// Note that this may return a short write with a nil error. This violates the
+// contract of io.WriterAt. See comment on Write for justification.
func (r *ReadWriterFile) WriteAt(p []byte, offset int64) (int, error) {
- n, err := r.File.WriteAt(p, uint64(offset))
- if err != nil {
- return n, err
- }
- if n < len(p) {
- return n, io.ErrShortWrite
- }
- return n, nil
+ return r.File.WriteAt(p, uint64(offset))
}
// Rename implements File.Rename.
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index 89c3ef079..1bbe6fdb7 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -363,7 +363,7 @@ func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
var ready waiter.EventMask
- if fd.fs.umounted {
+ if fd.fs == nil || fd.fs.umounted {
ready |= waiter.EventErr
return ready & mask
}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index e39cd305b..61138a7a4 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -381,6 +381,8 @@ afterTrailingSymlink:
creds := rp.Credentials()
child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
parentDir.insertChildLocked(child, name)
+ child.IncRef()
+ defer child.DecRef(ctx)
unlock()
fd, err := child.open(ctx, rp, &opts, true)
if err != nil {
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index add5dd48e..59fcff498 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -107,8 +107,10 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
// Dentries which may have a reference count of zero, and which therefore
// should be dropped once traversal is complete, are appended to ds.
//
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
if !d.isDir() {
return nil, syserror.ENOTDIR
@@ -158,15 +160,19 @@ afterSymlink:
return child, nil
}
-// verifyChild verifies the hash of child against the already verified hash of
-// the parent to ensure the child is expected. verifyChild triggers a sentry
-// panic if unexpected modifications to the file system are detected. In
+// verifyChildLocked verifies the hash of child against the already verified
+// hash of the parent to ensure the child is expected. verifyChild triggers a
+// sentry panic if unexpected modifications to the file system are detected. In
// noCrashOnVerificationFailure mode it returns a syserror instead.
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+//
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+//
// TODO(b/166474175): Investigate all possible errors returned in this
// function, and make sure we differentiate all errors that indicate unexpected
// modifications to the file system from the ones that are not harmful.
-func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
+func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
vfsObj := fs.vfsfs.VirtualFilesystem()
// Get the path to the child dentry. This is only used to provide path
@@ -268,7 +274,8 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
// contain the hash of the children in the parent Merkle tree when
// Verify returns with success.
var buf bytes.Buffer
- if _, err := merkletree.Verify(&merkletree.VerifyParams{
+ parent.hashMu.RLock()
+ _, err = merkletree.Verify(&merkletree.VerifyParams{
Out: &buf,
File: &fdReader,
Tree: &fdReader,
@@ -284,21 +291,27 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
ReadSize: int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())),
Expected: parent.hash,
DataAndTreeInSameFile: true,
- }); err != nil && err != io.EOF {
+ })
+ parent.hashMu.RUnlock()
+ if err != nil && err != io.EOF {
return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err))
}
// Cache child hash when it's verified the first time.
+ child.hashMu.Lock()
if len(child.hash) == 0 {
child.hash = buf.Bytes()
}
+ child.hashMu.Unlock()
return child, nil
}
-// verifyStatAndChildren verifies the stat and children names against the
+// verifyStatAndChildrenLocked verifies the stat and children names against the
// verified hash. The mode/uid/gid and childrenNames of the file is cached
// after verified.
-func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat linux.Statx) error {
+//
+// Preconditions: d.dirMu must be locked.
+func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry, stat linux.Statx) error {
vfsObj := fs.vfsfs.VirtualFilesystem()
// Get the path to the child dentry. This is only used to provide path
@@ -390,6 +403,7 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat
}
var buf bytes.Buffer
+ d.hashMu.RLock()
params := &merkletree.VerifyParams{
Out: &buf,
Tree: &fdReader,
@@ -407,6 +421,7 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat
Expected: d.hash,
DataAndTreeInSameFile: false,
}
+ d.hashMu.RUnlock()
if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR {
params.DataAndTreeInSameFile = true
}
@@ -421,7 +436,9 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat
return nil
}
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
if child, ok := parent.children[name]; ok {
// If verity is enabled on child, we should check again whether
@@ -470,7 +487,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
// be cached before enabled.
if fs.allowRuntimeEnable {
if parent.verityEnabled() {
- if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+ if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil {
return nil, err
}
}
@@ -486,7 +503,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
if err != nil {
return nil, err
}
- if err := fs.verifyStatAndChildren(ctx, child, stat); err != nil {
+ if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil {
return nil, err
}
}
@@ -506,7 +523,9 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
return child, nil
}
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
vfsObj := fs.vfsfs.VirtualFilesystem()
@@ -597,13 +616,13 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
// allowRuntimeEnable mode and the parent directory hasn't been enabled
// yet.
if parent.verityEnabled() {
- if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+ if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil {
child.destroyLocked(ctx)
return nil, err
}
}
if child.verityEnabled() {
- if err := fs.verifyStatAndChildren(ctx, child, stat); err != nil {
+ if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil {
child.destroyLocked(ctx)
return nil, err
}
@@ -617,7 +636,9 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
// rp.Start().Impl().(*dentry)). It does not check that the returned directory
// is searchable by the provider of rp.
//
-// Preconditions: fs.renameMu must be locked. !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
for !rp.Final() {
d.dirMu.Lock()
@@ -958,11 +979,13 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if err != nil {
return linux.Statx{}, err
}
+ d.dirMu.Lock()
if d.verityEnabled() {
- if err := fs.verifyStatAndChildren(ctx, d, stat); err != nil {
+ if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil {
return linux.Statx{}, err
}
}
+ d.dirMu.Unlock()
return stat, nil
}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index 87dabe038..46346e54d 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -19,6 +19,18 @@
// The verity file system is read-only, except for one case: when
// allowRuntimeEnable is true, additional Merkle files can be generated using
// the FS_IOC_ENABLE_VERITY ioctl.
+//
+// Lock order:
+//
+// filesystem.renameMu
+// dentry.dirMu
+// fileDescription.mu
+// filesystem.verityMu
+// dentry.hashMu
+//
+// Locking dentry.dirMu in multiple dentries requires that parent dentries are
+// locked before child dentries, and that filesystem.renameMu is locked to
+// stabilize this relationship.
package verity
import (
@@ -372,12 +384,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames: %v", err))
}
- if err := fs.verifyStatAndChildren(ctx, d, stat); err != nil {
+ if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil {
return nil, nil, err
}
}
+ d.hashMu.Lock()
copy(d.hash, iopts.RootHash)
+ d.hashMu.Unlock()
d.vfsd.Init(d)
fs.rootDentry = d
@@ -402,7 +416,8 @@ type dentry struct {
fs *filesystem
// mode, uid, gid and size are the file mode, owner, group, and size of
- // the file in the underlying file system.
+ // the file in the underlying file system. They are set when a dentry
+ // is initialized, and never modified.
mode uint32
uid uint32
gid uint32
@@ -425,18 +440,22 @@ type dentry struct {
// childrenNames stores the name of all children of the dentry. This is
// used by verity to check whether a child is expected. This is only
- // populated by enableVerity.
+ // populated by enableVerity. childrenNames is also protected by dirMu.
childrenNames map[string]struct{}
- // lowerVD is the VirtualDentry in the underlying file system.
+ // lowerVD is the VirtualDentry in the underlying file system. It is
+ // never modified after initialized.
lowerVD vfs.VirtualDentry
// lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree
- // in the underlying file system.
+ // in the underlying file system. It is never modified after
+ // initialized.
lowerMerkleVD vfs.VirtualDentry
- // hash is the calculated hash for the current file or directory.
- hash []byte
+ // hash is the calculated hash for the current file or directory. hash
+ // is protected by hashMu.
+ hashMu sync.RWMutex `state:"nosave"`
+ hash []byte
}
// newDentry creates a new dentry representing the given verity file. The
@@ -519,7 +538,9 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
// destroyLocked destroys the dentry.
//
-// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+// Preconditions:
+// * d.fs.renameMu must be locked for writing.
+// * d.refs == 0.
func (d *dentry) destroyLocked(ctx context.Context) {
switch atomic.LoadInt64(&d.refs) {
case 0:
@@ -599,6 +620,8 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
// mode, it returns true if the target has been enabled with
// ioctl(FS_IOC_ENABLE_VERITY).
func (d *dentry) verityEnabled() bool {
+ d.hashMu.RLock()
+ defer d.hashMu.RUnlock()
return !d.fs.allowRuntimeEnable || len(d.hash) != 0
}
@@ -678,11 +701,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
if err != nil {
return linux.Statx{}, err
}
+ fd.d.dirMu.Lock()
if fd.d.verityEnabled() {
- if err := fd.d.fs.verifyStatAndChildren(ctx, fd.d, stat); err != nil {
+ if err := fd.d.fs.verifyStatAndChildrenLocked(ctx, fd.d, stat); err != nil {
return linux.Statx{}, err
}
}
+ fd.d.dirMu.Unlock()
return stat, nil
}
@@ -718,13 +743,15 @@ func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32)
return offset, nil
}
-// generateMerkle generates a Merkle tree file for fd. If fd points to a file
-// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash
-// of the generated Merkle tree and the data size is returned. If fd points to
-// a regular file, the data is the content of the file. If fd points to a
-// directory, the data is all hahes of its children, written to the Merkle tree
-// file.
-func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) {
+// generateMerkleLocked generates a Merkle tree file for fd. If fd points to a
+// file /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The
+// hash of the generated Merkle tree and the data size is returned. If fd
+// points to a regular file, the data is the content of the file. If fd points
+// to a directory, the data is all hahes of its children, written to the Merkle
+// tree file.
+//
+// Preconditions: fd.d.fs.verityMu must be locked.
+func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, uint64, error) {
fdReader := vfs.FileReadWriteSeeker{
FD: fd.lowerFD,
Ctx: ctx,
@@ -793,11 +820,14 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64,
return hash, uint64(params.Size), err
}
-// recordChildren writes the names of fd's children into the corresponding
-// Merkle tree file, and saves the offset/size of the map into xattrs.
+// recordChildrenLocked writes the names of fd's children into the
+// corresponding Merkle tree file, and saves the offset/size of the map into
+// xattrs.
//
-// Preconditions: fd.d.isDir() == true
-func (fd *fileDescription) recordChildren(ctx context.Context) error {
+// Preconditions:
+// * fd.d.fs.verityMu must be locked.
+// * fd.d.isDir() == true.
+func (fd *fileDescription) recordChildrenLocked(ctx context.Context) error {
// Record the children names in the Merkle tree file.
childrenNames, err := json.Marshal(fd.d.childrenNames)
if err != nil {
@@ -847,7 +877,7 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) {
return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds")
}
- hash, dataSize, err := fd.generateMerkle(ctx)
+ hash, dataSize, err := fd.generateMerkleLocked(ctx)
if err != nil {
return 0, err
}
@@ -888,11 +918,13 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) {
}
if fd.d.isDir() {
- if err := fd.recordChildren(ctx); err != nil {
+ if err := fd.recordChildrenLocked(ctx); err != nil {
return 0, err
}
}
- fd.d.hash = append(fd.d.hash, hash...)
+ fd.d.hashMu.Lock()
+ fd.d.hash = hash
+ fd.d.hashMu.Unlock()
return 0, nil
}
@@ -904,6 +936,9 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest userm
}
var metadata linux.DigestMetadata
+ fd.d.hashMu.RLock()
+ defer fd.d.hashMu.RUnlock()
+
// If allowRuntimeEnable is true, an empty fd.d.hash indicates that
// verity is not enabled for the file. If allowRuntimeEnable is false,
// this is an integrity violation because all files should have verity
@@ -940,11 +975,13 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest userm
func (fd *fileDescription) verityFlags(ctx context.Context, flags usermem.Addr) (uintptr, error) {
f := int32(0)
+ fd.d.hashMu.RLock()
// All enabled files should store a hash. This flag is not settable via
// FS_IOC_SETFLAGS.
if len(fd.d.hash) != 0 {
f |= linux.FS_VERITY_FL
}
+ fd.d.hashMu.RUnlock()
t := kernel.TaskFromContext(ctx)
if t == nil {
@@ -1023,6 +1060,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of
Ctx: ctx,
}
+ fd.d.hashMu.RLock()
n, err := merkletree.Verify(&merkletree.VerifyParams{
Out: dst.Writer(ctx),
File: &dataReader,
@@ -1040,6 +1078,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of
Expected: fd.d.hash,
DataAndTreeInSameFile: false,
})
+ fd.d.hashMu.RUnlock()
if err != nil {
return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index fd92c3873..3f5be276b 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -263,13 +263,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
return usermem.NoAccess, platform.ErrContextInterrupt
case ring0.El0SyncUndef:
return c.fault(int32(syscall.SIGILL), info)
- case ring0.El1SyncUndef:
- *info = arch.SignalInfo{
- Signo: int32(syscall.SIGILL),
- Code: 1, // ILL_ILLOPC (illegal opcode).
- }
- info.SetAddr(switchOpts.Registers.Pc) // Include address.
- return usermem.AccessType{}, platform.ErrContextSignal
default:
panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
}
diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index 327d48465..c51df2811 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -90,6 +90,7 @@ const (
El0SyncIa
El0SyncFpsimdAcc
El0SyncSveAcc
+ El0SyncFpsimdExc
El0SyncSys
El0SyncSpPc
El0SyncUndef
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index f77bc72af..5f4b2105a 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -382,12 +382,12 @@ TEXT ·DisableVFP(SB),NOSPLIT,$0
MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \
LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack.
-// EXCEPTION_WITH_ERROR is a common exception handler function.
-#define EXCEPTION_WITH_ERROR(user, vector) \
+// EXCEPTION_EL0 is a common el0 exception handler function.
+#define EXCEPTION_EL0(vector) \
WORD $0xd538d092; \ //MRS TPIDR_EL1, R18
WORD $0xd538601a; \ //MRS FAR_EL1, R26
MOVD R26, CPU_FAULT_ADDR(RSV_REG); \
- MOVD $user, R3; \
+ MOVD $1, R3; \
MOVD R3, CPU_ERROR_TYPE(RSV_REG); \ // Set error type to user.
MOVD $vector, R3; \
MOVD R3, CPU_VECTOR_CODE(RSV_REG); \
@@ -395,6 +395,12 @@ TEXT ·DisableVFP(SB),NOSPLIT,$0
MOVD R3, CPU_ERROR_CODE(RSV_REG); \
B ·kernelExitToEl1(SB);
+// EXCEPTION_EL1 is a common el1 exception handler function.
+#define EXCEPTION_EL1(vector) \
+ MOVD $vector, R3; \
+ MOVD R3, 8(RSP); \
+ B ·HaltEl1ExceptionAndResume(SB);
+
// storeAppASID writes the application's asid value.
TEXT ·storeAppASID(SB),NOSPLIT,$0-8
MOVD asid+0(FP), R1
@@ -442,6 +448,16 @@ TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0
CALL ·kernelSyscall(SB) // Call the trampoline.
B ·kernelExitToEl1(SB) // Resume.
+// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume.
+TEXT ·HaltEl1ExceptionAndResume(SB),NOSPLIT,$0-8
+ WORD $0xd538d092 // MRS TPIDR_EL1, R18
+ MOVD CPU_SELF(RSV_REG), R3 // Load vCPU.
+ MOVD R3, 8(RSP) // First argument (vCPU).
+ MOVD vector+0(FP), R3
+ MOVD R3, 16(RSP) // Second argument (vector).
+ CALL ·kernelException(SB) // Call the trampoline.
+ B ·kernelExitToEl1(SB) // Resume.
+
// Shutdown stops the guest.
TEXT ·Shutdown(SB),NOSPLIT,$0
// PSCI EVENT.
@@ -604,39 +620,22 @@ TEXT ·El1_sync(SB),NOSPLIT,$0
B el1_invalid
el1_da:
+ EXCEPTION_EL1(El1SyncDa)
el1_ia:
- WORD $0xd538d092 //MRS TPIDR_EL1, R18
- WORD $0xd538601a //MRS FAR_EL1, R26
-
- MOVD R26, CPU_FAULT_ADDR(RSV_REG)
-
- MOVD $0, CPU_ERROR_TYPE(RSV_REG)
-
- MOVD $PageFault, R3
- MOVD R3, CPU_VECTOR_CODE(RSV_REG)
-
- B ·HaltAndResume(SB)
-
+ EXCEPTION_EL1(El1SyncIa)
el1_sp_pc:
- B ·Shutdown(SB)
-
+ EXCEPTION_EL1(El1SyncSpPc)
el1_undef:
- B ·Shutdown(SB)
-
+ EXCEPTION_EL1(El1SyncUndef)
el1_svc:
- MOVD $0, CPU_ERROR_CODE(RSV_REG)
- MOVD $0, CPU_ERROR_TYPE(RSV_REG)
B ·HaltEl1SvcAndResume(SB)
-
el1_dbg:
- B ·Shutdown(SB)
-
+ EXCEPTION_EL1(El1SyncDbg)
el1_fpsimd_acc:
VFP_ENABLE
B ·kernelExitToEl1(SB) // Resume.
-
el1_invalid:
- B ·Shutdown(SB)
+ EXCEPTION_EL1(El1SyncInv)
// El1_irq is the handler for El1_irq.
TEXT ·El1_irq(SB),NOSPLIT,$0
@@ -692,28 +691,21 @@ el0_svc:
el0_da:
el0_ia:
- EXCEPTION_WITH_ERROR(1, PageFault)
-
+ EXCEPTION_EL0(PageFault)
el0_fpsimd_acc:
- B ·Shutdown(SB)
-
+ EXCEPTION_EL0(El0SyncFpsimdAcc)
el0_sve_acc:
- B ·Shutdown(SB)
-
+ EXCEPTION_EL0(El0SyncSveAcc)
el0_fpsimd_exc:
- B ·Shutdown(SB)
-
+ EXCEPTION_EL0(El0SyncFpsimdExc)
el0_sp_pc:
- B ·Shutdown(SB)
-
+ EXCEPTION_EL0(El0SyncSpPc)
el0_undef:
- EXCEPTION_WITH_ERROR(1, El0SyncUndef)
-
+ EXCEPTION_EL0(El0SyncUndef)
el0_dbg:
- B ·Shutdown(SB)
-
+ EXCEPTION_EL0(El0SyncDbg)
el0_invalid:
- B ·Shutdown(SB)
+ EXCEPTION_EL0(El0SyncInv)
TEXT ·El0_irq(SB),NOSPLIT,$0
B ·Shutdown(SB)
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index ead598b24..90a7b8392 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -24,6 +24,10 @@ func HaltAndResume()
//go:nosplit
func HaltEl1SvcAndResume()
+// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume.
+//go:nosplit
+func HaltEl1ExceptionAndResume()
+
// init initializes architecture-specific state.
func (k *Kernel) init(maxCPUs int) {
}
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index 53bc3353c..b5652deb9 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -70,6 +70,7 @@ func Emit(w io.Writer) {
fmt.Fprintf(w, "#define El0SyncIa 0x%02x\n", El0SyncIa)
fmt.Fprintf(w, "#define El0SyncFpsimdAcc 0x%02x\n", El0SyncFpsimdAcc)
fmt.Fprintf(w, "#define El0SyncSveAcc 0x%02x\n", El0SyncSveAcc)
+ fmt.Fprintf(w, "#define El0SyncFpsimdExc 0x%02x\n", El0SyncFpsimdExc)
fmt.Fprintf(w, "#define El0SyncSys 0x%02x\n", El0SyncSys)
fmt.Fprintf(w, "#define El0SyncSpPc 0x%02x\n", El0SyncSpPc)
fmt.Fprintf(w, "#define El0SyncUndef 0x%02x\n", El0SyncUndef)
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 7d0ae15ca..5afe77858 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -2686,7 +2686,7 @@ func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequ
// Always do at least one fetchReadView, even if the number of bytes to
// read is 0.
err = s.fetchReadView()
- if err != nil {
+ if err != nil || len(s.readView) == 0 {
break
}
if dst.NumBytes() == 0 {
@@ -2709,15 +2709,20 @@ func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequ
}
copied += n
s.readView.TrimFront(n)
- if len(s.readView) == 0 {
- atomic.StoreUint32(&s.readViewHasData, 0)
- }
dst = dst.DropFirst(n)
if e != nil {
err = syserr.FromError(e)
break
}
+ // If we are done reading requested data then stop.
+ if dst.NumBytes() == 0 {
+ break
+ }
+ }
+
+ if len(s.readView) == 0 {
+ atomic.StoreUint32(&s.readViewHasData, 0)
}
// If we managed to copy something, we must deliver it.
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index a984f1712..d69b1e081 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -26,6 +26,7 @@ go_test(
"//pkg/tcpip",
"//pkg/tcpip/header",
"//pkg/tcpip/link/loopback",
+ "//pkg/tcpip/link/sniffer",
"//pkg/tcpip/network/ipv4",
"//pkg/tcpip/network/ipv6",
"//pkg/tcpip/stack",
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
index b196324c7..4b6bf4bba 100644
--- a/pkg/tcpip/adapters/gonet/gonet_test.go
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -28,6 +28,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+ "gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -65,7 +66,7 @@ func newLoopbackStack() (*stack.Stack, *tcpip.Error) {
TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol},
})
- if err := s.CreateNIC(NICID, loopback.New()); err != nil {
+ if err := s.CreateNIC(NICID, sniffer.New(loopback.New())); err != nil {
return nil, err
}
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index b3e8c4b92..178e658df 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -53,16 +53,35 @@ type endpoint struct {
nested.Endpoint
writer io.Writer
maxPCAPLen uint32
+ logPrefix string
}
var _ stack.GSOEndpoint = (*endpoint)(nil)
var _ stack.LinkEndpoint = (*endpoint)(nil)
var _ stack.NetworkDispatcher = (*endpoint)(nil)
+type direction int
+
+const (
+ directionSend = iota
+ directionRecv
+)
+
// New creates a new sniffer link-layer endpoint. It wraps around another
// endpoint and logs packets and they traverse the endpoint.
func New(lower stack.LinkEndpoint) stack.LinkEndpoint {
- sniffer := &endpoint{}
+ return NewWithPrefix(lower, "")
+}
+
+// NewWithPrefix creates a new sniffer link-layer endpoint. It wraps around
+// another endpoint and logs packets prefixed with logPrefix as they traverse
+// the endpoint.
+//
+// logPrefix is prepended to the log line without any separators.
+// E.g. logPrefix = "NIC:en0/" will produce log lines like
+// "NIC:en0/send udp [...]".
+func NewWithPrefix(lower stack.LinkEndpoint, logPrefix string) stack.LinkEndpoint {
+ sniffer := &endpoint{logPrefix: logPrefix}
sniffer.Endpoint.Init(lower, sniffer)
return sniffer
}
@@ -120,7 +139,7 @@ func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (
// called by the link-layer endpoint being wrapped when a packet arrives, and
// logs the packet before forwarding to the actual dispatcher.
func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
- e.dumpPacket("recv", nil, protocol, pkt)
+ e.dumpPacket(directionRecv, nil, protocol, pkt)
e.Endpoint.DeliverNetworkPacket(remote, local, protocol, pkt)
}
@@ -129,10 +148,10 @@ func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protoc
e.Endpoint.DeliverOutboundPacket(remote, local, protocol, pkt)
}
-func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+func (e *endpoint) dumpPacket(dir direction, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
writer := e.writer
if writer == nil && atomic.LoadUint32(&LogPackets) == 1 {
- logPacket(prefix, protocol, pkt, gso)
+ logPacket(e.logPrefix, dir, protocol, pkt, gso)
}
if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
totalLength := pkt.Size()
@@ -169,7 +188,7 @@ func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.Netw
// higher-level protocols to write packets; it just logs the packet and
// forwards the request to the lower endpoint.
func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
- e.dumpPacket("send", gso, protocol, pkt)
+ e.dumpPacket(directionSend, gso, protocol, pkt)
return e.Endpoint.WritePacket(r, gso, protocol, pkt)
}
@@ -178,20 +197,20 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
// forwards the request to the lower endpoint.
func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
- e.dumpPacket("send", gso, protocol, pkt)
+ e.dumpPacket(directionSend, gso, protocol, pkt)
}
return e.Endpoint.WritePackets(r, gso, pkts, protocol)
}
// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
- e.dumpPacket("send", nil, 0, stack.NewPacketBuffer(stack.PacketBufferOptions{
+ e.dumpPacket(directionSend, nil, 0, stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: vv,
}))
return e.Endpoint.WriteRawPacket(vv)
}
-func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
+func logPacket(prefix string, dir direction, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
// Figure out the network layer info.
var transProto uint8
src := tcpip.Address("unknown")
@@ -201,6 +220,16 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
var fragmentOffset uint16
var moreFragments bool
+ var directionPrefix string
+ switch dir {
+ case directionSend:
+ directionPrefix = "send"
+ case directionRecv:
+ directionPrefix = "recv"
+ default:
+ panic(fmt.Sprintf("unrecognized direction: %d", dir))
+ }
+
// Clone the packet buffer to not modify the original.
//
// We don't clone the original packet buffer so that the new packet buffer
@@ -248,15 +277,16 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
arp := header.ARP(pkt.NetworkHeader().View())
log.Infof(
- "%s arp %s (%s) -> %s (%s) valid:%t",
+ "%s%s arp %s (%s) -> %s (%s) valid:%t",
prefix,
+ directionPrefix,
tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
arp.IsValid(),
)
return
default:
- log.Infof("%s unknown network protocol", prefix)
+ log.Infof("%s%s unknown network protocol", prefix, directionPrefix)
return
}
@@ -300,7 +330,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
icmpType = "info reply"
}
}
- log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+ log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code())
return
case header.ICMPv6ProtocolNumber:
@@ -335,7 +365,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
case header.ICMPv6RedirectMsg:
icmpType = "redirect message"
}
- log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+ log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code())
return
case header.UDPProtocolNumber:
@@ -391,7 +421,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
}
default:
- log.Infof("%s %s -> %s unknown transport protocol: %d", prefix, src, dst, transProto)
+ log.Infof("%s%s %s -> %s unknown transport protocol: %d", prefix, directionPrefix, src, dst, transProto)
return
}
@@ -399,5 +429,5 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
details += fmt.Sprintf(" gso: %+v", gso)
}
- log.Infof("%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details)
+ log.Infof("%s%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, directionPrefix, transName, src, srcPort, dst, dstPort, size, id, details)
}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index ac6d879a7..6661e8915 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -496,7 +496,7 @@ func (h *handshake) resolveRoute() *tcpip.Error {
h.ep.mu.Lock()
}
if n&notifyError != 0 {
- return h.ep.LastError()
+ return h.ep.lastErrorLocked()
}
}
@@ -575,7 +575,6 @@ func (h *handshake) complete() *tcpip.Error {
return err
}
defer timer.stop()
-
for h.state != handshakeCompleted {
// Unlock before blocking, and reacquire again afterwards (h.ep.mu is held
// throughout handshake processing).
@@ -631,9 +630,8 @@ func (h *handshake) complete() *tcpip.Error {
h.ep.mu.Lock()
}
if n&notifyError != 0 {
- return h.ep.LastError()
+ return h.ep.lastErrorLocked()
}
-
case wakerForNewSegment:
if err := h.processSegments(); err != nil {
return err
@@ -1002,7 +1000,7 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
// Only send a reset if the connection is being aborted for a reason
// other than receiving a reset.
e.setEndpointState(StateError)
- e.HardError = err
+ e.hardError = err
if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
// The exact sequence number to be used for the RST is the same as the
// one used by Linux. We need to handle the case of window being shrunk
@@ -1141,7 +1139,7 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
// delete the TCB, and return.
case StateCloseWait:
e.transitionToStateCloseLocked()
- e.HardError = tcpip.ErrAborted
+ e.hardError = tcpip.ErrAborted
e.notifyProtocolGoroutine(notifyTickleWorker)
return false, nil
default:
@@ -1353,7 +1351,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
epilogue := func() {
// e.mu is expected to be hold upon entering this section.
-
if e.snd != nil {
e.snd.resendTimer.cleanup()
}
@@ -1383,7 +1380,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
e.lastErrorMu.Unlock()
e.setEndpointState(StateError)
- e.HardError = err
+ e.hardError = err
e.workerCleanup = true
// Lock released below.
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 4f4f4c65e..a2161e49d 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -315,11 +315,6 @@ func (*Stats) IsEndpointStats() {}
// +stateify savable
type EndpointInfo struct {
stack.TransportEndpointInfo
-
- // HardError is meaningful only when state is stateError. It stores the
- // error to be returned when read/write syscalls are called and the
- // endpoint is in this state. HardError is protected by endpoint mu.
- HardError *tcpip.Error `state:".(string)"`
}
// IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
@@ -386,6 +381,11 @@ type endpoint struct {
waiterQueue *waiter.Queue `state:"wait"`
uniqueID uint64
+ // hardError is meaningful only when state is stateError. It stores the
+ // error to be returned when read/write syscalls are called and the
+ // endpoint is in this state. hardError is protected by endpoint mu.
+ hardError *tcpip.Error `state:".(string)"`
+
// lastError represents the last error that the endpoint reported;
// access to it is protected by the following mutex.
lastErrorMu sync.Mutex `state:"nosave"`
@@ -1283,7 +1283,15 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
e.owner = owner
}
-func (e *endpoint) LastError() *tcpip.Error {
+// Preconditions: e.mu must be held to call this function.
+func (e *endpoint) hardErrorLocked() *tcpip.Error {
+ err := e.hardError
+ e.hardError = nil
+ return err
+}
+
+// Preconditions: e.mu must be held to call this function.
+func (e *endpoint) lastErrorLocked() *tcpip.Error {
e.lastErrorMu.Lock()
defer e.lastErrorMu.Unlock()
err := e.lastError
@@ -1291,6 +1299,15 @@ func (e *endpoint) LastError() *tcpip.Error {
return err
}
+func (e *endpoint) LastError() *tcpip.Error {
+ e.LockUser()
+ defer e.UnlockUser()
+ if err := e.hardErrorLocked(); err != nil {
+ return err
+ }
+ return e.lastErrorLocked()
+}
+
// Read reads data from the endpoint.
func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
e.LockUser()
@@ -1312,9 +1329,8 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
bufUsed := e.rcvBufUsed
if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
e.rcvListMu.Unlock()
- he := e.HardError
if s == StateError {
- return buffer.View{}, tcpip.ControlMessages{}, he
+ return buffer.View{}, tcpip.ControlMessages{}, e.hardErrorLocked()
}
e.stats.ReadErrors.NotConnected.Increment()
return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected
@@ -1370,9 +1386,13 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
// indicating the reason why it's not writable.
// Caller must hold e.mu and e.sndBufMu
func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
+ // The endpoint cannot be written to if it's not connected.
switch s := e.EndpointState(); {
case s == StateError:
- return 0, e.HardError
+ if err := e.hardErrorLocked(); err != nil {
+ return 0, err
+ }
+ return 0, tcpip.ErrClosedForSend
case !s.connecting() && !s.connected():
return 0, tcpip.ErrClosedForSend
case s.connecting():
@@ -1486,7 +1506,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
// but has some pending unread data.
if s := e.EndpointState(); !s.connected() && s != StateClose {
if s == StateError {
- return 0, tcpip.ControlMessages{}, e.HardError
+ return 0, tcpip.ControlMessages{}, e.hardErrorLocked()
}
e.stats.ReadErrors.InvalidEndpointState.Increment()
return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
@@ -2243,7 +2263,10 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
return tcpip.ErrAlreadyConnecting
case StateError:
- return e.HardError
+ if err := e.hardErrorLocked(); err != nil {
+ return err
+ }
+ return tcpip.ErrConnectionAborted
default:
return tcpip.ErrInvalidEndpointState
@@ -2417,7 +2440,7 @@ func (e *endpoint) startMainLoop(handshake bool) *tcpip.Error {
e.lastErrorMu.Unlock()
e.setEndpointState(StateError)
- e.HardError = err
+ e.hardError = err
// Call cleanupLocked to free up any reservations.
e.cleanupLocked()
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index bb901c0f8..ba67176b5 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -321,21 +321,21 @@ func (e *endpoint) loadRecentTSTime(unix unixTime) {
}
// saveHardError is invoked by stateify.
-func (e *EndpointInfo) saveHardError() string {
- if e.HardError == nil {
+func (e *endpoint) saveHardError() string {
+ if e.hardError == nil {
return ""
}
- return e.HardError.String()
+ return e.hardError.String()
}
// loadHardError is invoked by stateify.
-func (e *EndpointInfo) loadHardError(s string) {
+func (e *endpoint) loadHardError(s string) {
if s == "" {
return
}
- e.HardError = tcpip.StringToError(s)
+ e.hardError = tcpip.StringToError(s)
}
// saveMeasureTime is invoked by stateify.
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 9f0fb41e3..c366a4cbc 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -75,9 +75,6 @@ func TestGiveUpConnect(t *testing.T) {
// Wait for ep to become writable.
<-notifyCh
- if err := ep.LastError(); err != tcpip.ErrAborted {
- t.Fatalf("got ep.LastError() = %s, want = %s", err, tcpip.ErrAborted)
- }
// Call Connect again to retreive the handshake failure status
// and stats updates.
@@ -3198,6 +3195,11 @@ loop:
case tcpip.ErrWouldBlock:
select {
case <-ch:
+ // Expect the state to be StateError and subsequent Reads to fail with HardError.
+ if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset {
+ t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
+ }
+ break loop
case <-time.After(1 * time.Second):
t.Fatalf("Timed out waiting for reset to arrive")
}
@@ -3207,14 +3209,10 @@ loop:
t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
}
}
- // Expect the state to be StateError and subsequent Reads to fail with HardError.
- if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset {
- t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
- }
+
if tcp.EndpointState(c.EP.State()) != tcp.StateError {
t.Fatalf("got EP state is not StateError")
}
-
if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 {
t.Errorf("got stats.TCP.EstablishedResets.Value() = %d, want = 1", got)
}
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index c78549424..7ebae63d8 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -56,6 +56,7 @@ go_test(
"//pkg/tcpip/network/ipv4",
"//pkg/tcpip/network/ipv6",
"//pkg/tcpip/stack",
+ "//pkg/tcpip/transport/icmp",
"//pkg/waiter",
],
)
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 57976d4e3..835dcc54e 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -429,7 +429,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
to := opts.To
e.mu.RLock()
- defer e.mu.RUnlock()
+ lockReleased := false
+ defer func() {
+ if lockReleased {
+ return
+ }
+ e.mu.RUnlock()
+ }()
// If we've shutdown with SHUT_WR we are in an invalid state for sending.
if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
@@ -475,7 +481,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
if e.state != StateConnected {
err = tcpip.ErrInvalidEndpointState
}
- return
+ return ch, err
}
} else {
// Reject destination address if it goes through a different
@@ -541,7 +547,24 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
useDefaultTTL = false
}
- if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner, e.noChecksum); err != nil {
+ localPort := e.ID.LocalPort
+ sendTOS := e.sendTOS
+ owner := e.owner
+ noChecksum := e.noChecksum
+ lockReleased = true
+ e.mu.RUnlock()
+
+ // Do not hold lock when sending as loopback is synchronous and if the UDP
+ // datagram ends up generating an ICMP response then it can result in a
+ // deadlock where the ICMP response handling ends up acquiring this endpoint's
+ // mutex using e.mu.RLock() in endpoint.HandleControlPacket which can cause a
+ // deadlock if another caller is trying to acquire e.mu in exclusive mode w/
+ // e.mu.Lock(). Since e.mu.Lock() prevents any new read locks to ensure the
+ // lock can be eventually acquired.
+ //
+ // See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read
+ // locking is prohibited.
+ if err := sendUDP(route, buffer.View(v).ToVectorisedView(), localPort, dstPort, ttl, useDefaultTTL, sendTOS, owner, noChecksum); err != nil {
return 0, nil, err
}
return int64(len(v)), nil, nil
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 764ad0857..492e277a8 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -32,6 +32,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -54,6 +55,7 @@ const (
stackPort = 1234
testAddr = "\x0a\x00\x00\x02"
testPort = 4096
+ invalidPort = 8192
multicastAddr = "\xe8\x2b\xd3\xea"
multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
broadcastAddr = header.IPv4Broadcast
@@ -295,7 +297,8 @@ func newDualTestContext(t *testing.T, mtu uint32) *testContext {
t.Helper()
return newDualTestContextWithOptions(t, mtu, stack.Options{
NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
- TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+ TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, icmp.NewProtocol6, icmp.NewProtocol4},
+ HandleLocal: true,
})
}
@@ -972,7 +975,7 @@ func testFailingWrite(c *testContext, flow testFlow, wantErr *tcpip.Error) {
// provided.
func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 {
c.t.Helper()
- return testWriteInternal(c, flow, true, checkers...)
+ return testWriteAndVerifyInternal(c, flow, true, checkers...)
}
// testWriteWithoutDestination sends a packet of the given test flow from the
@@ -981,10 +984,10 @@ func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker
// checker functions provided.
func testWriteWithoutDestination(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 {
c.t.Helper()
- return testWriteInternal(c, flow, false, checkers...)
+ return testWriteAndVerifyInternal(c, flow, false, checkers...)
}
-func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 {
+func testWriteNoVerify(c *testContext, flow testFlow, setDest bool) buffer.View {
c.t.Helper()
// Take a snapshot of the stats to validate them at the end of the test.
epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
@@ -1006,6 +1009,12 @@ func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...
c.t.Fatalf("Bad number of bytes written: got %v, want %v", n, len(payload))
}
c.checkEndpointWriteStats(1, epstats, err)
+ return payload
+}
+
+func testWriteAndVerifyInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 {
+ c.t.Helper()
+ payload := testWriteNoVerify(c, flow, setDest)
// Received the packet and check the payload.
b := c.getPacketAndVerify(flow, checkers...)
var udp header.UDP
@@ -1150,6 +1159,39 @@ func TestV4WriteOnConnected(t *testing.T) {
testWriteWithoutDestination(c, unicastV4)
}
+func TestWriteOnConnectedInvalidPort(t *testing.T) {
+ protocols := map[string]tcpip.NetworkProtocolNumber{
+ "ipv4": ipv4.ProtocolNumber,
+ "ipv6": ipv6.ProtocolNumber,
+ }
+ for name, pn := range protocols {
+ t.Run(name, func(t *testing.T) {
+ c := newDualTestContext(t, defaultMTU)
+ defer c.cleanup()
+
+ c.createEndpoint(pn)
+ if err := c.ep.Connect(tcpip.FullAddress{Addr: stackAddr, Port: invalidPort}); err != nil {
+ c.t.Fatalf("Connect failed: %s", err)
+ }
+ writeOpts := tcpip.WriteOptions{
+ To: &tcpip.FullAddress{Addr: stackAddr, Port: invalidPort},
+ }
+ payload := buffer.View(newPayload())
+ n, _, err := c.ep.Write(tcpip.SlicePayload(payload), writeOpts)
+ if err != nil {
+ c.t.Fatalf("c.ep.Write(...) = %+s, want nil", err)
+ }
+ if got, want := n, int64(len(payload)); got != want {
+ c.t.Fatalf("c.ep.Write(...) wrote %d bytes, want %d bytes", got, want)
+ }
+
+ if err := c.ep.LastError(); err != tcpip.ErrConnectionRefused {
+ c.t.Fatalf("expected c.ep.LastError() == ErrConnectionRefused, got: %+v", err)
+ }
+ })
+ }
+}
+
// TestWriteOnBoundToV4Multicast checks that we can send packets out of a socket
// that is bound to a V4 multicast address.
func TestWriteOnBoundToV4Multicast(t *testing.T) {
diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
index 49ab87c58..976331230 100644
--- a/pkg/test/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -36,7 +36,6 @@ import (
"path/filepath"
"strconv"
"strings"
- "sync/atomic"
"syscall"
"testing"
"time"
@@ -417,33 +416,35 @@ func StartReaper() func() {
// WaitUntilRead reads from the given reader until the wanted string is found
// or until timeout.
-func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error {
+func WaitUntilRead(r io.Reader, want string, timeout time.Duration) error {
sc := bufio.NewScanner(r)
- if split != nil {
- sc.Split(split)
- }
// done must be accessed atomically. A value greater than 0 indicates
// that the read loop can exit.
- var done uint32
- doneCh := make(chan struct{})
+ doneCh := make(chan bool)
+ defer close(doneCh)
go func() {
for sc.Scan() {
t := sc.Text()
if strings.Contains(t, want) {
- atomic.StoreUint32(&done, 1)
- close(doneCh)
- break
+ doneCh <- true
+ return
}
- if atomic.LoadUint32(&done) > 0 {
- break
+ select {
+ case <-doneCh:
+ return
+ default:
}
}
+ doneCh <- false
}()
+
select {
case <-time.After(timeout):
- atomic.StoreUint32(&done, 1)
return fmt.Errorf("timeout waiting to read %q", want)
- case <-doneCh:
+ case res := <-doneCh:
+ if !res {
+ return fmt.Errorf("reader closed while waiting to read %q", want)
+ }
return nil
}
}
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index fdf13c8e1..865126ac5 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -211,10 +211,31 @@ func (cm *containerManager) Processes(cid *string, out *[]*control.Process) erro
return control.Processes(cm.l.k, *cid, out)
}
+// CreateArgs contains arguments to the Create method.
+type CreateArgs struct {
+ // CID is the ID of the container to start.
+ CID string
+
+ // FilePayload may contain a TTY file for the terminal, if enabled.
+ urpc.FilePayload
+}
+
// Create creates a container within a sandbox.
-func (cm *containerManager) Create(cid *string, _ *struct{}) error {
- log.Debugf("containerManager.Create, cid: %s", *cid)
- return cm.l.createContainer(*cid)
+func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error {
+ log.Debugf("containerManager.Create: %s", args.CID)
+
+ if len(args.Files) > 1 {
+ return fmt.Errorf("start arguments must have at most 1 files for TTY")
+ }
+ var tty *fd.FD
+ if len(args.Files) == 1 {
+ var err error
+ tty, err = fd.NewFromFile(args.Files[0])
+ if err != nil {
+ return fmt.Errorf("error dup'ing TTY file: %w", err)
+ }
+ }
+ return cm.l.createContainer(args.CID, tty)
}
// StartArgs contains arguments to the Start method.
@@ -229,9 +250,8 @@ type StartArgs struct {
CID string
// FilePayload contains, in order:
- // * stdin, stdout, and stderr.
- // * the file descriptor over which the sandbox will
- // request files from its root filesystem.
+ // * stdin, stdout, and stderr (optional: if terminal is disabled).
+ // * file descriptors to connect to gofer to serve the root filesystem.
urpc.FilePayload
}
@@ -251,23 +271,45 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
if args.CID == "" {
return errors.New("start argument missing container ID")
}
- if len(args.FilePayload.Files) < 4 {
- return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
+ if len(args.Files) < 1 {
+ return fmt.Errorf("start arguments must contain at least one file for the container root gofer")
}
// All validation passed, logs the spec for debugging.
specutils.LogSpec(args.Spec)
- fds, err := fd.NewFromFiles(args.FilePayload.Files)
+ goferFiles := args.Files
+ var stdios []*fd.FD
+ if !args.Spec.Process.Terminal {
+ // When not using a terminal, stdios come as the first 3 files in the
+ // payload.
+ if l := len(args.Files); l < 4 {
+ return fmt.Errorf("start arguments (len: %d) must contain stdios and files for the container root gofer", l)
+ }
+ var err error
+ stdios, err = fd.NewFromFiles(goferFiles[:3])
+ if err != nil {
+ return fmt.Errorf("error dup'ing stdio files: %w", err)
+ }
+ goferFiles = goferFiles[3:]
+ }
+ defer func() {
+ for _, fd := range stdios {
+ _ = fd.Close()
+ }
+ }()
+
+ goferFDs, err := fd.NewFromFiles(goferFiles)
if err != nil {
- return err
+ return fmt.Errorf("error dup'ing gofer files: %w", err)
}
defer func() {
- for _, fd := range fds {
+ for _, fd := range goferFDs {
_ = fd.Close()
}
}()
- if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
+
+ if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil {
log.Debugf("containerManager.Start failed, cid: %s, args: %+v, err: %v", args.CID, args, err)
return err
}
@@ -330,18 +372,18 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
log.Debugf("containerManager.Restore")
var specFile, deviceFile *os.File
- switch numFiles := len(o.FilePayload.Files); numFiles {
+ switch numFiles := len(o.Files); numFiles {
case 2:
// The device file is donated to the platform.
// Can't take ownership away from os.File. dup them to get a new FD.
- fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd()))
+ fd, err := syscall.Dup(int(o.Files[1].Fd()))
if err != nil {
return fmt.Errorf("failed to dup file: %v", err)
}
deviceFile = os.NewFile(uintptr(fd), "platform device")
fallthrough
case 1:
- specFile = o.FilePayload.Files[0]
+ specFile = o.Files[0]
case 0:
return fmt.Errorf("at least one file must be passed to Restore")
default:
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 6b6ae98d7..2b0d2cd51 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -22,15 +22,6 @@ import (
"strings"
"syscall"
- // Include filesystem types that OCI spec might mount.
- _ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
- _ "gvisor.dev/gvisor/pkg/sentry/fs/host"
- _ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
- _ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
- _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
- _ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
-
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -48,9 +39,18 @@ import (
tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
+
+ // Include filesystem types that OCI spec might mount.
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/host"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
)
const (
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index ebdd518d0..86bdc6ae3 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -157,6 +157,11 @@ type execProcess struct {
// pidnsPath is the pid namespace path in spec
pidnsPath string
+
+ // hostTTY is present when creating a sub-container with terminal enabled.
+ // TTY file is passed during container create and must be saved until
+ // container start.
+ hostTTY *fd.FD
}
func init() {
@@ -588,7 +593,9 @@ func (l *Loader) run() error {
// Create the root container init task. It will begin running
// when the kernel is started.
- if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
+ var err error
+ _, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(true, l.sandboxID, &l.root)
+ if err != nil {
return err
}
@@ -627,7 +634,7 @@ func (l *Loader) run() error {
}
// createContainer creates a new container inside the sandbox.
-func (l *Loader) createContainer(cid string) error {
+func (l *Loader) createContainer(cid string, tty *fd.FD) error {
l.mu.Lock()
defer l.mu.Unlock()
@@ -635,14 +642,14 @@ func (l *Loader) createContainer(cid string) error {
if _, ok := l.processes[eid]; ok {
return fmt.Errorf("container %q already exists", cid)
}
- l.processes[eid] = &execProcess{}
+ l.processes[eid] = &execProcess{hostTTY: tty}
return nil
}
// startContainer starts a child container. It returns the thread group ID of
// the newly created process. Used FDs are either closed or released. It's safe
// for the caller to close any remaining files upon return.
-func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error {
// Create capabilities.
caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
if err != nil {
@@ -695,36 +702,41 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin
info := &containerInfo{
conf: conf,
spec: spec,
- stdioFDs: files[:3],
- goferFDs: files[3:],
+ goferFDs: goferFDs,
}
info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
if err != nil {
return fmt.Errorf("creating new process: %v", err)
}
- tg, err := l.createContainerProcess(false, cid, info, ep)
+
+ // Use stdios or TTY depending on the spec configuration.
+ if spec.Process.Terminal {
+ if len(stdioFDs) > 0 {
+ return fmt.Errorf("using TTY, stdios not expected: %v", stdioFDs)
+ }
+ if ep.hostTTY == nil {
+ return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?")
+ }
+ info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY}
+ ep.hostTTY = nil
+ } else {
+ info.stdioFDs = stdioFDs
+ }
+
+ ep.tg, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(false, cid, info)
if err != nil {
return err
}
-
- // Success!
- l.k.StartProcess(tg)
- ep.tg = tg
+ l.k.StartProcess(ep.tg)
return nil
}
-func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) {
- console := false
- if root {
- // Only root container supports terminal for now.
- console = info.spec.Process.Terminal
- }
-
+func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
// Create the FD map, which will set stdin, stdout, and stderr.
ctx := info.procArgs.NewContext(l.k)
- fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs)
+ fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs)
if err != nil {
- return nil, fmt.Errorf("importing fds: %v", err)
+ return nil, nil, nil, fmt.Errorf("importing fds: %v", err)
}
// CreateProcess takes a reference on fdTable if successful. We won't need
// ours either way.
@@ -736,11 +748,11 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints)
if root {
if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
- return nil, err
+ return nil, nil, nil, err
}
}
if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
- return nil, err
+ return nil, nil, nil, err
}
// Add the HOME environment variable if it is not already set.
@@ -754,29 +766,25 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
}
if err != nil {
- return nil, err
+ return nil, nil, nil, err
}
info.procArgs.Envv = envv
// Create and start the new process.
tg, _, err := l.k.CreateProcess(info.procArgs)
if err != nil {
- return nil, fmt.Errorf("creating process: %v", err)
+ return nil, nil, nil, fmt.Errorf("creating process: %v", err)
}
// CreateProcess takes a reference on FDTable if successful.
info.procArgs.FDTable.DecRef(ctx)
// Set the foreground process group on the TTY to the global init process
// group, since that is what we are about to start running.
- if root {
- switch {
- case ttyFileVFS2 != nil:
- ep.ttyVFS2 = ttyFileVFS2
- ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
- case ttyFile != nil:
- ep.tty = ttyFile
- ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
- }
+ switch {
+ case ttyFileVFS2 != nil:
+ ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
+ case ttyFile != nil:
+ ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
}
// Install seccomp filters with the new task if there are any.
@@ -784,7 +792,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
if err != nil {
- return nil, fmt.Errorf("building seccomp program: %v", err)
+ return nil, nil, nil, fmt.Errorf("building seccomp program: %v", err)
}
if log.IsLogging(log.Debug) {
@@ -795,7 +803,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
task := tg.Leader()
// NOTE: It seems Flags are ignored by runc so we ignore them too.
if err := task.AppendSyscallFilter(program, true); err != nil {
- return nil, fmt.Errorf("appending seccomp filters: %v", err)
+ return nil, nil, nil, fmt.Errorf("appending seccomp filters: %v", err)
}
}
} else {
@@ -804,7 +812,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
}
}
- return tg, nil
+ return tg, ttyFile, ttyFileVFS2, nil
}
// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 86c02a22a..eafd6285c 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -150,7 +150,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
- // Start the new process and get it pid.
+ // Start the new process and get its pid.
pid, err := c.Execute(e)
if err != nil {
return Errorf("executing processes for container: %v", err)
diff --git a/runsc/console/console.go b/runsc/console/console.go
index dbb88e117..b36028792 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -24,8 +24,8 @@ import (
"golang.org/x/sys/unix"
)
-// NewWithSocket creates pty master/replica pair, sends the master FD over the given
-// socket, and returns the replica.
+// NewWithSocket creates pty master/replica pair, sends the master FD over the
+// given socket, and returns the replica.
func NewWithSocket(socketPath string) (*os.File, error) {
// Create a new pty master and replica.
ptyMaster, ptyReplica, err := pty.Open()
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index c33755482..1900ecceb 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -24,6 +24,7 @@ go_library(
"//runsc/boot",
"//runsc/cgroup",
"//runsc/config",
+ "//runsc/console",
"//runsc/sandbox",
"//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 4228399b8..1b0fdebd6 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -18,6 +18,7 @@ import (
"bytes"
"fmt"
"io"
+ "math/rand"
"os"
"path/filepath"
"syscall"
@@ -27,7 +28,6 @@ import (
"github.com/kr/pty"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/sentry/control"
- "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/test/testutil"
"gvisor.dev/gvisor/pkg/unet"
@@ -38,19 +38,22 @@ import (
// path is under 108 charactors (the unix socket path length limit),
// relativizing the path if necessary.
func socketPath(bundleDir string) (string, error) {
- path := filepath.Join(bundleDir, "socket")
+ num := rand.Intn(10000)
+ path := filepath.Join(bundleDir, fmt.Sprintf("socket-%4d", num))
+ const maxPathLen = 108
+ if len(path) <= maxPathLen {
+ return path, nil
+ }
+
+ // Path is too large, try to make it smaller.
cwd, err := os.Getwd()
if err != nil {
return "", fmt.Errorf("error getting cwd: %v", err)
}
- relPath, err := filepath.Rel(cwd, path)
+ path, err = filepath.Rel(cwd, path)
if err != nil {
return "", fmt.Errorf("error getting relative path for %q from cwd %q: %v", path, cwd, err)
}
- if len(path) > len(relPath) {
- path = relPath
- }
- const maxPathLen = 108
if len(path) > maxPathLen {
return "", fmt.Errorf("could not get socket path under length limit %d: %s", maxPathLen, path)
}
@@ -159,6 +162,82 @@ func TestConsoleSocket(t *testing.T) {
}
}
+// Test that an pty FD is sent over the console socket if one is provided.
+func TestMultiContainerConsoleSocket(t *testing.T) {
+ for name, conf := range configsWithVFS2(t, all...) {
+ t.Run(name, func(t *testing.T) {
+ rootDir, cleanup, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer cleanup()
+ conf.RootDir = rootDir
+
+ // Setup the containers.
+ sleep := []string{"sleep", "100"}
+ tru := []string{"true"}
+ testSpecs, ids := createSpecs(sleep, tru)
+ testSpecs[1].Process.Terminal = true
+
+ bundleDir, cleanup, err := testutil.SetupBundleDir(testSpecs[0])
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer cleanup()
+
+ args := Args{
+ ID: ids[0],
+ Spec: testSpecs[0],
+ BundleDir: bundleDir,
+ }
+ rootCont, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("error creating container: %v", err)
+ }
+ defer rootCont.Destroy()
+ if err := rootCont.Start(conf); err != nil {
+ t.Fatalf("error starting container: %v", err)
+ }
+
+ bundleDir, cleanup, err = testutil.SetupBundleDir(testSpecs[0])
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer cleanup()
+
+ sock, err := socketPath(bundleDir)
+ if err != nil {
+ t.Fatalf("error getting socket path: %v", err)
+ }
+ srv, cleanup := createConsoleSocket(t, sock)
+ defer cleanup()
+
+ // Create the container and pass the socket name.
+ args = Args{
+ ID: ids[1],
+ Spec: testSpecs[1],
+ BundleDir: bundleDir,
+ ConsoleSocket: sock,
+ }
+ cont, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("error creating container: %v", err)
+ }
+ defer cont.Destroy()
+ if err := cont.Start(conf); err != nil {
+ t.Fatalf("error starting container: %v", err)
+ }
+
+ // Make sure we get a console PTY.
+ ptyMaster, err := receiveConsolePTY(srv)
+ if err != nil {
+ t.Fatalf("error receiving console FD: %v", err)
+ }
+ ptyMaster.Close()
+ })
+ }
+}
+
// Test that job control signals work on a console created with "exec -ti".
func TestJobControlSignalExec(t *testing.T) {
spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
@@ -221,9 +300,9 @@ func TestJobControlSignalExec(t *testing.T) {
// Make sure all the processes are running.
expectedPL := []*control.Process{
// Root container process.
- {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+ newProcessBuilder().Cmd("sleep").Process(),
// Bash from exec process.
- {PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}},
+ newProcessBuilder().PID(2).Cmd("bash").Process(),
}
if err := waitForProcessList(c, expectedPL); err != nil {
t.Error(err)
@@ -233,7 +312,7 @@ func TestJobControlSignalExec(t *testing.T) {
ptyMaster.Write([]byte("sleep 100\n"))
// Wait for it to start. Sleep's PPID is bash's PID.
- expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}})
+ expectedPL = append(expectedPL, newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process())
if err := waitForProcessList(c, expectedPL); err != nil {
t.Error(err)
}
@@ -254,7 +333,7 @@ func TestJobControlSignalExec(t *testing.T) {
// Sleep is dead, but it may take more time for bash to notice and
// change the foreground process back to itself. We know it is done
// when bash writes "Terminated" to the pty.
- if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil {
+ if err := testutil.WaitUntilRead(ptyMaster, "Terminated", 5*time.Second); err != nil {
t.Fatalf("bash did not take over pty: %v", err)
}
@@ -359,7 +438,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
// Wait for bash to start.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
+ newProcessBuilder().PID(1).Cmd("bash").Process(),
}
if err := waitForProcessList(c, expectedPL); err != nil {
t.Fatalf("error waiting for processes: %v", err)
@@ -369,7 +448,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
ptyMaster.Write([]byte("sleep 100\n"))
// Wait for sleep to start.
- expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
+ expectedPL = append(expectedPL, newProcessBuilder().PID(2).PPID(1).Cmd("sleep").Process())
if err := waitForProcessList(c, expectedPL); err != nil {
t.Fatalf("error waiting for processes: %v", err)
}
@@ -393,7 +472,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
// Sleep is dead, but it may take more time for bash to notice and
// change the foreground process back to itself. We know it is done
// when bash writes "Terminated" to the pty.
- if err := testutil.WaitUntilRead(ptyBuf, "Terminated", nil, 5*time.Second); err != nil {
+ if err := testutil.WaitUntilRead(ptyBuf, "Terminated", 5*time.Second); err != nil {
t.Fatalf("bash did not take over pty: %v", err)
}
@@ -414,6 +493,104 @@ func TestJobControlSignalRootContainer(t *testing.T) {
}
}
+// Test that terminal works with root and sub-containers.
+func TestMultiContainerTerminal(t *testing.T) {
+ for name, conf := range configsWithVFS2(t, all...) {
+ t.Run(name, func(t *testing.T) {
+ rootDir, cleanup, err := testutil.SetupRootDir()
+ if err != nil {
+ t.Fatalf("error creating root dir: %v", err)
+ }
+ defer cleanup()
+ conf.RootDir = rootDir
+
+ // Don't let bash execute from profile or rc files, otherwise our PID
+ // counts get messed up.
+ bash := []string{"/bin/bash", "--noprofile", "--norc"}
+ testSpecs, ids := createSpecs(bash, bash)
+
+ type termContainer struct {
+ container *Container
+ master *os.File
+ }
+ var containers []termContainer
+ for i, spec := range testSpecs {
+ bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer cleanup()
+
+ spec.Process.Terminal = true
+ sock, err := socketPath(bundleDir)
+ if err != nil {
+ t.Fatalf("error getting socket path: %v", err)
+ }
+ srv, cleanup := createConsoleSocket(t, sock)
+ defer cleanup()
+
+ // Create the container and pass the socket name.
+ args := Args{
+ ID: ids[i],
+ Spec: spec,
+ BundleDir: bundleDir,
+ ConsoleSocket: sock,
+ }
+ cont, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("error creating container: %v", err)
+ }
+ defer cont.Destroy()
+
+ if err := cont.Start(conf); err != nil {
+ t.Fatalf("error starting container: %v", err)
+ }
+
+ // Make sure we get a console PTY.
+ ptyMaster, err := receiveConsolePTY(srv)
+ if err != nil {
+ t.Fatalf("error receiving console FD: %v", err)
+ }
+ defer ptyMaster.Close()
+
+ containers = append(containers, termContainer{
+ container: cont,
+ master: ptyMaster,
+ })
+ }
+
+ for _, tc := range containers {
+ // Bash output as well as sandbox output will be written to the PTY
+ // file. Writes after a certain point will block unless we drain the
+ // PTY, so we must continually copy from it.
+ //
+ // We log the output to stderr for debugabilitly, and also to a buffer,
+ // since we wait on particular output from bash below. We use a custom
+ // blockingBuffer which is thread-safe and also blocks on Read calls,
+ // which makes this a suitable Reader for WaitUntilRead.
+ ptyBuf := newBlockingBuffer()
+ tee := io.TeeReader(tc.master, ptyBuf)
+ go io.Copy(os.Stderr, tee)
+
+ // Wait for bash to start.
+ expectedPL := []*control.Process{
+ newProcessBuilder().Cmd("bash").Process(),
+ }
+ if err := waitForProcessList(tc.container, expectedPL); err != nil {
+ t.Fatalf("error waiting for processes: %v", err)
+ }
+
+ // Execute echo command and check that it was executed correctly. Use
+ // a variable to ensure it's not matching against command echo.
+ tc.master.Write([]byte("echo foo-${PWD}-123\n"))
+ if err := testutil.WaitUntilRead(ptyBuf, "foo-/-123", 5*time.Second); err != nil {
+ t.Fatalf("echo didn't execute: %v", err)
+ }
+ }
+ })
+ }
+}
+
// blockingBuffer is a thread-safe buffer that blocks when reading if the
// buffer is empty. It implements io.ReadWriter.
type blockingBuffer struct {
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 4aa139c88..f3d990cfc 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -38,6 +38,7 @@ import (
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/cgroup"
"gvisor.dev/gvisor/runsc/config"
+ "gvisor.dev/gvisor/runsc/console"
"gvisor.dev/gvisor/runsc/sandbox"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -397,7 +398,22 @@ func New(conf *config.Config, args Args) (*Container, error) {
return nil, err
}
c.Sandbox = sb.Sandbox
- if err := c.Sandbox.CreateContainer(c.ID); err != nil {
+
+ // If the console control socket file is provided, then create a new
+ // pty master/slave pair and send the TTY to the sandbox process.
+ var tty *os.File
+ if c.ConsoleSocket != "" {
+ // Create a new TTY pair and send the master on the provided socket.
+ var err error
+ tty, err = console.NewWithSocket(c.ConsoleSocket)
+ if err != nil {
+ return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err)
+ }
+ // tty file is transferred to the sandbox, then it can be closed here.
+ defer tty.Close()
+ }
+
+ if err := c.Sandbox.CreateContainer(c.ID, tty); err != nil {
return nil, err
}
}
@@ -451,11 +467,16 @@ func (c *Container) Start(conf *config.Config) error {
// the start (and all their children processes).
if err := runInCgroup(c.Sandbox.Cgroup, func() error {
// Create the gofer process.
- ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false)
+ goferFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false)
if err != nil {
return err
}
- defer mountsFile.Close()
+ defer func() {
+ _ = mountsFile.Close()
+ for _, f := range goferFiles {
+ _ = f.Close()
+ }
+ }()
cleanMounts, err := specutils.ReadMounts(mountsFile)
if err != nil {
@@ -463,7 +484,14 @@ func (c *Container) Start(conf *config.Config) error {
}
c.Spec.Mounts = cleanMounts
- return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles)
+ // Setup stdios if the container is not using terminal. Otherwise TTY was
+ // already setup in create.
+ var stdios []*os.File
+ if !c.Spec.Process.Terminal {
+ stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr}
+ }
+
+ return c.Sandbox.StartContainer(c.Spec, conf, c.ID, stdios, goferFiles)
}); err != nil {
return err
}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 4a4110477..c84ebcd8a 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -173,7 +173,7 @@ func New(conf *config.Config, args *Args) (*Sandbox, error) {
}
// CreateContainer creates a non-root container inside the sandbox.
-func (s *Sandbox) CreateContainer(cid string) error {
+func (s *Sandbox) CreateContainer(cid string, tty *os.File) error {
log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
sandboxConn, err := s.sandboxConnect()
if err != nil {
@@ -181,7 +181,16 @@ func (s *Sandbox) CreateContainer(cid string) error {
}
defer sandboxConn.Close()
- if err := sandboxConn.Call(boot.ContainerCreate, &cid, nil); err != nil {
+ var files []*os.File
+ if tty != nil {
+ files = []*os.File{tty}
+ }
+
+ args := boot.CreateArgs{
+ CID: cid,
+ FilePayload: urpc.FilePayload{Files: files},
+ }
+ if err := sandboxConn.Call(boot.ContainerCreate, &args, nil); err != nil {
return fmt.Errorf("creating non-root container %q: %v", cid, err)
}
return nil
@@ -211,11 +220,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error {
}
// StartContainer starts running a non-root container inside the sandbox.
-func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error {
- for _, f := range goferFiles {
- defer f.Close()
- }
-
+func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles []*os.File) error {
log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
sandboxConn, err := s.sandboxConnect()
if err != nil {
@@ -223,15 +228,18 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid stri
}
defer sandboxConn.Close()
- // The payload must container stdin/stdout/stderr followed by gofer
- // files.
- files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...)
+ // The payload must contain stdin/stdout/stderr (which may be empty if using
+ // TTY) followed by gofer files.
+ payload := urpc.FilePayload{}
+ payload.Files = append(payload.Files, stdios...)
+ payload.Files = append(payload.Files, goferFiles...)
+
// Start running the container.
args := boot.StartArgs{
Spec: spec,
Conf: conf,
CID: cid,
- FilePayload: urpc.FilePayload{Files: files},
+ FilePayload: payload,
}
if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err)
diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go
index 11ac5cb52..735dff107 100644
--- a/test/root/crictl_test.go
+++ b/test/root/crictl_test.go
@@ -480,7 +480,7 @@ func setup(t *testing.T, version string) (*criutil.Crictl, func(), error) {
}
// Wait for containerd to boot.
- if err := testutil.WaitUntilRead(startupR, "Start streaming server", nil, 10*time.Second); err != nil {
+ if err := testutil.WaitUntilRead(startupR, "Start streaming server", 10*time.Second); err != nil {
t.Fatalf("failed to start containerd: %v", err)
}
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index e19a83413..27e9816eb 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -1185,19 +1185,44 @@ TEST_P(SocketInetLoopbackTest, TCPAcceptAfterReset) {
listen_fd.get(), reinterpret_cast<sockaddr*>(&accept_addr), &addrlen));
ASSERT_EQ(addrlen, listener.addr_len);
- // TODO(gvisor.dev/issue/3812): Remove after SO_ERROR is fixed.
- if (IsRunningOnGvisor()) {
- char buf[10];
- ASSERT_THAT(ReadFd(accept_fd.get(), buf, sizeof(buf)),
- SyscallFailsWithErrno(ECONNRESET));
- } else {
+ // Wait for accept_fd to process the RST.
+ const int kTimeout = 10000;
+ struct pollfd pfd = {
+ .fd = accept_fd.get(),
+ .events = POLLIN,
+ };
+ ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+ ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR);
+
+ {
int err;
socklen_t optlen = sizeof(err);
ASSERT_THAT(
getsockopt(accept_fd.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
SyscallSucceeds());
+ // This should return ECONNRESET as the socket just received a RST packet
+ // from the peer.
+ ASSERT_EQ(optlen, sizeof(err));
ASSERT_EQ(err, ECONNRESET);
+ }
+ {
+ int err;
+ socklen_t optlen = sizeof(err);
+ ASSERT_THAT(
+ getsockopt(accept_fd.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
+ SyscallSucceeds());
+ // This should return no error as the previous getsockopt call would have
+ // cleared the socket error.
ASSERT_EQ(optlen, sizeof(err));
+ ASSERT_EQ(err, 0);
+ }
+ {
+ sockaddr_storage peer_addr;
+ socklen_t addrlen = sizeof(peer_addr);
+ // The socket is not connected anymore and should return ENOTCONN.
+ ASSERT_THAT(getpeername(accept_fd.get(),
+ reinterpret_cast<sockaddr*>(&peer_addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
}
}
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index bc2c8278c..714848b8e 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -964,37 +964,156 @@ TEST_P(TcpSocketTest, PollAfterShutdown) {
SyscallSucceedsWithValue(1));
}
-TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) {
+TEST_P(SimpleTcpSocketTest, NonBlockingConnectRetry) {
+ const FileDescriptor listener =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
// Initialize address to the loopback one.
sockaddr_storage addr =
ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
socklen_t addrlen = sizeof(addr);
- const FileDescriptor s =
+ // Bind to some port but don't listen yet.
+ ASSERT_THAT(
+ bind(listener.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ // Get the address we're bound to, then connect to it. We need to do this
+ // because we're allowing the stack to pick a port for us.
+ ASSERT_THAT(getsockname(listener.get(),
+ reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ FileDescriptor connector =
ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
- // Set the FD to O_NONBLOCK.
- int opts;
- ASSERT_THAT(opts = fcntl(s.get(), F_GETFL), SyscallSucceeds());
- opts |= O_NONBLOCK;
- ASSERT_THAT(fcntl(s.get(), F_SETFL, opts), SyscallSucceeds());
+ // Verify that connect fails.
+ ASSERT_THAT(
+ RetryEINTR(connect)(connector.get(),
+ reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(ECONNREFUSED));
- ASSERT_THAT(RetryEINTR(connect)(
+ // Now start listening
+ ASSERT_THAT(listen(listener.get(), SOMAXCONN), SyscallSucceeds());
+
+ // TODO(gvisor.dev/issue/3828): Issuing connect() again on a socket that
+ // failed first connect should succeed.
+ if (IsRunningOnGvisor()) {
+ ASSERT_THAT(
+ RetryEINTR(connect)(connector.get(),
+ reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(ECONNABORTED));
+ return;
+ }
+
+ // Verify that connect now succeeds.
+ ASSERT_THAT(
+ RetryEINTR(connect)(connector.get(),
+ reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ // Accept the connection.
+ const FileDescriptor accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listener.get(), nullptr, nullptr));
+}
+
+// nonBlockingConnectNoListener returns a socket on which a connect that is
+// expected to fail has been issued.
+PosixErrorOr<FileDescriptor> nonBlockingConnectNoListener(const int family,
+ sockaddr_storage addr,
+ socklen_t addrlen) {
+ // We will first create a socket and bind to ensure we bind a port but will
+ // not call listen on this socket.
+ // Then we will create a new socket that will connect to the port bound by
+ // the first socket and that shoud fail.
+ constexpr int sock_type = SOCK_STREAM | SOCK_NONBLOCK;
+ int b_sock;
+ RETURN_ERROR_IF_SYSCALL_FAIL(b_sock = socket(family, sock_type, IPPROTO_TCP));
+ FileDescriptor b(b_sock);
+ EXPECT_THAT(bind(b.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ // Get the address bound by the listening socket.
+ EXPECT_THAT(
+ getsockname(b.get(), reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ // Now create another socket and issue a connect on this one. This connect
+ // should fail as there is no listener.
+ int c_sock;
+ RETURN_ERROR_IF_SYSCALL_FAIL(c_sock = socket(family, sock_type, IPPROTO_TCP));
+ FileDescriptor s(c_sock);
+
+ // Now connect to the bound address and this should fail as nothing
+ // is listening on the bound address.
+ EXPECT_THAT(RetryEINTR(connect)(
s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
SyscallFailsWithErrno(EINPROGRESS));
- // Now polling on the FD with a timeout should return 0 corresponding to no
- // FDs ready.
- struct pollfd poll_fd = {s.get(), POLLOUT, 0};
- EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
- SyscallSucceedsWithValue(1));
+ // Wait for the connect to fail.
+ struct pollfd poll_fd = {s.get(), POLLERR, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 1000), SyscallSucceedsWithValue(1));
+ return std::move(s);
+}
+
+TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) {
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ const FileDescriptor s =
+ nonBlockingConnectNoListener(GetParam(), addr, addrlen).ValueOrDie();
int err;
socklen_t optlen = sizeof(err);
ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
SyscallSucceeds());
-
+ ASSERT_THAT(optlen, sizeof(err));
EXPECT_EQ(err, ECONNREFUSED);
+
+ unsigned char c;
+ ASSERT_THAT(read(s.get(), &c, sizeof(c)), SyscallSucceedsWithValue(0));
+ int opts;
+ EXPECT_THAT(opts = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+ opts &= ~O_NONBLOCK;
+ EXPECT_THAT(fcntl(s.get(), F_SETFL, opts), SyscallSucceeds());
+ // Try connecting again.
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(ECONNABORTED));
+}
+
+TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListenerRead) {
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ const FileDescriptor s =
+ nonBlockingConnectNoListener(GetParam(), addr, addrlen).ValueOrDie();
+
+ unsigned char c;
+ ASSERT_THAT(read(s.get(), &c, 1), SyscallFailsWithErrno(ECONNREFUSED));
+ ASSERT_THAT(read(s.get(), &c, 1), SyscallSucceedsWithValue(0));
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(ECONNABORTED));
+}
+
+TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListenerPeek) {
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ const FileDescriptor s =
+ nonBlockingConnectNoListener(GetParam(), addr, addrlen).ValueOrDie();
+
+ unsigned char c;
+ ASSERT_THAT(recv(s.get(), &c, 1, MSG_PEEK),
+ SyscallFailsWithErrno(ECONNREFUSED));
+ ASSERT_THAT(recv(s.get(), &c, 1, MSG_PEEK), SyscallSucceedsWithValue(0));
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(ECONNABORTED));
}
TEST_P(SimpleTcpSocketTest, SelfConnectSendRecv_NoRandomSave) {
@@ -1235,6 +1354,19 @@ TEST_P(SimpleTcpSocketTest, CleanupOnConnectionRefused) {
// Attempt #2, with the new socket and reused addr our connect should fail in
// the same way as before, not with an EADDRINUSE.
+ //
+ // TODO(gvisor.dev/issue/3828): 2nd connect on a socket which failed connect
+ // first time should succeed.
+ // gVisor never issues the second connect and returns ECONNABORTED instead.
+ // Linux actually sends a SYN again and gets a RST and correctly returns
+ // ECONNREFUSED.
+ if (IsRunningOnGvisor()) {
+ ASSERT_THAT(connect(client_s.get(),
+ reinterpret_cast<const struct sockaddr*>(&bound_addr),
+ bound_addrlen),
+ SyscallFailsWithErrno(ECONNABORTED));
+ return;
+ }
ASSERT_THAT(connect(client_s.get(),
reinterpret_cast<const struct sockaddr*>(&bound_addr),
bound_addrlen),
diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc
index d65275fd3..34255bfb8 100644
--- a/test/syscalls/linux/udp_socket.cc
+++ b/test/syscalls/linux/udp_socket.cc
@@ -374,6 +374,42 @@ TEST_P(UdpSocketTest, BindInUse) {
SyscallFailsWithErrno(EADDRINUSE));
}
+TEST_P(UdpSocketTest, ConnectWriteToInvalidPort) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Discover a free unused port by creating a new UDP socket, binding it
+ // recording the just bound port and closing it. This is not guaranteed as it
+ // can still race with other port UDP sockets trying to bind a port at the
+ // same time.
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ socklen_t addrlen = sizeof(addr_storage);
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
+ ASSERT_THAT(bind(s.get(), addr, addrlen), SyscallSucceeds());
+ ASSERT_THAT(getsockname(s.get(), addr, &addrlen), SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_NE(*Port(&addr_storage), 0);
+ ASSERT_THAT(close(s.release()), SyscallSucceeds());
+
+ // Now connect to the port that we just released. This should generate an
+ // ECONNREFUSED error.
+ ASSERT_THAT(connect(sock_.get(), addr, addrlen_), SyscallSucceeds());
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+ // Send from sock_ to an unbound port.
+ ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, addr, addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Now verify that we got an ICMP error back of ECONNREFUSED.
+ int err;
+ socklen_t optlen = sizeof(err);
+ ASSERT_THAT(getsockopt(sock_.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
+ SyscallSucceeds());
+ ASSERT_EQ(err, ECONNREFUSED);
+ ASSERT_EQ(optlen, sizeof(err));
+}
+
TEST_P(UdpSocketTest, ReceiveAfterConnect) {
ASSERT_NO_ERRNO(BindLoopback());
ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());