diff options
36 files changed, 924 insertions, 293 deletions
diff --git a/pkg/p9/client.go b/pkg/p9/client.go index 71e944c30..eadea390a 100644 --- a/pkg/p9/client.go +++ b/pkg/p9/client.go @@ -570,6 +570,8 @@ func (c *Client) Version() uint32 { func (c *Client) Close() { // unet.Socket.Shutdown() has no effect if unet.Socket.Close() has already // been called (by c.watch()). - c.socket.Shutdown() + if err := c.socket.Shutdown(); err != nil { + log.Warningf("Socket.Shutdown() failed (FD: %d): %v", c.socket.FD(), err) + } c.closedWg.Wait() } diff --git a/pkg/p9/client_file.go b/pkg/p9/client_file.go index 28fe081d6..8b46a2987 100644 --- a/pkg/p9/client_file.go +++ b/pkg/p9/client_file.go @@ -478,28 +478,23 @@ func (r *ReadWriterFile) ReadAt(p []byte, offset int64) (int, error) { } // Write implements part of the io.ReadWriter interface. +// +// Note that this may return a short write with a nil error. This violates the +// contract of io.Writer, but is more consistent with gVisor's pattern of +// returning errors that correspond to Linux errnos. Since short writes without +// error are common in Linux, returning a nil error is appropriate. func (r *ReadWriterFile) Write(p []byte) (int, error) { n, err := r.File.WriteAt(p, r.Offset) r.Offset += uint64(n) - if err != nil { - return n, err - } - if n < len(p) { - return n, io.ErrShortWrite - } - return n, nil + return n, err } // WriteAt implements the io.WriteAt interface. +// +// Note that this may return a short write with a nil error. This violates the +// contract of io.WriterAt. See comment on Write for justification. func (r *ReadWriterFile) WriteAt(p []byte, offset int64) (int, error) { - n, err := r.File.WriteAt(p, uint64(offset)) - if err != nil { - return n, err - } - if n < len(p) { - return n, io.ErrShortWrite - } - return n, nil + return r.File.WriteAt(p, uint64(offset)) } // Rename implements File.Rename. diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index 89c3ef079..1bbe6fdb7 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -363,7 +363,7 @@ func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask { var ready waiter.EventMask - if fd.fs.umounted { + if fd.fs == nil || fd.fs.umounted { ready |= waiter.EventErr return ready & mask } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e39cd305b..61138a7a4 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -381,6 +381,8 @@ afterTrailingSymlink: creds := rp.Credentials() child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)) parentDir.insertChildLocked(child, name) + child.IncRef() + defer child.DecRef(ctx) unlock() fd, err := child.open(ctx, rp, &opts, true) if err != nil { diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index add5dd48e..59fcff498 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -107,8 +107,10 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de // Dentries which may have a reference count of zero, and which therefore // should be dropped once traversal is complete, are appended to ds. // -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. -// !rp.Done(). +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +// * !rp.Done(). func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR @@ -158,15 +160,19 @@ afterSymlink: return child, nil } -// verifyChild verifies the hash of child against the already verified hash of -// the parent to ensure the child is expected. verifyChild triggers a sentry -// panic if unexpected modifications to the file system are detected. In +// verifyChildLocked verifies the hash of child against the already verified +// hash of the parent to ensure the child is expected. verifyChild triggers a +// sentry panic if unexpected modifications to the file system are detected. In // noCrashOnVerificationFailure mode it returns a syserror instead. -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +// // TODO(b/166474175): Investigate all possible errors returned in this // function, and make sure we differentiate all errors that indicate unexpected // modifications to the file system from the ones that are not harmful. -func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) { +func (fs *filesystem) verifyChildLocked(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) { vfsObj := fs.vfsfs.VirtualFilesystem() // Get the path to the child dentry. This is only used to provide path @@ -268,7 +274,8 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // contain the hash of the children in the parent Merkle tree when // Verify returns with success. var buf bytes.Buffer - if _, err := merkletree.Verify(&merkletree.VerifyParams{ + parent.hashMu.RLock() + _, err = merkletree.Verify(&merkletree.VerifyParams{ Out: &buf, File: &fdReader, Tree: &fdReader, @@ -284,21 +291,27 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de ReadSize: int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())), Expected: parent.hash, DataAndTreeInSameFile: true, - }); err != nil && err != io.EOF { + }) + parent.hashMu.RUnlock() + if err != nil && err != io.EOF { return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err)) } // Cache child hash when it's verified the first time. + child.hashMu.Lock() if len(child.hash) == 0 { child.hash = buf.Bytes() } + child.hashMu.Unlock() return child, nil } -// verifyStatAndChildren verifies the stat and children names against the +// verifyStatAndChildrenLocked verifies the stat and children names against the // verified hash. The mode/uid/gid and childrenNames of the file is cached // after verified. -func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat linux.Statx) error { +// +// Preconditions: d.dirMu must be locked. +func (fs *filesystem) verifyStatAndChildrenLocked(ctx context.Context, d *dentry, stat linux.Statx) error { vfsObj := fs.vfsfs.VirtualFilesystem() // Get the path to the child dentry. This is only used to provide path @@ -390,6 +403,7 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat } var buf bytes.Buffer + d.hashMu.RLock() params := &merkletree.VerifyParams{ Out: &buf, Tree: &fdReader, @@ -407,6 +421,7 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat Expected: d.hash, DataAndTreeInSameFile: false, } + d.hashMu.RUnlock() if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR { params.DataAndTreeInSameFile = true } @@ -421,7 +436,9 @@ func (fs *filesystem) verifyStatAndChildren(ctx context.Context, d *dentry, stat return nil } -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if child, ok := parent.children[name]; ok { // If verity is enabled on child, we should check again whether @@ -470,7 +487,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s // be cached before enabled. if fs.allowRuntimeEnable { if parent.verityEnabled() { - if _, err := fs.verifyChild(ctx, parent, child); err != nil { + if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil { return nil, err } } @@ -486,7 +503,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s if err != nil { return nil, err } - if err := fs.verifyStatAndChildren(ctx, child, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil { return nil, err } } @@ -506,7 +523,9 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s return child, nil } -// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) { vfsObj := fs.vfsfs.VirtualFilesystem() @@ -597,13 +616,13 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, // allowRuntimeEnable mode and the parent directory hasn't been enabled // yet. if parent.verityEnabled() { - if _, err := fs.verifyChild(ctx, parent, child); err != nil { + if _, err := fs.verifyChildLocked(ctx, parent, child); err != nil { child.destroyLocked(ctx) return nil, err } } if child.verityEnabled() { - if err := fs.verifyStatAndChildren(ctx, child, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, child, stat); err != nil { child.destroyLocked(ctx) return nil, err } @@ -617,7 +636,9 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // -// Preconditions: fs.renameMu must be locked. !rp.Done(). +// Preconditions: +// * fs.renameMu must be locked. +// * !rp.Done(). func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { for !rp.Final() { d.dirMu.Lock() @@ -958,11 +979,13 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return linux.Statx{}, err } + d.dirMu.Lock() if d.verityEnabled() { - if err := fs.verifyStatAndChildren(ctx, d, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { return linux.Statx{}, err } } + d.dirMu.Unlock() return stat, nil } diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index 87dabe038..46346e54d 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -19,6 +19,18 @@ // The verity file system is read-only, except for one case: when // allowRuntimeEnable is true, additional Merkle files can be generated using // the FS_IOC_ENABLE_VERITY ioctl. +// +// Lock order: +// +// filesystem.renameMu +// dentry.dirMu +// fileDescription.mu +// filesystem.verityMu +// dentry.hashMu +// +// Locking dentry.dirMu in multiple dentries requires that parent dentries are +// locked before child dentries, and that filesystem.renameMu is locked to +// stabilize this relationship. package verity import ( @@ -372,12 +384,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames: %v", err)) } - if err := fs.verifyStatAndChildren(ctx, d, stat); err != nil { + if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil { return nil, nil, err } } + d.hashMu.Lock() copy(d.hash, iopts.RootHash) + d.hashMu.Unlock() d.vfsd.Init(d) fs.rootDentry = d @@ -402,7 +416,8 @@ type dentry struct { fs *filesystem // mode, uid, gid and size are the file mode, owner, group, and size of - // the file in the underlying file system. + // the file in the underlying file system. They are set when a dentry + // is initialized, and never modified. mode uint32 uid uint32 gid uint32 @@ -425,18 +440,22 @@ type dentry struct { // childrenNames stores the name of all children of the dentry. This is // used by verity to check whether a child is expected. This is only - // populated by enableVerity. + // populated by enableVerity. childrenNames is also protected by dirMu. childrenNames map[string]struct{} - // lowerVD is the VirtualDentry in the underlying file system. + // lowerVD is the VirtualDentry in the underlying file system. It is + // never modified after initialized. lowerVD vfs.VirtualDentry // lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree - // in the underlying file system. + // in the underlying file system. It is never modified after + // initialized. lowerMerkleVD vfs.VirtualDentry - // hash is the calculated hash for the current file or directory. - hash []byte + // hash is the calculated hash for the current file or directory. hash + // is protected by hashMu. + hashMu sync.RWMutex `state:"nosave"` + hash []byte } // newDentry creates a new dentry representing the given verity file. The @@ -519,7 +538,9 @@ func (d *dentry) checkDropLocked(ctx context.Context) { // destroyLocked destroys the dentry. // -// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. +// Preconditions: +// * d.fs.renameMu must be locked for writing. +// * d.refs == 0. func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: @@ -599,6 +620,8 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) // mode, it returns true if the target has been enabled with // ioctl(FS_IOC_ENABLE_VERITY). func (d *dentry) verityEnabled() bool { + d.hashMu.RLock() + defer d.hashMu.RUnlock() return !d.fs.allowRuntimeEnable || len(d.hash) != 0 } @@ -678,11 +701,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu if err != nil { return linux.Statx{}, err } + fd.d.dirMu.Lock() if fd.d.verityEnabled() { - if err := fd.d.fs.verifyStatAndChildren(ctx, fd.d, stat); err != nil { + if err := fd.d.fs.verifyStatAndChildrenLocked(ctx, fd.d, stat); err != nil { return linux.Statx{}, err } } + fd.d.dirMu.Unlock() return stat, nil } @@ -718,13 +743,15 @@ func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) return offset, nil } -// generateMerkle generates a Merkle tree file for fd. If fd points to a file -// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash -// of the generated Merkle tree and the data size is returned. If fd points to -// a regular file, the data is the content of the file. If fd points to a -// directory, the data is all hahes of its children, written to the Merkle tree -// file. -func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) { +// generateMerkleLocked generates a Merkle tree file for fd. If fd points to a +// file /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The +// hash of the generated Merkle tree and the data size is returned. If fd +// points to a regular file, the data is the content of the file. If fd points +// to a directory, the data is all hahes of its children, written to the Merkle +// tree file. +// +// Preconditions: fd.d.fs.verityMu must be locked. +func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, uint64, error) { fdReader := vfs.FileReadWriteSeeker{ FD: fd.lowerFD, Ctx: ctx, @@ -793,11 +820,14 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, return hash, uint64(params.Size), err } -// recordChildren writes the names of fd's children into the corresponding -// Merkle tree file, and saves the offset/size of the map into xattrs. +// recordChildrenLocked writes the names of fd's children into the +// corresponding Merkle tree file, and saves the offset/size of the map into +// xattrs. // -// Preconditions: fd.d.isDir() == true -func (fd *fileDescription) recordChildren(ctx context.Context) error { +// Preconditions: +// * fd.d.fs.verityMu must be locked. +// * fd.d.isDir() == true. +func (fd *fileDescription) recordChildrenLocked(ctx context.Context) error { // Record the children names in the Merkle tree file. childrenNames, err := json.Marshal(fd.d.childrenNames) if err != nil { @@ -847,7 +877,7 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds") } - hash, dataSize, err := fd.generateMerkle(ctx) + hash, dataSize, err := fd.generateMerkleLocked(ctx) if err != nil { return 0, err } @@ -888,11 +918,13 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { } if fd.d.isDir() { - if err := fd.recordChildren(ctx); err != nil { + if err := fd.recordChildrenLocked(ctx); err != nil { return 0, err } } - fd.d.hash = append(fd.d.hash, hash...) + fd.d.hashMu.Lock() + fd.d.hash = hash + fd.d.hashMu.Unlock() return 0, nil } @@ -904,6 +936,9 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest userm } var metadata linux.DigestMetadata + fd.d.hashMu.RLock() + defer fd.d.hashMu.RUnlock() + // If allowRuntimeEnable is true, an empty fd.d.hash indicates that // verity is not enabled for the file. If allowRuntimeEnable is false, // this is an integrity violation because all files should have verity @@ -940,11 +975,13 @@ func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest userm func (fd *fileDescription) verityFlags(ctx context.Context, flags usermem.Addr) (uintptr, error) { f := int32(0) + fd.d.hashMu.RLock() // All enabled files should store a hash. This flag is not settable via // FS_IOC_SETFLAGS. if len(fd.d.hash) != 0 { f |= linux.FS_VERITY_FL } + fd.d.hashMu.RUnlock() t := kernel.TaskFromContext(ctx) if t == nil { @@ -1023,6 +1060,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of Ctx: ctx, } + fd.d.hashMu.RLock() n, err := merkletree.Verify(&merkletree.VerifyParams{ Out: dst.Writer(ctx), File: &dataReader, @@ -1040,6 +1078,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of Expected: fd.d.hash, DataAndTreeInSameFile: false, }) + fd.d.hashMu.RUnlock() if err != nil { return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err)) } diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index fd92c3873..3f5be276b 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -263,13 +263,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) return usermem.NoAccess, platform.ErrContextInterrupt case ring0.El0SyncUndef: return c.fault(int32(syscall.SIGILL), info) - case ring0.El1SyncUndef: - *info = arch.SignalInfo{ - Signo: int32(syscall.SIGILL), - Code: 1, // ILL_ILLOPC (illegal opcode). - } - info.SetAddr(switchOpts.Registers.Pc) // Include address. - return usermem.AccessType{}, platform.ErrContextSignal default: panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) } diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go index 327d48465..c51df2811 100644 --- a/pkg/sentry/platform/ring0/aarch64.go +++ b/pkg/sentry/platform/ring0/aarch64.go @@ -90,6 +90,7 @@ const ( El0SyncIa El0SyncFpsimdAcc El0SyncSveAcc + El0SyncFpsimdExc El0SyncSys El0SyncSpPc El0SyncUndef diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s index f77bc72af..5f4b2105a 100644 --- a/pkg/sentry/platform/ring0/entry_arm64.s +++ b/pkg/sentry/platform/ring0/entry_arm64.s @@ -382,12 +382,12 @@ TEXT ·DisableVFP(SB),NOSPLIT,$0 MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \ LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack. -// EXCEPTION_WITH_ERROR is a common exception handler function. -#define EXCEPTION_WITH_ERROR(user, vector) \ +// EXCEPTION_EL0 is a common el0 exception handler function. +#define EXCEPTION_EL0(vector) \ WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 WORD $0xd538601a; \ //MRS FAR_EL1, R26 MOVD R26, CPU_FAULT_ADDR(RSV_REG); \ - MOVD $user, R3; \ + MOVD $1, R3; \ MOVD R3, CPU_ERROR_TYPE(RSV_REG); \ // Set error type to user. MOVD $vector, R3; \ MOVD R3, CPU_VECTOR_CODE(RSV_REG); \ @@ -395,6 +395,12 @@ TEXT ·DisableVFP(SB),NOSPLIT,$0 MOVD R3, CPU_ERROR_CODE(RSV_REG); \ B ·kernelExitToEl1(SB); +// EXCEPTION_EL1 is a common el1 exception handler function. +#define EXCEPTION_EL1(vector) \ + MOVD $vector, R3; \ + MOVD R3, 8(RSP); \ + B ·HaltEl1ExceptionAndResume(SB); + // storeAppASID writes the application's asid value. TEXT ·storeAppASID(SB),NOSPLIT,$0-8 MOVD asid+0(FP), R1 @@ -442,6 +448,16 @@ TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0 CALL ·kernelSyscall(SB) // Call the trampoline. B ·kernelExitToEl1(SB) // Resume. +// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume. +TEXT ·HaltEl1ExceptionAndResume(SB),NOSPLIT,$0-8 + WORD $0xd538d092 // MRS TPIDR_EL1, R18 + MOVD CPU_SELF(RSV_REG), R3 // Load vCPU. + MOVD R3, 8(RSP) // First argument (vCPU). + MOVD vector+0(FP), R3 + MOVD R3, 16(RSP) // Second argument (vector). + CALL ·kernelException(SB) // Call the trampoline. + B ·kernelExitToEl1(SB) // Resume. + // Shutdown stops the guest. TEXT ·Shutdown(SB),NOSPLIT,$0 // PSCI EVENT. @@ -604,39 +620,22 @@ TEXT ·El1_sync(SB),NOSPLIT,$0 B el1_invalid el1_da: + EXCEPTION_EL1(El1SyncDa) el1_ia: - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - WORD $0xd538601a //MRS FAR_EL1, R26 - - MOVD R26, CPU_FAULT_ADDR(RSV_REG) - - MOVD $0, CPU_ERROR_TYPE(RSV_REG) - - MOVD $PageFault, R3 - MOVD R3, CPU_VECTOR_CODE(RSV_REG) - - B ·HaltAndResume(SB) - + EXCEPTION_EL1(El1SyncIa) el1_sp_pc: - B ·Shutdown(SB) - + EXCEPTION_EL1(El1SyncSpPc) el1_undef: - B ·Shutdown(SB) - + EXCEPTION_EL1(El1SyncUndef) el1_svc: - MOVD $0, CPU_ERROR_CODE(RSV_REG) - MOVD $0, CPU_ERROR_TYPE(RSV_REG) B ·HaltEl1SvcAndResume(SB) - el1_dbg: - B ·Shutdown(SB) - + EXCEPTION_EL1(El1SyncDbg) el1_fpsimd_acc: VFP_ENABLE B ·kernelExitToEl1(SB) // Resume. - el1_invalid: - B ·Shutdown(SB) + EXCEPTION_EL1(El1SyncInv) // El1_irq is the handler for El1_irq. TEXT ·El1_irq(SB),NOSPLIT,$0 @@ -692,28 +691,21 @@ el0_svc: el0_da: el0_ia: - EXCEPTION_WITH_ERROR(1, PageFault) - + EXCEPTION_EL0(PageFault) el0_fpsimd_acc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncFpsimdAcc) el0_sve_acc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncSveAcc) el0_fpsimd_exc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncFpsimdExc) el0_sp_pc: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncSpPc) el0_undef: - EXCEPTION_WITH_ERROR(1, El0SyncUndef) - + EXCEPTION_EL0(El0SyncUndef) el0_dbg: - B ·Shutdown(SB) - + EXCEPTION_EL0(El0SyncDbg) el0_invalid: - B ·Shutdown(SB) + EXCEPTION_EL0(El0SyncInv) TEXT ·El0_irq(SB),NOSPLIT,$0 B ·Shutdown(SB) diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go index ead598b24..90a7b8392 100644 --- a/pkg/sentry/platform/ring0/kernel_arm64.go +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -24,6 +24,10 @@ func HaltAndResume() //go:nosplit func HaltEl1SvcAndResume() +// HaltEl1ExceptionAndResume calls Hooks.KernelException and resume. +//go:nosplit +func HaltEl1ExceptionAndResume() + // init initializes architecture-specific state. func (k *Kernel) init(maxCPUs int) { } diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go index 53bc3353c..b5652deb9 100644 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -70,6 +70,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define El0SyncIa 0x%02x\n", El0SyncIa) fmt.Fprintf(w, "#define El0SyncFpsimdAcc 0x%02x\n", El0SyncFpsimdAcc) fmt.Fprintf(w, "#define El0SyncSveAcc 0x%02x\n", El0SyncSveAcc) + fmt.Fprintf(w, "#define El0SyncFpsimdExc 0x%02x\n", El0SyncFpsimdExc) fmt.Fprintf(w, "#define El0SyncSys 0x%02x\n", El0SyncSys) fmt.Fprintf(w, "#define El0SyncSpPc 0x%02x\n", El0SyncSpPc) fmt.Fprintf(w, "#define El0SyncUndef 0x%02x\n", El0SyncUndef) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 7d0ae15ca..5afe77858 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -2686,7 +2686,7 @@ func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequ // Always do at least one fetchReadView, even if the number of bytes to // read is 0. err = s.fetchReadView() - if err != nil { + if err != nil || len(s.readView) == 0 { break } if dst.NumBytes() == 0 { @@ -2709,15 +2709,20 @@ func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequ } copied += n s.readView.TrimFront(n) - if len(s.readView) == 0 { - atomic.StoreUint32(&s.readViewHasData, 0) - } dst = dst.DropFirst(n) if e != nil { err = syserr.FromError(e) break } + // If we are done reading requested data then stop. + if dst.NumBytes() == 0 { + break + } + } + + if len(s.readView) == 0 { + atomic.StoreUint32(&s.readViewHasData, 0) } // If we managed to copy something, we must deliver it. diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD index a984f1712..d69b1e081 100644 --- a/pkg/tcpip/adapters/gonet/BUILD +++ b/pkg/tcpip/adapters/gonet/BUILD @@ -26,6 +26,7 @@ go_test( "//pkg/tcpip", "//pkg/tcpip/header", "//pkg/tcpip/link/loopback", + "//pkg/tcpip/link/sniffer", "//pkg/tcpip/network/ipv4", "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go index b196324c7..4b6bf4bba 100644 --- a/pkg/tcpip/adapters/gonet/gonet_test.go +++ b/pkg/tcpip/adapters/gonet/gonet_test.go @@ -28,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" + "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" @@ -65,7 +66,7 @@ func newLoopbackStack() (*stack.Stack, *tcpip.Error) { TransportProtocols: []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol}, }) - if err := s.CreateNIC(NICID, loopback.New()); err != nil { + if err := s.CreateNIC(NICID, sniffer.New(loopback.New())); err != nil { return nil, err } diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index b3e8c4b92..178e658df 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -53,16 +53,35 @@ type endpoint struct { nested.Endpoint writer io.Writer maxPCAPLen uint32 + logPrefix string } var _ stack.GSOEndpoint = (*endpoint)(nil) var _ stack.LinkEndpoint = (*endpoint)(nil) var _ stack.NetworkDispatcher = (*endpoint)(nil) +type direction int + +const ( + directionSend = iota + directionRecv +) + // New creates a new sniffer link-layer endpoint. It wraps around another // endpoint and logs packets and they traverse the endpoint. func New(lower stack.LinkEndpoint) stack.LinkEndpoint { - sniffer := &endpoint{} + return NewWithPrefix(lower, "") +} + +// NewWithPrefix creates a new sniffer link-layer endpoint. It wraps around +// another endpoint and logs packets prefixed with logPrefix as they traverse +// the endpoint. +// +// logPrefix is prepended to the log line without any separators. +// E.g. logPrefix = "NIC:en0/" will produce log lines like +// "NIC:en0/send udp [...]". +func NewWithPrefix(lower stack.LinkEndpoint, logPrefix string) stack.LinkEndpoint { + sniffer := &endpoint{logPrefix: logPrefix} sniffer.Endpoint.Init(lower, sniffer) return sniffer } @@ -120,7 +139,7 @@ func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) ( // called by the link-layer endpoint being wrapped when a packet arrives, and // logs the packet before forwarding to the actual dispatcher. func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { - e.dumpPacket("recv", nil, protocol, pkt) + e.dumpPacket(directionRecv, nil, protocol, pkt) e.Endpoint.DeliverNetworkPacket(remote, local, protocol, pkt) } @@ -129,10 +148,10 @@ func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protoc e.Endpoint.DeliverOutboundPacket(remote, local, protocol, pkt) } -func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { +func (e *endpoint) dumpPacket(dir direction, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { writer := e.writer if writer == nil && atomic.LoadUint32(&LogPackets) == 1 { - logPacket(prefix, protocol, pkt, gso) + logPacket(e.logPrefix, dir, protocol, pkt, gso) } if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 { totalLength := pkt.Size() @@ -169,7 +188,7 @@ func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.Netw // higher-level protocols to write packets; it just logs the packet and // forwards the request to the lower endpoint. func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error { - e.dumpPacket("send", gso, protocol, pkt) + e.dumpPacket(directionSend, gso, protocol, pkt) return e.Endpoint.WritePacket(r, gso, protocol, pkt) } @@ -178,20 +197,20 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne // forwards the request to the lower endpoint. func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { - e.dumpPacket("send", gso, protocol, pkt) + e.dumpPacket(directionSend, gso, protocol, pkt) } return e.Endpoint.WritePackets(r, gso, pkts, protocol) } // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket. func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error { - e.dumpPacket("send", nil, 0, stack.NewPacketBuffer(stack.PacketBufferOptions{ + e.dumpPacket(directionSend, nil, 0, stack.NewPacketBuffer(stack.PacketBufferOptions{ Data: vv, })) return e.Endpoint.WriteRawPacket(vv) } -func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) { +func logPacket(prefix string, dir direction, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) { // Figure out the network layer info. var transProto uint8 src := tcpip.Address("unknown") @@ -201,6 +220,16 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P var fragmentOffset uint16 var moreFragments bool + var directionPrefix string + switch dir { + case directionSend: + directionPrefix = "send" + case directionRecv: + directionPrefix = "recv" + default: + panic(fmt.Sprintf("unrecognized direction: %d", dir)) + } + // Clone the packet buffer to not modify the original. // // We don't clone the original packet buffer so that the new packet buffer @@ -248,15 +277,16 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P arp := header.ARP(pkt.NetworkHeader().View()) log.Infof( - "%s arp %s (%s) -> %s (%s) valid:%t", + "%s%s arp %s (%s) -> %s (%s) valid:%t", prefix, + directionPrefix, tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()), tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()), arp.IsValid(), ) return default: - log.Infof("%s unknown network protocol", prefix) + log.Infof("%s%s unknown network protocol", prefix, directionPrefix) return } @@ -300,7 +330,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P icmpType = "info reply" } } - log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code()) + log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code()) return case header.ICMPv6ProtocolNumber: @@ -335,7 +365,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P case header.ICMPv6RedirectMsg: icmpType = "redirect message" } - log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code()) + log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code()) return case header.UDPProtocolNumber: @@ -391,7 +421,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P } default: - log.Infof("%s %s -> %s unknown transport protocol: %d", prefix, src, dst, transProto) + log.Infof("%s%s %s -> %s unknown transport protocol: %d", prefix, directionPrefix, src, dst, transProto) return } @@ -399,5 +429,5 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P details += fmt.Sprintf(" gso: %+v", gso) } - log.Infof("%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details) + log.Infof("%s%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, directionPrefix, transName, src, srcPort, dst, dstPort, size, id, details) } diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index ac6d879a7..6661e8915 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -496,7 +496,7 @@ func (h *handshake) resolveRoute() *tcpip.Error { h.ep.mu.Lock() } if n¬ifyError != 0 { - return h.ep.LastError() + return h.ep.lastErrorLocked() } } @@ -575,7 +575,6 @@ func (h *handshake) complete() *tcpip.Error { return err } defer timer.stop() - for h.state != handshakeCompleted { // Unlock before blocking, and reacquire again afterwards (h.ep.mu is held // throughout handshake processing). @@ -631,9 +630,8 @@ func (h *handshake) complete() *tcpip.Error { h.ep.mu.Lock() } if n¬ifyError != 0 { - return h.ep.LastError() + return h.ep.lastErrorLocked() } - case wakerForNewSegment: if err := h.processSegments(); err != nil { return err @@ -1002,7 +1000,7 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) { // Only send a reset if the connection is being aborted for a reason // other than receiving a reset. e.setEndpointState(StateError) - e.HardError = err + e.hardError = err if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout { // The exact sequence number to be used for the RST is the same as the // one used by Linux. We need to handle the case of window being shrunk @@ -1141,7 +1139,7 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) { // delete the TCB, and return. case StateCloseWait: e.transitionToStateCloseLocked() - e.HardError = tcpip.ErrAborted + e.hardError = tcpip.ErrAborted e.notifyProtocolGoroutine(notifyTickleWorker) return false, nil default: @@ -1353,7 +1351,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ epilogue := func() { // e.mu is expected to be hold upon entering this section. - if e.snd != nil { e.snd.resendTimer.cleanup() } @@ -1383,7 +1380,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ e.lastErrorMu.Unlock() e.setEndpointState(StateError) - e.HardError = err + e.hardError = err e.workerCleanup = true // Lock released below. diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 4f4f4c65e..a2161e49d 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -315,11 +315,6 @@ func (*Stats) IsEndpointStats() {} // +stateify savable type EndpointInfo struct { stack.TransportEndpointInfo - - // HardError is meaningful only when state is stateError. It stores the - // error to be returned when read/write syscalls are called and the - // endpoint is in this state. HardError is protected by endpoint mu. - HardError *tcpip.Error `state:".(string)"` } // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo @@ -386,6 +381,11 @@ type endpoint struct { waiterQueue *waiter.Queue `state:"wait"` uniqueID uint64 + // hardError is meaningful only when state is stateError. It stores the + // error to be returned when read/write syscalls are called and the + // endpoint is in this state. hardError is protected by endpoint mu. + hardError *tcpip.Error `state:".(string)"` + // lastError represents the last error that the endpoint reported; // access to it is protected by the following mutex. lastErrorMu sync.Mutex `state:"nosave"` @@ -1283,7 +1283,15 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.owner = owner } -func (e *endpoint) LastError() *tcpip.Error { +// Preconditions: e.mu must be held to call this function. +func (e *endpoint) hardErrorLocked() *tcpip.Error { + err := e.hardError + e.hardError = nil + return err +} + +// Preconditions: e.mu must be held to call this function. +func (e *endpoint) lastErrorLocked() *tcpip.Error { e.lastErrorMu.Lock() defer e.lastErrorMu.Unlock() err := e.lastError @@ -1291,6 +1299,15 @@ func (e *endpoint) LastError() *tcpip.Error { return err } +func (e *endpoint) LastError() *tcpip.Error { + e.LockUser() + defer e.UnlockUser() + if err := e.hardErrorLocked(); err != nil { + return err + } + return e.lastErrorLocked() +} + // Read reads data from the endpoint. func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { e.LockUser() @@ -1312,9 +1329,8 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, bufUsed := e.rcvBufUsed if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { e.rcvListMu.Unlock() - he := e.HardError if s == StateError { - return buffer.View{}, tcpip.ControlMessages{}, he + return buffer.View{}, tcpip.ControlMessages{}, e.hardErrorLocked() } e.stats.ReadErrors.NotConnected.Increment() return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected @@ -1370,9 +1386,13 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) { // indicating the reason why it's not writable. // Caller must hold e.mu and e.sndBufMu func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) { + // The endpoint cannot be written to if it's not connected. switch s := e.EndpointState(); { case s == StateError: - return 0, e.HardError + if err := e.hardErrorLocked(); err != nil { + return 0, err + } + return 0, tcpip.ErrClosedForSend case !s.connecting() && !s.connected(): return 0, tcpip.ErrClosedForSend case s.connecting(): @@ -1486,7 +1506,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro // but has some pending unread data. if s := e.EndpointState(); !s.connected() && s != StateClose { if s == StateError { - return 0, tcpip.ControlMessages{}, e.HardError + return 0, tcpip.ControlMessages{}, e.hardErrorLocked() } e.stats.ReadErrors.InvalidEndpointState.Increment() return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState @@ -2243,7 +2263,10 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc return tcpip.ErrAlreadyConnecting case StateError: - return e.HardError + if err := e.hardErrorLocked(); err != nil { + return err + } + return tcpip.ErrConnectionAborted default: return tcpip.ErrInvalidEndpointState @@ -2417,7 +2440,7 @@ func (e *endpoint) startMainLoop(handshake bool) *tcpip.Error { e.lastErrorMu.Unlock() e.setEndpointState(StateError) - e.HardError = err + e.hardError = err // Call cleanupLocked to free up any reservations. e.cleanupLocked() diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index bb901c0f8..ba67176b5 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -321,21 +321,21 @@ func (e *endpoint) loadRecentTSTime(unix unixTime) { } // saveHardError is invoked by stateify. -func (e *EndpointInfo) saveHardError() string { - if e.HardError == nil { +func (e *endpoint) saveHardError() string { + if e.hardError == nil { return "" } - return e.HardError.String() + return e.hardError.String() } // loadHardError is invoked by stateify. -func (e *EndpointInfo) loadHardError(s string) { +func (e *endpoint) loadHardError(s string) { if s == "" { return } - e.HardError = tcpip.StringToError(s) + e.hardError = tcpip.StringToError(s) } // saveMeasureTime is invoked by stateify. diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index 9f0fb41e3..c366a4cbc 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -75,9 +75,6 @@ func TestGiveUpConnect(t *testing.T) { // Wait for ep to become writable. <-notifyCh - if err := ep.LastError(); err != tcpip.ErrAborted { - t.Fatalf("got ep.LastError() = %s, want = %s", err, tcpip.ErrAborted) - } // Call Connect again to retreive the handshake failure status // and stats updates. @@ -3198,6 +3195,11 @@ loop: case tcpip.ErrWouldBlock: select { case <-ch: + // Expect the state to be StateError and subsequent Reads to fail with HardError. + if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset { + t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset) + } + break loop case <-time.After(1 * time.Second): t.Fatalf("Timed out waiting for reset to arrive") } @@ -3207,14 +3209,10 @@ loop: t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset) } } - // Expect the state to be StateError and subsequent Reads to fail with HardError. - if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset { - t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset) - } + if tcp.EndpointState(c.EP.State()) != tcp.StateError { t.Fatalf("got EP state is not StateError") } - if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 { t.Errorf("got stats.TCP.EstablishedResets.Value() = %d, want = 1", got) } diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index c78549424..7ebae63d8 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -56,6 +56,7 @@ go_test( "//pkg/tcpip/network/ipv4", "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", + "//pkg/tcpip/transport/icmp", "//pkg/waiter", ], ) diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 57976d4e3..835dcc54e 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -429,7 +429,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c to := opts.To e.mu.RLock() - defer e.mu.RUnlock() + lockReleased := false + defer func() { + if lockReleased { + return + } + e.mu.RUnlock() + }() // If we've shutdown with SHUT_WR we are in an invalid state for sending. if e.shutdownFlags&tcpip.ShutdownWrite != 0 { @@ -475,7 +481,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c if e.state != StateConnected { err = tcpip.ErrInvalidEndpointState } - return + return ch, err } } else { // Reject destination address if it goes through a different @@ -541,7 +547,24 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c useDefaultTTL = false } - if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner, e.noChecksum); err != nil { + localPort := e.ID.LocalPort + sendTOS := e.sendTOS + owner := e.owner + noChecksum := e.noChecksum + lockReleased = true + e.mu.RUnlock() + + // Do not hold lock when sending as loopback is synchronous and if the UDP + // datagram ends up generating an ICMP response then it can result in a + // deadlock where the ICMP response handling ends up acquiring this endpoint's + // mutex using e.mu.RLock() in endpoint.HandleControlPacket which can cause a + // deadlock if another caller is trying to acquire e.mu in exclusive mode w/ + // e.mu.Lock(). Since e.mu.Lock() prevents any new read locks to ensure the + // lock can be eventually acquired. + // + // See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read + // locking is prohibited. + if err := sendUDP(route, buffer.View(v).ToVectorisedView(), localPort, dstPort, ttl, useDefaultTTL, sendTOS, owner, noChecksum); err != nil { return 0, nil, err } return int64(len(v)), nil, nil diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go index 764ad0857..492e277a8 100644 --- a/pkg/tcpip/transport/udp/udp_test.go +++ b/pkg/tcpip/transport/udp/udp_test.go @@ -32,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/pkg/waiter" ) @@ -54,6 +55,7 @@ const ( stackPort = 1234 testAddr = "\x0a\x00\x00\x02" testPort = 4096 + invalidPort = 8192 multicastAddr = "\xe8\x2b\xd3\xea" multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" broadcastAddr = header.IPv4Broadcast @@ -295,7 +297,8 @@ func newDualTestContext(t *testing.T, mtu uint32) *testContext { t.Helper() return newDualTestContextWithOptions(t, mtu, stack.Options{ NetworkProtocols: []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol}, - TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol}, + TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, icmp.NewProtocol6, icmp.NewProtocol4}, + HandleLocal: true, }) } @@ -972,7 +975,7 @@ func testFailingWrite(c *testContext, flow testFlow, wantErr *tcpip.Error) { // provided. func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 { c.t.Helper() - return testWriteInternal(c, flow, true, checkers...) + return testWriteAndVerifyInternal(c, flow, true, checkers...) } // testWriteWithoutDestination sends a packet of the given test flow from the @@ -981,10 +984,10 @@ func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker // checker functions provided. func testWriteWithoutDestination(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 { c.t.Helper() - return testWriteInternal(c, flow, false, checkers...) + return testWriteAndVerifyInternal(c, flow, false, checkers...) } -func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 { +func testWriteNoVerify(c *testContext, flow testFlow, setDest bool) buffer.View { c.t.Helper() // Take a snapshot of the stats to validate them at the end of the test. epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone() @@ -1006,6 +1009,12 @@ func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ... c.t.Fatalf("Bad number of bytes written: got %v, want %v", n, len(payload)) } c.checkEndpointWriteStats(1, epstats, err) + return payload +} + +func testWriteAndVerifyInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 { + c.t.Helper() + payload := testWriteNoVerify(c, flow, setDest) // Received the packet and check the payload. b := c.getPacketAndVerify(flow, checkers...) var udp header.UDP @@ -1150,6 +1159,39 @@ func TestV4WriteOnConnected(t *testing.T) { testWriteWithoutDestination(c, unicastV4) } +func TestWriteOnConnectedInvalidPort(t *testing.T) { + protocols := map[string]tcpip.NetworkProtocolNumber{ + "ipv4": ipv4.ProtocolNumber, + "ipv6": ipv6.ProtocolNumber, + } + for name, pn := range protocols { + t.Run(name, func(t *testing.T) { + c := newDualTestContext(t, defaultMTU) + defer c.cleanup() + + c.createEndpoint(pn) + if err := c.ep.Connect(tcpip.FullAddress{Addr: stackAddr, Port: invalidPort}); err != nil { + c.t.Fatalf("Connect failed: %s", err) + } + writeOpts := tcpip.WriteOptions{ + To: &tcpip.FullAddress{Addr: stackAddr, Port: invalidPort}, + } + payload := buffer.View(newPayload()) + n, _, err := c.ep.Write(tcpip.SlicePayload(payload), writeOpts) + if err != nil { + c.t.Fatalf("c.ep.Write(...) = %+s, want nil", err) + } + if got, want := n, int64(len(payload)); got != want { + c.t.Fatalf("c.ep.Write(...) wrote %d bytes, want %d bytes", got, want) + } + + if err := c.ep.LastError(); err != tcpip.ErrConnectionRefused { + c.t.Fatalf("expected c.ep.LastError() == ErrConnectionRefused, got: %+v", err) + } + }) + } +} + // TestWriteOnBoundToV4Multicast checks that we can send packets out of a socket // that is bound to a V4 multicast address. func TestWriteOnBoundToV4Multicast(t *testing.T) { diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go index 49ab87c58..976331230 100644 --- a/pkg/test/testutil/testutil.go +++ b/pkg/test/testutil/testutil.go @@ -36,7 +36,6 @@ import ( "path/filepath" "strconv" "strings" - "sync/atomic" "syscall" "testing" "time" @@ -417,33 +416,35 @@ func StartReaper() func() { // WaitUntilRead reads from the given reader until the wanted string is found // or until timeout. -func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error { +func WaitUntilRead(r io.Reader, want string, timeout time.Duration) error { sc := bufio.NewScanner(r) - if split != nil { - sc.Split(split) - } // done must be accessed atomically. A value greater than 0 indicates // that the read loop can exit. - var done uint32 - doneCh := make(chan struct{}) + doneCh := make(chan bool) + defer close(doneCh) go func() { for sc.Scan() { t := sc.Text() if strings.Contains(t, want) { - atomic.StoreUint32(&done, 1) - close(doneCh) - break + doneCh <- true + return } - if atomic.LoadUint32(&done) > 0 { - break + select { + case <-doneCh: + return + default: } } + doneCh <- false }() + select { case <-time.After(timeout): - atomic.StoreUint32(&done, 1) return fmt.Errorf("timeout waiting to read %q", want) - case <-doneCh: + case res := <-doneCh: + if !res { + return fmt.Errorf("reader closed while waiting to read %q", want) + } return nil } } diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index fdf13c8e1..865126ac5 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -211,10 +211,31 @@ func (cm *containerManager) Processes(cid *string, out *[]*control.Process) erro return control.Processes(cm.l.k, *cid, out) } +// CreateArgs contains arguments to the Create method. +type CreateArgs struct { + // CID is the ID of the container to start. + CID string + + // FilePayload may contain a TTY file for the terminal, if enabled. + urpc.FilePayload +} + // Create creates a container within a sandbox. -func (cm *containerManager) Create(cid *string, _ *struct{}) error { - log.Debugf("containerManager.Create, cid: %s", *cid) - return cm.l.createContainer(*cid) +func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error { + log.Debugf("containerManager.Create: %s", args.CID) + + if len(args.Files) > 1 { + return fmt.Errorf("start arguments must have at most 1 files for TTY") + } + var tty *fd.FD + if len(args.Files) == 1 { + var err error + tty, err = fd.NewFromFile(args.Files[0]) + if err != nil { + return fmt.Errorf("error dup'ing TTY file: %w", err) + } + } + return cm.l.createContainer(args.CID, tty) } // StartArgs contains arguments to the Start method. @@ -229,9 +250,8 @@ type StartArgs struct { CID string // FilePayload contains, in order: - // * stdin, stdout, and stderr. - // * the file descriptor over which the sandbox will - // request files from its root filesystem. + // * stdin, stdout, and stderr (optional: if terminal is disabled). + // * file descriptors to connect to gofer to serve the root filesystem. urpc.FilePayload } @@ -251,23 +271,45 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { if args.CID == "" { return errors.New("start argument missing container ID") } - if len(args.FilePayload.Files) < 4 { - return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") + if len(args.Files) < 1 { + return fmt.Errorf("start arguments must contain at least one file for the container root gofer") } // All validation passed, logs the spec for debugging. specutils.LogSpec(args.Spec) - fds, err := fd.NewFromFiles(args.FilePayload.Files) + goferFiles := args.Files + var stdios []*fd.FD + if !args.Spec.Process.Terminal { + // When not using a terminal, stdios come as the first 3 files in the + // payload. + if l := len(args.Files); l < 4 { + return fmt.Errorf("start arguments (len: %d) must contain stdios and files for the container root gofer", l) + } + var err error + stdios, err = fd.NewFromFiles(goferFiles[:3]) + if err != nil { + return fmt.Errorf("error dup'ing stdio files: %w", err) + } + goferFiles = goferFiles[3:] + } + defer func() { + for _, fd := range stdios { + _ = fd.Close() + } + }() + + goferFDs, err := fd.NewFromFiles(goferFiles) if err != nil { - return err + return fmt.Errorf("error dup'ing gofer files: %w", err) } defer func() { - for _, fd := range fds { + for _, fd := range goferFDs { _ = fd.Close() } }() - if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil { + + if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil { log.Debugf("containerManager.Start failed, cid: %s, args: %+v, err: %v", args.CID, args, err) return err } @@ -330,18 +372,18 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { log.Debugf("containerManager.Restore") var specFile, deviceFile *os.File - switch numFiles := len(o.FilePayload.Files); numFiles { + switch numFiles := len(o.Files); numFiles { case 2: // The device file is donated to the platform. // Can't take ownership away from os.File. dup them to get a new FD. - fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd())) + fd, err := syscall.Dup(int(o.Files[1].Fd())) if err != nil { return fmt.Errorf("failed to dup file: %v", err) } deviceFile = os.NewFile(uintptr(fd), "platform device") fallthrough case 1: - specFile = o.FilePayload.Files[0] + specFile = o.Files[0] case 0: return fmt.Errorf("at least one file must be passed to Restore") default: diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 6b6ae98d7..2b0d2cd51 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -22,15 +22,6 @@ import ( "strings" "syscall" - // Include filesystem types that OCI spec might mount. - _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" - _ "gvisor.dev/gvisor/pkg/sentry/fs/host" - _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" - _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" - _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" - _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" - "gvisor.dev/gvisor/pkg/sentry/vfs" - specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -48,9 +39,18 @@ import ( tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" + + // Include filesystem types that OCI spec might mount. + _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" + _ "gvisor.dev/gvisor/pkg/sentry/fs/host" + _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" + _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" ) const ( diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index ebdd518d0..86bdc6ae3 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -157,6 +157,11 @@ type execProcess struct { // pidnsPath is the pid namespace path in spec pidnsPath string + + // hostTTY is present when creating a sub-container with terminal enabled. + // TTY file is passed during container create and must be saved until + // container start. + hostTTY *fd.FD } func init() { @@ -588,7 +593,9 @@ func (l *Loader) run() error { // Create the root container init task. It will begin running // when the kernel is started. - if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { + var err error + _, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(true, l.sandboxID, &l.root) + if err != nil { return err } @@ -627,7 +634,7 @@ func (l *Loader) run() error { } // createContainer creates a new container inside the sandbox. -func (l *Loader) createContainer(cid string) error { +func (l *Loader) createContainer(cid string, tty *fd.FD) error { l.mu.Lock() defer l.mu.Unlock() @@ -635,14 +642,14 @@ func (l *Loader) createContainer(cid string) error { if _, ok := l.processes[eid]; ok { return fmt.Errorf("container %q already exists", cid) } - l.processes[eid] = &execProcess{} + l.processes[eid] = &execProcess{hostTTY: tty} return nil } // startContainer starts a child container. It returns the thread group ID of // the newly created process. Used FDs are either closed or released. It's safe // for the caller to close any remaining files upon return. -func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error { +func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -695,36 +702,41 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin info := &containerInfo{ conf: conf, spec: spec, - stdioFDs: files[:3], - goferFDs: files[3:], + goferFDs: goferFDs, } info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } - tg, err := l.createContainerProcess(false, cid, info, ep) + + // Use stdios or TTY depending on the spec configuration. + if spec.Process.Terminal { + if len(stdioFDs) > 0 { + return fmt.Errorf("using TTY, stdios not expected: %v", stdioFDs) + } + if ep.hostTTY == nil { + return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") + } + info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} + ep.hostTTY = nil + } else { + info.stdioFDs = stdioFDs + } + + ep.tg, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(false, cid, info) if err != nil { return err } - - // Success! - l.k.StartProcess(tg) - ep.tg = tg + l.k.StartProcess(ep.tg) return nil } -func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) { - console := false - if root { - // Only root container supports terminal for now. - console = info.spec.Process.Terminal - } - +func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Create the FD map, which will set stdin, stdout, and stderr. ctx := info.procArgs.NewContext(l.k) - fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs) + fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs) if err != nil { - return nil, fmt.Errorf("importing fds: %v", err) + return nil, nil, nil, fmt.Errorf("importing fds: %v", err) } // CreateProcess takes a reference on fdTable if successful. We won't need // ours either way. @@ -736,11 +748,11 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints) if root { if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { - return nil, err + return nil, nil, nil, err } } if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil { - return nil, err + return nil, nil, nil, err } // Add the HOME environment variable if it is not already set. @@ -754,29 +766,25 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn info.procArgs.Credentials.RealKUID, info.procArgs.Envv) } if err != nil { - return nil, err + return nil, nil, nil, err } info.procArgs.Envv = envv // Create and start the new process. tg, _, err := l.k.CreateProcess(info.procArgs) if err != nil { - return nil, fmt.Errorf("creating process: %v", err) + return nil, nil, nil, fmt.Errorf("creating process: %v", err) } // CreateProcess takes a reference on FDTable if successful. info.procArgs.FDTable.DecRef(ctx) // Set the foreground process group on the TTY to the global init process // group, since that is what we are about to start running. - if root { - switch { - case ttyFileVFS2 != nil: - ep.ttyVFS2 = ttyFileVFS2 - ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) - case ttyFile != nil: - ep.tty = ttyFile - ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) - } + switch { + case ttyFileVFS2 != nil: + ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) + case ttyFile != nil: + ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) } // Install seccomp filters with the new task if there are any. @@ -784,7 +792,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) if err != nil { - return nil, fmt.Errorf("building seccomp program: %v", err) + return nil, nil, nil, fmt.Errorf("building seccomp program: %v", err) } if log.IsLogging(log.Debug) { @@ -795,7 +803,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn task := tg.Leader() // NOTE: It seems Flags are ignored by runc so we ignore them too. if err := task.AppendSyscallFilter(program, true); err != nil { - return nil, fmt.Errorf("appending seccomp filters: %v", err) + return nil, nil, nil, fmt.Errorf("appending seccomp filters: %v", err) } } } else { @@ -804,7 +812,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn } } - return tg, nil + return tg, ttyFile, ttyFileVFS2, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 86c02a22a..eafd6285c 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -150,7 +150,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus { - // Start the new process and get it pid. + // Start the new process and get its pid. pid, err := c.Execute(e) if err != nil { return Errorf("executing processes for container: %v", err) diff --git a/runsc/console/console.go b/runsc/console/console.go index dbb88e117..b36028792 100644 --- a/runsc/console/console.go +++ b/runsc/console/console.go @@ -24,8 +24,8 @@ import ( "golang.org/x/sys/unix" ) -// NewWithSocket creates pty master/replica pair, sends the master FD over the given -// socket, and returns the replica. +// NewWithSocket creates pty master/replica pair, sends the master FD over the +// given socket, and returns the replica. func NewWithSocket(socketPath string) (*os.File, error) { // Create a new pty master and replica. ptyMaster, ptyReplica, err := pty.Open() diff --git a/runsc/container/BUILD b/runsc/container/BUILD index c33755482..1900ecceb 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -24,6 +24,7 @@ go_library( "//runsc/boot", "//runsc/cgroup", "//runsc/config", + "//runsc/console", "//runsc/sandbox", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 4228399b8..1b0fdebd6 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "io" + "math/rand" "os" "path/filepath" "syscall" @@ -27,7 +28,6 @@ import ( "github.com/kr/pty" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/control" - "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/pkg/unet" @@ -38,19 +38,22 @@ import ( // path is under 108 charactors (the unix socket path length limit), // relativizing the path if necessary. func socketPath(bundleDir string) (string, error) { - path := filepath.Join(bundleDir, "socket") + num := rand.Intn(10000) + path := filepath.Join(bundleDir, fmt.Sprintf("socket-%4d", num)) + const maxPathLen = 108 + if len(path) <= maxPathLen { + return path, nil + } + + // Path is too large, try to make it smaller. cwd, err := os.Getwd() if err != nil { return "", fmt.Errorf("error getting cwd: %v", err) } - relPath, err := filepath.Rel(cwd, path) + path, err = filepath.Rel(cwd, path) if err != nil { return "", fmt.Errorf("error getting relative path for %q from cwd %q: %v", path, cwd, err) } - if len(path) > len(relPath) { - path = relPath - } - const maxPathLen = 108 if len(path) > maxPathLen { return "", fmt.Errorf("could not get socket path under length limit %d: %s", maxPathLen, path) } @@ -159,6 +162,82 @@ func TestConsoleSocket(t *testing.T) { } } +// Test that an pty FD is sent over the console socket if one is provided. +func TestMultiContainerConsoleSocket(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + tru := []string{"true"} + testSpecs, ids := createSpecs(sleep, tru) + testSpecs[1].Process.Terminal = true + + bundleDir, cleanup, err := testutil.SetupBundleDir(testSpecs[0]) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: ids[0], + Spec: testSpecs[0], + BundleDir: bundleDir, + } + rootCont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer rootCont.Destroy() + if err := rootCont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + bundleDir, cleanup, err = testutil.SetupBundleDir(testSpecs[0]) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args = Args{ + ID: ids[1], + Spec: testSpecs[1], + BundleDir: bundleDir, + ConsoleSocket: sock, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Make sure we get a console PTY. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + ptyMaster.Close() + }) + } +} + // Test that job control signals work on a console created with "exec -ti". func TestJobControlSignalExec(t *testing.T) { spec := testutil.NewSpecWithArgs("/bin/sleep", "10000") @@ -221,9 +300,9 @@ func TestJobControlSignalExec(t *testing.T) { // Make sure all the processes are running. expectedPL := []*control.Process{ // Root container process. - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().Cmd("sleep").Process(), // Bash from exec process. - {PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}}, + newProcessBuilder().PID(2).Cmd("bash").Process(), } if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) @@ -233,7 +312,7 @@ func TestJobControlSignalExec(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for it to start. Sleep's PPID is bash's PID. - expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}}) + expectedPL = append(expectedPL, newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process()) if err := waitForProcessList(c, expectedPL); err != nil { t.Error(err) } @@ -254,7 +333,7 @@ func TestJobControlSignalExec(t *testing.T) { // Sleep is dead, but it may take more time for bash to notice and // change the foreground process back to itself. We know it is done // when bash writes "Terminated" to the pty. - if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil { + if err := testutil.WaitUntilRead(ptyMaster, "Terminated", 5*time.Second); err != nil { t.Fatalf("bash did not take over pty: %v", err) } @@ -359,7 +438,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Wait for bash to start. expectedPL := []*control.Process{ - {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}}, + newProcessBuilder().PID(1).Cmd("bash").Process(), } if err := waitForProcessList(c, expectedPL); err != nil { t.Fatalf("error waiting for processes: %v", err) @@ -369,7 +448,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { ptyMaster.Write([]byte("sleep 100\n")) // Wait for sleep to start. - expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}}) + expectedPL = append(expectedPL, newProcessBuilder().PID(2).PPID(1).Cmd("sleep").Process()) if err := waitForProcessList(c, expectedPL); err != nil { t.Fatalf("error waiting for processes: %v", err) } @@ -393,7 +472,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Sleep is dead, but it may take more time for bash to notice and // change the foreground process back to itself. We know it is done // when bash writes "Terminated" to the pty. - if err := testutil.WaitUntilRead(ptyBuf, "Terminated", nil, 5*time.Second); err != nil { + if err := testutil.WaitUntilRead(ptyBuf, "Terminated", 5*time.Second); err != nil { t.Fatalf("bash did not take over pty: %v", err) } @@ -414,6 +493,104 @@ func TestJobControlSignalRootContainer(t *testing.T) { } } +// Test that terminal works with root and sub-containers. +func TestMultiContainerTerminal(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Don't let bash execute from profile or rc files, otherwise our PID + // counts get messed up. + bash := []string{"/bin/bash", "--noprofile", "--norc"} + testSpecs, ids := createSpecs(bash, bash) + + type termContainer struct { + container *Container + master *os.File + } + var containers []termContainer + for i, spec := range testSpecs { + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + spec.Process.Terminal = true + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args := Args{ + ID: ids[i], + Spec: spec, + BundleDir: bundleDir, + ConsoleSocket: sock, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Make sure we get a console PTY. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + defer ptyMaster.Close() + + containers = append(containers, termContainer{ + container: cont, + master: ptyMaster, + }) + } + + for _, tc := range containers { + // Bash output as well as sandbox output will be written to the PTY + // file. Writes after a certain point will block unless we drain the + // PTY, so we must continually copy from it. + // + // We log the output to stderr for debugabilitly, and also to a buffer, + // since we wait on particular output from bash below. We use a custom + // blockingBuffer which is thread-safe and also blocks on Read calls, + // which makes this a suitable Reader for WaitUntilRead. + ptyBuf := newBlockingBuffer() + tee := io.TeeReader(tc.master, ptyBuf) + go io.Copy(os.Stderr, tee) + + // Wait for bash to start. + expectedPL := []*control.Process{ + newProcessBuilder().Cmd("bash").Process(), + } + if err := waitForProcessList(tc.container, expectedPL); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } + + // Execute echo command and check that it was executed correctly. Use + // a variable to ensure it's not matching against command echo. + tc.master.Write([]byte("echo foo-${PWD}-123\n")) + if err := testutil.WaitUntilRead(ptyBuf, "foo-/-123", 5*time.Second); err != nil { + t.Fatalf("echo didn't execute: %v", err) + } + } + }) + } +} + // blockingBuffer is a thread-safe buffer that blocks when reading if the // buffer is empty. It implements io.ReadWriter. type blockingBuffer struct { diff --git a/runsc/container/container.go b/runsc/container/container.go index 4aa139c88..f3d990cfc 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -38,6 +38,7 @@ import ( "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/sandbox" "gvisor.dev/gvisor/runsc/specutils" ) @@ -397,7 +398,22 @@ func New(conf *config.Config, args Args) (*Container, error) { return nil, err } c.Sandbox = sb.Sandbox - if err := c.Sandbox.CreateContainer(c.ID); err != nil { + + // If the console control socket file is provided, then create a new + // pty master/slave pair and send the TTY to the sandbox process. + var tty *os.File + if c.ConsoleSocket != "" { + // Create a new TTY pair and send the master on the provided socket. + var err error + tty, err = console.NewWithSocket(c.ConsoleSocket) + if err != nil { + return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err) + } + // tty file is transferred to the sandbox, then it can be closed here. + defer tty.Close() + } + + if err := c.Sandbox.CreateContainer(c.ID, tty); err != nil { return nil, err } } @@ -451,11 +467,16 @@ func (c *Container) Start(conf *config.Config) error { // the start (and all their children processes). if err := runInCgroup(c.Sandbox.Cgroup, func() error { // Create the gofer process. - ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) + goferFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) if err != nil { return err } - defer mountsFile.Close() + defer func() { + _ = mountsFile.Close() + for _, f := range goferFiles { + _ = f.Close() + } + }() cleanMounts, err := specutils.ReadMounts(mountsFile) if err != nil { @@ -463,7 +484,14 @@ func (c *Container) Start(conf *config.Config) error { } c.Spec.Mounts = cleanMounts - return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles) + // Setup stdios if the container is not using terminal. Otherwise TTY was + // already setup in create. + var stdios []*os.File + if !c.Spec.Process.Terminal { + stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr} + } + + return c.Sandbox.StartContainer(c.Spec, conf, c.ID, stdios, goferFiles) }); err != nil { return err } diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 4a4110477..c84ebcd8a 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -173,7 +173,7 @@ func New(conf *config.Config, args *Args) (*Sandbox, error) { } // CreateContainer creates a non-root container inside the sandbox. -func (s *Sandbox) CreateContainer(cid string) error { +func (s *Sandbox) CreateContainer(cid string, tty *os.File) error { log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) sandboxConn, err := s.sandboxConnect() if err != nil { @@ -181,7 +181,16 @@ func (s *Sandbox) CreateContainer(cid string) error { } defer sandboxConn.Close() - if err := sandboxConn.Call(boot.ContainerCreate, &cid, nil); err != nil { + var files []*os.File + if tty != nil { + files = []*os.File{tty} + } + + args := boot.CreateArgs{ + CID: cid, + FilePayload: urpc.FilePayload{Files: files}, + } + if err := sandboxConn.Call(boot.ContainerCreate, &args, nil); err != nil { return fmt.Errorf("creating non-root container %q: %v", cid, err) } return nil @@ -211,11 +220,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error { } // StartContainer starts running a non-root container inside the sandbox. -func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error { - for _, f := range goferFiles { - defer f.Close() - } - +func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles []*os.File) error { log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) sandboxConn, err := s.sandboxConnect() if err != nil { @@ -223,15 +228,18 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid stri } defer sandboxConn.Close() - // The payload must container stdin/stdout/stderr followed by gofer - // files. - files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...) + // The payload must contain stdin/stdout/stderr (which may be empty if using + // TTY) followed by gofer files. + payload := urpc.FilePayload{} + payload.Files = append(payload.Files, stdios...) + payload.Files = append(payload.Files, goferFiles...) + // Start running the container. args := boot.StartArgs{ Spec: spec, Conf: conf, CID: cid, - FilePayload: urpc.FilePayload{Files: files}, + FilePayload: payload, } if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err) diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go index 11ac5cb52..735dff107 100644 --- a/test/root/crictl_test.go +++ b/test/root/crictl_test.go @@ -480,7 +480,7 @@ func setup(t *testing.T, version string) (*criutil.Crictl, func(), error) { } // Wait for containerd to boot. - if err := testutil.WaitUntilRead(startupR, "Start streaming server", nil, 10*time.Second); err != nil { + if err := testutil.WaitUntilRead(startupR, "Start streaming server", 10*time.Second); err != nil { t.Fatalf("failed to start containerd: %v", err) } diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc index e19a83413..27e9816eb 100644 --- a/test/syscalls/linux/socket_inet_loopback.cc +++ b/test/syscalls/linux/socket_inet_loopback.cc @@ -1185,19 +1185,44 @@ TEST_P(SocketInetLoopbackTest, TCPAcceptAfterReset) { listen_fd.get(), reinterpret_cast<sockaddr*>(&accept_addr), &addrlen)); ASSERT_EQ(addrlen, listener.addr_len); - // TODO(gvisor.dev/issue/3812): Remove after SO_ERROR is fixed. - if (IsRunningOnGvisor()) { - char buf[10]; - ASSERT_THAT(ReadFd(accept_fd.get(), buf, sizeof(buf)), - SyscallFailsWithErrno(ECONNRESET)); - } else { + // Wait for accept_fd to process the RST. + const int kTimeout = 10000; + struct pollfd pfd = { + .fd = accept_fd.get(), + .events = POLLIN, + }; + ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1)); + ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR); + + { int err; socklen_t optlen = sizeof(err); ASSERT_THAT( getsockopt(accept_fd.get(), SOL_SOCKET, SO_ERROR, &err, &optlen), SyscallSucceeds()); + // This should return ECONNRESET as the socket just received a RST packet + // from the peer. + ASSERT_EQ(optlen, sizeof(err)); ASSERT_EQ(err, ECONNRESET); + } + { + int err; + socklen_t optlen = sizeof(err); + ASSERT_THAT( + getsockopt(accept_fd.get(), SOL_SOCKET, SO_ERROR, &err, &optlen), + SyscallSucceeds()); + // This should return no error as the previous getsockopt call would have + // cleared the socket error. ASSERT_EQ(optlen, sizeof(err)); + ASSERT_EQ(err, 0); + } + { + sockaddr_storage peer_addr; + socklen_t addrlen = sizeof(peer_addr); + // The socket is not connected anymore and should return ENOTCONN. + ASSERT_THAT(getpeername(accept_fd.get(), + reinterpret_cast<sockaddr*>(&peer_addr), &addrlen), + SyscallFailsWithErrno(ENOTCONN)); } } diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc index bc2c8278c..714848b8e 100644 --- a/test/syscalls/linux/tcp_socket.cc +++ b/test/syscalls/linux/tcp_socket.cc @@ -964,37 +964,156 @@ TEST_P(TcpSocketTest, PollAfterShutdown) { SyscallSucceedsWithValue(1)); } -TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) { +TEST_P(SimpleTcpSocketTest, NonBlockingConnectRetry) { + const FileDescriptor listener = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); + // Initialize address to the loopback one. sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); socklen_t addrlen = sizeof(addr); - const FileDescriptor s = + // Bind to some port but don't listen yet. + ASSERT_THAT( + bind(listener.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallSucceeds()); + + // Get the address we're bound to, then connect to it. We need to do this + // because we're allowing the stack to pick a port for us. + ASSERT_THAT(getsockname(listener.get(), + reinterpret_cast<struct sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + + FileDescriptor connector = ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP)); - // Set the FD to O_NONBLOCK. - int opts; - ASSERT_THAT(opts = fcntl(s.get(), F_GETFL), SyscallSucceeds()); - opts |= O_NONBLOCK; - ASSERT_THAT(fcntl(s.get(), F_SETFL, opts), SyscallSucceeds()); + // Verify that connect fails. + ASSERT_THAT( + RetryEINTR(connect)(connector.get(), + reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallFailsWithErrno(ECONNREFUSED)); - ASSERT_THAT(RetryEINTR(connect)( + // Now start listening + ASSERT_THAT(listen(listener.get(), SOMAXCONN), SyscallSucceeds()); + + // TODO(gvisor.dev/issue/3828): Issuing connect() again on a socket that + // failed first connect should succeed. + if (IsRunningOnGvisor()) { + ASSERT_THAT( + RetryEINTR(connect)(connector.get(), + reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallFailsWithErrno(ECONNABORTED)); + return; + } + + // Verify that connect now succeeds. + ASSERT_THAT( + RetryEINTR(connect)(connector.get(), + reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallSucceeds()); + + // Accept the connection. + const FileDescriptor accepted = + ASSERT_NO_ERRNO_AND_VALUE(Accept(listener.get(), nullptr, nullptr)); +} + +// nonBlockingConnectNoListener returns a socket on which a connect that is +// expected to fail has been issued. +PosixErrorOr<FileDescriptor> nonBlockingConnectNoListener(const int family, + sockaddr_storage addr, + socklen_t addrlen) { + // We will first create a socket and bind to ensure we bind a port but will + // not call listen on this socket. + // Then we will create a new socket that will connect to the port bound by + // the first socket and that shoud fail. + constexpr int sock_type = SOCK_STREAM | SOCK_NONBLOCK; + int b_sock; + RETURN_ERROR_IF_SYSCALL_FAIL(b_sock = socket(family, sock_type, IPPROTO_TCP)); + FileDescriptor b(b_sock); + EXPECT_THAT(bind(b.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallSucceeds()); + + // Get the address bound by the listening socket. + EXPECT_THAT( + getsockname(b.get(), reinterpret_cast<struct sockaddr*>(&addr), &addrlen), + SyscallSucceeds()); + + // Now create another socket and issue a connect on this one. This connect + // should fail as there is no listener. + int c_sock; + RETURN_ERROR_IF_SYSCALL_FAIL(c_sock = socket(family, sock_type, IPPROTO_TCP)); + FileDescriptor s(c_sock); + + // Now connect to the bound address and this should fail as nothing + // is listening on the bound address. + EXPECT_THAT(RetryEINTR(connect)( s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), SyscallFailsWithErrno(EINPROGRESS)); - // Now polling on the FD with a timeout should return 0 corresponding to no - // FDs ready. - struct pollfd poll_fd = {s.get(), POLLOUT, 0}; - EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000), - SyscallSucceedsWithValue(1)); + // Wait for the connect to fail. + struct pollfd poll_fd = {s.get(), POLLERR, 0}; + EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 1000), SyscallSucceedsWithValue(1)); + return std::move(s); +} + +TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) { + sockaddr_storage addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t addrlen = sizeof(addr); + + const FileDescriptor s = + nonBlockingConnectNoListener(GetParam(), addr, addrlen).ValueOrDie(); int err; socklen_t optlen = sizeof(err); ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_ERROR, &err, &optlen), SyscallSucceeds()); - + ASSERT_THAT(optlen, sizeof(err)); EXPECT_EQ(err, ECONNREFUSED); + + unsigned char c; + ASSERT_THAT(read(s.get(), &c, sizeof(c)), SyscallSucceedsWithValue(0)); + int opts; + EXPECT_THAT(opts = fcntl(s.get(), F_GETFL), SyscallSucceeds()); + opts &= ~O_NONBLOCK; + EXPECT_THAT(fcntl(s.get(), F_SETFL, opts), SyscallSucceeds()); + // Try connecting again. + ASSERT_THAT(RetryEINTR(connect)( + s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallFailsWithErrno(ECONNABORTED)); +} + +TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListenerRead) { + sockaddr_storage addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t addrlen = sizeof(addr); + + const FileDescriptor s = + nonBlockingConnectNoListener(GetParam(), addr, addrlen).ValueOrDie(); + + unsigned char c; + ASSERT_THAT(read(s.get(), &c, 1), SyscallFailsWithErrno(ECONNREFUSED)); + ASSERT_THAT(read(s.get(), &c, 1), SyscallSucceedsWithValue(0)); + ASSERT_THAT(RetryEINTR(connect)( + s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallFailsWithErrno(ECONNABORTED)); +} + +TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListenerPeek) { + sockaddr_storage addr = + ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam())); + socklen_t addrlen = sizeof(addr); + + const FileDescriptor s = + nonBlockingConnectNoListener(GetParam(), addr, addrlen).ValueOrDie(); + + unsigned char c; + ASSERT_THAT(recv(s.get(), &c, 1, MSG_PEEK), + SyscallFailsWithErrno(ECONNREFUSED)); + ASSERT_THAT(recv(s.get(), &c, 1, MSG_PEEK), SyscallSucceedsWithValue(0)); + ASSERT_THAT(RetryEINTR(connect)( + s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen), + SyscallFailsWithErrno(ECONNABORTED)); } TEST_P(SimpleTcpSocketTest, SelfConnectSendRecv_NoRandomSave) { @@ -1235,6 +1354,19 @@ TEST_P(SimpleTcpSocketTest, CleanupOnConnectionRefused) { // Attempt #2, with the new socket and reused addr our connect should fail in // the same way as before, not with an EADDRINUSE. + // + // TODO(gvisor.dev/issue/3828): 2nd connect on a socket which failed connect + // first time should succeed. + // gVisor never issues the second connect and returns ECONNABORTED instead. + // Linux actually sends a SYN again and gets a RST and correctly returns + // ECONNREFUSED. + if (IsRunningOnGvisor()) { + ASSERT_THAT(connect(client_s.get(), + reinterpret_cast<const struct sockaddr*>(&bound_addr), + bound_addrlen), + SyscallFailsWithErrno(ECONNABORTED)); + return; + } ASSERT_THAT(connect(client_s.get(), reinterpret_cast<const struct sockaddr*>(&bound_addr), bound_addrlen), diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc index d65275fd3..34255bfb8 100644 --- a/test/syscalls/linux/udp_socket.cc +++ b/test/syscalls/linux/udp_socket.cc @@ -374,6 +374,42 @@ TEST_P(UdpSocketTest, BindInUse) { SyscallFailsWithErrno(EADDRINUSE)); } +TEST_P(UdpSocketTest, ConnectWriteToInvalidPort) { + ASSERT_NO_ERRNO(BindLoopback()); + + // Discover a free unused port by creating a new UDP socket, binding it + // recording the just bound port and closing it. This is not guaranteed as it + // can still race with other port UDP sockets trying to bind a port at the + // same time. + struct sockaddr_storage addr_storage = InetLoopbackAddr(); + socklen_t addrlen = sizeof(addr_storage); + struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage); + FileDescriptor s = + ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP)); + ASSERT_THAT(bind(s.get(), addr, addrlen), SyscallSucceeds()); + ASSERT_THAT(getsockname(s.get(), addr, &addrlen), SyscallSucceeds()); + EXPECT_EQ(addrlen, addrlen_); + EXPECT_NE(*Port(&addr_storage), 0); + ASSERT_THAT(close(s.release()), SyscallSucceeds()); + + // Now connect to the port that we just released. This should generate an + // ECONNREFUSED error. + ASSERT_THAT(connect(sock_.get(), addr, addrlen_), SyscallSucceeds()); + char buf[512]; + RandomizeBuffer(buf, sizeof(buf)); + // Send from sock_ to an unbound port. + ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, addr, addrlen_), + SyscallSucceedsWithValue(sizeof(buf))); + + // Now verify that we got an ICMP error back of ECONNREFUSED. + int err; + socklen_t optlen = sizeof(err); + ASSERT_THAT(getsockopt(sock_.get(), SOL_SOCKET, SO_ERROR, &err, &optlen), + SyscallSucceeds()); + ASSERT_EQ(err, ECONNREFUSED); + ASSERT_EQ(optlen, sizeof(err)); +} + TEST_P(UdpSocketTest, ReceiveAfterConnect) { ASSERT_NO_ERRNO(BindLoopback()); ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds()); |