diff options
Diffstat (limited to 'pkg')
31 files changed, 525 insertions, 228 deletions
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index 055ac1d7c..e11ca2d62 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -191,8 +191,9 @@ var DirentType = abi.ValueSet{ // Values for preadv2/pwritev2. const ( - // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is - // accepted as a valid flag argument for preadv2/pwritev2. + // NOTE(b/120162627): gVisor does not implement the RWF_HIPRI feature, but + // the flag is accepted as a valid flag argument for preadv2/pwritev2 and + // silently ignored. RWF_HIPRI = 0x00000001 RWF_DSYNC = 0x00000002 RWF_SYNC = 0x00000004 diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go index 28d851ff5..122c457d2 100644 --- a/pkg/p9/p9.go +++ b/pkg/p9/p9.go @@ -1091,6 +1091,19 @@ type AllocateMode struct { Unshare bool } +// ToAllocateMode returns an AllocateMode from a fallocate(2) mode. +func ToAllocateMode(mode uint64) AllocateMode { + return AllocateMode{ + KeepSize: mode&unix.FALLOC_FL_KEEP_SIZE != 0, + PunchHole: mode&unix.FALLOC_FL_PUNCH_HOLE != 0, + NoHideStale: mode&unix.FALLOC_FL_NO_HIDE_STALE != 0, + CollapseRange: mode&unix.FALLOC_FL_COLLAPSE_RANGE != 0, + ZeroRange: mode&unix.FALLOC_FL_ZERO_RANGE != 0, + InsertRange: mode&unix.FALLOC_FL_INSERT_RANGE != 0, + Unshare: mode&unix.FALLOC_FL_UNSHARE_RANGE != 0, + } +} + // ToLinux converts to a value compatible with fallocate(2)'s mode. func (a *AllocateMode) ToLinux() uint32 { rv := uint32(0) diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 51082359d..7bcc99b29 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -869,11 +869,22 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf if err := d.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } + + trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG + if trunc { + // Lock metadataMu *while* we open a regular file with O_TRUNC because + // open(2) will change the file size on server. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + } + + var vfd *vfs.FileDescription + var err error mnt := rp.Mount() switch d.fileType() { case linux.S_IFREG: if !d.fs.opts.regularFilesUseSpecialFileFD { - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil { return nil, err } fd := ®ularFileFD{} @@ -883,7 +894,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf }); err != nil { return nil, err } - return &fd.vfsfd, nil + vfd = &fd.vfsfd } case linux.S_IFDIR: // Can't open directories with O_CREAT. @@ -923,7 +934,25 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks) } } - return d.openSpecialFileLocked(ctx, mnt, opts) + + if vfd == nil { + if vfd, err = d.openSpecialFileLocked(ctx, mnt, opts); err != nil { + return nil, err + } + } + + if trunc { + // If no errors occured so far then update file size in memory. This + // step is required even if !d.cachedMetadataAuthoritative() because + // d.mappings has to be updated. + // d.metadataMu has already been acquired if trunc == true. + d.updateFileSizeLocked(0) + + if d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + } + return vfd, err } func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 71c8d3ae1..8e74e60a5 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -794,9 +794,7 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { atomic.StoreUint32(&d.nlink, uint32(attr.NLink)) } if mask.Size { - d.dataMu.Lock() - atomic.StoreUint64(&d.size, attr.Size) - d.dataMu.Unlock() + d.updateFileSizeLocked(attr.Size) } d.metadataMu.Unlock() } @@ -902,6 +900,12 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } d.metadataMu.Lock() defer d.metadataMu.Unlock() + if stat.Mask&linux.STATX_SIZE != 0 { + // The size needs to be changed even when + // !d.cachedMetadataAuthoritative() because d.mappings has to be + // updated. + d.updateFileSizeLocked(stat.Size) + } if !d.isSynthetic() { if stat.Mask != 0 { if err := d.file.setAttr(ctx, p9.SetAttrMask{ @@ -963,40 +967,42 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin stat.Mask |= linux.STATX_MTIME } atomic.StoreInt64(&d.ctime, now) - if stat.Mask&linux.STATX_SIZE != 0 { + return nil +} + +// Preconditions: d.metadataMu must be locked. +func (d *dentry) updateFileSizeLocked(newSize uint64) { + d.dataMu.Lock() + oldSize := d.size + d.size = newSize + // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings + // below. This allows concurrent calls to Read/Translate/etc. These + // functions synchronize with truncation by refusing to use cache + // contents beyond the new d.size. (We are still holding d.metadataMu, + // so we can't race with Write or another truncate.) + d.dataMu.Unlock() + if d.size < oldSize { + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(d.size) + if oldpgend != newpgend { + d.mapsMu.Lock() + d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + d.mapsMu.Unlock() + } + // We are now guaranteed that there are no translations of + // truncated pages, and can remove them from the cache. Since + // truncated pages have been removed from the remote file, they + // should be dropped without being written back. d.dataMu.Lock() - oldSize := d.size - d.size = stat.Size - // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings - // below. This allows concurrent calls to Read/Translate/etc. These - // functions synchronize with truncation by refusing to use cache - // contents beyond the new d.size. (We are still holding d.metadataMu, - // so we can't race with Write or another truncate.) + d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) + d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) d.dataMu.Unlock() - if d.size < oldSize { - oldpgend, _ := usermem.PageRoundUp(oldSize) - newpgend, _ := usermem.PageRoundUp(d.size) - if oldpgend != newpgend { - d.mapsMu.Lock() - d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ - // Compare Linux's mm/truncate.c:truncate_setsize() => - // truncate_pagecache() => - // mm/memory.c:unmap_mapping_range(evencows=1). - InvalidatePrivate: true, - }) - d.mapsMu.Unlock() - } - // We are now guaranteed that there are no translations of - // truncated pages, and can remove them from the cache. Since - // truncated pages have been removed from the remote file, they - // should be dropped without being written back. - d.dataMu.Lock() - d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) - d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) - d.dataMu.Unlock() - } } - return nil } func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 404a452c4..3d2d3530a 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -67,13 +68,43 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { return d.handle.file.flush(ctx) } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + + d := fd.dentry() + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + size := offset + length + + // Allocating a smaller size is a noop. + if size <= d.size { + return nil + } + + d.handleMu.Lock() + defer d.handleMu.Unlock() + + err := d.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + if err != nil { + return err + } + d.size = size + if !d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + return nil +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } @@ -126,7 +157,9 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index a016cbae1..3c4e7e2e4 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -130,7 +130,9 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } @@ -176,7 +178,9 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } @@ -235,8 +239,5 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) ( // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *specialFileFD) Sync(ctx context.Context) error { - if !fd.vfsfd.IsWritable() { - return nil - } - return fd.handle.sync(ctx) + return fd.dentry().syncSharedHandle(ctx) } diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 1d5aa82dc..0eef4e16e 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -36,7 +36,7 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { } } -// Preconditions: fs.interop != InteropModeShared. +// Preconditions: d.cachedMetadataAuthoritative() == true. func (d *dentry) touchAtime(mnt *vfs.Mount) { if mnt.Flags.NoATime { return @@ -51,8 +51,8 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) { mnt.EndWrite() } -// Preconditions: fs.interop != InteropModeShared. The caller has successfully -// called vfs.Mount.CheckBeginWrite(). +// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has +// successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCtime() { now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() @@ -60,8 +60,8 @@ func (d *dentry) touchCtime() { d.metadataMu.Unlock() } -// Preconditions: fs.interop != InteropModeShared. The caller has successfully -// called vfs.Mount.CheckBeginWrite(). +// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has +// successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCMtime() { now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() @@ -70,6 +70,8 @@ func (d *dentry) touchCMtime() { d.metadataMu.Unlock() } +// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has +// locked d.metadataMu. func (d *dentry) touchCMtimeLocked() { now := d.fs.clock.Now().Nanoseconds() atomic.StoreInt64(&d.mtime, now) diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index d5e73ddae..1cd2982cb 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -543,6 +543,16 @@ func (f *fileDescription) Release() { // noop } +// Allocate implements vfs.FileDescriptionImpl. +func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { + if !f.inode.seekable { + return syserror.ESPIPE + } + + // TODO(gvisor.dev/issue/2923): Implement Allocate for non-pipe hostfds. + return syserror.EOPNOTSUPP +} + // PRead implements FileDescriptionImpl. func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { i := f.inode @@ -578,8 +588,10 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts } func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { + // Check that flags are supported. + // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. - if flags != 0 { + if flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index 38f1fbfba..fd16bd92d 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -47,11 +47,6 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor return ep, nil } -// maxSendBufferSize is the maximum host send buffer size allowed for endpoint. -// -// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max). -const maxSendBufferSize = 8 << 20 - // ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and // transport.Receiver. It is backed by a host fd that was imported at sentry // startup. This fd is shared with a hostfs inode, which retains ownership of @@ -114,10 +109,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error { if err != nil { return syserr.FromError(err) } - if sndbuf > maxSendBufferSize { - log.Warningf("Socket send buffer too large: %d", sndbuf) - return syserr.ErrInvalidEndpointState - } c.stype = linux.SockType(stype) c.sndbuf = int64(sndbuf) diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 5f7853a2a..ca8b8c63b 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -236,6 +236,11 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio return inode.SetStat(ctx, fd.filesystem(), creds, opts) } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) +} + // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 286c23f01..9af43b859 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -231,8 +231,9 @@ func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Linux will return envp up to and including the first NULL character, // so find it. - if end := bytes.IndexByte(buf.Bytes()[ar.Length():], 0); end != -1 { - buf.Truncate(end) + envStart := int(ar.Length()) + if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 { + buf.Truncate(envStart + nullIdx) } } @@ -300,7 +301,7 @@ type idMapData struct { var _ dynamicInode = (*idMapData)(nil) -// Generate implements vfs.DynamicBytesSource.Generate. +// Generate implements vfs.WritableDynamicBytesSource.Generate. func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { var entries []auth.IDMapEntry if d.gids { @@ -314,6 +315,7 @@ func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } +// Write implements vfs.WritableDynamicBytesSource.Write. func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { // "In addition, the number of bytes written to the file must be less than // the system page size, and the write must be performed at the start of diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index fe02f7ee9..01ce30a4d 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -138,7 +138,7 @@ type cpuFile struct { // Generate implements vfs.DynamicBytesSource.Generate. func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "0-%d", c.maxCores-1) + fmt.Fprintf(buf, "0-%d\n", c.maxCores-1) return nil } diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 4b3602d47..242d5fd12 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -51,7 +51,7 @@ func TestReadCPUFile(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) maxCPUCores := k.ApplicationCores() - expected := fmt.Sprintf("0-%d", maxCPUCores-1) + expected := fmt.Sprintf("0-%d\n", maxCPUCores-1) for _, fname := range []string{"online", "possible", "present"} { pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname)) diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 71ac7b8e6..ed40f6b52 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -407,10 +407,10 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open case *regularFile: var fd regularFileFD fd.LockFD.Init(&d.inode.locks) - if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } - if opts.Flags&linux.O_TRUNC != 0 { + if !afterCreate && opts.Flags&linux.O_TRUNC != 0 { if _, err := impl.truncate(0); err != nil { return nil, err } @@ -423,7 +423,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open } var fd directoryFD fd.LockFD.Init(&d.inode.locks) - if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } return &fd.vfsfd, nil diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index b805aadd0..1cdb46e6f 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -274,14 +274,32 @@ func (fd *regularFileFD) Release() { // noop } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + f := fd.inode().impl.(*regularFile) + + f.inode.mu.Lock() + defer f.inode.mu.Unlock() + oldSize := f.size + size := offset + length + if oldSize >= size { + return nil + } + _, err := f.truncateLocked(size) + return err +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. - if opts.Flags&^linux.RWF_HIPRI != 0 { + // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since + // all state is in-memory. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { return 0, syserror.EOPNOTSUPP } @@ -311,8 +329,11 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. - if opts.Flags&^linux.RWF_HIPRI != 0 { + // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since + // all state is in-memory. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { return 0, syserror.EOPNOTSUPP } diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index 323f1dfa5..153d2cd9b 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -176,3 +176,13 @@ func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kern a.recipientTG = nil a.recipientPG = recipient } + +// ClearOwner unsets the current signal recipient. +func (a *FileAsync) ClearOwner() { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = nil + a.recipientT = nil + a.recipientTG = nil + a.recipientPG = nil +} diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index a4519363f..45d4c5fc1 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -200,6 +200,11 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { } } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ESPIPE +} + // EventRegister implements waiter.Waitable.EventRegister. func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { fd.pipe.EventRegister(e, mask) diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index ad5f64799..8f192c62f 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -96,7 +96,12 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal return ioctl(ctx, s.fd, uio, args) } -// PRead implements vfs.FileDescriptionImpl. +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ENODEV +} + +// PRead implements vfs.FileDescriptionImpl.PRead. func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, syserror.ESPIPE } diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 7e4c6a56e..517394ba9 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -156,11 +156,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } return uintptr(n), nil, nil case linux.F_GETOWN: - a := file.AsyncHandler() - if a == nil { + owner, hasOwner := getAsyncOwner(t, file) + if !hasOwner { return 0, nil, nil } - owner := getAsyncOwner(t, a.(*fasync.FileAsync)) if owner.Type == linux.F_OWNER_PGRP { return uintptr(-owner.PID), nil, nil } @@ -176,26 +175,21 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall ownerType = linux.F_OWNER_PGRP who = -who } - a := file.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) - return 0, nil, setAsyncOwner(t, a, ownerType, who) + return 0, nil, setAsyncOwner(t, file, ownerType, who) case linux.F_GETOWN_EX: - a := file.AsyncHandler() - if a == nil { + owner, hasOwner := getAsyncOwner(t, file) + if !hasOwner { return 0, nil, nil } - addr := args[2].Pointer() - owner := getAsyncOwner(t, a.(*fasync.FileAsync)) - _, err := t.CopyOut(addr, &owner) + _, err := t.CopyOut(args[2].Pointer(), &owner) return 0, nil, err case linux.F_SETOWN_EX: - addr := args[2].Pointer() var owner linux.FOwnerEx - n, err := t.CopyIn(addr, &owner) + n, err := t.CopyIn(args[2].Pointer(), &owner) if err != nil { return 0, nil, err } - a := file.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) - return uintptr(n), nil, setAsyncOwner(t, a, owner.Type, owner.PID) + return uintptr(n), nil, setAsyncOwner(t, file, owner.Type, owner.PID) case linux.F_GETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { @@ -219,30 +213,48 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } -func getAsyncOwner(t *kernel.Task, a *fasync.FileAsync) linux.FOwnerEx { - ot, otg, opg := a.Owner() +func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) { + a := fd.AsyncHandler() + if a == nil { + return linux.FOwnerEx{}, false + } + + ot, otg, opg := a.(*fasync.FileAsync).Owner() switch { case ot != nil: return linux.FOwnerEx{ Type: linux.F_OWNER_TID, PID: int32(t.PIDNamespace().IDOfTask(ot)), - } + }, true case otg != nil: return linux.FOwnerEx{ Type: linux.F_OWNER_PID, PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), - } + }, true case opg != nil: return linux.FOwnerEx{ Type: linux.F_OWNER_PGRP, PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), - } + }, true default: - return linux.FOwnerEx{} + return linux.FOwnerEx{}, true } } -func setAsyncOwner(t *kernel.Task, a *fasync.FileAsync, ownerType, pid int32) error { +func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error { + switch ownerType { + case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP: + // Acceptable type. + default: + return syserror.EINVAL + } + + a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) + if pid == 0 { + a.ClearOwner() + return nil + } + switch ownerType { case linux.F_OWNER_TID: task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index 5dac77e4d..b12b5967b 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -18,6 +18,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -239,6 +240,55 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd }) } +// Fallocate implements linux system call fallocate(2). +func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].Uint64() + offset := args[2].Int64() + length := args[3].Int64() + + file := t.GetFileVFS2(fd) + + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + if !file.IsWritable() { + return 0, nil, syserror.EBADF + } + + if mode != 0 { + return 0, nil, syserror.ENOTSUP + } + + if offset < 0 || length <= 0 { + return 0, nil, syserror.EINVAL + } + + size := offset + length + + if size < 0 { + return 0, nil, syserror.EFBIG + } + + limit := limits.FromContext(t).Get(limits.FileSize).Cur + + if uint64(size) >= limit { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(linux.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + return 0, nil, file.Impl().Allocate(t, mode, uint64(offset), uint64(length)) + + // File length modified, generate notification. + // TODO(gvisor.dev/issue/1479): Reenable when Inotify is ported. + // file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) +} + // Rmdir implements Linux syscall rmdir(2). func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index 0399c0db4..fd6ab94b2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -57,6 +57,49 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall flags &^= linux.O_NONBLOCK } return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags) + + case linux.FIOASYNC: + var set int32 + if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.StatusFlags() + if set != 0 { + flags |= linux.O_ASYNC + } else { + flags &^= linux.O_ASYNC + } + file.SetStatusFlags(t, t.Credentials(), flags) + return 0, nil, nil + + case linux.FIOGETOWN, linux.SIOCGPGRP: + var who int32 + owner, hasOwner := getAsyncOwner(t, file) + if hasOwner { + if owner.Type == linux.F_OWNER_PGRP { + who = -owner.PID + } else { + who = owner.PID + } + } + _, err := t.CopyOut(args[2].Pointer(), &who) + return 0, nil, err + + case linux.FIOSETOWN, linux.SIOCSPGRP: + var who int32 + if _, err := t.CopyIn(args[2].Pointer(), &who); err != nil { + return 0, nil, err + } + ownerType := int32(linux.F_OWNER_PID) + if who < 0 { + // Check for overflow before flipping the sign. + if who-1 > who { + return 0, nil, syserror.EINVAL + } + ownerType = linux.F_OWNER_PGRP + who = -who + } + return 0, nil, setAsyncOwner(t, file, ownerType, who) } ret, err := file.Ioctl(t, t.MemoryManager(), args) diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index 365250b0b..0d0ebf46a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -65,10 +65,8 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel nbytes := args[2].Int64() flags := args[3].Uint() - if offset < 0 { - return 0, nil, syserror.EINVAL - } - if nbytes < 0 { + // Check for negative values and overflow. + if offset < 0 || offset+nbytes < 0 { return 0, nil, syserror.EINVAL } if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { @@ -81,7 +79,37 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel } defer file.DecRef() - // TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of - // [offset, offset+nbytes). - return 0, nil, file.Sync(t) + // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support + // is a full-file sync, i.e. fsync(2). As a result, there are severe + // limitations on how much we support sync_file_range: + // - In Linux, sync_file_range(2) doesn't write out the file's metadata, even + // if the file size is changed. We do. + // - We always sync the entire file instead of [offset, offset+nbytes). + // - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For + // correctness, we would have to perform a write-out every time WAIT_BEFORE + // was used, but this would be much more expensive than expected if there + // were no write-out operations in progress. + // - Whenever WAIT_AFTER is used, we sync the file. + // - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will + // be synced anyway. If this flag is used without WAIT_AFTER, then it is + // safe (and less expensive) to do nothing, because the syscall will not + // wait for the write-out to complete--we only need to make sure that the + // next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes. + // - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O + // errors or ENOSPC conditions and will return those to the caller, after + // clearing the EIO and ENOSPC flags in the address_space." We don't do + // this. + + if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 && + flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 { + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + } + + if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 { + if err := file.Sync(t); err != nil { + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + } + } + return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 9e60c4a1c..8f497ecc7 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -138,7 +138,7 @@ func Override() { s.Table[282] = syscalls.Supported("signalfd", Signalfd) s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) s.Table[284] = syscalls.Supported("eventfd", Eventfd) - delete(s.Table, 285) // fallocate + s.Table[285] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil) s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) s.Table[288] = syscalls.Supported("accept4", Accept4) diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index cd1db14ac..0c42574db 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -91,8 +91,7 @@ type FileDescription struct { // FileDescriptionOptions contains options to FileDescription.Init(). type FileDescriptionOptions struct { - // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is - // usually only the case if O_DIRECT would actually have an effect. + // If AllowDirectIO is true, allow O_DIRECT to be set on the file. AllowDirectIO bool // If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE. @@ -355,6 +354,10 @@ type FileDescriptionImpl interface { // represented by the FileDescription. StatFS(ctx context.Context) (linux.Statfs, error) + // Allocate grows file represented by FileDescription to offset + length bytes. + // Only mode == 0 is supported currently. + Allocate(ctx context.Context, mode, offset, length uint64) error + // waiter.Waitable methods may be used to poll for I/O events. waiter.Waitable diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 3fec0d6d6..6b8b4ad49 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -56,6 +56,12 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err return linux.Statfs{}, syserror.ENOSYS } +// Allocate implements FileDescriptionImpl.Allocate analogously to +// fallocate called on regular file, directory or FIFO in Linux. +func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ENODEV +} + // Readiness implements waiter.Waitable.Readiness analogously to // file_operations::poll == NULL in Linux. func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { @@ -158,6 +164,11 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) // implementations of non-directory I/O methods that return EISDIR. type DirectoryFileDescriptionDefaultImpl struct{} +// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate. +func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EISDIR +} + // PRead implements FileDescriptionImpl.PRead. func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, syserror.EISDIR diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 509034531..c2e21ac5f 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -148,6 +148,11 @@ func (i *Inotify) Release() { } } +// Allocate implements FileDescription.Allocate. +func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error { + panic("Allocate should not be called on read-only inotify fds") +} + // EventRegister implements waiter.Waitable. func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) { i.queue.EventRegister(e, mask) diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index 9d4dce826..81b740115 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -509,9 +509,7 @@ func (h *handshake) execute() *tcpip.Error { // Initialize the resend timer. resendWaker := sleep.Waker{} timeOut := time.Duration(time.Second) - rt := time.AfterFunc(timeOut, func() { - resendWaker.Assert() - }) + rt := time.AfterFunc(timeOut, resendWaker.Assert) defer rt.Stop() // Set up the wakers. @@ -1050,8 +1048,8 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) { panic("current endpoint not removed from demuxer, enqueing segments to itself") } - if ep.(*endpoint).enqueueSegment(s) { - ep.(*endpoint).newSegmentWaker.Assert() + if ep := ep.(*endpoint); ep.enqueueSegment(s) { + ep.newSegmentWaker.Assert() } } @@ -1120,7 +1118,7 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) { func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error { checkRequeue := true for i := 0; i < maxSegmentsPerWake; i++ { - if e.EndpointState() == StateClose || e.EndpointState() == StateError { + if e.EndpointState().closed() { return nil } s := e.segmentQueue.dequeue() @@ -1440,9 +1438,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ if e.EndpointState() == StateFinWait2 && e.closed { // The socket has been closed and we are in FIN_WAIT2 // so start the FIN_WAIT2 timer. - closeTimer = time.AfterFunc(e.tcpLingerTimeout, func() { - closeWaker.Assert() - }) + closeTimer = time.AfterFunc(e.tcpLingerTimeout, closeWaker.Assert) e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) } } @@ -1460,7 +1456,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ return err } } - if e.EndpointState() != StateClose && e.EndpointState() != StateError { + if !e.EndpointState().closed() { // Only block the worker if the endpoint // is not in closed state or error state. close(e.drainDone) @@ -1526,7 +1522,12 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{ } loop: - for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError { + for { + switch e.EndpointState() { + case StateTimeWait, StateClose, StateError: + break loop + } + e.mu.Unlock() v, _ := s.Fetch(true) e.mu.Lock() diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go index 047704c80..98aecab9e 100644 --- a/pkg/tcpip/transport/tcp/dispatcher.go +++ b/pkg/tcpip/transport/tcp/dispatcher.go @@ -15,6 +15,8 @@ package tcp import ( + "encoding/binary" + "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sleep" "gvisor.dev/gvisor/pkg/sync" @@ -66,89 +68,68 @@ func (q *epQueue) empty() bool { // processor is responsible for processing packets queued to a tcp endpoint. type processor struct { epQ epQueue + sleeper sleep.Sleeper newEndpointWaker sleep.Waker closeWaker sleep.Waker - id int - wg sync.WaitGroup -} - -func newProcessor(id int) *processor { - p := &processor{ - id: id, - } - p.wg.Add(1) - go p.handleSegments() - return p } func (p *processor) close() { p.closeWaker.Assert() } -func (p *processor) wait() { - p.wg.Wait() -} - func (p *processor) queueEndpoint(ep *endpoint) { // Queue an endpoint for processing by the processor goroutine. p.epQ.enqueue(ep) p.newEndpointWaker.Assert() } -func (p *processor) handleSegments() { - const newEndpointWaker = 1 - const closeWaker = 2 - s := sleep.Sleeper{} - s.AddWaker(&p.newEndpointWaker, newEndpointWaker) - s.AddWaker(&p.closeWaker, closeWaker) - defer s.Done() +const ( + newEndpointWaker = 1 + closeWaker = 2 +) + +func (p *processor) start(wg *sync.WaitGroup) { + defer wg.Done() + defer p.sleeper.Done() + for { - id, ok := s.Fetch(true) - if ok && id == closeWaker { - p.wg.Done() - return + if id, _ := p.sleeper.Fetch(true); id == closeWaker { + break } - for ep := p.epQ.dequeue(); ep != nil; ep = p.epQ.dequeue() { + for { + ep := p.epQ.dequeue() + if ep == nil { + break + } if ep.segmentQueue.empty() { continue } - // If socket has transitioned out of connected state - // then just let the worker handle the packet. + // If socket has transitioned out of connected state then just let the + // worker handle the packet. // - // NOTE: We read this outside of e.mu lock which means - // that by the time we get to handleSegments the - // endpoint may not be in ESTABLISHED. But this should - // be fine as all normal shutdown states are handled by - // handleSegments and if the endpoint moves to a - // CLOSED/ERROR state then handleSegments is a noop. - if ep.EndpointState() != StateEstablished { - ep.newSegmentWaker.Assert() - continue - } - - if !ep.mu.TryLock() { - ep.newSegmentWaker.Assert() - continue - } - // If the endpoint is in a connected state then we do - // direct delivery to ensure low latency and avoid - // scheduler interactions. - if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose { - // Send any active resets if required. - if err != nil { + // NOTE: We read this outside of e.mu lock which means that by the time + // we get to handleSegments the endpoint may not be in ESTABLISHED. But + // this should be fine as all normal shutdown states are handled by + // handleSegments and if the endpoint moves to a CLOSED/ERROR state + // then handleSegments is a noop. + if ep.EndpointState() == StateEstablished && ep.mu.TryLock() { + // If the endpoint is in a connected state then we do direct delivery + // to ensure low latency and avoid scheduler interactions. + switch err := ep.handleSegments(true /* fastPath */); { + case err != nil: + // Send any active resets if required. ep.resetConnectionLocked(err) + fallthrough + case ep.EndpointState() == StateClose: + ep.notifyProtocolGoroutine(notifyTickleWorker) + case !ep.segmentQueue.empty(): + p.epQ.enqueue(ep) } - ep.notifyProtocolGoroutine(notifyTickleWorker) ep.mu.Unlock() - continue - } - - if !ep.segmentQueue.empty() { - p.epQ.enqueue(ep) + } else { + ep.newSegmentWaker.Assert() } - - ep.mu.Unlock() } } } @@ -159,31 +140,36 @@ func (p *processor) handleSegments() { // hash of the endpoint id to ensure that delivery for the same endpoint happens // in-order. type dispatcher struct { - processors []*processor + processors []processor seed uint32 -} - -func newDispatcher(nProcessors int) *dispatcher { - processors := []*processor{} - for i := 0; i < nProcessors; i++ { - processors = append(processors, newProcessor(i)) - } - return &dispatcher{ - processors: processors, - seed: generateRandUint32(), + wg sync.WaitGroup +} + +func (d *dispatcher) init(nProcessors int) { + d.close() + d.wait() + d.processors = make([]processor, nProcessors) + d.seed = generateRandUint32() + for i := range d.processors { + p := &d.processors[i] + p.sleeper.AddWaker(&p.newEndpointWaker, newEndpointWaker) + p.sleeper.AddWaker(&p.closeWaker, closeWaker) + d.wg.Add(1) + // NB: sleeper-waker registration must happen synchronously to avoid races + // with `close`. It's possible to pull all this logic into `start`, but + // that results in a heap-allocated function literal. + go p.start(&d.wg) } } func (d *dispatcher) close() { - for _, p := range d.processors { - p.close() + for i := range d.processors { + d.processors[i].close() } } func (d *dispatcher) wait() { - for _, p := range d.processors { - p.wait() - } + d.wg.Wait() } func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) { @@ -231,20 +217,18 @@ func generateRandUint32() uint32 { if _, err := rand.Read(b); err != nil { panic(err) } - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 + return binary.LittleEndian.Uint32(b) } func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor { - payload := []byte{ - byte(id.LocalPort), - byte(id.LocalPort >> 8), - byte(id.RemotePort), - byte(id.RemotePort >> 8)} + var payload [4]byte + binary.LittleEndian.PutUint16(payload[0:], id.LocalPort) + binary.LittleEndian.PutUint16(payload[2:], id.RemotePort) h := jenkins.Sum32(d.seed) - h.Write(payload) + h.Write(payload[:]) h.Write([]byte(id.LocalAddress)) h.Write([]byte(id.RemoteAddress)) - return d.processors[h.Sum32()%uint32(len(d.processors))] + return &d.processors[h.Sum32()%uint32(len(d.processors))] } diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 99a691815..bd3ec5a8d 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1212,6 +1212,16 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { // Read reads data from the endpoint. func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { e.LockUser() + defer e.UnlockUser() + + // When in SYN-SENT state, let the caller block on the receive. + // An application can initiate a non-blocking connect and then block + // on a receive. It can expect to read any data after the handshake + // is complete. RFC793, section 3.9, p58. + if e.EndpointState() == StateSynSent { + return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrWouldBlock + } + // The endpoint can be read if it's connected, or if it's already closed // but has some pending unread data. Also note that a RST being received // would cause the state to become StateError so we should allow the @@ -1221,7 +1231,6 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { e.rcvListMu.Unlock() he := e.HardError - e.UnlockUser() if s == StateError { return buffer.View{}, tcpip.ControlMessages{}, he } @@ -1231,7 +1240,6 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, v, err := e.readLocked() e.rcvListMu.Unlock() - e.UnlockUser() if err == tcpip.ErrClosedForReceive { e.stats.ReadErrors.ReadClosed.Increment() diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index f2ae6ce50..b34e47bbd 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -174,7 +174,7 @@ type protocol struct { maxRetries uint32 synRcvdCount synRcvdCounter synRetries uint8 - dispatcher *dispatcher + dispatcher dispatcher } // Number returns the tcp protocol number. @@ -515,7 +515,7 @@ func (*protocol) Parse(pkt *stack.PacketBuffer) bool { // NewProtocol returns a TCP transport protocol. func NewProtocol() stack.TransportProtocol { - return &protocol{ + p := protocol{ sendBufferSize: SendBufferSizeOption{ Min: MinBufferSize, Default: DefaultSendBufferSize, @@ -531,10 +531,11 @@ func NewProtocol() stack.TransportProtocol { tcpLingerTimeout: DefaultTCPLingerTimeout, tcpTimeWaitTimeout: DefaultTCPTimeWaitTimeout, synRcvdCount: synRcvdCounter{threshold: SynRcvdCountThreshold}, - dispatcher: newDispatcher(runtime.GOMAXPROCS(0)), synRetries: DefaultSynRetries, minRTO: MinRTO, maxRTO: MaxRTO, maxRetries: MaxRetries, } + p.dispatcher.init(runtime.GOMAXPROCS(0)) + return &p } diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index acacb42e4..5862c32f2 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -833,25 +833,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se panic("Netstack queues FIN segments without data.") } - segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - // If the entire segment cannot be accomodated in the receiver - // advertized window, skip splitting and sending of the segment. - // ref: net/ipv4/tcp_output.c::tcp_snd_wnd_test() - // - // Linux checks this for all segment transmits not triggered - // by a probe timer. On this condition, it defers the segment - // split and transmit to a short probe timer. - // ref: include/net/tcp.h::tcp_check_probe_timer() - // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() - // - // Instead of defining a new transmit timer, we attempt to split the - // segment right here if there are no pending segments. - // If there are pending segments, segment transmits are deferred - // to the retransmit timer handler. - if s.sndUna != s.sndNxt && !segEnd.LessThan(end) { - return false - } - if !seg.sequenceNumber.LessThan(end) { return false } @@ -861,14 +842,48 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se return false } - // The segment size limit is computed as a function of sender congestion - // window and MSS. When sender congestion window is > 1, this limit can - // be larger than MSS. Ensure that the currently available send space - // is not greater than minimum of this limit and MSS. + // If the whole segment or at least 1MSS sized segment cannot + // be accomodated in the receiver advertized window, skip + // splitting and sending of the segment. ref: + // net/ipv4/tcp_output.c::tcp_snd_wnd_test() + // + // Linux checks this for all segment transmits not triggered by + // a probe timer. On this condition, it defers the segment split + // and transmit to a short probe timer. + // + // ref: include/net/tcp.h::tcp_check_probe_timer() + // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() + // + // Instead of defining a new transmit timer, we attempt to split + // the segment right here if there are no pending segments. If + // there are pending segments, segment transmits are deferred to + // the retransmit timer handler. + if s.sndUna != s.sndNxt { + switch { + case available >= seg.data.Size(): + // OK to send, the whole segments fits in the + // receiver's advertised window. + case available >= s.maxPayloadSize: + // OK to send, at least 1 MSS sized segment fits + // in the receiver's advertised window. + default: + return false + } + } + + // The segment size limit is computed as a function of sender + // congestion window and MSS. When sender congestion window is > + // 1, this limit can be larger than MSS. Ensure that the + // currently available send space is not greater than minimum of + // this limit and MSS. if available > limit { available = limit } - if available > s.maxPayloadSize { + + // If GSO is not in use then cap available to + // maxPayloadSize. When GSO is in use the gVisor GSO logic or + // the host GSO logic will cap the segment to the correct size. + if s.ep.gso == nil && available > s.maxPayloadSize { available = s.maxPayloadSize } |