diff options
Diffstat (limited to 'pkg')
36 files changed, 992 insertions, 291 deletions
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD index 9612f072e..ffc918846 100644 --- a/pkg/amutex/BUILD +++ b/pkg/amutex/BUILD @@ -6,6 +6,7 @@ go_library( name = "amutex", srcs = ["amutex.go"], visibility = ["//:sandbox"], + deps = ["//pkg/syserror"], ) go_test( diff --git a/pkg/amutex/amutex.go b/pkg/amutex/amutex.go index 1c4fd1784..a078a31db 100644 --- a/pkg/amutex/amutex.go +++ b/pkg/amutex/amutex.go @@ -18,6 +18,8 @@ package amutex import ( "sync/atomic" + + "gvisor.dev/gvisor/pkg/syserror" ) // Sleeper must be implemented by users of the abortable mutex to allow for @@ -53,6 +55,21 @@ func (NoopSleeper) SleepFinish(success bool) {} // Interrupted implements Sleeper.Interrupted. func (NoopSleeper) Interrupted() bool { return false } +// Block blocks until either receiving from ch succeeds (in which case it +// returns nil) or sleeper is interrupted (in which case it returns +// syserror.ErrInterrupted). +func Block(sleeper Sleeper, ch <-chan struct{}) error { + cancel := sleeper.SleepStart() + select { + case <-ch: + sleeper.SleepFinish(true) + return nil + case <-cancel: + sleeper.SleepFinish(false) + return syserror.ErrInterrupted + } +} + // AbortableMutex is an abortable mutex. It allows Lock() to be aborted while it // waits to acquire the mutex. type AbortableMutex struct { diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 1cb1adf8c..642c79dda 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -19,6 +19,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/usermem" ) @@ -134,6 +135,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt c.Regs.Regs[1] = uint64(infoAddr) c.Regs.Regs[2] = uint64(ucAddr) c.Regs.Regs[30] = uint64(act.Restorer) + + // Save the thread's floating point state. + c.sigFPState = append(c.sigFPState, c.aarch64FPState) + // Signal handler gets a clean floating point state. + c.aarch64FPState = newAarch64FPState() return nil } @@ -155,5 +161,21 @@ func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalSt c.Regs.Sp = uc.MContext.Sp c.Regs.Pstate = uc.MContext.Pstate + // Restore floating point state. + l := len(c.sigFPState) + if l > 0 { + c.aarch64FPState = c.sigFPState[l-1] + // NOTE(cl/133042258): State save requires that any slice + // elements from '[len:cap]' to be zero value. + c.sigFPState[l-1] = nil + c.sigFPState = c.sigFPState[0 : l-1] + } else { + // This might happen if sigreturn(2) calls are unbalanced with + // respect to signal handler entries. This is not expected so + // don't bother to do anything fancy with the floating point + // state. + log.Warningf("sigreturn unable to restore application fpstate") + } + return uc.Sigset, uc.Stack, nil } diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go index 9d41fcbdb..8ae2d78d7 100644 --- a/pkg/sentry/fs/gofer/fs.go +++ b/pkg/sentry/fs/gofer/fs.go @@ -60,8 +60,7 @@ const ( limitHostFDTranslationKey = "limit_host_fd_translation" // overlayfsStaleRead if present closes cached readonly file after the first - // write. This is done to workaround a limitation of overlayfs in kernels - // before 4.19 where open FDs are not updated after the file is copied up. + // write. This is done to workaround a limitation of Linux overlayfs. overlayfsStaleRead = "overlayfs_stale_read" ) diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index a35c3a23d..cf9800100 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -16,7 +16,6 @@ package gofer import ( "fmt" - "syscall" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" @@ -68,7 +67,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string // Get a p9.File for name. qids, newFile, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name}) if err != nil { - if err == syscall.ENOENT { + if err == syserror.ENOENT { if cp.cacheNegativeDirents() { // Return a negative Dirent. It will stay cached until something // is created over it. @@ -207,7 +206,7 @@ func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, t targetOpts, ok := target.InodeOperations.(*inodeOperations) if !ok { - return syscall.EXDEV + return syserror.EXDEV } if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil { @@ -251,7 +250,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, } if i.session().overrides == nil { - return nil, syscall.EOPNOTSUPP + return nil, syserror.EOPNOTSUPP } // Stabilize the override map while creation is in progress. @@ -280,7 +279,7 @@ func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name st // N.B. FIFOs use major/minor numbers 0. if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { - if i.session().overrides == nil || err != syscall.EPERM { + if i.session().overrides == nil || err != syserror.EPERM { return err } // If gofer doesn't support mknod, check if we can create an internal fifo. @@ -427,17 +426,16 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent return syserror.ENAMETOOLONG } - // Unwrap the new parent to a *inodeOperations. - newParentInodeOperations, ok := newParent.InodeOperations.(*inodeOperations) - if !ok { - return syscall.EXDEV + // Don't allow renames across different mounts. + if newParent.MountSource != oldParent.MountSource { + return syserror.EXDEV } + // Unwrap the new parent to a *inodeOperations. + newParentInodeOperations := newParent.InodeOperations.(*inodeOperations) + // Unwrap the old parent to a *inodeOperations. - oldParentInodeOperations, ok := oldParent.InodeOperations.(*inodeOperations) - if !ok { - return syscall.EXDEV - } + oldParentInodeOperations := oldParent.InodeOperations.(*inodeOperations) // Do the rename. if err := i.fileState.file.rename(ctx, newParentInodeOperations.fileState.file, newName); err != nil { diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index 3c2b583ae..b095312fe 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -39,14 +39,13 @@ var fsInfo = fs.Info{ // rename implements fs.InodeOperations.Rename for tmpfs nodes. func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { - op, ok := oldParent.InodeOperations.(*Dir) - if !ok { - return syserror.EXDEV - } - np, ok := newParent.InodeOperations.(*Dir) - if !ok { + // Don't allow renames across different mounts. + if newParent.MountSource != oldParent.MountSource { return syserror.EXDEV } + + op := oldParent.InodeOperations.(*Dir) + np := newParent.InodeOperations.(*Dir) return ramfs.Rename(ctx, op.ramfsDir, oldName, np.ramfsDir, newName, replacement) } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 4a32821bd..7f2181216 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" @@ -835,6 +837,9 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf if d.isSynthetic() { return nil, syserror.ENXIO } + if d.fs.iopts.OpenSocketsByConnecting { + return d.connectSocketLocked(ctx, opts) + } case linux.S_IFIFO: if d.isSynthetic() { return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags) @@ -843,10 +848,28 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return d.openSpecialFileLocked(ctx, mnt, opts) } +func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + fdObj, err := d.file.connect(ctx, p9.AnonymousSocket) + if err != nil { + return nil, err + } + fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fdObj.FD(), &host.NewFDOptions{ + HaveFlags: true, + Flags: opts.Flags, + }) + if err != nil { + fdObj.Close() + return nil, err + } + fdObj.Release() + return fd, nil +} + func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) - // Treat as a special file. This is done for non-synthetic pipes as well as - // regular files when d.fs.opts.regularFilesUseSpecialFileFD is true. if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } @@ -854,10 +877,15 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts if err != nil { return nil, err } + seekable := d.fileType() == linux.S_IFREG fd := &specialFileFD{ - handle: h, + handle: h, + seekable: seekable, } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: !seekable, + DenyPWrite: !seekable, + }); err != nil { h.close(ctx) return nil, err } @@ -888,7 +916,11 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } creds := rp.Credentials() name := rp.Component() - fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + // Filter file creation flags and O_LARGEFILE out; the create RPC already + // has the semantics of O_CREAT|O_EXCL, while some servers will choke on + // O_LARGEFILE. + createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE)) + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) if err != nil { dirfile.close(ctx) return nil, err @@ -896,7 +928,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // Then we need to walk to the file we just created to get a non-open fid // representing it, and to get its metadata. This must use d.file since, as // explained above, dirfile was invalidated by dirfile.Create(). - walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) + _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) if err != nil { openFile.close(ctx) if fdobj != nil { @@ -904,17 +936,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } return nil, err } - // Sanity-check that we walked to the file we created. - if createQID.Path != walkQID.Path { - // Probably due to concurrent remote filesystem mutation? - ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop) - nonOpenFile.close(ctx) - openFile.close(ctx) - if fdobj != nil { - fdobj.Close() - } - return nil, syserror.EAGAIN - } // Construct the new dentry. child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) @@ -960,16 +981,21 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } else { + seekable := child.fileType() == linux.S_IFREG fd := &specialFileFD{ handle: handle{ file: openFile, fd: -1, }, + seekable: seekable, } if fdobj != nil { fd.handle.fd = int32(fdobj.Release()) } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: !seekable, + DenyPWrite: !seekable, + }); err != nil { fd.handle.close(ctx) return nil, err } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index e68e37ebc..353e2cf5b 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -143,9 +143,12 @@ type filesystemOptions struct { // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote // filesystem may not be coherent with writable host FDs opened later, so - // mappings of the former must be replaced by mappings of the latter. This - // is usually only the case when the remote filesystem is an overlayfs - // mount on Linux < 4.19. + // all uses of the former must be replaced by uses of the latter. This is + // usually only the case when the remote filesystem is a Linux overlayfs + // mount. (Prior to Linux 4.18, patch series centered on commit + // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were + // incoherent between pre-copy-up and post-copy-up FDs; after that patch + // series, only memory mappings are incoherent.) overlayfsStaleRead bool // If regularFilesUseSpecialFileFD is true, application FDs representing @@ -221,6 +224,10 @@ type InternalFilesystemOptions struct { // which servers can handle only a single client and report failure if that // client disconnects. LeakConnection bool + + // If OpenSocketsByConnecting is true, silently translate attempts to open + // files identifying as sockets to connect RPCs. + OpenSocketsByConnecting bool } // Name implements vfs.FilesystemType.Name. diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 507e0e276..a464e6a94 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -33,13 +33,14 @@ import ( type specialFileFD struct { fileDescription - // handle is immutable. + // handle is used for file I/O. handle is immutable. handle handle - // off is the file offset. off is protected by mu. (POSIX 2.9.7 only - // requires operations using the file offset to be atomic for regular files - // and symlinks; however, since specialFileFD may be used for regular - // files, we apply this atomicity unconditionally.) + // seekable is true if this file description represents a file for which + // file offset is significant, i.e. a regular file. seekable is immutable. + seekable bool + + // If seekable is true, off is the file offset. off is protected by mu. mu sync.Mutex off int64 } @@ -63,7 +64,7 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error { // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if offset < 0 { + if fd.seekable && offset < 0 { return 0, syserror.EINVAL } if opts.Flags != 0 { @@ -91,6 +92,10 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // Read implements vfs.FileDescriptionImpl.Read. func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + if !fd.seekable { + return fd.PRead(ctx, dst, -1, opts) + } + fd.mu.Lock() n, err := fd.PRead(ctx, dst, fd.off, opts) fd.off += n @@ -100,14 +105,14 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if offset < 0 { + if fd.seekable && offset < 0 { return 0, syserror.EINVAL } if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } - if fd.dentry().fileType() == linux.S_IFREG { + if fd.seekable { limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { return 0, err @@ -130,6 +135,10 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Write implements vfs.FileDescriptionImpl.Write. func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + if !fd.seekable { + return fd.PWrite(ctx, src, -1, opts) + } + fd.mu.Lock() n, err := fd.PWrite(ctx, src, fd.off, opts) fd.off += n @@ -139,6 +148,9 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if !fd.seekable { + return 0, syserror.ESPIPE + } fd.mu.Lock() defer fd.mu.Unlock() switch whence { diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 36bceeaa4..8caf55a1b 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -40,8 +40,20 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// ImportFD sets up and returns a vfs.FileDescription from a donated fd. -func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { +// NewFDOptions contains options to NewFD. +type NewFDOptions struct { + // If IsTTY is true, the file descriptor is a TTY. + IsTTY bool + + // If HaveFlags is true, use Flags for the new file description. Otherwise, + // the new file description will inherit flags from hostFD. + HaveFlags bool + Flags uint32 +} + +// NewFD returns a vfs.FileDescription representing the given host file +// descriptor. mnt must be Kernel.HostMount(). +func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) { fs, ok := mnt.Filesystem().Impl().(*filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) @@ -53,10 +65,14 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs return nil, err } - // Get flags for the imported FD. - flags, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0) - if err != nil { - return nil, err + flags := opts.Flags + if !opts.HaveFlags { + // Get flags for the imported FD. + flagsInt, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0) + if err != nil { + return nil, err + } + flags = uint32(flagsInt) } fileMode := linux.FileMode(s.Mode) @@ -65,13 +81,13 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs // Determine if hostFD is seekable. If not, this syscall will return ESPIPE // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character // devices. - _, err = unix.Seek(hostFD, 0, linux.SEEK_CUR) + _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) seekable := err != syserror.ESPIPE i := &inode{ hostFD: hostFD, seekable: seekable, - isTTY: isTTY, + isTTY: opts.IsTTY, canMap: canMap(uint32(fileType)), wouldBlock: wouldBlock(uint32(fileType)), ino: fs.NextIno(), @@ -101,7 +117,14 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs // i.open will take a reference on d. defer d.DecRef() - return i.open(ctx, d.VFSDentry(), mnt, uint32(flags)) + return i.open(ctx, d.VFSDentry(), mnt, flags) +} + +// ImportFD sets up and returns a vfs.FileDescription from a donated fd. +func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { + return NewFD(ctx, mnt, hostFD, &NewFDOptions{ + IsTTY: isTTY, + }) } // filesystemType implements vfs.FilesystemType. diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index a2d9649e7..007be1572 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -52,7 +52,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/lock", - "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", @@ -96,6 +95,7 @@ go_test( "pipe_test.go", "regular_file_test.go", "stat_test.go", + "tmpfs_test.go", ], library = ":tmpfs", deps = [ @@ -105,7 +105,6 @@ go_test( "//pkg/sentry/contexttest", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/contexttest", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 36ffcb592..80fa7b29d 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -16,6 +16,7 @@ package tmpfs import ( "fmt" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -24,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -76,8 +78,8 @@ afterSymlink: return nil, err } if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO(gvisor.dev/issue/1197): Symlink traversals updates - // access time. + // Symlink traversal updates access time. + atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds()) if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } @@ -361,8 +363,8 @@ afterTrailingSymlink: } // Do we need to resolve a trailing symlink? if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO(gvisor.dev/issue/1197): Symlink traversals updates - // access time. + // Symlink traversal updates access time. + atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds()) if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } @@ -636,12 +638,19 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) - if err != nil { + if _, err := resolveLocked(rp); err != nil { return linux.Statfs{}, err } - // TODO(gvisor.dev/issue/1197): Actually implement statfs. - return linux.Statfs{}, syserror.ENOSYS + statfs := linux.Statfs{ + Type: linux.TMPFS_MAGIC, + BlockSize: usermem.PageSize, + FragmentSize: usermem.PageSize, + NameLength: linux.NAME_MAX, + // TODO(b/29637826): Allow configuring a tmpfs size and enforce it. + Blocks: 0, + BlocksFree: 0, + } + return statfs, nil } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. @@ -763,5 +772,24 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) + mnt := vd.Mount() + d := vd.Dentry().Impl().(*dentry) + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + if d.name != "" { + // This must be an anonymous memfd file. + b.PrependComponent("/" + d.name) + return vfs.PrependPathSyntheticError{} + } + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 57e5e28ec..3f433d666 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -88,6 +88,7 @@ type regularFile struct { func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { file := ®ularFile{ memFile: fs.memFile, + seals: linux.F_SEAL_SEAL, } file.inode.init(file, fs, creds, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory @@ -577,3 +578,44 @@ exitLoop: return done, retErr } + +// GetSeals returns the current set of seals on a memfd inode. +func GetSeals(fd *vfs.FileDescription) (uint32, error) { + f, ok := fd.Impl().(*regularFileFD) + if !ok { + return 0, syserror.EINVAL + } + rf := f.inode().impl.(*regularFile) + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + return rf.seals, nil +} + +// AddSeals adds new file seals to a memfd inode. +func AddSeals(fd *vfs.FileDescription, val uint32) error { + f, ok := fd.Impl().(*regularFileFD) + if !ok { + return syserror.EINVAL + } + rf := f.inode().impl.(*regularFile) + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + + if rf.seals&linux.F_SEAL_SEAL != 0 { + // Seal applied which prevents addition of any new seals. + return syserror.EPERM + } + + // F_SEAL_WRITE can only be added if there are no active writable maps. + if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { + if rf.writableMappingPages > 0 { + return syserror.EBUSY + } + } + + // Seals can only be added, never removed. + rf.seals |= val + return nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 0399725cf..64e1c40ad 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -18,152 +18,16 @@ import ( "bytes" "fmt" "io" - "sync/atomic" "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) -// nextFileID is used to generate unique file names. -var nextFileID int64 - -// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error -// is not nil, then cleanup should be called when the root is no longer needed. -func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { - creds := auth.CredentialsFromContext(ctx) - - vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { - return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) - } - - vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) - if err != nil { - return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) - } - root := mntns.Root() - return vfsObj, root, func() { - root.DecRef() - mntns.DecRef() - }, nil -} - -// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If -// the returned err is not nil, then cleanup should be called when the FD is no -// longer needed. -func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { - creds := auth.CredentialsFromContext(ctx) - vfsObj, root, cleanup, err := newTmpfsRoot(ctx) - if err != nil { - return nil, nil, err - } - - filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1)) - - // Create the file that will be write/read. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, - Mode: linux.ModeRegular | mode, - }) - if err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) - } - - return fd, cleanup, nil -} - -// newDirFD is like newFileFD, but for directories. -func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { - creds := auth.CredentialsFromContext(ctx) - vfsObj, root, cleanup, err := newTmpfsRoot(ctx) - if err != nil { - return nil, nil, err - } - - dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1)) - - // Create the dir. - if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(dirname), - }, &vfs.MkdirOptions{ - Mode: linux.ModeDirectory | mode, - }); err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err) - } - - // Open the dir and return it. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(dirname), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY | linux.O_DIRECTORY, - }) - if err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err) - } - - return fd, cleanup, nil -} - -// newPipeFD is like newFileFD, but for pipes. -func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { - creds := auth.CredentialsFromContext(ctx) - vfsObj, root, cleanup, err := newTmpfsRoot(ctx) - if err != nil { - return nil, nil, err - } - - pipename := fmt.Sprintf("tmpfs-test-pipe-%d", atomic.AddInt64(&nextFileID, 1)) - - // Create the pipe. - if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(pipename), - }, &vfs.MknodOptions{ - Mode: linux.ModeNamedPipe | mode, - }); err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to create pipe %q: %v", pipename, err) - } - - // Open the pipe and return it. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(pipename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR, - }) - if err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to open pipe %q: %v", pipename, err) - } - - return fd, cleanup, nil -} - // Test that we can write some data to a file and read it back.` func TestSimpleWriteRead(t *testing.T) { ctx := contexttest.Context(t) diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index 60c2c980e..f7ee4aab2 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -19,8 +19,8 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -29,7 +29,6 @@ func TestStatAfterCreate(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( @@ -175,7 +174,6 @@ func TestSetStat(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 405928bd0..1e781aecd 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -94,7 +94,7 @@ type FilesystemOpts struct { } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") @@ -139,6 +139,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return &fs.vfsfs, &root.vfsd, nil } +// NewFilesystem returns a new tmpfs filesystem. +func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) { + return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{}) +} + // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) @@ -658,3 +663,34 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) } + +// NewMemfd creates a new tmpfs regular file and file description that can back +// an anonymous fd created by memfd_create. +func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) { + fs, ok := mount.Filesystem().Impl().(*filesystem) + if !ok { + panic("NewMemfd() called with non-tmpfs mount") + } + + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with + // S_IRWXUGO. + mode := linux.FileMode(0777) + inode := fs.newRegularFile(creds, mode) + rf := inode.impl.(*regularFile) + if allowSeals { + rf.seals = 0 + } + + d := fs.newDentry(inode) + defer d.DecRef() + d.name = name + + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with + // FMODE_READ | FMODE_WRITE. + var fd regularFileFD + flags := uint32(linux.O_RDWR) + if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go new file mode 100644 index 000000000..a240fb276 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -0,0 +1,156 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// nextFileID is used to generate unique file names. +var nextFileID int64 + +// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error +// is not nil, then cleanup should be called when the root is no longer needed. +func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { + creds := auth.CredentialsFromContext(ctx) + + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) + } + + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) + } + root := mntns.Root() + return vfsObj, root, func() { + root.DecRef() + mntns.DecRef() + }, nil +} + +// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If +// the returned err is not nil, then cleanup should be called when the FD is no +// longer needed. +func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the file that will be write/read. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: linux.ModeRegular | mode, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) + } + + return fd, cleanup, nil +} + +// newDirFD is like newFileFD, but for directories. +func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the dir. + if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.MkdirOptions{ + Mode: linux.ModeDirectory | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err) + } + + // Open the dir and return it. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err) + } + + return fd, cleanup, nil +} + +// newPipeFD is like newFileFD, but for pipes. +func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + name := fmt.Sprintf("tmpfs-test-%d", atomic.AddInt64(&nextFileID, 1)) + + if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + }, &vfs.MknodOptions{ + Mode: linux.ModeNamedPipe | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create pipe %q: %v", name, err) + } + + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open pipe %q: %v", name, err) + } + + return fd, cleanup, nil +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 8104f50f3..a28eab8b8 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -173,6 +173,7 @@ go_library( "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/fsimpl/timerfd", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 3617da8c6..5efeb3767 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -53,6 +53,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -259,6 +260,10 @@ type Kernel struct { // syscalls (as opposed to named pipes created by mknod()). pipeMount *vfs.Mount + // shmMount is the Mount used for anonymous files created by the + // memfd_create() syscalls. It is analagous to Linux's shm_mnt. + shmMount *vfs.Mount + // socketMount is the Mount used for sockets created by the socket() and // socketpair() syscalls. There are several cases where a socket dentry will // not be contained in socketMount: @@ -330,6 +335,9 @@ func (k *Kernel) Init(args InitKernelArgs) error { if args.Timekeeper == nil { return fmt.Errorf("Timekeeper is nil") } + if args.Timekeeper.clocks == nil { + return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()") + } if args.RootUserNamespace == nil { return fmt.Errorf("RootUserNamespace is nil") } @@ -384,6 +392,18 @@ func (k *Kernel) Init(args InitKernelArgs) error { } k.pipeMount = pipeMount + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + if err != nil { + return fmt.Errorf("failed to create tmpfs filesystem: %v", err) + } + defer tmpfsFilesystem.DecRef() + defer tmpfsRoot.DecRef() + shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create tmpfs mount: %v", err) + } + k.shmMount = shmMount + socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) if err != nil { return fmt.Errorf("failed to create sockfs filesystem: %v", err) @@ -1656,6 +1676,11 @@ func (k *Kernel) PipeMount() *vfs.Mount { return k.pipeMount } +// ShmMount returns the tmpfs mount. +func (k *Kernel) ShmMount() *vfs.Mount { + return k.shmMount +} + // SocketMount returns the sockfs mount. func (k *Kernel) SocketMount() *vfs.Mount { return k.socketMount diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 41a1ce031..789bb94c8 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -59,6 +59,13 @@ type metadata struct { // developing iptables, but can pollute sentry logs otherwise. const enableLogging = false +// emptyFilter is for comparison with a rule's filters to determine whether it +// is also empty. It is immutable. +var emptyFilter = stack.IPHeaderFilter{ + Dst: "\x00\x00\x00\x00", + DstMask: "\x00\x00\x00\x00", +} + // nflog logs messages related to the writing and reading of iptables. func nflog(format string, args ...interface{}) { if enableLogging && log.IsLogging(log.Debug) { @@ -484,7 +491,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { } if offset == replace.Underflow[hook] { if !validUnderflow(table.Rules[ruleIdx]) { - nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP") + nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx) return syserr.ErrInvalidArgument } table.Underflows[hk] = ruleIdx @@ -547,7 +554,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // make sure all other chains point to ACCEPT rules. for hook, ruleIdx := range table.BuiltinChains { if hook == stack.Forward || hook == stack.Postrouting { - if _, ok := table.Rules[ruleIdx].Target.(stack.AcceptTarget); !ok { + if !isUnconditionalAccept(table.Rules[ruleIdx]) { nflog("hook %d is unsupported.", hook) return syserr.ErrInvalidArgument } @@ -776,6 +783,9 @@ func validUnderflow(rule stack.Rule) bool { if len(rule.Matchers) != 0 { return false } + if rule.Filter != emptyFilter { + return false + } switch rule.Target.(type) { case stack.AcceptTarget, stack.DropTarget: return true @@ -784,6 +794,14 @@ func validUnderflow(rule stack.Rule) bool { } } +func isUnconditionalAccept(rule stack.Rule) bool { + if !validUnderflow(rule) { + return false + } + _, ok := rule.Target.(stack.AcceptTarget) + return ok +} + func hookFromLinux(hook int) stack.Hook { switch hook { case linux.NF_INET_PRE_ROUTING: diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go index 5949a7c29..3863293c7 100644 --- a/pkg/sentry/socket/netfilter/owner_matcher.go +++ b/pkg/sentry/socket/netfilter/owner_matcher.go @@ -45,14 +45,18 @@ func (ownerMarshaler) marshal(mr stack.Matcher) []byte { GID: matcher.gid, } - // Support for UID match. - // TODO(gvisor.dev/issue/170): Need to support gid match. + // Support for UID and GID match. if matcher.matchUID { iptOwnerInfo.Match = linux.XT_OWNER_UID - } else if matcher.matchGID { - panic("GID match is not supported.") - } else { - panic("UID match is not set.") + if matcher.invertUID { + iptOwnerInfo.Invert = linux.XT_OWNER_UID + } + } + if matcher.matchGID { + iptOwnerInfo.Match |= linux.XT_OWNER_GID + if matcher.invertGID { + iptOwnerInfo.Invert |= linux.XT_OWNER_GID + } } buf := make([]byte, 0, linux.SizeOfIPTOwnerInfo) @@ -71,31 +75,34 @@ func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack. binary.Unmarshal(buf[:linux.SizeOfIPTOwnerInfo], usermem.ByteOrder, &matchData) nflog("parseMatchers: parsed IPTOwnerInfo: %+v", matchData) - if matchData.Invert != 0 { - return nil, fmt.Errorf("invert flag is not supported for owner match") - } - - // Support for UID match. - // TODO(gvisor.dev/issue/170): Need to support gid match. - if matchData.Match&linux.XT_OWNER_UID != linux.XT_OWNER_UID { - return nil, fmt.Errorf("owner match is only supported for uid") - } - - // Check Flags. var owner OwnerMatcher owner.uid = matchData.UID owner.gid = matchData.GID - owner.matchUID = true + + // Check flags. + if matchData.Match&linux.XT_OWNER_UID != 0 { + owner.matchUID = true + if matchData.Invert&linux.XT_OWNER_UID != 0 { + owner.invertUID = true + } + } + if matchData.Match&linux.XT_OWNER_GID != 0 { + owner.matchGID = true + if matchData.Invert&linux.XT_OWNER_GID != 0 { + owner.invertGID = true + } + } return &owner, nil } type OwnerMatcher struct { - uid uint32 - gid uint32 - matchUID bool - matchGID bool - invert uint8 + uid uint32 + gid uint32 + matchUID bool + matchGID bool + invertUID bool + invertGID bool } // Name implements Matcher.Name. @@ -112,16 +119,30 @@ func (om *OwnerMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interface } // If the packet owner is not set, drop the packet. - // Support for uid match. - // TODO(gvisor.dev/issue/170): Need to support gid match. - if pkt.Owner == nil || !om.matchUID { + if pkt.Owner == nil { return false, true } - // TODO(gvisor.dev/issue/170): Need to add tests to verify - // drop rule when packet UID does not match owner matcher UID. - if pkt.Owner.UID() != om.uid { - return false, false + var matches bool + // Check for UID match. + if om.matchUID { + if pkt.Owner.UID() == om.uid { + matches = true + } + if matches == om.invertUID { + return false, false + } + } + + // Check for GID match. + if om.matchGID { + matches = false + if pkt.Owner.GID() == om.gid { + matches = true + } + if matches == om.invertGID { + return false, false + } } return true, false diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index 6129fb83d..333e0042e 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -18,6 +18,7 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/amutex", "//pkg/binary", "//pkg/context", "//pkg/log", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 81053d8ef..9dea2b5ff 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -34,6 +34,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" @@ -553,11 +554,9 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } if resCh != nil { - t := kernel.TaskFromContext(ctx) - if err := t.Block(resCh); err != nil { - return 0, syserr.FromError(err).ToError() + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) } @@ -626,11 +625,9 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader } if resCh != nil { - t := kernel.TaskFromContext(ctx) - if err := t.Block(resCh); err != nil { - return 0, syserr.FromError(err).ToError() + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{ Atomic: true, // See above. }) @@ -1324,6 +1321,29 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return int32(time.Duration(v) / time.Second), nil + case linux.TCP_SYNCNT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + + case linux.TCP_WINDOW_CLAMP: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil default: emitUnimplementedEventTCP(t, name) } @@ -1793,6 +1813,22 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)))) + case linux.TCP_SYNCNT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := usermem.ByteOrder.Uint32(optVal) + + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v))) + + case linux.TCP_WINDOW_CLAMP: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := usermem.ByteOrder.Uint32(optVal) + + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v))) + case linux.TCP_REPAIR_OPTIONS: t.Kernel().EmitUnimplementedEvent(t) diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index 191970d41..fcd8013c0 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -16,6 +16,7 @@ package netstack import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" @@ -89,11 +90,6 @@ func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { s.socketOpsCommon.EventUnregister(e) } -// PRead implements vfs.FileDescriptionImpl. -func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE -} - // Read implements vfs.FileDescriptionImpl. func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. @@ -115,11 +111,6 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. return int64(n), nil } -// PWrite implements vfs.FileDescriptionImpl. -func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE -} - // Write implements vfs.FileDescriptionImpl. func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. @@ -135,11 +126,9 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs } if resCh != nil { - t := kernel.TaskFromContext(ctx) - if err := t.Block(resCh); err != nil { - return 0, syserr.FromError(err).ToError() + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index c32f942fb..f882ef840 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -13,6 +13,7 @@ go_library( "fscontext.go", "getdents.go", "ioctl.go", + "memfd.go", "mmap.go", "path.go", "pipe.go", @@ -43,6 +44,7 @@ go_library( "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/signalfd", "//pkg/sentry/fsimpl/timerfd", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 8181d80f4..ca0f7fd1e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -17,6 +17,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" @@ -157,6 +158,15 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.EBADF } return uintptr(pipefile.PipeSize()), nil, nil + case linux.F_GET_SEALS: + val, err := tmpfs.GetSeals(file) + return uintptr(val), nil, err + case linux.F_ADD_SEALS: + if !file.IsWritable() { + return 0, nil, syserror.EPERM + } + err := tmpfs.AddSeals(file, args[2].Uint()) + return 0, nil, err default: // TODO(gvisor.dev/issue/1623): Everything else is not yet supported. return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go new file mode 100644 index 000000000..bbe248d17 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + memfdPrefix = "memfd:" + memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) +) + +// MemfdCreate implements the linux syscall memfd_create(2). +func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + + if flags&^memfdAllFlags != 0 { + // Unknown bits in flags. + return 0, nil, syserror.EINVAL + } + + allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 + cloExec := flags&linux.MFD_CLOEXEC != 0 + + name, err := t.CopyInString(addr, memfdMaxNameLen) + if err != nil { + return 0, nil, err + } + + shmMount := t.Kernel().ShmMount() + file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name) + if err != nil { + return 0, nil, err + } + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: cloExec, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 9c04677f1..ec8da7f06 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -158,7 +158,7 @@ func Override() { s.Table[306] = syscalls.Supported("syncfs", Syncfs) s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg) s.Table[316] = syscalls.Supported("renameat2", Renameat2) - delete(s.Table, 319) // memfd_create + s.Table[319] = syscalls.Supported("memfd_create", MemfdCreate) s.Table[322] = syscalls.Supported("execveat", Execveat) s.Table[327] = syscalls.Supported("preadv2", Preadv2) s.Table[328] = syscalls.Supported("pwritev2", Pwritev2) diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go index f01217c91..9a3c5d6c3 100644 --- a/pkg/tcpip/buffer/view.go +++ b/pkg/tcpip/buffer/view.go @@ -59,6 +59,9 @@ func (v *View) Reader() bytes.Reader { // ToVectorisedView returns a VectorisedView containing the receiver. func (v View) ToVectorisedView() VectorisedView { + if len(v) == 0 { + return VectorisedView{} + } return NewVectorisedView(len(v), []View{v}) } @@ -229,6 +232,9 @@ func (vv *VectorisedView) Append(vv2 VectorisedView) { // AppendView appends the given view into this vectorised view. func (vv *VectorisedView) AppendView(v View) { + if len(v) == 0 { + return + } vv.views = append(vv.views, v) vv.size += len(v) } diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go index c56795c7b..726e54de9 100644 --- a/pkg/tcpip/buffer/view_test.go +++ b/pkg/tcpip/buffer/view_test.go @@ -483,3 +483,39 @@ func TestPullUp(t *testing.T) { } } } + +func TestToVectorisedView(t *testing.T) { + testCases := []struct { + in View + want VectorisedView + }{ + {nil, VectorisedView{}}, + {View{}, VectorisedView{}}, + {View{'a'}, VectorisedView{size: 1, views: []View{{'a'}}}}, + } + for _, tc := range testCases { + if got, want := tc.in.ToVectorisedView(), tc.want; !reflect.DeepEqual(got, want) { + t.Errorf("(%v).ToVectorisedView failed got: %+v, want: %+v", tc.in, got, want) + } + } +} + +func TestAppendView(t *testing.T) { + testCases := []struct { + vv VectorisedView + in View + want VectorisedView + }{ + {VectorisedView{}, nil, VectorisedView{}}, + {VectorisedView{}, View{}, VectorisedView{}}, + {VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, nil, VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}}, + {VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, View{}, VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}}, + {VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, View{'e'}, VectorisedView{[]View{{'a', 'b', 'c', 'd'}, {'e'}}, 5}}, + } + for _, tc := range testCases { + tc.vv.AppendView(tc.in) + if got, want := tc.vv, tc.want; !reflect.DeepEqual(got, want) { + t.Errorf("(%v).ToVectorisedView failed got: %+v, want: %+v", tc.in, got, want) + } + } +} diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go index be9fec3b3..54432194d 100644 --- a/pkg/tcpip/link/qdisc/fifo/endpoint.go +++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go @@ -163,7 +163,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne // WritePackets implements stack.LinkEndpoint.WritePackets. // -// Being a batch API each packet in pkts should have the following fields +// Being a batch API, each packet in pkts should have the following fields // populated: // - pkt.EgressRoute // - pkt.GSOOptions diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 1ca4088c9..45e930ad8 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -622,6 +622,19 @@ const ( // // A zero value indicates the default. TTLOption + + // TCPSynCountOption is used by SetSockOpt/GetSockOpt to specify the number of + // SYN retransmits that TCP should send before aborting the attempt to + // connect. It cannot exceed 255. + // + // NOTE: This option is currently only stubbed out and is no-op. + TCPSynCountOption + + // TCPWindowClampOption is used by SetSockOpt/GetSockOpt to bound the size + // of the advertised window to this value. + // + // NOTE: This option is currently only stubed out and is a no-op + TCPWindowClampOption ) // ErrorOption is used in GetSockOpt to specify that the last error reported by @@ -685,11 +698,23 @@ type TCPDeferAcceptOption time.Duration // default MinRTO used by the Stack. type TCPMinRTOOption time.Duration +// TCPMaxRTOOption is use by SetSockOpt/GetSockOpt to allow overriding +// default MaxRTO used by the Stack. +type TCPMaxRTOOption time.Duration + +// TCPMaxRetriesOption is used by SetSockOpt/GetSockOpt to set/get the +// maximum number of retransmits after which we time out the connection. +type TCPMaxRetriesOption uint64 + // TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify // the number of endpoints that can be in SYN-RCVD state before the stack // switches to using SYN cookies. type TCPSynRcvdCountThresholdOption uint64 +// TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide +// default for number of times SYN is retransmitted before aborting a connect. +type TCPSynRetriesOption uint8 + // MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a // default interface for multicast. type MulticastInterfaceOption struct { diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 07d3e64c8..71735029e 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -470,6 +470,17 @@ type endpoint struct { // for this endpoint using the TCP_MAXSEG setsockopt. userMSS uint16 + // maxSynRetries is the maximum number of SYN retransmits that TCP should + // send before aborting the attempt to connect. It cannot exceed 255. + // + // NOTE: This is currently a no-op and does not change the SYN + // retransmissions. + maxSynRetries uint8 + + // windowClamp is used to bound the size of the advertised window to + // this value. + windowClamp uint32 + // The following fields are used to manage the send buffer. When // segments are ready to be sent, they are added to sndQueue and the // protocol goroutine is signaled via sndWaker. @@ -795,8 +806,10 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue interval: 75 * time.Second, count: 9, }, - uniqueID: s.UniqueID(), - txHash: s.Rand().Uint32(), + uniqueID: s.UniqueID(), + txHash: s.Rand().Uint32(), + windowClamp: DefaultReceiveBufferSize, + maxSynRetries: DefaultSynRetries, } var ss SendBufferSizeOption @@ -829,6 +842,11 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue e.tcpLingerTimeout = time.Duration(tcpLT) } + var synRetries tcpip.TCPSynRetriesOption + if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { + e.maxSynRetries = uint8(synRetries) + } + if p := s.GetTCPProbe(); p != nil { e.probe = p } @@ -1603,6 +1621,36 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { e.ttl = uint8(v) e.UnlockUser() + case tcpip.TCPSynCountOption: + if v < 1 || v > 255 { + return tcpip.ErrInvalidOptionValue + } + e.LockUser() + e.maxSynRetries = uint8(v) + e.UnlockUser() + + case tcpip.TCPWindowClampOption: + if v == 0 { + e.LockUser() + switch e.EndpointState() { + case StateClose, StateInitial: + e.windowClamp = 0 + e.UnlockUser() + return nil + default: + e.UnlockUser() + return tcpip.ErrInvalidOptionValue + } + } + var rs ReceiveBufferSizeOption + if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { + if v < rs.Min/2 { + v = rs.Min / 2 + } + } + e.LockUser() + e.windowClamp = uint32(v) + e.UnlockUser() } return nil } @@ -1826,6 +1874,18 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { e.UnlockUser() return v, nil + case tcpip.TCPSynCountOption: + e.LockUser() + v := int(e.maxSynRetries) + e.UnlockUser() + return v, nil + + case tcpip.TCPWindowClampOption: + e.LockUser() + v := int(e.windowClamp) + e.UnlockUser() + return v, nil + default: return -1, tcpip.ErrUnknownProtocolOption } diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index cfd9a4e8e..2a2a7ddeb 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -64,6 +64,10 @@ const ( // DefaultTCPTimeWaitTimeout is the amount of time that sockets linger // in TIME_WAIT state before being marked closed. DefaultTCPTimeWaitTimeout = 60 * time.Second + + // DefaultSynRetries is the default value for the number of SYN retransmits + // before a connect is aborted. + DefaultSynRetries = 6 ) // SACKEnabled option can be used to enable SACK support in the TCP @@ -163,7 +167,10 @@ type protocol struct { tcpLingerTimeout time.Duration tcpTimeWaitTimeout time.Duration minRTO time.Duration + maxRTO time.Duration + maxRetries uint32 synRcvdCount synRcvdCounter + synRetries uint8 dispatcher *dispatcher } @@ -340,12 +347,36 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error { p.mu.Unlock() return nil + case tcpip.TCPMaxRTOOption: + if v < 0 { + v = tcpip.TCPMaxRTOOption(MaxRTO) + } + p.mu.Lock() + p.maxRTO = time.Duration(v) + p.mu.Unlock() + return nil + + case tcpip.TCPMaxRetriesOption: + p.mu.Lock() + p.maxRetries = uint32(v) + p.mu.Unlock() + return nil + case tcpip.TCPSynRcvdCountThresholdOption: p.mu.Lock() p.synRcvdCount.SetThreshold(uint64(v)) p.mu.Unlock() return nil + case tcpip.TCPSynRetriesOption: + if v < 1 || v > 255 { + return tcpip.ErrInvalidOptionValue + } + p.mu.Lock() + p.synRetries = uint8(v) + p.mu.Unlock() + return nil + default: return tcpip.ErrUnknownProtocolOption } @@ -414,12 +445,30 @@ func (p *protocol) Option(option interface{}) *tcpip.Error { p.mu.RUnlock() return nil + case *tcpip.TCPMaxRTOOption: + p.mu.RLock() + *v = tcpip.TCPMaxRTOOption(p.maxRTO) + p.mu.RUnlock() + return nil + + case *tcpip.TCPMaxRetriesOption: + p.mu.RLock() + *v = tcpip.TCPMaxRetriesOption(p.maxRetries) + p.mu.RUnlock() + return nil + case *tcpip.TCPSynRcvdCountThresholdOption: p.mu.RLock() *v = tcpip.TCPSynRcvdCountThresholdOption(p.synRcvdCount.Threshold()) p.mu.RUnlock() return nil + case *tcpip.TCPSynRetriesOption: + p.mu.RLock() + *v = tcpip.TCPSynRetriesOption(p.synRetries) + p.mu.RUnlock() + return nil + default: return tcpip.ErrUnknownProtocolOption } @@ -452,6 +501,9 @@ func NewProtocol() stack.TransportProtocol { tcpTimeWaitTimeout: DefaultTCPTimeWaitTimeout, synRcvdCount: synRcvdCounter{threshold: SynRcvdCountThreshold}, dispatcher: newDispatcher(runtime.GOMAXPROCS(0)), + synRetries: DefaultSynRetries, minRTO: MinRTO, + maxRTO: MaxRTO, + maxRetries: MaxRetries, } } diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 9e547a221..06dc9b7d7 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -43,7 +43,8 @@ const ( nDupAckThreshold = 3 // MaxRetries is the maximum number of probe retries sender does - // before timing out the connection, Linux default TCP_RETR2. + // before timing out the connection. + // Linux default TCP_RETR2, net.ipv4.tcp_retries2. MaxRetries = 15 ) @@ -165,6 +166,12 @@ type sender struct { // minRTO is the minimum permitted value for sender.rto. minRTO time.Duration + // maxRTO is the maximum permitted value for sender.rto. + maxRTO time.Duration + + // maxRetries is the maximum permitted retransmissions. + maxRetries uint32 + // maxPayloadSize is the maximum size of the payload of a given segment. // It is initialized on demand. maxPayloadSize int @@ -276,12 +283,24 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint // etc. s.ep.scoreboard = NewSACKScoreboard(uint16(s.maxPayloadSize), iss) - // Get Stack wide minRTO. - var v tcpip.TCPMinRTOOption - if err := ep.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { + // Get Stack wide config. + var minRTO tcpip.TCPMinRTOOption + if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) } - s.minRTO = time.Duration(v) + s.minRTO = time.Duration(minRTO) + + var maxRTO tcpip.TCPMaxRTOOption + if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { + panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) + } + s.maxRTO = time.Duration(maxRTO) + + var maxRetries tcpip.TCPMaxRetriesOption + if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { + panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) + } + s.maxRetries = uint32(maxRetries) return s } @@ -485,7 +504,7 @@ func (s *sender) retransmitTimerExpired() bool { } elapsed := time.Since(s.firstRetransmittedSegXmitTime) - remaining := MaxRTO + remaining := s.maxRTO if uto != 0 { // Cap to the user specified timeout if one is specified. remaining = uto - elapsed @@ -494,24 +513,17 @@ func (s *sender) retransmitTimerExpired() bool { // Always honor the user-timeout irrespective of whether the zero // window probes were acknowledged. // net/ipv4/tcp_timer.c::tcp_probe_timer() - if remaining <= 0 || s.unackZeroWindowProbes >= MaxRetries { + if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { return false } - if s.rto >= MaxRTO { - // RFC 1122 section: 4.2.2.17 - // A TCP MAY keep its offered receive window closed - // indefinitely. As long as the receiving TCP continues to - // send acknowledgments in response to the probe segments, the - // sending TCP MUST allow the connection to stay open. - if !(s.zeroWindowProbing && s.unackZeroWindowProbes == 0) { - return false - } - } - // Set new timeout. The timer will be restarted by the call to sendData // below. s.rto *= 2 + // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 + if s.rto > s.maxRTO { + s.rto = s.maxRTO + } // Cap RTO to remaining time. if s.rto > remaining { @@ -565,9 +577,20 @@ func (s *sender) retransmitTimerExpired() bool { // send. if s.zeroWindowProbing { s.sendZeroWindowProbe() + // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed + // indefinitely. As long as the receiving TCP continues to send + // acknowledgments in response to the probe segments, the sending TCP + // MUST allow the connection to stay open. return true } + seg := s.writeNext + // RFC 1122 4.2.3.5: Close the connection when the number of + // retransmissions for this segment is beyond a limit. + if seg != nil && seg.xmitCount > s.maxRetries { + return false + } + s.sendData() return true diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index d2c90ebd5..0b4512c65 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -2994,6 +2994,101 @@ func TestSendOnResetConnection(t *testing.T) { } } +// TestMaxRetransmitsTimeout tests if the connection is timed out after +// a segment has been retransmitted MaxRetries times. +func TestMaxRetransmitsTimeout(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + const numRetries = 2 + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRetriesOption(numRetries)); err != nil { + t.Fatalf("could not set protocol option MaxRetries.\n") + } + + c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */) + + waitEntry, notifyCh := waiter.NewChannelEntry(nil) + c.WQ.EventRegister(&waitEntry, waiter.EventHUp) + defer c.WQ.EventUnregister(&waitEntry) + + _, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed: %v", err) + } + + // Expect first transmit and MaxRetries retransmits. + for i := 0; i < numRetries+1; i++ { + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagAck|header.TCPFlagPsh), + ), + ) + } + // Wait for the connection to timeout after MaxRetries retransmits. + initRTO := 1 * time.Second + select { + case <-notifyCh: + case <-time.After((2 << numRetries) * initRTO): + t.Fatalf("connection still alive after maximum retransmits.\n") + } + + // Send an ACK and expect a RST as the connection would have been closed. + c.SendPacket(nil, &context.Headers{ + SrcPort: context.TestPort, + DstPort: c.Port, + Flags: header.TCPFlagAck, + }) + + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlags(header.TCPFlagRst), + ), + ) + + if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 { + t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %v, want = 1", got) + } +} + +// TestMaxRTO tests if the retransmit interval caps to MaxRTO. +func TestMaxRTO(t *testing.T) { + c := context.New(t, defaultMTU) + defer c.Cleanup() + + rto := 1 * time.Second + if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRTOOption(rto)); err != nil { + t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPMaxRTO(%d) failed: %s", rto, err) + } + + c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */) + + _, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed: %v", err) + } + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)), + ), + ) + const numRetransmits = 2 + for i := 0; i < numRetransmits; i++ { + start := time.Now() + checker.IPv4(t, c.GetPacket(), + checker.TCP( + checker.DstPort(context.TestPort), + checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)), + ), + ) + if time.Since(start).Round(time.Second).Seconds() != rto.Seconds() { + t.Errorf("Retransmit interval not capped to MaxRTO.\n") + } + } +} + func TestFinImmediately(t *testing.T) { c := context.New(t, defaultMTU) defer c.Cleanup() @@ -6605,9 +6700,16 @@ func TestTCPUserTimeout(t *testing.T) { c.CreateConnected(789, 30000, -1 /* epRcvBuf */) + waitEntry, notifyCh := waiter.NewChannelEntry(nil) + c.WQ.EventRegister(&waitEntry, waiter.EventHUp) + defer c.WQ.EventUnregister(&waitEntry) + origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value() - userTimeout := 50 * time.Millisecond + // Ensure that on the next retransmit timer fire, the user timeout has + // expired. + initRTO := 1 * time.Second + userTimeout := initRTO / 2 c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout)) // Send some data and wait before ACKing it. @@ -6627,9 +6729,13 @@ func TestTCPUserTimeout(t *testing.T) { ), ) - // Wait for a little over the minimum retransmit timeout of 200ms for - // the retransmitTimer to fire and close the connection. - time.Sleep(tcp.MinRTO + 10*time.Millisecond) + // Wait for the retransmit timer to be fired and the user timeout to cause + // close of the connection. + select { + case <-notifyCh: + case <-time.After(2 * initRTO): + t.Fatalf("connection still alive after %s, should have been closed after :%s", 2*initRTO, userTimeout) + } // No packet should be received as the connection should be silently // closed due to timeout. |