summaryrefslogtreecommitdiffhomepage
path: root/pkg
diff options
context:
space:
mode:
Diffstat (limited to 'pkg')
-rw-r--r--pkg/abi/linux/ioctl.go2
-rw-r--r--pkg/sentry/devices/memdev/full.go4
-rw-r--r--pkg/sentry/devices/memdev/null.go4
-rw-r--r--pkg/sentry/devices/memdev/random.go4
-rw-r--r--pkg/sentry/devices/memdev/zero.go4
-rw-r--r--pkg/sentry/devices/ttydev/ttydev.go2
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts.go8
-rw-r--r--pkg/sentry/fsimpl/devpts/master.go6
-rw-r--r--pkg/sentry/fsimpl/devpts/replica.go97
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs.go4
-rw-r--r--pkg/sentry/fsimpl/eventfd/eventfd.go2
-rw-r--r--pkg/sentry/fsimpl/ext/dentry.go2
-rw-r--r--pkg/sentry/fsimpl/ext/directory.go8
-rw-r--r--pkg/sentry/fsimpl/ext/ext.go2
-rw-r--r--pkg/sentry/fsimpl/ext/extent_file.go2
-rw-r--r--pkg/sentry/fsimpl/ext/filesystem.go4
-rw-r--r--pkg/sentry/fsimpl/ext/inode.go2
-rw-r--r--pkg/sentry/fsimpl/ext/regular_file.go6
-rw-r--r--pkg/sentry/fsimpl/ext/symlink.go4
-rw-r--r--pkg/sentry/fsimpl/fuse/connection.go25
-rw-r--r--pkg/sentry/fsimpl/fuse/dev.go16
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go55
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go3
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go39
-rw-r--r--pkg/sentry/fsimpl/gofer/handle.go2
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go7
-rw-r--r--pkg/sentry/fsimpl/gofer/socket.go4
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go6
-rw-r--r--pkg/sentry/fsimpl/host/host.go12
-rw-r--r--pkg/sentry/fsimpl/host/mmap.go6
-rw-r--r--pkg/sentry/fsimpl/host/tty.go2
-rw-r--r--pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go8
-rw-r--r--pkg/sentry/fsimpl/kernfs/fd_impl_util.go11
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go126
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go72
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go38
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs_test.go20
-rw-r--r--pkg/sentry/fsimpl/kernfs/symlink.go2
-rw-r--r--pkg/sentry/fsimpl/kernfs/synthetic_directory.go14
-rw-r--r--pkg/sentry/fsimpl/overlay/directory.go3
-rw-r--r--pkg/sentry/fsimpl/overlay/non_directory.go3
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go16
-rw-r--r--pkg/sentry/fsimpl/pipefs/pipefs.go4
-rw-r--r--pkg/sentry/fsimpl/proc/filesystem.go7
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks.go1
-rw-r--r--pkg/sentry/fsimpl/proc/task.go2
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go1
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go5
-rw-r--r--pkg/sentry/fsimpl/proc/task_net.go1
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go2
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_files.go6
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys.go1
-rw-r--r--pkg/sentry/fsimpl/signalfd/signalfd.go4
-rw-r--r--pkg/sentry/fsimpl/sockfs/sockfs.go5
-rw-r--r--pkg/sentry/fsimpl/sys/kcov.go3
-rw-r--r--pkg/sentry/fsimpl/sys/sys.go9
-rw-r--r--pkg/sentry/fsimpl/timerfd/timerfd.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/device_file.go1
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go4
-rw-r--r--pkg/sentry/fsimpl/tmpfs/named_pipe.go1
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go7
-rw-r--r--pkg/sentry/fsimpl/tmpfs/socket_file.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/symlink.go1
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go26
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go14
-rw-r--r--pkg/sentry/kernel/kernel.go13
-rw-r--r--pkg/sentry/socket/unix/unix_vfs2.go2
-rw-r--r--pkg/sentry/state/state.go6
-rw-r--r--pkg/sentry/vfs/anonfs.go5
-rw-r--r--pkg/sentry/vfs/dentry.go2
-rw-r--r--pkg/sentry/vfs/device.go3
-rw-r--r--pkg/sentry/vfs/epoll.go9
-rw-r--r--pkg/sentry/vfs/file_description.go10
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go28
-rw-r--r--pkg/sentry/vfs/filesystem.go6
-rw-r--r--pkg/sentry/vfs/filesystem_type.go2
-rw-r--r--pkg/sentry/vfs/genericfstree/genericfstree.go2
-rw-r--r--pkg/sentry/vfs/inotify.go2
-rw-r--r--pkg/sentry/vfs/lock.go2
-rw-r--r--pkg/sentry/vfs/mount.go9
-rw-r--r--pkg/sentry/vfs/mount_test.go26
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go11
-rw-r--r--pkg/sentry/vfs/options.go30
-rw-r--r--pkg/sentry/vfs/permissions.go2
-rw-r--r--pkg/sentry/vfs/resolving_path.go5
-rw-r--r--pkg/sentry/vfs/vfs.go2
-rw-r--r--pkg/state/types.go14
-rw-r--r--pkg/tcpip/checker/checker.go60
-rw-r--r--pkg/tcpip/stack/stack.go5
-rw-r--r--pkg/tcpip/transport/tcp/BUILD1
-rw-r--r--pkg/tcpip/transport/tcp/connect.go19
-rw-r--r--pkg/tcpip/transport/tcp/dual_stack_test.go16
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go165
-rw-r--r--pkg/tcpip/transport/tcp/endpoint_state.go4
-rw-r--r--pkg/tcpip/transport/tcp/rcv.go50
-rw-r--r--pkg/tcpip/transport/tcp/segment.go45
-rw-r--r--pkg/tcpip/transport/tcp/segment_queue.go52
-rw-r--r--pkg/tcpip/transport/tcp/tcp_test.go861
-rw-r--r--pkg/tcpip/transport/tcp/tcp_timestamp_test.go19
-rw-r--r--pkg/tcpip/transport/tcp/testing/context/context.go85
-rw-r--r--pkg/tcpip/transport/udp/udp_test.go12
101 files changed, 1479 insertions, 872 deletions
diff --git a/pkg/abi/linux/ioctl.go b/pkg/abi/linux/ioctl.go
index 3356a2b4a..dc9ac7e7c 100644
--- a/pkg/abi/linux/ioctl.go
+++ b/pkg/abi/linux/ioctl.go
@@ -115,7 +115,7 @@ const (
// Constants from uapi/linux/fs.h.
const (
- FS_IOC_GETFLAGS = 2147771905
+ FS_IOC_GETFLAGS = 2148034049
FS_VERITY_FL = 1048576
)
diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go
index 511179e31..fece3e762 100644
--- a/pkg/sentry/devices/memdev/full.go
+++ b/pkg/sentry/devices/memdev/full.go
@@ -24,6 +24,8 @@ import (
const fullDevMinor = 7
// fullDevice implements vfs.Device for /dev/full.
+//
+// +stateify savable
type fullDevice struct{}
// Open implements vfs.Device.Open.
@@ -38,6 +40,8 @@ func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// fullFD implements vfs.FileDescriptionImpl for /dev/full.
+//
+// +stateify savable
type fullFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go
index 4918dbeeb..ff5837747 100644
--- a/pkg/sentry/devices/memdev/null.go
+++ b/pkg/sentry/devices/memdev/null.go
@@ -25,6 +25,8 @@ import (
const nullDevMinor = 3
// nullDevice implements vfs.Device for /dev/null.
+//
+// +stateify savable
type nullDevice struct{}
// Open implements vfs.Device.Open.
@@ -39,6 +41,8 @@ func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// nullFD implements vfs.FileDescriptionImpl for /dev/null.
+//
+// +stateify savable
type nullFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go
index 5e7fe0280..ac943e3ba 100644
--- a/pkg/sentry/devices/memdev/random.go
+++ b/pkg/sentry/devices/memdev/random.go
@@ -30,6 +30,8 @@ const (
)
// randomDevice implements vfs.Device for /dev/random and /dev/urandom.
+//
+// +stateify savable
type randomDevice struct{}
// Open implements vfs.Device.Open.
@@ -44,6 +46,8 @@ func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry,
}
// randomFD implements vfs.FileDescriptionImpl for /dev/random.
+//
+// +stateify savable
type randomFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
index 60cfea888..1929e41cd 100644
--- a/pkg/sentry/devices/memdev/zero.go
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -27,6 +27,8 @@ import (
const zeroDevMinor = 5
// zeroDevice implements vfs.Device for /dev/zero.
+//
+// +stateify savable
type zeroDevice struct{}
// Open implements vfs.Device.Open.
@@ -41,6 +43,8 @@ func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// zeroFD implements vfs.FileDescriptionImpl for /dev/zero.
+//
+// +stateify savable
type zeroFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go
index 664e54498..a287c65ca 100644
--- a/pkg/sentry/devices/ttydev/ttydev.go
+++ b/pkg/sentry/devices/ttydev/ttydev.go
@@ -30,6 +30,8 @@ const (
)
// ttyDevice implements vfs.Device for /dev/tty.
+//
+// +stateify savable
type ttyDevice struct{}
// Open implements vfs.Device.Open.
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 77d1f493d..903135fae 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -35,6 +35,8 @@ import (
const Name = "devpts"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// Name implements vfs.FilesystemType.Name.
@@ -58,6 +60,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil
}
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -110,6 +113,8 @@ func (fs *filesystem) Release(ctx context.Context) {
}
// rootInode is the root directory inode for the devpts mounts.
+//
+// +stateify savable
type rootInode struct {
implStatFS
kernfs.AlwaysValid
@@ -131,7 +136,7 @@ type rootInode struct {
root *rootInode
// mu protects the fields below.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// replicas maps pty ids to replica inodes.
replicas map[uint32]*replicaInode
@@ -242,6 +247,7 @@ func (i *rootInode) DecRef(context.Context) {
i.rootInodeRefs.DecRef(i.Destroy)
}
+// +stateify savable
type implStatFS struct{}
// StatFS implements kernfs.Inode.StatFS.
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 98d4ffb22..69c2fe951 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -31,6 +31,8 @@ import (
)
// masterInode is the inode for the master end of the Terminal.
+//
+// +stateify savable
type masterInode struct {
implStatFS
kernfs.InodeAttrs
@@ -56,14 +58,12 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernf
return nil, err
}
- mi.IncRef()
fd := &masterFileDescription{
inode: mi,
t: t,
}
fd.LockFD.Init(&mi.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
- mi.DecRef(ctx)
return nil, err
}
return &fd.vfsfd, nil
@@ -89,6 +89,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
}
+// +stateify savable
type masterFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -103,7 +104,6 @@ var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
// Release implements vfs.FileDescriptionImpl.Release.
func (mfd *masterFileDescription) Release(ctx context.Context) {
mfd.inode.root.masterClose(mfd.t)
- mfd.inode.DecRef(ctx)
}
// EventRegister implements waiter.Waitable.EventRegister.
diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go
index 816bac80c..6515c5536 100644
--- a/pkg/sentry/fsimpl/devpts/replica.go
+++ b/pkg/sentry/fsimpl/devpts/replica.go
@@ -30,6 +30,8 @@ import (
)
// replicaInode is the inode for the replica end of the Terminal.
+//
+// +stateify savable
type replicaInode struct {
implStatFS
kernfs.InodeAttrs
@@ -52,14 +54,12 @@ type replicaInode struct {
var _ kernfs.Inode = (*replicaInode)(nil)
// Open implements kernfs.Inode.Open.
-func (si *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- si.IncRef()
+func (ri *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &replicaFileDescription{
- inode: si,
+ inode: ri,
}
- fd.LockFD.Init(&si.locks)
+ fd.LockFD.Init(&ri.locks)
if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
- si.DecRef(ctx)
return nil, err
}
return &fd.vfsfd, nil
@@ -67,34 +67,35 @@ func (si *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kern
}
// Valid implements kernfs.Inode.Valid.
-func (si *replicaInode) Valid(context.Context) bool {
+func (ri *replicaInode) Valid(context.Context) bool {
// Return valid if the replica still exists.
- si.root.mu.Lock()
- defer si.root.mu.Unlock()
- _, ok := si.root.replicas[si.t.n]
+ ri.root.mu.Lock()
+ defer ri.root.mu.Unlock()
+ _, ok := ri.root.replicas[ri.t.n]
return ok
}
// Stat implements kernfs.Inode.Stat.
-func (si *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
- statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
+func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+ statx, err := ri.InodeAttrs.Stat(ctx, vfsfs, opts)
if err != nil {
return linux.Statx{}, err
}
statx.Blksize = 1024
statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR
- statx.RdevMinor = si.t.n
+ statx.RdevMinor = ri.t.n
return statx, nil
}
// SetStat implements kernfs.Inode.SetStat
-func (si *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
if opts.Stat.Mask&linux.STATX_SIZE != 0 {
return syserror.EINVAL
}
- return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+ return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
}
+// +stateify savable
type replicaFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -106,37 +107,35 @@ type replicaFileDescription struct {
var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil)
// Release implements fs.FileOperations.Release.
-func (sfd *replicaFileDescription) Release(ctx context.Context) {
- sfd.inode.DecRef(ctx)
-}
+func (rfd *replicaFileDescription) Release(ctx context.Context) {}
// EventRegister implements waiter.Waitable.EventRegister.
-func (sfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
- sfd.inode.t.ld.replicaWaiter.EventRegister(e, mask)
+func (rfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ rfd.inode.t.ld.replicaWaiter.EventRegister(e, mask)
}
// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sfd *replicaFileDescription) EventUnregister(e *waiter.Entry) {
- sfd.inode.t.ld.replicaWaiter.EventUnregister(e)
+func (rfd *replicaFileDescription) EventUnregister(e *waiter.Entry) {
+ rfd.inode.t.ld.replicaWaiter.EventUnregister(e)
}
// Readiness implements waiter.Waitable.Readiness.
-func (sfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
- return sfd.inode.t.ld.replicaReadiness()
+func (rfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return rfd.inode.t.ld.replicaReadiness()
}
// Read implements vfs.FileDescriptionImpl.Read.
-func (sfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
- return sfd.inode.t.ld.inputQueueRead(ctx, dst)
+func (rfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+ return rfd.inode.t.ld.inputQueueRead(ctx, dst)
}
// Write implements vfs.FileDescriptionImpl.Write.
-func (sfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
- return sfd.inode.t.ld.outputQueueWrite(ctx, src)
+func (rfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+ return rfd.inode.t.ld.outputQueueWrite(ctx, src)
}
// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
-func (sfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
t := kernel.TaskFromContext(ctx)
if t == nil {
// ioctl(2) may only be called from a task goroutine.
@@ -146,35 +145,35 @@ func (sfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, arg
switch cmd := args[1].Uint(); cmd {
case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
// Get the number of bytes in the input queue read buffer.
- return 0, sfd.inode.t.ld.inputQueueReadSize(t, io, args)
+ return 0, rfd.inode.t.ld.inputQueueReadSize(t, io, args)
case linux.TCGETS:
- return sfd.inode.t.ld.getTermios(t, args)
+ return rfd.inode.t.ld.getTermios(t, args)
case linux.TCSETS:
- return sfd.inode.t.ld.setTermios(t, args)
+ return rfd.inode.t.ld.setTermios(t, args)
case linux.TCSETSW:
// TODO(b/29356795): This should drain the output queue first.
- return sfd.inode.t.ld.setTermios(t, args)
+ return rfd.inode.t.ld.setTermios(t, args)
case linux.TIOCGPTN:
- nP := primitive.Uint32(sfd.inode.t.n)
+ nP := primitive.Uint32(rfd.inode.t.n)
_, err := nP.CopyOut(t, args[2].Pointer())
return 0, err
case linux.TIOCGWINSZ:
- return 0, sfd.inode.t.ld.windowSize(t, args)
+ return 0, rfd.inode.t.ld.windowSize(t, args)
case linux.TIOCSWINSZ:
- return 0, sfd.inode.t.ld.setWindowSize(t, args)
+ return 0, rfd.inode.t.ld.setWindowSize(t, args)
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- return 0, sfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
+ return 0, rfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
case linux.TIOCNOTTY:
// Release this process's controlling terminal.
- return 0, sfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
+ return 0, rfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
case linux.TIOCGPGRP:
// Get the foreground process group.
- return sfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
+ return rfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
case linux.TIOCSPGRP:
// Set the foreground process group.
- return sfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
+ return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
@@ -182,24 +181,24 @@ func (sfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, arg
}
// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (sfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+func (rfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
- fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
- return sfd.inode.SetStat(ctx, fs, creds, opts)
+ fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return rfd.inode.SetStat(ctx, fs, creds, opts)
}
// Stat implements vfs.FileDescriptionImpl.Stat.
-func (sfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
- fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
- return sfd.inode.Stat(ctx, fs, opts)
+func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return rfd.inode.Stat(ctx, fs, opts)
}
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (sfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
- return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
+func (rfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+ return rfd.Locks().LockPOSIX(ctx, &rfd.vfsfd, uid, t, start, length, whence, block)
}
// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (sfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
- return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
+func (rfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+ return rfd.Locks().UnlockPOSIX(ctx, &rfd.vfsfd, uid, start, length, whence)
}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index a23094e54..6d1753080 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -33,8 +33,10 @@ import (
const Name = "devtmpfs"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct {
- initOnce sync.Once
+ initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1664): not yet supported.
initErr error
// fs is the tmpfs filesystem that backs all mounts of this FilesystemType.
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index bb0bf3a07..1c27ad700 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -33,6 +33,8 @@ import (
// EventFileDescription implements vfs.FileDescriptionImpl for file-based event
// notification (eventfd). Eventfds are usually internal to the Sentry but in
// certain situations they may be converted into a host-backed eventfd.
+//
+// +stateify savable
type EventFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 7a1b4219f..9bfed883a 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -20,6 +20,8 @@ import (
)
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 0fc01668d..452450d82 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -28,6 +28,8 @@ import (
)
// directory represents a directory inode. It holds the childList in memory.
+//
+// +stateify savable
type directory struct {
inode inode
@@ -39,7 +41,7 @@ type directory struct {
// Lock Order (outermost locks must be taken first):
// directory.mu
// filesystem.mu
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// childList is a list containing (1) child dirents and (2) fake dirents
// (with diskDirent == nil) that represent the iteration position of
@@ -120,6 +122,8 @@ func (i *inode) isDir() bool {
}
// dirent is the directory.childList node.
+//
+// +stateify savable
type dirent struct {
diskDirent disklayout.Dirent
@@ -129,6 +133,8 @@ type dirent struct {
// directoryFD represents a directory file description. It implements
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index 08ffc2834..aca258d40 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -34,6 +34,8 @@ import (
const Name = "ext"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// Compiles only if FilesystemType implements vfs.FilesystemType.
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index c36225a7c..04917d762 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -24,6 +24,8 @@ import (
)
// extentFile is a type of regular file which uses extents to store file data.
+//
+// +stateify savable
type extentFile struct {
regFile regularFile
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index a4a6d8c55..917f1873d 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -38,11 +38,13 @@ var (
)
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
// mu serializes changes to the Dentry tree.
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
// dev represents the underlying fs device. It does not require protection
// because io.ReaderAt permits concurrent read calls to it. It translates to
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 30636cf66..9009ba3c7 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -37,6 +37,8 @@ import (
// |-- regular--
// |-- extent file
// |-- block map file
+//
+// +stateify savable
type inode struct {
// refs is a reference count. refs is accessed using atomic memory operations.
refs int64
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index e73e740d6..4a5539b37 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -31,6 +31,8 @@ import (
// regularFile represents a regular file's inode. This too follows the
// inheritance pattern prevelant in the vfs layer described in
// pkg/sentry/vfs/README.md.
+//
+// +stateify savable
type regularFile struct {
inode inode
@@ -67,6 +69,8 @@ func (in *inode) isRegular() bool {
// directoryFD represents a directory file description. It implements
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type regularFileFD struct {
fileDescription
vfs.LockFD
@@ -75,7 +79,7 @@ type regularFileFD struct {
off int64
// offMu serializes operations that may mutate off.
- offMu sync.Mutex
+ offMu sync.Mutex `state:"nosave"`
}
// Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index f33592d59..5e2bcc837 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -23,6 +23,8 @@ import (
)
// symlink represents a symlink inode.
+//
+// +stateify savable
type symlink struct {
inode inode
target string // immutable
@@ -64,6 +66,8 @@ func (in *inode) isSymlink() bool {
// symlinkFD represents a symlink file description and implements
// vfs.FileDescriptionImpl. which may only be used if open options contains
// O_PATH. For this reason most of the functions return EBADF.
+//
+// +stateify savable
type symlinkFD struct {
fileDescription
vfs.NoLockFD
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
index dbc5e1954..8ccda1264 100644
--- a/pkg/sentry/fsimpl/fuse/connection.go
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -39,15 +39,18 @@ const (
)
// connection is the struct by which the sentry communicates with the FUSE server daemon.
+//
// Lock order:
// - conn.fd.mu
// - conn.mu
// - conn.asyncMu
+//
+// +stateify savable
type connection struct {
fd *DeviceFD
// mu protects access to struct memebers.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// attributeVersion is the version of connection's attributes.
attributeVersion uint64
@@ -75,7 +78,7 @@ type connection struct {
initialized int32
// initializedChan is used to block requests before initialization.
- initializedChan chan struct{}
+ initializedChan chan struct{} `state:".(bool)"`
// connected (connection established) when a new FUSE file system is created.
// Set to false when:
@@ -113,7 +116,7 @@ type connection struct {
// i.e. `!request.noReply`
// asyncMu protects the async request fields.
- asyncMu sync.Mutex
+ asyncMu sync.Mutex `state:"nosave"`
// asyncNum is the number of async requests.
// Protected by asyncMu.
@@ -174,6 +177,22 @@ type connection struct {
noOpen bool
}
+func (conn *connection) saveInitializedChan() bool {
+ select {
+ case <-conn.initializedChan:
+ return true // Closed.
+ default:
+ return false // Not closed.
+ }
+}
+
+func (conn *connection) loadInitializedChan(closed bool) {
+ conn.initializedChan = make(chan struct{}, 1)
+ if closed {
+ close(conn.initializedChan)
+ }
+}
+
// newFUSEConnection creates a FUSE connection to fd.
func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) {
// Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index f690ef5ad..1b86a4b4c 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -31,6 +31,8 @@ import (
const fuseDevMinor = 229
// fuseDevice implements vfs.Device for /dev/fuse.
+//
+// +stateify savable
type fuseDevice struct{}
// Open implements vfs.Device.Open.
@@ -49,6 +51,8 @@ func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
}
// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
+//
+// +stateify savable
type DeviceFD struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -79,7 +83,7 @@ type DeviceFD struct {
writeCursorFR *futureResponse
// mu protects all the queues, maps, buffers and cursors and nextOpID.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// waitQueue is used to notify interested parties when the device becomes
// readable or writable.
@@ -88,12 +92,20 @@ type DeviceFD struct {
// fullQueueCh is a channel used to synchronize the readers with the writers.
// Writers (inbound requests to the filesystem) block if there are too many
// unprocessed in-flight requests.
- fullQueueCh chan struct{}
+ fullQueueCh chan struct{} `state:".(int)"`
// fs is the FUSE filesystem that this FD is being used for.
fs *filesystem
}
+func (fd *DeviceFD) saveFullQueueCh() int {
+ return cap(fd.fullQueueCh)
+}
+
+func (fd *DeviceFD) loadFullQueueCh(capacity int) {
+ fd.fullQueueCh = make(chan struct{}, capacity)
+}
+
// Release implements vfs.FileDescriptionImpl.Release.
func (fd *DeviceFD) Release(ctx context.Context) {
if fd.fs != nil {
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 1d42a51f4..65786e42a 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -41,8 +41,11 @@ const Name = "fuse"
const maxActiveRequestsDefault = 10000
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
+// +stateify savable
type filesystemOptions struct {
// userID specifies the numeric uid of the mount owner.
// This option should not be specified by the filesystem owner.
@@ -73,6 +76,8 @@ type filesystemOptions struct {
}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
devMinor uint32
@@ -240,6 +245,8 @@ func (fs *filesystem) Release(ctx context.Context) {
}
// inode implements kernfs.Inode.
+//
+// +stateify savable
type inode struct {
inodeRefs
kernfs.InodeAttrs
@@ -419,7 +426,7 @@ func (*inode) Valid(ctx context.Context) bool {
}
// NewFile implements kernfs.Inode.NewFile.
-func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*kernfs.Dentry, error) {
kernelTask := kernel.TaskFromContext(ctx)
if kernelTask == nil {
log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID)
@@ -433,15 +440,11 @@ func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions)
},
Name: name,
}
- d, err := i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in)
- if err != nil {
- return nil, err
- }
- return d.VFSDentry(), nil
+ return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in)
}
// NewNode implements kernfs.Inode.NewNode.
-func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) {
+func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*kernfs.Dentry, error) {
in := linux.FUSEMknodIn{
MknodMeta: linux.FUSEMknodMeta{
Mode: uint32(opts.Mode),
@@ -450,28 +453,20 @@ func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions)
},
Name: name,
}
- d, err := i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in)
- if err != nil {
- return nil, err
- }
- return d.VFSDentry(), nil
+ return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in)
}
// NewSymlink implements kernfs.Inode.NewSymlink.
-func (i *inode) NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) {
+func (i *inode) NewSymlink(ctx context.Context, name, target string) (*kernfs.Dentry, error) {
in := linux.FUSESymLinkIn{
Name: name,
Target: target,
}
- d, err := i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in)
- if err != nil {
- return nil, err
- }
- return d.VFSDentry(), nil
+ return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in)
}
// Unlink implements kernfs.Inode.Unlink.
-func (i *inode) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+func (i *inode) Unlink(ctx context.Context, name string, child *kernfs.Dentry) error {
kernelTask := kernel.TaskFromContext(ctx)
if kernelTask == nil {
log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
@@ -494,7 +489,7 @@ func (i *inode) Unlink(ctx context.Context, name string, child *vfs.Dentry) erro
}
// NewDir implements kernfs.Inode.NewDir.
-func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*kernfs.Dentry, error) {
in := linux.FUSEMkdirIn{
MkdirMeta: linux.FUSEMkdirMeta{
Mode: uint32(opts.Mode),
@@ -502,15 +497,11 @@ func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions)
},
Name: name,
}
- d, err := i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in)
- if err != nil {
- return nil, err
- }
- return d.VFSDentry(), nil
+ return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in)
}
// RmDir implements kernfs.Inode.RmDir.
-func (i *inode) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+func (i *inode) RmDir(ctx context.Context, name string, child *kernfs.Dentry) error {
fusefs := i.fs
task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
@@ -528,12 +519,7 @@ func (i *inode) RmDir(ctx context.Context, name string, child *vfs.Dentry) error
return err
}
- // TODO(Before merging): When creating new nodes, should we add nodes to the ordered children?
- // If so we'll probably need to call this. We will also need to add them with the writable flag when
- // appropriate.
- // return i.OrderedChildren.RmDir(ctx, name, child)
-
- return nil
+ return i.dentry.RemoveChildLocked(name, child)
}
// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response.
@@ -563,11 +549,6 @@ func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMo
return nil, syserror.EIO
}
child := i.fs.newInode(out.NodeID, out.Attr)
- if opcode == linux.FUSE_LOOKUP {
- i.dentry.InsertChildLocked(name, child)
- } else {
- i.dentry.InsertChild(name, child)
- }
return child, nil
}
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 91d2ae199..18c884b59 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -117,11 +117,12 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
d.syntheticChildren++
}
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
dirents []vfs.Dirent
}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index aaad9c0d9..8608471f8 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -62,9 +62,13 @@ import (
const Name = "9p"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
@@ -77,7 +81,7 @@ type filesystem struct {
iopts InternalFilesystemOptions
// client is the client used by this filesystem. client is immutable.
- client *p9.Client
+ client *p9.Client `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
// clock is a realtime clock used to set timestamps in file operations.
clock ktime.Clock
@@ -95,7 +99,7 @@ type filesystem struct {
// reference count (such that it is usable as vfs.ResolvingPath.Start() or
// is reachable from its children), or if it is a child dentry (such that
// it is reachable from its parent).
- renameMu sync.RWMutex
+ renameMu sync.RWMutex `state:"nosave"`
// cachedDentries contains all dentries with 0 references. (Due to race
// conditions, it may also contain dentries with non-zero references.)
@@ -107,7 +111,7 @@ type filesystem struct {
// syncableDentries contains all dentries in this filesystem for which
// !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
// These fields are protected by syncMu.
- syncMu sync.Mutex
+ syncMu sync.Mutex `state:"nosave"`
syncableDentries map[*dentry]struct{}
specialFileFDs map[*specialFileFD]struct{}
@@ -120,6 +124,8 @@ type filesystem struct {
// dentries, it comes from QID.Path from the 9P server. Synthetic dentries
// have have their inodeNumber generated sequentially, with the MSB reserved to
// prevent conflicts with regular dentries.
+//
+// +stateify savable
type inodeNumber uint64
// Reserve MSB for synthetic mounts.
@@ -132,6 +138,7 @@ func inoFromPath(path uint64) inodeNumber {
return inodeNumber(path &^ syntheticInoMask)
}
+// +stateify savable
type filesystemOptions struct {
// "Standard" 9P options.
fd int
@@ -177,6 +184,8 @@ type filesystemOptions struct {
// InteropMode controls the client's interaction with other remote filesystem
// users.
+//
+// +stateify savable
type InteropMode uint32
const (
@@ -235,6 +244,8 @@ const (
// InternalFilesystemOptions may be passed as
// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
type InternalFilesystemOptions struct {
// If LeakConnection is true, do not close the connection to the server
// when the Filesystem is released. This is necessary for deployments in
@@ -534,6 +545,8 @@ func (fs *filesystem) Release(ctx context.Context) {
}
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -563,7 +576,7 @@ type dentry struct {
// If file.isNil(), this dentry represents a synthetic file, i.e. a file
// that does not exist on the remote filesystem. As of this writing, the
// only files that can be synthetic are sockets, pipes, and directories.
- file p9file
+ file p9file `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
// If deleted is non-zero, the file represented by this dentry has been
// deleted. deleted is accessed using atomic memory operations.
@@ -575,7 +588,7 @@ type dentry struct {
cached bool
dentryEntry
- dirMu sync.Mutex
+ dirMu sync.Mutex `state:"nosave"`
// If this dentry represents a directory, children contains:
//
@@ -607,7 +620,7 @@ type dentry struct {
// To mutate:
// - Lock metadataMu and use atomic operations to update because we might
// have atomic readers that don't hold the lock.
- metadataMu sync.Mutex
+ metadataMu sync.Mutex `state:"nosave"`
ino inodeNumber // immutable
mode uint32 // type is immutable, perms are mutable
uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
@@ -638,7 +651,7 @@ type dentry struct {
// other metadata fields.
nlink uint32
- mapsMu sync.Mutex
+ mapsMu sync.Mutex `state:"nosave"`
// If this dentry represents a regular file, mappings tracks mappings of
// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
@@ -662,12 +675,12 @@ type dentry struct {
// either p9.File transitions from closed (isNil() == true) to open
// (isNil() == false), it may be mutated with handleMu locked, but cannot
// be closed until the dentry is destroyed.
- handleMu sync.RWMutex
- readFile p9file
- writeFile p9file
+ handleMu sync.RWMutex `state:"nosave"`
+ readFile p9file `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
+ writeFile p9file `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
hostFD int32
- dataMu sync.RWMutex
+ dataMu sync.RWMutex `state:"nosave"`
// If this dentry represents a regular file that is client-cached, cache
// maps offsets into the cached file to offsets into
@@ -1627,12 +1640,14 @@ func (d *dentry) decLinks() {
// fileDescription is embedded by gofer implementations of
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
vfs.LockFD
- lockLogging sync.Once
+ lockLogging sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
}
func (fd *fileDescription) filesystem() *filesystem {
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 104157512..a9ebe1206 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -25,6 +25,8 @@ import (
// handle represents a remote "open file descriptor", consisting of an opened
// fid (p9.File) and optionally a host file descriptor.
+//
+// These are explicitly not savable.
type handle struct {
file p9file
fd int32 // -1 if unavailable
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 24f03ee94..eeaf6e444 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -39,11 +39,12 @@ func (d *dentry) isRegularFile() bool {
return d.fileType() == linux.S_IFREG
}
+// +stateify savable
type regularFileFD struct {
fileDescription
// off is the file offset. off is protected by mu.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
}
@@ -898,6 +899,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
// dentryPlatformFile is only used when a host FD representing the remote file
// is available (i.e. dentry.hostFD >= 0), and that FD is used for application
// memory mappings (i.e. !filesystem.opts.forcePageCache).
+//
+// +stateify savable
type dentryPlatformFile struct {
*dentry
@@ -910,7 +913,7 @@ type dentryPlatformFile struct {
hostFileMapper fsutil.HostFileMapper
// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
- hostFileMapperInitOnce sync.Once
+ hostFileMapperInitOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
}
// IncRef implements memmap.File.IncRef.
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
index 85d2bee72..326b940a7 100644
--- a/pkg/sentry/fsimpl/gofer/socket.go
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -36,12 +36,14 @@ func (d *dentry) isSocket() bool {
// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt()
// is called and either BoundEndpoint.BidirectionalConnect or
// BoundEndpoint.UnidirectionalConnect is called.
+//
+// +stateify savable
type endpoint struct {
// dentry is the filesystem dentry which produced this endpoint.
dentry *dentry
// file is the p9 file that contains a single unopened fid.
- file p9.File
+ file p9.File `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
// path is the sentry path where this endpoint is bound.
path string
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 576c57491..71581736c 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -34,11 +34,13 @@ import (
// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
// in effect) regular files. specialFileFD differs from regularFileFD by using
// per-FD handles instead of shared per-dentry handles, and never buffering I/O.
+//
+// +stateify savable
type specialFileFD struct {
fileDescription
// handle is used for file I/O. handle is immutable.
- handle handle
+ handle handle `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
// isRegularFile is true if this FD represents a regular file which is only
// possible when filesystemOptions.regularFilesUseSpecialFileFD is in
@@ -56,7 +58,7 @@ type specialFileFD struct {
queue waiter.Queue
// If seekable is true, off is the file offset. off is protected by mu.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index c0cef3453..ffe4ddb32 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -137,6 +137,8 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs
}
// filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type filesystemType struct{}
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
@@ -166,6 +168,8 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -185,6 +189,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
// inode implements kernfs.Inode.
+//
+// +stateify savable
type inode struct {
kernfs.InodeNoStatFS
kernfs.InodeNotDirectory
@@ -233,7 +239,7 @@ type inode struct {
canMap bool
// mapsMu protects mappings.
- mapsMu sync.Mutex
+ mapsMu sync.Mutex `state:"nosave"`
// If canMap is true, mappings tracks mappings of hostFD into
// memmap.MappingSpaces.
@@ -511,6 +517,8 @@ func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flag
}
// fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -525,7 +533,7 @@ type fileDescription struct {
inode *inode
// offsetMu protects offset.
- offsetMu sync.Mutex
+ offsetMu sync.Mutex `state:"nosave"`
// offset specifies the current file offset. It is only meaningful when
// inode.seekable is true.
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
index 65d3af38c..b51a17bed 100644
--- a/pkg/sentry/fsimpl/host/mmap.go
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -27,11 +27,13 @@ import (
// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef.
//
// inodePlatformFile should only be used if inode.canMap is true.
+//
+// +stateify savable
type inodePlatformFile struct {
*inode
// fdRefsMu protects fdRefs.
- fdRefsMu sync.Mutex
+ fdRefsMu sync.Mutex `state:"nosave"`
// fdRefs counts references on memmap.File offsets. It is used solely for
// memory accounting.
@@ -41,7 +43,7 @@ type inodePlatformFile struct {
fileMapper fsutil.HostFileMapper
// fileMapperInitOnce is used to lazily initialize fileMapper.
- fileMapperInitOnce sync.Once
+ fileMapperInitOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
}
// IncRef implements memmap.File.IncRef.
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index e02b9b8f6..f5c596fec 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -30,6 +30,8 @@ import (
// TTYFileDescription implements vfs.FileDescriptionImpl for a host file
// descriptor that wraps a TTY FD.
+//
+// +stateify savable
type TTYFileDescription struct {
fileDescription
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 7d040481f..b929118b1 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -58,7 +58,7 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint
// Open implements Inode.Open.
func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &DynamicBytesFD{}
- if err := fd.Init(rp.Mount(), d.VFSDentry(), f.data, &f.locks, opts.Flags); err != nil {
+ if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil {
return nil, err
}
return &fd.vfsfd, nil
@@ -87,12 +87,12 @@ type DynamicBytesFD struct {
}
// Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
fd.LockFD.Init(locks)
- if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
return err
}
- fd.inode = d.Impl().(*Dentry).inode
+ fd.inode = d.inode
fd.SetDataSource(data)
return nil
}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 2b4294228..0a4cd4057 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -29,6 +29,8 @@ import (
)
// SeekEndConfig describes the SEEK_END behaviour for FDs.
+//
+// +stateify savable
type SeekEndConfig int
// Constants related to SEEK_END behaviour for FDs.
@@ -41,6 +43,8 @@ const (
)
// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD.
+//
+// +stateify savable
type GenericDirectoryFDOptions struct {
SeekEnd SeekEndConfig
}
@@ -56,6 +60,8 @@ type GenericDirectoryFDOptions struct {
// Must be initialize with Init before first use.
//
// Lock ordering: mu => children.mu.
+//
+// +stateify savable
type GenericDirectoryFD struct {
vfs.FileDescriptionDefaultImpl
vfs.DirectoryFileDescriptionDefaultImpl
@@ -68,7 +74,7 @@ type GenericDirectoryFD struct {
children *OrderedChildren
// mu protects the fields below.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// off is the current directory offset. Protected by "mu".
off int64
@@ -195,8 +201,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
// these.
childIdx := fd.off - 2
for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
- inode := it.Dentry.Impl().(*Dentry).inode
- stat, err := inode.Stat(ctx, fd.filesystem(), opts)
+ stat, err := it.Dentry.inode.Stat(ctx, fd.filesystem(), opts)
if err != nil {
return err
}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index daad1e04a..11dc24b07 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -37,8 +37,7 @@ import (
// * !rp.Done().
//
// Postcondition: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
- d := vfsd.Impl().(*Dentry)
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) {
if !d.isDir() {
return nil, syserror.ENOTDIR
}
@@ -55,20 +54,20 @@ afterSymlink:
// calls d_revalidate(), but walk_component() => handle_dots() does not.
if name == "." {
rp.Advance()
- return vfsd, nil
+ return d, nil
}
if name == ".." {
- if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil {
+ if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil {
return nil, err
} else if isRoot || d.parent == nil {
rp.Advance()
- return vfsd, nil
+ return d, nil
}
- if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil {
return nil, err
}
rp.Advance()
- return &d.parent.vfsd, nil
+ return d.parent, nil
}
if len(name) > linux.NAME_MAX {
return nil, syserror.ENAMETOOLONG
@@ -79,7 +78,7 @@ afterSymlink:
if err != nil {
return nil, err
}
- if err := rp.CheckMount(ctx, &next.vfsd); err != nil {
+ if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil {
return nil, err
}
// Resolve any symlink at current path component.
@@ -102,7 +101,7 @@ afterSymlink:
goto afterSymlink
}
rp.Advance()
- return &next.vfsd, nil
+ return next, nil
}
// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
@@ -122,7 +121,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
if !child.inode.Valid(ctx) {
delete(parent.children, name)
vfsObj.InvalidateDentry(ctx, &child.vfsd)
- fs.deferDecRef(&child.vfsd) // Reference from Lookup.
+ fs.deferDecRef(child) // Reference from Lookup.
child = nil
}
}
@@ -133,7 +132,8 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
if err != nil {
return nil, err
}
- // Reference on childVFSD dropped by a corresponding Valid.
+ // Reference on c (provided by Lookup) will be dropped when the dentry
+ // fails validation.
parent.InsertChildLocked(name, c)
child = c
}
@@ -149,15 +149,14 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
- vfsd := rp.Start()
+ d := rp.Start().Impl().(*Dentry)
for !rp.Done() {
var err error
- vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+ d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
if err != nil {
return nil, err
}
}
- d := vfsd.Impl().(*Dentry)
if rp.MustBeDir() && !d.isDir() {
return nil, syserror.ENOTDIR
}
@@ -176,20 +175,19 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
// * !rp.Done().
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
- vfsd := rp.Start()
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+ d := rp.Start().Impl().(*Dentry)
for !rp.Final() {
var err error
- vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+ d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
if err != nil {
- return nil, nil, err
+ return nil, err
}
}
- d := vfsd.Impl().(*Dentry)
if !d.isDir() {
- return nil, nil, syserror.ENOTDIR
+ return nil, syserror.ENOTDIR
}
- return vfsd, d.inode, nil
+ return d, nil
}
// checkCreateLocked checks that a file named rp.Component() may be created in
@@ -197,10 +195,9 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
//
// Preconditions:
// * Filesystem.mu must be locked for at least reading.
-// * parentInode == parentVFSD.Impl().(*Dentry).Inode.
// * isDir(parentInode) == true.
-func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
- if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *Dentry) (string, error) {
+ if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return "", err
}
pc := rp.Component()
@@ -211,10 +208,10 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
return "", syserror.ENAMETOOLONG
}
// FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu.
- if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok {
+ if _, ok := parent.children[pc]; ok {
return "", syserror.EEXIST
}
- if parentVFSD.IsDead() {
+ if parent.VFSDentry().IsDead() {
return "", syserror.ENOENT
}
return pc, nil
@@ -288,12 +285,12 @@ func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
fs.mu.RLock()
defer fs.processDeferredDecRefs(ctx)
defer fs.mu.RUnlock()
- vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+ d, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return nil, err
}
- vfsd.IncRef() // Ownership transferred to caller.
- return vfsd, nil
+ d.IncRef() // Ownership transferred to caller.
+ return d.VFSDentry(), nil
}
// LinkAt implements vfs.FilesystemImpl.LinkAt.
@@ -303,12 +300,12 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
}
fs.mu.Lock()
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ parent, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -325,11 +322,11 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EPERM
}
- childVFSD, err := parentInode.NewLink(ctx, pc, d.inode)
+ child, err := parent.inode.NewLink(ctx, pc, d.inode)
if err != nil {
return err
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+ parent.InsertChild(pc, child)
return nil
}
@@ -340,12 +337,12 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
fs.mu.Lock()
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ parent, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -353,14 +350,14 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return err
}
defer rp.Mount().EndWrite()
- childVFSD, err := parentInode.NewDir(ctx, pc, opts)
+ child, err := parent.inode.NewDir(ctx, pc, opts)
if err != nil {
if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
return err
}
- childVFSD = newSyntheticDirectory(rp.Credentials(), opts.Mode)
+ child = newSyntheticDirectory(rp.Credentials(), opts.Mode)
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+ parent.InsertChild(pc, child)
return nil
}
@@ -371,12 +368,12 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
fs.mu.Lock()
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ parent, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -384,11 +381,11 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return err
}
defer rp.Mount().EndWrite()
- newVFSD, err := parentInode.NewNode(ctx, pc, opts)
+ newD, err := parent.inode.NewNode(ctx, pc, opts)
if err != nil {
return err
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry))
+ parent.InsertChild(pc, newD)
return nil
}
@@ -450,13 +447,13 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
return d.inode.Open(ctx, rp, d, opts)
}
afterTrailingSymlink:
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ parent, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return nil, err
}
// Check for search permission in the parent directory.
- if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+ if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
// Reject attempts to open directories with O_CREAT.
@@ -471,10 +468,10 @@ afterTrailingSymlink:
return nil, syserror.ENAMETOOLONG
}
// Determine whether or not we need to create a file.
- childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */)
+ child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */)
if err == syserror.ENOENT {
// Already checked for searchability above; now check for writability.
- if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+ if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -482,12 +479,11 @@ afterTrailingSymlink:
}
defer rp.Mount().EndWrite()
// Create and open the child.
- childVFSD, err = parentInode.NewFile(ctx, pc, opts)
+ child, err := parent.inode.NewFile(ctx, pc, opts)
if err != nil {
return nil, err
}
- child := childVFSD.Impl().(*Dentry)
- parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+ parent.InsertChild(pc, child)
child.inode.IncRef()
defer child.inode.DecRef(ctx)
unlock()
@@ -500,7 +496,6 @@ afterTrailingSymlink:
if mustCreate {
return nil, syserror.EEXIST
}
- child := childVFSD.Impl().(*Dentry)
if rp.ShouldFollowSymlink() && child.isSymlink() {
targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
if err != nil {
@@ -559,11 +554,10 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// Resolve the destination directory first to verify that it's on this
// Mount.
- dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+ dstDir, err := fs.walkParentDirLocked(ctx, rp)
if err != nil {
return err
}
- dstDir := dstDirVFSD.Impl().(*Dentry)
mnt := rp.Mount()
if mnt != oldParentVD.Mount() {
return syserror.EXDEV
@@ -589,7 +583,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// Can we create the dst dentry?
var dst *Dentry
- pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
+ pc, err := checkCreateLocked(ctx, rp, dstDir)
switch err {
case nil:
// Ok, continue with rename as replacement.
@@ -600,14 +594,14 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
dst = dstDir.children[pc]
if dst == nil {
- panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+ panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDir))
}
default:
return err
}
var dstVFSD *vfs.Dentry
if dst != nil {
- dstVFSD = &dst.vfsd
+ dstVFSD = dst.VFSDentry()
}
mntns := vfs.MountNamespaceFromContext(ctx)
@@ -627,14 +621,14 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
return err
}
- replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD)
+ replaced, err := srcDir.inode.Rename(ctx, src.name, pc, src, dstDir)
if err != nil {
virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
return err
}
delete(srcDir.children, src.name)
if srcDir != dstDir {
- fs.deferDecRef(srcDirVFSD)
+ fs.deferDecRef(srcDir)
dstDir.IncRef()
}
src.parent = dstDir
@@ -643,7 +637,11 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
dstDir.children = make(map[string]*Dentry)
}
dstDir.children[pc] = src
- virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced)
+ var replaceVFSD *vfs.Dentry
+ if replaced != nil {
+ replaceVFSD = replaced.VFSDentry()
+ }
+ virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD)
return nil
}
@@ -682,7 +680,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
- if err := parentDentry.inode.RmDir(ctx, d.name, d.VFSDentry()); err != nil {
+ if err := parentDentry.inode.RmDir(ctx, d.name, d); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
@@ -736,12 +734,12 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
}
fs.mu.Lock()
defer fs.mu.Unlock()
- parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ parent, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
return err
}
- pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+ pc, err := checkCreateLocked(ctx, rp, parent)
if err != nil {
return err
}
@@ -749,11 +747,11 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
return err
}
defer rp.Mount().EndWrite()
- childVFSD, err := parentInode.NewSymlink(ctx, pc, target)
+ child, err := parent.inode.NewSymlink(ctx, pc, target)
if err != nil {
return err
}
- parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+ parent.InsertChild(pc, child)
return nil
}
@@ -787,7 +785,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
- if err := parentDentry.inode.Unlink(ctx, d.name, d.VFSDentry()); err != nil {
+ if err := parentDentry.inode.Unlink(ctx, d.name, d); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index 5cc196980..49210e748 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -31,6 +31,8 @@ import (
// count for inodes, performing no extra actions when references are obtained or
// released. This is suitable for simple file inodes that don't reference any
// resources.
+//
+// +stateify savable
type InodeNoopRefCount struct {
}
@@ -50,30 +52,32 @@ func (InodeNoopRefCount) TryIncRef() bool {
// InodeDirectoryNoNewChildren partially implements the Inode interface.
// InodeDirectoryNoNewChildren represents a directory inode which does not
// support creation of new children.
+//
+// +stateify savable
type InodeDirectoryNoNewChildren struct{}
// NewFile implements Inode.NewFile.
-func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewDir implements Inode.NewDir.
-func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewLink implements Inode.NewLink.
-func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewSymlink implements Inode.NewSymlink.
-func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewNode implements Inode.NewNode.
-func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*Dentry, error) {
return nil, syserror.EPERM
}
@@ -81,6 +85,8 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt
// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
// represent directories can embed this to provide no-op implementations for
// directory-related functions.
+//
+// +stateify savable
type InodeNotDirectory struct {
}
@@ -90,42 +96,42 @@ func (InodeNotDirectory) HasChildren() bool {
}
// NewFile implements Inode.NewFile.
-func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*Dentry, error) {
panic("NewFile called on non-directory inode")
}
// NewDir implements Inode.NewDir.
-func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*Dentry, error) {
panic("NewDir called on non-directory inode")
}
// NewLink implements Inode.NewLinkink.
-func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*Dentry, error) {
panic("NewLink called on non-directory inode")
}
// NewSymlink implements Inode.NewSymlink.
-func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*Dentry, error) {
panic("NewSymlink called on non-directory inode")
}
// NewNode implements Inode.NewNode.
-func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*Dentry, error) {
panic("NewNode called on non-directory inode")
}
// Unlink implements Inode.Unlink.
-func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) Unlink(context.Context, string, *Dentry) error {
panic("Unlink called on non-directory inode")
}
// RmDir implements Inode.RmDir.
-func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) RmDir(context.Context, string, *Dentry) error {
panic("RmDir called on non-directory inode")
}
// Rename implements Inode.Rename.
-func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Rename(context.Context, string, string, *Dentry, *Dentry) (*Dentry, error) {
panic("Rename called on non-directory inode")
}
@@ -149,6 +155,8 @@ func (InodeNotDirectory) Valid(context.Context) bool {
// dymanic entries (i.e. entries that are not "hashed" into the
// vfs.Dentry.children) can embed this to provide no-op implementations for
// functions related to dynamic entries.
+//
+// +stateify savable
type InodeNoDynamicLookup struct{}
// Lookup implements Inode.Lookup.
@@ -169,6 +177,8 @@ func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
// InodeNotSymlink partially implements the Inode interface, specifically the
// inodeSymlink sub interface. All inodes that are not symlinks may embed this
// to return the appropriate errors from symlink-related functions.
+//
+// +stateify savable
type InodeNotSymlink struct{}
// Readlink implements Inode.Readlink.
@@ -186,6 +196,8 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry,
// inode attributes.
//
// Must be initialized by Init prior to first use.
+//
+// +stateify savable
type InodeAttrs struct {
devMajor uint32
devMinor uint32
@@ -330,13 +342,16 @@ func (a *InodeAttrs) DecLinks() {
}
}
+// +stateify savable
type slot struct {
Name string
- Dentry *vfs.Dentry
+ Dentry *Dentry
slotEntry
}
// OrderedChildrenOptions contains initialization options for OrderedChildren.
+//
+// +stateify savable
type OrderedChildrenOptions struct {
// Writable indicates whether vfs.FilesystemImpl methods implemented by
// OrderedChildren may modify the tracked children. This applies to
@@ -352,12 +367,14 @@ type OrderedChildrenOptions struct {
// directories.
//
// Must be initialize with Init before first use.
+//
+// +stateify savable
type OrderedChildren struct {
// Can children be modified by user syscalls? It set to false, interface
// methods that would modify the children return EPERM. Immutable.
writable bool
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
order slotList
set map[string]*slot
}
@@ -390,7 +407,7 @@ func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint3
if child.isDir() {
links++
}
- if err := o.Insert(name, child.VFSDentry()); err != nil {
+ if err := o.Insert(name, child); err != nil {
panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
}
d.InsertChild(name, child)
@@ -407,7 +424,7 @@ func (o *OrderedChildren) HasChildren() bool {
// Insert inserts child into o. This ignores the writability of o, as this is
// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
-func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) Insert(name string, child *Dentry) error {
o.mu.Lock()
defer o.mu.Unlock()
if _, ok := o.set[name]; ok {
@@ -431,10 +448,10 @@ func (o *OrderedChildren) removeLocked(name string) {
}
// Precondition: caller must hold o.mu for writing.
-func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+func (o *OrderedChildren) replaceChildLocked(name string, new *Dentry) *Dentry {
if s, ok := o.set[name]; ok {
// Existing slot with given name, simply replace the dentry.
- var old *vfs.Dentry
+ var old *Dentry
old, s.Dentry = s.Dentry, new
return old
}
@@ -450,7 +467,7 @@ func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.
}
// Precondition: caller must hold o.mu for reading or writing.
-func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) checkExistingLocked(name string, child *Dentry) error {
s, ok := o.set[name]
if !ok {
return syserror.ENOENT
@@ -462,7 +479,7 @@ func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) er
}
// Unlink implements Inode.Unlink.
-func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *Dentry) error {
if !o.writable {
return syserror.EPERM
}
@@ -478,12 +495,13 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De
}
// Rmdir implements Inode.Rmdir.
-func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *Dentry) error {
// We're not responsible for checking that child is a directory, that it's
// empty, or updating any link counts; so this is the same as unlink.
return o.Unlink(ctx, name, child)
}
+// +stateify savable
type renameAcrossDifferentImplementationsError struct{}
func (renameAcrossDifferentImplementationsError) Error() string {
@@ -499,8 +517,8 @@ func (renameAcrossDifferentImplementationsError) Error() string {
// that will support Rename.
//
// Postcondition: reference on any replaced dentry transferred to caller.
-func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
- dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *Dentry) (*Dentry, error) {
+ dst, ok := dstDir.inode.(interface{}).(*OrderedChildren)
if !ok {
return nil, renameAcrossDifferentImplementationsError{}
}
@@ -542,6 +560,8 @@ func (o *OrderedChildren) nthLocked(i int64) *slot {
}
// InodeSymlink partially implements Inode interface for symlinks.
+//
+// +stateify savable
type InodeSymlink struct {
InodeNotDirectory
}
@@ -615,6 +635,8 @@ func (s *StaticDirectory) DecRef(context.Context) {
}
// AlwaysValid partially implements kernfs.inodeDynamicLookup.
+//
+// +stateify savable
type AlwaysValid struct{}
// Valid implements kernfs.inodeDynamicLookup.Valid.
@@ -624,6 +646,8 @@ func (*AlwaysValid) Valid(context.Context) bool {
// InodeNoStatFS partially implements the Inode interface, where the client
// filesystem doesn't support statfs(2).
+//
+// +stateify savable
type InodeNoStatFS struct{}
// StatFS implements Inode.StatFS.
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 14bf43ede..f543a2065 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -66,15 +66,17 @@ import (
// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
// filesystem. Concrete implementations are expected to embed this in their own
// Filesystem type.
+//
+// +stateify savable
type Filesystem struct {
vfsfs vfs.Filesystem
- droppedDentriesMu sync.Mutex
+ droppedDentriesMu sync.Mutex `state:"nosave"`
// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
// used to defer dentry destruction until mu can be acquired for
// writing. Protected by droppedDentriesMu.
- droppedDentries []*vfs.Dentry
+ droppedDentries []*Dentry
// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
// for reading guarantees continued existence of any resolved dentries, but
@@ -97,7 +99,7 @@ type Filesystem struct {
// defer fs.mu.RUnlock()
// ...
// fs.deferDecRef(dentry)
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
// nextInoMinusOne is used to to allocate inode numbers on this
// filesystem. Must be accessed by atomic operations.
@@ -108,7 +110,7 @@ type Filesystem struct {
// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
//
// Precondition: d must not already be pending destruction.
-func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+func (fs *Filesystem) deferDecRef(d *Dentry) {
fs.droppedDentriesMu.Lock()
fs.droppedDentries = append(fs.droppedDentries, d)
fs.droppedDentriesMu.Unlock()
@@ -160,6 +162,8 @@ const (
// to, and child dentries hold a reference on their parent.
//
// Must be initialized by Init prior to first use.
+//
+// +stateify savable
type Dentry struct {
DentryRefs
@@ -173,7 +177,7 @@ type Dentry struct {
name string
// dirMu protects children and the names of child Dentries.
- dirMu sync.Mutex
+ dirMu sync.Mutex `state:"nosave"`
children map[string]*Dentry
inode Inode
@@ -273,7 +277,7 @@ func (d *Dentry) InsertChildLocked(name string, child *Dentry) {
// isn't sufficient to remove a child from a directory.
//
// Precondition: d must represent a directory inode.
-func (d *Dentry) RemoveChild(name string, child *vfs.Dentry) error {
+func (d *Dentry) RemoveChild(name string, child *Dentry) error {
d.dirMu.Lock()
defer d.dirMu.Unlock()
return d.RemoveChildLocked(name, child)
@@ -283,7 +287,7 @@ func (d *Dentry) RemoveChild(name string, child *vfs.Dentry) error {
// preconditions.
//
// Precondition: d.dirMu must be locked.
-func (d *Dentry) RemoveChildLocked(name string, child *vfs.Dentry) error {
+func (d *Dentry) RemoveChildLocked(name string, child *Dentry) error {
if !d.isDir() {
panic(fmt.Sprintf("RemoveChild called on non-directory Dentry: %+v.", d))
}
@@ -291,7 +295,7 @@ func (d *Dentry) RemoveChildLocked(name string, child *vfs.Dentry) error {
if !ok {
return syserror.ENOENT
}
- if &c.vfsd != child {
+ if c != child {
panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! Child: %+v, vfs: %+v", c, child))
}
delete(d.children, name)
@@ -346,7 +350,7 @@ type Inode interface {
// Open creates a file description for the filesystem object represented by
// this inode. The returned file description should hold a reference on the
- // inode for its lifetime.
+ // dentry for its lifetime.
//
// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
// the inode on which Open() is being called.
@@ -400,30 +404,30 @@ type inodeDirectory interface {
HasChildren() bool
// NewFile creates a new regular file inode.
- NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+ NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*Dentry, error)
// NewDir creates a new directory inode.
- NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+ NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*Dentry, error)
// NewLink creates a new hardlink to a specified inode in this
// directory. Implementations should create a new kernfs Dentry pointing to
// target, and update target's link count.
- NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+ NewLink(ctx context.Context, name string, target Inode) (*Dentry, error)
// NewSymlink creates a new symbolic link inode.
- NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+ NewSymlink(ctx context.Context, name, target string) (*Dentry, error)
// NewNode creates a new filesystem node for a mknod syscall.
- NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+ NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*Dentry, error)
// Unlink removes a child dentry from this directory inode.
- Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+ Unlink(ctx context.Context, name string, child *Dentry) error
// RmDir removes an empty child directory from this directory
// inode. Implementations must update the parent directory's link count,
// if required. Implementations are not responsible for checking that child
// is a directory, checking for an empty directory.
- RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+ RmDir(ctx context.Context, name string, child *Dentry) error
// Rename is called on the source directory containing an inode being
// renamed. child should point to the resolved child in the source
@@ -431,7 +435,7 @@ type inodeDirectory interface {
// should return the replaced dentry or nil otherwise.
//
// Precondition: Caller must serialize concurrent calls to Rename.
- Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
+ Rename(ctx context.Context, oldname, newname string, child, dstDir *Dentry) (replaced *Dentry, err error)
}
type inodeDynamicLookup interface {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index c9f81d734..e413242dc 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -176,38 +176,36 @@ func (d *dir) DecRef(context.Context) {
d.dirRefs.DecRef(d.Destroy)
}
-func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*kernfs.Dentry, error) {
creds := auth.CredentialsFromContext(ctx)
dir := d.fs.newDir(creds, opts.Mode, nil)
- dirVFSD := dir.VFSDentry()
- if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
+ if err := d.OrderedChildren.Insert(name, dir); err != nil {
dir.DecRef(ctx)
return nil, err
}
d.IncLinks(1)
- return dirVFSD, nil
+ return dir, nil
}
-func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*kernfs.Dentry, error) {
creds := auth.CredentialsFromContext(ctx)
f := d.fs.newFile(creds, "")
- fVFSD := f.VFSDentry()
- if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
+ if err := d.OrderedChildren.Insert(name, f); err != nil {
f.DecRef(ctx)
return nil, err
}
- return fVFSD, nil
+ return f, nil
}
-func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (*kernfs.Dentry, error) {
return nil, syserror.EPERM
}
-func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (*dir) NewSymlink(context.Context, string, string) (*kernfs.Dentry, error) {
return nil, syserror.EPERM
}
-func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*kernfs.Dentry, error) {
return nil, syserror.EPERM
}
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 443121c99..58a93eaac 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -24,6 +24,8 @@ import (
// StaticSymlink provides an Inode implementation for symlinks that point to
// a immutable target.
+//
+// +stateify savable
type StaticSymlink struct {
InodeAttrs
InodeNoopRefCount
diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
index 845de6c1d..ea7f073eb 100644
--- a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
+++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
@@ -41,12 +41,12 @@ type syntheticDirectory struct {
var _ Inode = (*syntheticDirectory)(nil)
-func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) *vfs.Dentry {
+func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) *Dentry {
inode := &syntheticDirectory{}
inode.Init(creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm)
d := &Dentry{}
d.Init(inode)
- return &d.vfsd
+ return d
}
func (dir *syntheticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
@@ -69,12 +69,12 @@ func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath,
}
// NewFile implements Inode.NewFile.
-func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewDir implements Inode.NewDir.
-func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*Dentry, error) {
if !opts.ForSyntheticMountpoint {
return nil, syserror.EPERM
}
@@ -87,16 +87,16 @@ func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs
}
// NewLink implements Inode.NewLink.
-func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) {
+func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewSymlink implements Inode.NewSymlink.
-func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) {
+func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (*Dentry, error) {
return nil, syserror.EPERM
}
// NewNode implements Inode.NewNode.
-func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) {
+func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*Dentry, error) {
return nil, syserror.EPERM
}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
index 7ab42e71e..aea599460 100644
--- a/pkg/sentry/fsimpl/overlay/directory.go
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -100,12 +100,13 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string
return whiteouts, readdirErr
}
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
vfs.DentryMetadataFileDescriptionImpl
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
off int64
dirents []vfs.Dirent
}
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
index 6e04705c7..50b4a50f1 100644
--- a/pkg/sentry/fsimpl/overlay/non_directory.go
+++ b/pkg/sentry/fsimpl/overlay/non_directory.go
@@ -39,6 +39,7 @@ func (d *dentry) readlink(ctx context.Context) (string, error) {
})
}
+// +stateify savable
type nonDirectoryFD struct {
fileDescription
@@ -47,7 +48,7 @@ type nonDirectoryFD struct {
// fileDescription.dentry().upperVD. cachedFlags is the last known value of
// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
// protected by mu.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
copiedUp bool
cachedFD *vfs.FileDescription
cachedFlags uint32
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index d0d26185e..b03fa9121 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -51,6 +51,8 @@ import (
const Name = "overlay"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// Name implements vfs.FilesystemType.Name.
@@ -60,6 +62,8 @@ func (FilesystemType) Name() string {
// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
// FilesystemType.GetFilesystem.
+//
+// +stateify savable
type FilesystemOptions struct {
// Callers passing FilesystemOptions to
// overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
@@ -76,6 +80,8 @@ type FilesystemOptions struct {
}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
@@ -98,7 +104,7 @@ type filesystem struct {
// renameMu synchronizes renaming with non-renaming operations in order to
// ensure consistent lock ordering between dentry.dirMu in different
// dentries.
- renameMu sync.RWMutex
+ renameMu sync.RWMutex `state:"nosave"`
// lastDirIno is the last inode number assigned to a directory. lastDirIno
// is accessed using atomic memory operations.
@@ -367,6 +373,8 @@ func (fs *filesystem) newDirIno() uint64 {
}
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -399,7 +407,7 @@ type dentry struct {
// and dirents (if not nil) is a cache of dirents as returned by
// directoryFDs representing this directory. children is protected by
// dirMu.
- dirMu sync.Mutex
+ dirMu sync.Mutex `state:"nosave"`
children map[string]*dentry
dirents []vfs.Dirent
@@ -409,7 +417,7 @@ type dentry struct {
// If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
// be copied up) with copyMu locked for writing; otherwise, it is
// immutable. lowerVDs is always immutable.
- copyMu sync.RWMutex
+ copyMu sync.RWMutex `state:"nosave"`
upperVD vfs.VirtualDentry
lowerVDs []vfs.VirtualDentry
@@ -652,6 +660,8 @@ func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
// fileDescription is embedded by overlay implementations of
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index 33e247578..4e2da4810 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -31,6 +31,7 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type filesystemType struct{}
// Name implements vfs.FilesystemType.Name.
@@ -43,6 +44,7 @@ func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile
panic("pipefs.filesystemType.GetFilesystem should never be called")
}
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -76,6 +78,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
// inode implements kernfs.Inode.
+//
+// +stateify savable
type inode struct {
kernfs.InodeNotDirectory
kernfs.InodeNotSymlink
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 03b5941b9..05d7948ea 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -41,6 +41,7 @@ func (FilesystemType) Name() string {
return Name
}
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -84,6 +85,8 @@ func (fs *filesystem) Release(ctx context.Context) {
// dynamicInode is an overfitted interface for common Inodes with
// dynamicByteSource types used in procfs.
+//
+// +stateify savable
type dynamicInode interface {
kernfs.Inode
vfs.DynamicBytesSource
@@ -99,6 +102,7 @@ func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.
return d
}
+// +stateify savable
type staticFile struct {
kernfs.DynamicBytesFile
vfs.StaticData
@@ -118,10 +122,13 @@ func newStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64
// InternalData contains internal data passed in to the procfs mount via
// vfs.GetFilesystemOptions.InternalData.
+//
+// +stateify savable
type InternalData struct {
Cgroups map[string]string
}
+// +stateify savable
type implStatFS struct{}
// StatFS implements kernfs.Inode.StatFS.
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 47dc0ac9a..47ecd941c 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -116,6 +116,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
return offset, nil
}
+// +stateify savable
type subtasksFD struct {
kernfs.GenericDirectoryFD
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index ae461bb48..1f99183eb 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -129,6 +129,8 @@ func (i *taskInode) DecRef(context.Context) {
// taskOwnedInode implements kernfs.Inode and overrides inode owner with task
// effective user and group.
+//
+// +stateify savable
type taskOwnedInode struct {
kernfs.Inode
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index c7104458f..0866cea2b 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -51,6 +51,7 @@ func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool {
return true
}
+// +stateify savable
type fdDir struct {
locks vfs.FileLocks
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index feed5bc3f..b81c8279e 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -785,6 +785,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
return nil
}
+// +stateify savable
type namespaceSymlink struct {
kernfs.StaticSymlink
@@ -832,6 +833,8 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir
// namespaceInode is a synthetic inode created to represent a namespace in
// /proc/[pid]/ns/*.
+//
+// +stateify savable
type namespaceInode struct {
implStatFS
kernfs.InodeAttrs
@@ -865,6 +868,8 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *ker
// namespace FD is a synthetic file that represents a namespace in
// /proc/[pid]/ns/*.
+//
+// +stateify savable
type namespaceFD struct {
vfs.FileDescriptionDefaultImpl
vfs.LockFD
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index 1607eac19..e7f748655 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -616,6 +616,7 @@ type netSnmpData struct {
var _ dynamicInode = (*netSnmpData)(nil)
+// +stateify savable
type snmpLine struct {
prefix string
header string
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index ad08c3626..d8f5dd509 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -236,6 +236,8 @@ func (i *tasksInode) DecRef(context.Context) {
// staticFileSetStat implements a special static file that allows inode
// attributes to be set. This is to support /proc files that are readonly, but
// allow attributes to be set.
+//
+// +stateify savable
type staticFileSetStat struct {
dynamicBytesFileSetAttr
vfs.StaticData
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 459a8e52e..f268c59b0 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -31,6 +31,7 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type selfSymlink struct {
implStatFS
kernfs.InodeAttrs
@@ -74,6 +75,7 @@ func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials,
return syserror.EPERM
}
+// +stateify savable
type threadSelfSymlink struct {
implStatFS
kernfs.InodeAttrs
@@ -121,6 +123,8 @@ func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Creden
// dynamicBytesFileSetAttr implements a special file that allows inode
// attributes to be set. This is to support /proc files that are readonly, but
// allow attributes to be set.
+//
+// +stateify savable
type dynamicBytesFileSetAttr struct {
kernfs.DynamicBytesFile
}
@@ -131,6 +135,8 @@ func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesyste
}
// cpuStats contains the breakdown of CPU time for /proc/stat.
+//
+// +stateify savable
type cpuStats struct {
// user is time spent in userspace tasks with non-positive niceness.
user uint64
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index a3ffbb15e..3312b0418 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -31,6 +31,7 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type tcpMemDir int
const (
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index 3c02af8c9..bf11b425a 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -27,6 +27,8 @@ import (
)
// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds.
+//
+// +stateify savable
type SignalFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -43,7 +45,7 @@ type SignalFileDescription struct {
target *kernel.Task
// mu protects mask.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// mask is the signal mask. Protected by mu.
mask linux.SignalSet
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index f1e75e277..29e5371d6 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -28,6 +28,8 @@ import (
)
// filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type filesystemType struct{}
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
@@ -44,6 +46,7 @@ func (filesystemType) Name() string {
return "sockfs"
}
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -80,6 +83,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
// inode implements kernfs.Inode.
+//
+// +stateify savable
type inode struct {
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go
index 252f75d26..b75d70ae6 100644
--- a/pkg/sentry/fsimpl/sys/kcov.go
+++ b/pkg/sentry/fsimpl/sys/kcov.go
@@ -36,6 +36,8 @@ func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials)
}
// kcovInode implements kernfs.Inode.
+//
+// +stateify savable
type kcovInode struct {
kernfs.InodeAttrs
kernfs.InodeNoopRefCount
@@ -63,6 +65,7 @@ func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.D
return &fd.vfsfd, nil
}
+// +stateify savable
type kcovFD struct {
vfs.FileDescriptionDefaultImpl
vfs.NoLockFD
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 8bcefc103..1568c581f 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -34,9 +34,13 @@ const Name = "sysfs"
const defaultSysDirMode = linux.FileMode(0755)
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
kernfs.Filesystem
@@ -117,6 +121,8 @@ func (fs *filesystem) Release(ctx context.Context) {
}
// dir implements kernfs.Inode.
+//
+// +stateify savable
type dir struct {
dirRefs
kernfs.InodeAttrs
@@ -169,6 +175,8 @@ func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, err
}
// cpuFile implements kernfs.Inode.
+//
+// +stateify savable
type cpuFile struct {
implStatFS
kernfs.DynamicBytesFile
@@ -190,6 +198,7 @@ func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode li
return d
}
+// +stateify savable
type implStatFS struct{}
// StatFS implements kernfs.Inode.StatFS.
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index ac8a4e3bb..8853c8ad2 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -28,6 +28,8 @@ import (
// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also
// implements ktime.TimerListener.
+//
+// +stateify savable
type TimerFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index ac54d420d..9129d35b7 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
+// +stateify savable
type deviceFile struct {
inode inode
kind vfs.DeviceKind
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 070c75e68..e90669cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -25,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
+// +stateify savable
type directory struct {
// Since directories can't be hard-linked, each directory can only be
// associated with a single dentry, which we can store in the directory
@@ -44,7 +45,7 @@ type directory struct {
// (with inode == nil) that represent the iteration position of
// directoryFDs. childList is used to support directoryFD.IterDirents()
// efficiently. childList is protected by iterMu.
- iterMu sync.Mutex
+ iterMu sync.Mutex `state:"nosave"`
childList dentryList
}
@@ -86,6 +87,7 @@ func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid)))
}
+// +stateify savable
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 5b0471ff4..d772db9e9 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// +stateify savable
type namedPipe struct {
inode inode
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index b8699d064..a199eb33d 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -36,6 +36,8 @@ import (
)
// regularFile is a regular (=S_IFREG) tmpfs file.
+//
+// +stateify savable
type regularFile struct {
inode inode
@@ -66,7 +68,7 @@ type regularFile struct {
writableMappingPages uint64
// dataMu protects the fields below.
- dataMu sync.RWMutex
+ dataMu sync.RWMutex `state:"nosave"`
// data maps offsets into the file to offsets into memFile that store
// the file's data.
@@ -325,13 +327,14 @@ func (*regularFile) InvalidateUnsavable(context.Context) error {
return nil
}
+// +stateify savable
type regularFileFD struct {
fileDescription
// off is the file offset. off is accessed using atomic memory operations.
// offMu serializes operations that may mutate off.
off int64
- offMu sync.Mutex
+ offMu sync.Mutex `state:"nosave"`
}
// Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 3ed650474..5699d5975 100644
--- a/pkg/sentry/fsimpl/tmpfs/socket_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -21,6 +21,8 @@ import (
)
// socketFile is a socket (=S_IFSOCK) tmpfs file.
+//
+// +stateify savable
type socketFile struct {
inode inode
ep transport.BoundEndpoint
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index b0de5fabe..a102a2ee2 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
+// +stateify savable
type symlink struct {
inode inode
target string // immutable
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 4658e1533..cefec8fde 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -51,9 +51,13 @@ import (
const Name = "tmpfs"
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
@@ -67,7 +71,7 @@ type filesystem struct {
devMinor uint32
// mu serializes changes to the Dentry tree.
- mu sync.RWMutex
+ mu sync.RWMutex `state:"nosave"`
nextInoMinusOne uint64 // accessed using atomic memory operations
}
@@ -78,6 +82,8 @@ func (FilesystemType) Name() string {
}
// FilesystemOpts is used to pass configuration data to tmpfs.
+//
+// +stateify savable
type FilesystemOpts struct {
// RootFileType is the FileType of the filesystem root. Valid values
// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
@@ -221,6 +227,8 @@ var globalStatfs = linux.Statfs{
}
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -300,6 +308,8 @@ func (d *dentry) Watches() *vfs.Watches {
func (d *dentry) OnZeroWatches(context.Context) {}
// inode represents a filesystem object.
+//
+// +stateify savable
type inode struct {
// fs is the owning filesystem. fs is immutable.
fs *filesystem
@@ -316,12 +326,12 @@ type inode struct {
// Inode metadata. Writing multiple fields atomically requires holding
// mu, othewise atomic operations can be used.
- mu sync.Mutex
- mode uint32 // file type and mode
- nlink uint32 // protected by filesystem.mu instead of inode.mu
- uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
- gid uint32 // auth.KGID, but ...
- ino uint64 // immutable
+ mu sync.Mutex `state:"nosave"`
+ mode uint32 // file type and mode
+ nlink uint32 // protected by filesystem.mu instead of inode.mu
+ uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+ gid uint32 // auth.KGID, but ...
+ ino uint64 // immutable
// Linux's tmpfs has no concept of btime.
atime int64 // nanoseconds
@@ -668,6 +678,8 @@ func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats
// fileDescription is embedded by tmpfs implementations of
// vfs.FileDescriptionImpl.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index a81e7f714..3129f290d 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -74,9 +74,13 @@ var noCrashOnVerificationFailure bool
var verityMu sync.RWMutex
// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
type FilesystemType struct{}
// filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
type filesystem struct {
vfsfs vfs.Filesystem
@@ -101,11 +105,13 @@ type filesystem struct {
// renameMu synchronizes renaming with non-renaming operations in order
// to ensure consistent lock ordering between dentry.dirMu in different
// dentries.
- renameMu sync.RWMutex
+ renameMu sync.RWMutex `state:"nosave"`
}
// InternalFilesystemOptions may be passed as
// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
type InternalFilesystemOptions struct {
// RootMerkleFileName is the name of the verity root Merkle tree file.
RootMerkleFileName string
@@ -259,6 +265,8 @@ func (fs *filesystem) Release(ctx context.Context) {
}
// dentry implements vfs.DentryImpl.
+//
+// +stateify savable
type dentry struct {
vfsd vfs.Dentry
@@ -285,7 +293,7 @@ type dentry struct {
// and dirents (if not nil) is a cache of dirents as returned by
// directoryFDs representing this directory. children is protected by
// dirMu.
- dirMu sync.Mutex
+ dirMu sync.Mutex `state:"nosave"`
children map[string]*dentry
// lowerVD is the VirtualDentry in the underlying file system.
@@ -429,6 +437,8 @@ func (d *dentry) readlink(ctx context.Context) (string, error) {
// FileDescription is a wrapper of the underlying lowerFD, with support to build
// Merkle trees through the Linux fs-verity API to verify contents read from
// lowerFD.
+//
+// +stateify savable
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 22f9bb006..08bb5bd12 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -507,6 +507,10 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
+ if VFS2Enabled {
+ return nil // Not relevant.
+ }
+
// Flush all mount sources for currently mounted filesystems in each task.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
@@ -533,11 +537,6 @@ func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
//
// Precondition: Must be called with the kernel paused.
func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) {
- // TODO(gvisor.dev/issue/1663): Add save support for VFS2.
- if VFS2Enabled {
- return nil
- }
-
ts.mu.RLock()
defer ts.mu.RUnlock()
for t := range ts.Root.tids {
@@ -556,6 +555,10 @@ func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.Fi
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
+ if VFS2Enabled {
+ return nil
+ }
+
return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
if flags := file.Flags(); !flags.Write {
return nil
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index 3688f22d2..b76c0e6fb 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -37,6 +37,8 @@ import (
// SocketVFS2 implements socket.SocketVFS2 (and by extension,
// vfs.FileDescriptionImpl) for Unix sockets.
+//
+// +stateify savable
type SocketVFS2 struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index a06c9b8ab..245d2c5cf 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -61,8 +61,10 @@ func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error {
log.Infof("Sandbox save started, pausing all tasks.")
k.Pause()
k.ReceiveTaskStates()
- defer k.Unpause()
- defer log.Infof("Tasks resumed after save.")
+ defer func() {
+ k.Unpause()
+ log.Infof("Tasks resumed after save.")
+ }()
w.Stop()
defer w.Start()
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index 9c4db3047..bdfd3ca8f 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -52,6 +52,8 @@ const (
)
// anonFilesystemType implements FilesystemType.
+//
+// +stateify savable
type anonFilesystemType struct{}
// GetFilesystem implements FilesystemType.GetFilesystem.
@@ -69,12 +71,15 @@ func (anonFilesystemType) Name() string {
//
// Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
// methods that would require an anonDentry to be a directory return ENOTDIR.
+//
+// +stateify savable
type anonFilesystem struct {
vfsfs Filesystem
devMinor uint32
}
+// +stateify savable
type anonDentry struct {
vfsd Dentry
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index a69a5b2f1..320ab7ce1 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -89,6 +89,8 @@ func (d *Dentry) Impl() DentryImpl {
// DentryImpl contains implementation details for a Dentry. Implementations of
// DentryImpl should contain their associated Dentry by value as their first
// field.
+//
+// +stateify savable
type DentryImpl interface {
// IncRef increments the Dentry's reference count. A Dentry with a non-zero
// reference count must remain coherent with the state of the filesystem.
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 1e9dffc8f..dde2ad79b 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -22,6 +22,8 @@ import (
)
// DeviceKind indicates whether a device is a block or character device.
+//
+// +stateify savable
type DeviceKind uint32
const (
@@ -44,6 +46,7 @@ func (kind DeviceKind) String() string {
}
}
+// +stateify savable
type devTuple struct {
kind DeviceKind
major uint32
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 754e76aec..8f36c3e3b 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -27,6 +27,8 @@ import (
var epollCycleMu sync.Mutex
// EpollInstance represents an epoll instance, as described by epoll(7).
+//
+// +stateify savable
type EpollInstance struct {
vfsfd FileDescription
FileDescriptionDefaultImpl
@@ -38,11 +40,11 @@ type EpollInstance struct {
// interest is the set of file descriptors that are registered with the
// EpollInstance for monitoring. interest is protected by interestMu.
- interestMu sync.Mutex
+ interestMu sync.Mutex `state:"nosave"`
interest map[epollInterestKey]*epollInterest
// mu protects fields in registered epollInterests.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// ready is the set of file descriptors that may be "ready" for I/O. Note
// that this must be an ordered list, not a map: "If more than maxevents
@@ -55,6 +57,7 @@ type EpollInstance struct {
ready epollInterestList
}
+// +stateify savable
type epollInterestKey struct {
// file is the registered FileDescription. No reference is held on file;
// instead, when the last reference is dropped, FileDescription.DecRef()
@@ -67,6 +70,8 @@ type epollInterestKey struct {
}
// epollInterest represents an EpollInstance's interest in a file descriptor.
+//
+// +stateify savable
type epollInterest struct {
// epoll is the owning EpollInstance. epoll is immutable.
epoll *EpollInstance
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 73bb36d3e..1eba0270f 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -37,11 +37,13 @@ import (
// FileDescription methods require that a reference is held.
//
// FileDescription is analogous to Linux's struct file.
+//
+// +stateify savable
type FileDescription struct {
FileDescriptionRefs
// flagsMu protects statusFlags and asyncHandler below.
- flagsMu sync.Mutex
+ flagsMu sync.Mutex `state:"nosave"`
// statusFlags contains status flags, "initialized by open(2) and possibly
// modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
@@ -56,7 +58,7 @@ type FileDescription struct {
// epolls is the set of epollInterests registered for this FileDescription.
// epolls is protected by epollMu.
- epollMu sync.Mutex
+ epollMu sync.Mutex `state:"nosave"`
epolls map[*epollInterest]struct{}
// vd is the filesystem location at which this FileDescription was opened.
@@ -88,6 +90,8 @@ type FileDescription struct {
}
// FileDescriptionOptions contains options to FileDescription.Init().
+//
+// +stateify savable
type FileDescriptionOptions struct {
// If AllowDirectIO is true, allow O_DIRECT to be set on the file.
AllowDirectIO bool
@@ -451,6 +455,8 @@ type FileDescriptionImpl interface {
}
// Dirent holds the information contained in struct linux_dirent64.
+//
+// +stateify savable
type Dirent struct {
// Name is the filename.
Name string
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 78da16bac..48ca9de44 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -42,6 +42,8 @@ import (
// FileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
// methods with default behavior analogous to Linux's.
+//
+// +stateify savable
type FileDescriptionDefaultImpl struct{}
// OnClose implements FileDescriptionImpl.OnClose analogously to
@@ -166,6 +168,8 @@ func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string)
// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl that always represent directories to obtain
// implementations of non-directory I/O methods that return EISDIR.
+//
+// +stateify savable
type DirectoryFileDescriptionDefaultImpl struct{}
// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate.
@@ -196,6 +200,8 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
// DentryMetadataFileDescriptionImpl may be embedded by implementations of
// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
// true to obtain implementations of Stat and SetStat that panic.
+//
+// +stateify savable
type DentryMetadataFileDescriptionImpl struct{}
// Stat implements FileDescriptionImpl.Stat.
@@ -210,12 +216,16 @@ func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetSt
// DynamicBytesSource represents a data source for a
// DynamicBytesFileDescriptionImpl.
+//
+// +stateify savable
type DynamicBytesSource interface {
// Generate writes the file's contents to buf.
Generate(ctx context.Context, buf *bytes.Buffer) error
}
// StaticData implements DynamicBytesSource over a static string.
+//
+// +stateify savable
type StaticData struct {
Data string
}
@@ -242,14 +252,24 @@ type WritableDynamicBytesSource interface {
//
// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
// use.
+//
+// +stateify savable
type DynamicBytesFileDescriptionImpl struct {
data DynamicBytesSource // immutable
- mu sync.Mutex // protects the following fields
- buf bytes.Buffer
+ mu sync.Mutex `state:"nosave"` // protects the following fields
+ buf bytes.Buffer `state:".([]byte)"`
off int64
lastRead int64 // offset at which the last Read, PRead, or Seek ended
}
+func (fd *DynamicBytesFileDescriptionImpl) saveBuf() []byte {
+ return fd.buf.Bytes()
+}
+
+func (fd *DynamicBytesFileDescriptionImpl) loadBuf(p []byte) {
+ fd.buf.Write(p)
+}
+
// SetDataSource must be called exactly once on fd before first use.
func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
fd.data = data
@@ -382,6 +402,8 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M
// LockFD may be used by most implementations of FileDescriptionImpl.Lock*
// functions. Caller must call Init().
+//
+// +stateify savable
type LockFD struct {
locks *FileLocks
}
@@ -409,6 +431,8 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
// returning ENOLCK.
+//
+// +stateify savable
type NoLockFD struct{}
// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 7dae4e7e8..c93d94634 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -506,6 +506,8 @@ type FilesystemImpl interface {
// PrependPathAtVFSRootError is returned by implementations of
// FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+//
+// +stateify savable
type PrependPathAtVFSRootError struct{}
// Error implements error.Error.
@@ -516,6 +518,8 @@ func (PrependPathAtVFSRootError) Error() string {
// PrependPathAtNonMountRootError is returned by implementations of
// FilesystemImpl.PrependPath() when they encounter an independent ancestor
// Dentry that is not the Mount root.
+//
+// +stateify savable
type PrependPathAtNonMountRootError struct{}
// Error implements error.Error.
@@ -526,6 +530,8 @@ func (PrependPathAtNonMountRootError) Error() string {
// PrependPathSyntheticError is returned by implementations of
// FilesystemImpl.PrependPath() for which prepended names do not represent real
// paths.
+//
+// +stateify savable
type PrependPathSyntheticError struct{}
// Error implements error.Error.
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index 82ae35fdc..bc19db1d5 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -55,6 +55,8 @@ type registeredFilesystemType struct {
// RegisterFilesystemTypeOptions contains options to
// VirtualFilesystem.RegisterFilesystem().
+//
+// +stateify savable
type RegisterFilesystemTypeOptions struct {
// AllowUserMount determines whether users are allowed to mount a file system
// of this type, i.e. through mount(2). If AllowUserMount is true, allow calls
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
index 8882fa84a..2d27d9d35 100644
--- a/pkg/sentry/vfs/genericfstree/genericfstree.go
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -27,6 +27,8 @@ import (
)
// Dentry is a required type parameter that is a struct with the given fields.
+//
+// +stateify savable
type Dentry struct {
// vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl.
vfsd vfs.Dentry
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
index aff220a61..3f0b8f45b 100644
--- a/pkg/sentry/vfs/inotify.go
+++ b/pkg/sentry/vfs/inotify.go
@@ -37,6 +37,8 @@ const inotifyEventBaseSize = 16
//
// The way events are labelled appears somewhat arbitrary, but they must match
// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+//
+// +stateify savable
type EventType uint8
// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go
index 42666eebf..55783d4eb 100644
--- a/pkg/sentry/vfs/lock.go
+++ b/pkg/sentry/vfs/lock.go
@@ -33,6 +33,8 @@ import (
// Note that in Linux these two types of locks are _not_ cooperative, because
// race and deadlock conditions make merging them prohibitive. We do the same
// and keep them oblivious to each other.
+//
+// +stateify savable
type FileLocks struct {
// bsd is a set of BSD-style advisory file wide locks, see flock(2).
bsd fslock.Locks
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 9da09d4c1..dfc3ae6c0 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -65,7 +65,7 @@ type Mount struct {
//
// Invariant: key.parent != nil iff key.point != nil. key.point belongs to
// key.parent.fs.
- key mountKey
+ key mountKey `state:".(VirtualDentry)"`
// ns is the namespace in which this Mount was mounted. ns is protected by
// VirtualFilesystem.mountMu.
@@ -345,6 +345,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
return nil
}
+// +stateify savable
type umountRecursiveOptions struct {
// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
// on umounted mounts fail.
@@ -414,7 +415,7 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
}
}
mnt.IncRef() // dropped by callers of umountRecursiveLocked
- mnt.storeKey(vd)
+ mnt.setKey(vd)
if vd.mount.children == nil {
vd.mount.children = make(map[*Mount]struct{})
}
@@ -439,13 +440,13 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
// * vfs.mounts.seq must be in a writer critical section.
// * mnt.parent() != nil.
func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
- vd := mnt.loadKey()
+ vd := mnt.getKey()
if checkInvariants {
if vd.mount != nil {
panic("VFS.disconnectLocked called on disconnected mount")
}
}
- mnt.storeKey(VirtualDentry{})
+ mnt.loadKey(VirtualDentry{})
delete(vd.mount.children, mnt)
atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
mnt.ns.mountpoints[vd.dentry]--
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3335e4057..cb8c56bd3 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -38,7 +38,7 @@ func TestMountTableInsertLookup(t *testing.T) {
mt.Init()
mount := &Mount{}
- mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+ mount.setKey(VirtualDentry{&Mount{}, &Dentry{}})
mt.Insert(mount)
if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
@@ -79,7 +79,7 @@ const enableComparativeBenchmarks = false
func newBenchMount() *Mount {
mount := &Mount{}
- mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+ mount.loadKey(VirtualDentry{&Mount{}, &Dentry{}})
return mount
}
@@ -94,7 +94,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) {
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
mt.Insert(mount)
- keys = append(keys, mount.loadKey())
+ keys = append(keys, mount.saveKey())
}
var ready sync.WaitGroup
@@ -146,7 +146,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) {
keys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- key := mount.loadKey()
+ key := mount.saveKey()
ms[key] = mount
keys = append(keys, key)
}
@@ -201,7 +201,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
keys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- key := mount.loadKey()
+ key := mount.getKey()
ms.Store(key, mount)
keys = append(keys, key)
}
@@ -283,7 +283,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) {
ms := make(map[VirtualDentry]*Mount)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- ms[mount.loadKey()] = mount
+ ms[mount.getKey()] = mount
}
negkeys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
@@ -318,7 +318,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
var ms sync.Map
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- ms.Store(mount.loadKey(), mount)
+ ms.Store(mount.saveKey(), mount)
}
negkeys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
@@ -372,7 +372,7 @@ func BenchmarkMountMapInsert(b *testing.B) {
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms[mount.loadKey()] = mount
+ ms[mount.saveKey()] = mount
}
}
@@ -392,7 +392,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) {
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms.Store(mount.loadKey(), mount)
+ ms.Store(mount.saveKey(), mount)
}
}
@@ -425,13 +425,13 @@ func BenchmarkMountMapRemove(b *testing.B) {
ms := make(map[VirtualDentry]*Mount)
for i := range mounts {
mount := mounts[i]
- ms[mount.loadKey()] = mount
+ ms[mount.saveKey()] = mount
}
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- delete(ms, mount.loadKey())
+ delete(ms, mount.saveKey())
}
}
@@ -447,12 +447,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) {
var ms sync.Map
for i := range mounts {
mount := mounts[i]
- ms.Store(mount.loadKey(), mount)
+ ms.Store(mount.saveKey(), mount)
}
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms.Delete(mount.loadKey())
+ ms.Delete(mount.saveKey())
}
}
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index da2a2e9c4..b7d122d22 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -34,6 +34,8 @@ import (
// structurally identical to VirtualDentry, but stores its fields as
// unsafe.Pointer since mutators synchronize with VFS path traversal using
// seqcounts.
+//
+// This is explicitly not savable.
type mountKey struct {
parent unsafe.Pointer // *Mount
point unsafe.Pointer // *Dentry
@@ -47,19 +49,23 @@ func (mnt *Mount) point() *Dentry {
return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
}
-func (mnt *Mount) loadKey() VirtualDentry {
+func (mnt *Mount) getKey() VirtualDentry {
return VirtualDentry{
mount: mnt.parent(),
dentry: mnt.point(),
}
}
+func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() }
+
// Invariant: mnt.key.parent == nil. vd.Ok().
-func (mnt *Mount) storeKey(vd VirtualDentry) {
+func (mnt *Mount) setKey(vd VirtualDentry) {
atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
}
+func (mnt *Mount) loadKey(vd VirtualDentry) { mnt.setKey(vd) }
+
// mountTable maps (mount parent, mount point) pairs to mounts. It supports
// efficient concurrent lookup, even in the presence of concurrent mutators
// (provided mutation is sufficiently uncommon).
@@ -92,6 +98,7 @@ type mountTable struct {
// length and cap in separate uint32s) for ~free.
size uint64
+ // FIXME(gvisor.dev/issue/1663): Slots need to be saved.
slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 413cfb101..bc79e5ecc 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -21,6 +21,8 @@ import (
// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
// FilesystemImpl.GetDentryAt().
+//
+// +stateify savable
type GetDentryOptions struct {
// If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that
// the returned Dentry is a directory for which creds has search
@@ -30,6 +32,8 @@ type GetDentryOptions struct {
// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and
// FilesystemImpl.MkdirAt().
+//
+// +stateify savable
type MkdirOptions struct {
// Mode is the file mode bits for the created directory.
Mode linux.FileMode
@@ -56,6 +60,8 @@ type MkdirOptions struct {
// MknodOptions contains options to VirtualFilesystem.MknodAt() and
// FilesystemImpl.MknodAt().
+//
+// +stateify savable
type MknodOptions struct {
// Mode is the file type and mode bits for the created file.
Mode linux.FileMode
@@ -72,6 +78,8 @@ type MknodOptions struct {
// MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
// MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers.
+//
+// +stateify savable
type MountFlags struct {
// NoExec is equivalent to MS_NOEXEC.
NoExec bool
@@ -93,6 +101,8 @@ type MountFlags struct {
}
// MountOptions contains options to VirtualFilesystem.MountAt().
+//
+// +stateify savable
type MountOptions struct {
// Flags contains flags as specified for mount(2), e.g. MS_NOEXEC.
Flags MountFlags
@@ -112,6 +122,8 @@ type MountOptions struct {
// OpenOptions contains options to VirtualFilesystem.OpenAt() and
// FilesystemImpl.OpenAt().
+//
+// +stateify savable
type OpenOptions struct {
// Flags contains access mode and flags as specified for open(2).
//
@@ -137,6 +149,8 @@ type OpenOptions struct {
// ReadOptions contains options to FileDescription.PRead(),
// FileDescriptionImpl.PRead(), FileDescription.Read(), and
// FileDescriptionImpl.Read().
+//
+// +stateify savable
type ReadOptions struct {
// Flags contains flags as specified for preadv2(2).
Flags uint32
@@ -144,6 +158,8 @@ type ReadOptions struct {
// RenameOptions contains options to VirtualFilesystem.RenameAt() and
// FilesystemImpl.RenameAt().
+//
+// +stateify savable
type RenameOptions struct {
// Flags contains flags as specified for renameat2(2).
Flags uint32
@@ -155,6 +171,8 @@ type RenameOptions struct {
// SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and
// FileDescriptionImpl.SetStat().
+//
+// +stateify savable
type SetStatOptions struct {
// Stat is the metadata that should be set. Only fields indicated by
// Stat.Mask should be set.
@@ -176,6 +194,8 @@ type SetStatOptions struct {
// BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt()
// and FilesystemImpl.BoundEndpointAt().
+//
+// +stateify savable
type BoundEndpointOptions struct {
// Addr is the path of the file whose socket endpoint is being retrieved.
// It is generally irrelevant: most endpoints are stored at a dentry that
@@ -195,6 +215,8 @@ type BoundEndpointOptions struct {
// GetXattrOptions contains options to VirtualFilesystem.GetXattrAt(),
// FilesystemImpl.GetXattrAt(), FileDescription.GetXattr(), and
// FileDescriptionImpl.GetXattr().
+//
+// +stateify savable
type GetXattrOptions struct {
// Name is the name of the extended attribute to retrieve.
Name string
@@ -209,6 +231,8 @@ type GetXattrOptions struct {
// SetXattrOptions contains options to VirtualFilesystem.SetXattrAt(),
// FilesystemImpl.SetXattrAt(), FileDescription.SetXattr(), and
// FileDescriptionImpl.SetXattr().
+//
+// +stateify savable
type SetXattrOptions struct {
// Name is the name of the extended attribute being mutated.
Name string
@@ -223,6 +247,8 @@ type SetXattrOptions struct {
// StatOptions contains options to VirtualFilesystem.StatAt(),
// FilesystemImpl.StatAt(), FileDescription.Stat(), and
// FileDescriptionImpl.Stat().
+//
+// +stateify savable
type StatOptions struct {
// Mask is the set of fields in the returned Statx that the FilesystemImpl
// or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask.
@@ -240,6 +266,8 @@ type StatOptions struct {
}
// UmountOptions contains options to VirtualFilesystem.UmountAt().
+//
+// +stateify savable
type UmountOptions struct {
// Flags contains flags as specified for umount2(2).
Flags uint32
@@ -248,6 +276,8 @@ type UmountOptions struct {
// WriteOptions contains options to FileDescription.PWrite(),
// FileDescriptionImpl.PWrite(), FileDescription.Write(), and
// FileDescriptionImpl.Write().
+//
+// +stateify savable
type WriteOptions struct {
// Flags contains flags as specified for pwritev2(2).
Flags uint32
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 00eeb8842..d48520d58 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -26,6 +26,8 @@ import (
)
// AccessTypes is a bitmask of Unix file permissions.
+//
+// +stateify savable
type AccessTypes uint16
// Bits in AccessTypes.
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 3304372d9..e4fd55012 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -35,6 +35,8 @@ import (
// FilesystemImpl methods.
//
// ResolvingPath is loosely analogous to Linux's struct nameidata.
+//
+// +stateify savable
type ResolvingPath struct {
vfs *VirtualFilesystem
root VirtualDentry // refs borrowed from PathOperation
@@ -88,6 +90,7 @@ func init() {
// so error "constants" are really mutable vars, necessitating somewhat
// expensive interface object comparisons.
+// +stateify savable
type resolveMountRootOrJumpError struct{}
// Error implements error.Error.
@@ -95,6 +98,7 @@ func (resolveMountRootOrJumpError) Error() string {
return "resolving mount root or jump"
}
+// +stateify savable
type resolveMountPointError struct{}
// Error implements error.Error.
@@ -102,6 +106,7 @@ func (resolveMountPointError) Error() string {
return "resolving mount point"
}
+// +stateify savable
type resolveAbsSymlinkError struct{}
// Error implements error.Error.
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 1ebf355ef..5bd756ea5 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -163,6 +163,8 @@ func (vfs *VirtualFilesystem) Init(ctx context.Context) error {
// PathOperation is passed to VFS methods by pointer to reduce memory copying:
// it's somewhat large and should never escape. (Options structs are passed by
// pointer to VFS and FileDescription methods for the same reason.)
+//
+// +stateify savable
type PathOperation struct {
// Root is the VFS root. References on Root are borrowed from the provider
// of the PathOperation.
diff --git a/pkg/state/types.go b/pkg/state/types.go
index 215ef80f8..84aed8732 100644
--- a/pkg/state/types.go
+++ b/pkg/state/types.go
@@ -107,6 +107,14 @@ func lookupNameFields(typ reflect.Type) (string, []string, bool) {
}
return name, nil, true
}
+ // Sanity check the type.
+ if raceEnabled {
+ if _, ok := reverseTypeDatabase[typ]; !ok {
+ // The type was not registered? Must be an embedded
+ // structure or something else.
+ return "", nil, false
+ }
+ }
// Extract the name from the object.
name := t.StateTypeName()
fields := t.StateFields()
@@ -313,6 +321,9 @@ var primitiveTypeDatabase = func() map[string]reflect.Type {
// globalTypeDatabase is used for dispatching interfaces on decode.
var globalTypeDatabase = map[string]reflect.Type{}
+// reverseTypeDatabase is a reverse mapping.
+var reverseTypeDatabase = map[reflect.Type]string{}
+
// Register registers a type.
//
// This must be called on init and only done once.
@@ -358,4 +369,7 @@ func Register(t Type) {
Failf("conflicting name for %T: matches interfaceType", t)
}
globalTypeDatabase[name] = typ
+ if raceEnabled {
+ reverseTypeDatabase[typ] = name
+ }
}
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index b769094dc..19627fa9b 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -339,7 +339,7 @@ func NoChecksum(noChecksum bool) TransportChecker {
udp, ok := h.(header.UDP)
if !ok {
- return
+ t.Fatalf("UDP header not found in h: %T", h)
}
if b := udp.Checksum() == 0; b != noChecksum {
@@ -348,14 +348,14 @@ func NoChecksum(noChecksum bool) TransportChecker {
}
}
-// SeqNum creates a checker that checks the sequence number.
-func SeqNum(seq uint32) TransportChecker {
+// TCPSeqNum creates a checker that checks the sequence number.
+func TCPSeqNum(seq uint32) TransportChecker {
return func(t *testing.T, h header.Transport) {
t.Helper()
tcp, ok := h.(header.TCP)
if !ok {
- return
+ t.Fatalf("TCP header not found in h: %T", h)
}
if s := tcp.SequenceNumber(); s != seq {
@@ -364,14 +364,14 @@ func SeqNum(seq uint32) TransportChecker {
}
}
-// AckNum creates a checker that checks the ack number.
-func AckNum(seq uint32) TransportChecker {
+// TCPAckNum creates a checker that checks the ack number.
+func TCPAckNum(seq uint32) TransportChecker {
return func(t *testing.T, h header.Transport) {
t.Helper()
tcp, ok := h.(header.TCP)
if !ok {
- return
+ t.Fatalf("TCP header not found in h: %T", h)
}
if s := tcp.AckNumber(); s != seq {
@@ -380,18 +380,52 @@ func AckNum(seq uint32) TransportChecker {
}
}
-// Window creates a checker that checks the tcp window.
-func Window(window uint16) TransportChecker {
+// TCPWindow creates a checker that checks the tcp window.
+func TCPWindow(window uint16) TransportChecker {
return func(t *testing.T, h header.Transport) {
t.Helper()
tcp, ok := h.(header.TCP)
if !ok {
- return
+ t.Fatalf("TCP header not found in hdr : %T", h)
}
if w := tcp.WindowSize(); w != window {
- t.Errorf("Bad window, got 0x%x, want 0x%x", w, window)
+ t.Errorf("Bad window, got %d, want %d", w, window)
+ }
+ }
+}
+
+// TCPWindowGreaterThanEq creates a checker that checks that the TCP window
+// is greater than or equal to the provided value.
+func TCPWindowGreaterThanEq(window uint16) TransportChecker {
+ return func(t *testing.T, h header.Transport) {
+ t.Helper()
+
+ tcp, ok := h.(header.TCP)
+ if !ok {
+ t.Fatalf("TCP header not found in h: %T", h)
+ }
+
+ if w := tcp.WindowSize(); w < window {
+ t.Errorf("Bad window, got %d, want > %d", w, window)
+ }
+ }
+}
+
+// TCPWindowLessThanEq creates a checker that checks that the tcp window
+// is less than or equal to the provided value.
+func TCPWindowLessThanEq(window uint16) TransportChecker {
+ return func(t *testing.T, h header.Transport) {
+ t.Helper()
+
+ tcp, ok := h.(header.TCP)
+ if !ok {
+ t.Fatalf("TCP header not found in h: %T", h)
+ }
+
+ if w := tcp.WindowSize(); w > window {
+ t.Errorf("Bad window, got %d, want < %d", w, window)
}
}
}
@@ -403,7 +437,7 @@ func TCPFlags(flags uint8) TransportChecker {
tcp, ok := h.(header.TCP)
if !ok {
- return
+ t.Fatalf("TCP header not found in h: %T", h)
}
if f := tcp.Flags(); f != flags {
@@ -420,7 +454,7 @@ func TCPFlagsMatch(flags, mask uint8) TransportChecker {
tcp, ok := h.(header.TCP)
if !ok {
- return
+ t.Fatalf("TCP header not found in h: %T", h)
}
if f := tcp.Flags(); (f & mask) != (flags & mask) {
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 68cf77de2..e7b7e95d4 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -144,10 +144,7 @@ type TCPReceiverState struct {
// PendingBufUsed is the number of bytes pending in the receive
// queue.
- PendingBufUsed seqnum.Size
-
- // PendingBufSize is the size of the socket receive buffer.
- PendingBufSize seqnum.Size
+ PendingBufUsed int
}
// TCPSenderState holds a copy of the internal state of the sender for
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 4778e7b1c..518449602 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -94,6 +94,7 @@ go_test(
shard_count = 10,
deps = [
":tcp",
+ "//pkg/rand",
"//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 881752371..6891fd245 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -898,7 +898,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
// sendRaw sends a TCP segment to the endpoint's peer.
func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
var sackBlocks []header.SACKBlock
- if e.EndpointState() == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
+ if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) {
sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
}
options := e.makeOptions(sackBlocks)
@@ -1003,9 +1003,8 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
// (indicated by a negative send window scale).
e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
- rcvBufSize := seqnum.Size(e.receiveBufferSize())
e.rcvListMu.Lock()
- e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
+ e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale())
// Bootstrap the auto tuning algorithm. Starting at zero will
// result in a really large receive window after the first auto
// tuning adjustment.
@@ -1136,12 +1135,11 @@ func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
}
cont, err := e.handleSegment(s)
+ s.decRef()
if err != nil {
- s.decRef()
return err
}
if !cont {
- s.decRef()
return nil
}
}
@@ -1221,6 +1219,12 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
return true, nil
}
+ // Increase counter if after processing the segment we would potentially
+ // advertise a zero window.
+ if crossed, above := e.windowCrossedACKThresholdLocked(-s.segMemSize()); crossed && !above {
+ e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
+ }
+
// Now check if the received segment has caused us to transition
// to a CLOSED state, if yes then terminate processing and do
// not invoke the sender.
@@ -1233,7 +1237,6 @@ func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
// or a notification from the protocolMainLoop (caller goroutine).
// This means that with this return, the segment dequeue below can
// never occur on a closed endpoint.
- s.decRef()
return false, nil
}
@@ -1425,10 +1428,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
e.rcv.nonZeroWindow()
}
- if n&notifyReceiveWindowChanged != 0 {
- e.rcv.pendingBufSize = seqnum.Size(e.receiveBufferSize())
- }
-
if n&notifyMTUChanged != 0 {
e.sndBufMu.Lock()
count := e.packetTooBigCount
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index 94207c141..560b4904c 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -78,8 +78,8 @@ func testV4Connect(t *testing.T, c *context.Context, checkers ...checker.Network
ackCheckers := append(checkers, checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(iss)+1),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(iss)+1),
))
checker.IPv4(t, c.GetPacket(), ackCheckers...)
@@ -185,8 +185,8 @@ func testV6Connect(t *testing.T, c *context.Context, checkers ...checker.Network
ackCheckers := append(checkers, checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(iss)+1),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(iss)+1),
))
checker.IPv6(t, c.GetV6Packet(), ackCheckers...)
@@ -283,7 +283,7 @@ func TestV4RefuseOnV6Only(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
- checker.AckNum(uint32(irs)+1),
+ checker.TCPAckNum(uint32(irs)+1),
),
)
}
@@ -319,7 +319,7 @@ func TestV6RefuseOnBoundToV4Mapped(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
- checker.AckNum(uint32(irs)+1),
+ checker.TCPAckNum(uint32(irs)+1),
),
)
}
@@ -352,7 +352,7 @@ func testV4Accept(t *testing.T, c *context.Context) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
- checker.AckNum(uint32(irs)+1),
+ checker.TCPAckNum(uint32(irs)+1),
),
)
@@ -492,7 +492,7 @@ func TestV6AcceptOnV6(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
- checker.AckNum(uint32(irs)+1),
+ checker.TCPAckNum(uint32(irs)+1),
),
)
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 120483838..87db13720 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -63,6 +63,17 @@ const (
StateClosing
)
+const (
+ // rcvAdvWndScale is used to split the available socket buffer into
+ // application buffer and the window to be advertised to the peer. This is
+ // currently hard coded to split the available space equally.
+ rcvAdvWndScale = 1
+
+ // SegOverheadFactor is used to multiply the value provided by the
+ // user on a SetSockOpt for setting the socket send/receive buffer sizes.
+ SegOverheadFactor = 2
+)
+
// connected returns true when s is one of the states representing an
// endpoint connected to a peer.
func (s EndpointState) connected() bool {
@@ -149,7 +160,6 @@ func (s EndpointState) String() string {
// Reasons for notifying the protocol goroutine.
const (
notifyNonZeroReceiveWindow = 1 << iota
- notifyReceiveWindowChanged
notifyClose
notifyMTUChanged
notifyDrain
@@ -384,13 +394,26 @@ type endpoint struct {
// to indicate to users that no more data is coming.
//
// rcvListMu can be taken after the endpoint mu below.
- rcvListMu sync.Mutex `state:"nosave"`
- rcvList segmentList `state:"wait"`
- rcvClosed bool
- rcvBufSize int
+ rcvListMu sync.Mutex `state:"nosave"`
+ rcvList segmentList `state:"wait"`
+ rcvClosed bool
+ // rcvBufSize is the total size of the receive buffer.
+ rcvBufSize int
+ // rcvBufUsed is the actual number of payload bytes held in the receive buffer
+ // not counting any overheads of the segments itself. NOTE: This will always
+ // be strictly <= rcvMemUsed below.
rcvBufUsed int
rcvAutoParams rcvBufAutoTuneParams
+ // rcvMemUsed tracks the total amount of memory in use by received segments
+ // held in rcvList, pendingRcvdSegments and the segment queue. This is used to
+ // compute the window and the actual available buffer space. This is distinct
+ // from rcvBufUsed above which is the actual number of payload bytes held in
+ // the buffer not including any segment overheads.
+ //
+ // rcvMemUsed must be accessed atomically.
+ rcvMemUsed int32
+
// mu protects all endpoint fields unless documented otherwise. mu must
// be acquired before interacting with the endpoint fields.
mu sync.Mutex `state:"nosave"`
@@ -891,7 +914,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
e.probe = p
}
- e.segmentQueue.setLimit(MaxUnprocessedSegments)
+ e.segmentQueue.ep = e
e.tsOffset = timeStampOffset()
e.acceptCond = sync.NewCond(&e.acceptMu)
@@ -1129,10 +1152,16 @@ func (e *endpoint) cleanupLocked() {
tcpip.DeleteDanglingEndpoint(e)
}
+// wndFromSpace returns the window that we can advertise based on the available
+// receive buffer space.
+func wndFromSpace(space int) int {
+ return space / (1 << rcvAdvWndScale)
+}
+
// initialReceiveWindow returns the initial receive window to advertise in the
// SYN/SYN-ACK.
func (e *endpoint) initialReceiveWindow() int {
- rcvWnd := e.receiveBufferAvailable()
+ rcvWnd := wndFromSpace(e.receiveBufferAvailable())
if rcvWnd > math.MaxUint16 {
rcvWnd = math.MaxUint16
}
@@ -1209,14 +1238,12 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
// reject valid data that might already be in flight as the
// acceptable window will shrink.
if rcvWnd > e.rcvBufSize {
- availBefore := e.receiveBufferAvailableLocked()
+ availBefore := wndFromSpace(e.receiveBufferAvailableLocked())
e.rcvBufSize = rcvWnd
- availAfter := e.receiveBufferAvailableLocked()
- mask := uint32(notifyReceiveWindowChanged)
+ availAfter := wndFromSpace(e.receiveBufferAvailableLocked())
if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
- mask |= notifyNonZeroReceiveWindow
+ e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
}
- e.notifyProtocolGoroutine(mask)
}
// We only update prevCopied when we grow the buffer because in cases
@@ -1293,18 +1320,22 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
v := views[s.viewToDeliver]
s.viewToDeliver++
+ var delta int
if s.viewToDeliver >= len(views) {
e.rcvList.Remove(s)
+ // We only free up receive buffer space when the segment is released as the
+ // segment is still holding on to the views even though some views have been
+ // read out to the user.
+ delta = s.segMemSize()
s.decRef()
}
e.rcvBufUsed -= len(v)
-
// If the window was small before this read and if the read freed up
// enough buffer space, to either fit an aMSS or half a receive buffer
// (whichever smaller), then notify the protocol goroutine to send a
// window update.
- if crossed, above := e.windowCrossedACKThresholdLocked(len(v)); crossed && above {
+ if crossed, above := e.windowCrossedACKThresholdLocked(delta); crossed && above {
e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
}
@@ -1481,11 +1512,11 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
}
// windowCrossedACKThresholdLocked checks if the receive window to be announced
-// now would be under aMSS or under half receive buffer, whichever smaller. This
-// is useful as a receive side silly window syndrome prevention mechanism. If
-// window grows to reasonable value, we should send ACK to the sender to inform
-// the rx space is now large. We also want ensure a series of small read()'s
-// won't trigger a flood of spurious tiny ACK's.
+// would be under aMSS or under the window derived from half receive buffer,
+// whichever smaller. This is useful as a receive side silly window syndrome
+// prevention mechanism. If window grows to reasonable value, we should send ACK
+// to the sender to inform the rx space is now large. We also want ensure a
+// series of small read()'s won't trigger a flood of spurious tiny ACK's.
//
// For large receive buffers, the threshold is aMSS - once reader reads more
// than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
@@ -1496,17 +1527,18 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
//
// Precondition: e.mu and e.rcvListMu must be held.
func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
- newAvail := e.receiveBufferAvailableLocked()
+ newAvail := wndFromSpace(e.receiveBufferAvailableLocked())
oldAvail := newAvail - deltaBefore
if oldAvail < 0 {
oldAvail = 0
}
-
threshold := int(e.amss)
- if threshold > e.rcvBufSize/2 {
- threshold = e.rcvBufSize / 2
+ // rcvBufFraction is the inverse of the fraction of receive buffer size that
+ // is used to decide if the available buffer space is now above it.
+ const rcvBufFraction = 2
+ if wndThreshold := wndFromSpace(e.rcvBufSize / rcvBufFraction); threshold > wndThreshold {
+ threshold = wndThreshold
}
-
switch {
case oldAvail < threshold && newAvail >= threshold:
return true, true
@@ -1636,17 +1668,23 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
// Make sure the receive buffer size is within the min and max
// allowed.
var rs tcpip.TCPReceiveBufferSizeRangeOption
- if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
+ if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
+ panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &rs, err))
+ }
+
+ if v > rs.Max {
+ v = rs.Max
+ }
+
+ if v < math.MaxInt32/SegOverheadFactor {
+ v *= SegOverheadFactor
if v < rs.Min {
v = rs.Min
}
- if v > rs.Max {
- v = rs.Max
- }
+ } else {
+ v = math.MaxInt32
}
- mask := uint32(notifyReceiveWindowChanged)
-
e.LockUser()
e.rcvListMu.Lock()
@@ -1660,14 +1698,9 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
v = 1 << scale
}
- // Make sure 2*size doesn't overflow.
- if v > math.MaxInt32/2 {
- v = math.MaxInt32 / 2
- }
-
- availBefore := e.receiveBufferAvailableLocked()
+ availBefore := wndFromSpace(e.receiveBufferAvailableLocked())
e.rcvBufSize = v
- availAfter := e.receiveBufferAvailableLocked()
+ availAfter := wndFromSpace(e.receiveBufferAvailableLocked())
e.rcvAutoParams.disabled = true
@@ -1675,24 +1708,31 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
// syndrome prevetion, when our available space grows above aMSS
// or half receive buffer, whichever smaller.
if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
- mask |= notifyNonZeroReceiveWindow
+ e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
}
e.rcvListMu.Unlock()
e.UnlockUser()
- e.notifyProtocolGoroutine(mask)
case tcpip.SendBufferSizeOption:
// Make sure the send buffer size is within the min and max
// allowed.
var ss tcpip.TCPSendBufferSizeRangeOption
- if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
+ if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err != nil {
+ panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &ss, err))
+ }
+
+ if v > ss.Max {
+ v = ss.Max
+ }
+
+ if v < math.MaxInt32/SegOverheadFactor {
+ v *= SegOverheadFactor
if v < ss.Min {
v = ss.Min
}
- if v > ss.Max {
- v = ss.Max
- }
+ } else {
+ v = math.MaxInt32
}
e.sndBufMu.Lock()
@@ -2699,13 +2739,8 @@ func (e *endpoint) updateSndBufferUsage(v int) {
func (e *endpoint) readyToRead(s *segment) {
e.rcvListMu.Lock()
if s != nil {
+ e.rcvBufUsed += s.payloadSize()
s.incRef()
- e.rcvBufUsed += s.data.Size()
- // Increase counter if the receive window falls down below MSS
- // or half receive buffer size, whichever smaller.
- if crossed, above := e.windowCrossedACKThresholdLocked(-s.data.Size()); crossed && !above {
- e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
- }
e.rcvList.PushBack(s)
} else {
e.rcvClosed = true
@@ -2720,15 +2755,17 @@ func (e *endpoint) readyToRead(s *segment) {
func (e *endpoint) receiveBufferAvailableLocked() int {
// We may use more bytes than the buffer size when the receive buffer
// shrinks.
- if e.rcvBufUsed >= e.rcvBufSize {
+ memUsed := e.receiveMemUsed()
+ if memUsed >= e.rcvBufSize {
return 0
}
- return e.rcvBufSize - e.rcvBufUsed
+ return e.rcvBufSize - memUsed
}
// receiveBufferAvailable calculates how many bytes are still available in the
-// receive buffer.
+// receive buffer based on the actual memory used by all segments held in
+// receive buffer/pending and segment queue.
func (e *endpoint) receiveBufferAvailable() int {
e.rcvListMu.Lock()
available := e.receiveBufferAvailableLocked()
@@ -2736,14 +2773,35 @@ func (e *endpoint) receiveBufferAvailable() int {
return available
}
+// receiveBufferUsed returns the amount of in-use receive buffer.
+func (e *endpoint) receiveBufferUsed() int {
+ e.rcvListMu.Lock()
+ used := e.rcvBufUsed
+ e.rcvListMu.Unlock()
+ return used
+}
+
+// receiveBufferSize returns the current size of the receive buffer.
func (e *endpoint) receiveBufferSize() int {
e.rcvListMu.Lock()
size := e.rcvBufSize
e.rcvListMu.Unlock()
-
return size
}
+// receiveMemUsed returns the total memory in use by segments held by this
+// endpoint.
+func (e *endpoint) receiveMemUsed() int {
+ return int(atomic.LoadInt32(&e.rcvMemUsed))
+}
+
+// updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
+func (e *endpoint) updateReceiveMemUsed(delta int) {
+ atomic.AddInt32(&e.rcvMemUsed, int32(delta))
+}
+
+// maxReceiveBufferSize returns the stack wide maximum receive buffer size for
+// an endpoint.
func (e *endpoint) maxReceiveBufferSize() int {
var rs tcpip.TCPReceiveBufferSizeRangeOption
if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
@@ -2894,7 +2952,6 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
RcvAcc: e.rcv.rcvAcc,
RcvWndScale: e.rcv.rcvWndScale,
PendingBufUsed: e.rcv.pendingBufUsed,
- PendingBufSize: e.rcv.pendingBufSize,
}
// Copy sender state.
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 41d0050f3..b25431467 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -44,7 +44,7 @@ func (e *endpoint) drainSegmentLocked() {
// beforeSave is invoked by stateify.
func (e *endpoint) beforeSave() {
// Stop incoming packets.
- e.segmentQueue.setLimit(0)
+ e.segmentQueue.freeze()
e.mu.Lock()
defer e.mu.Unlock()
@@ -178,7 +178,7 @@ func (e *endpoint) afterLoad() {
// Resume implements tcpip.ResumableEndpoint.Resume.
func (e *endpoint) Resume(s *stack.Stack) {
e.stack = s
- e.segmentQueue.setLimit(MaxUnprocessedSegments)
+ e.segmentQueue.thaw()
epState := e.origEndpointState
switch epState {
case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index cfd43b5e3..4aafb4d22 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -47,22 +47,24 @@ type receiver struct {
closed bool
+ // pendingRcvdSegments is bounded by the receive buffer size of the
+ // endpoint.
pendingRcvdSegments segmentHeap
- pendingBufUsed seqnum.Size
- pendingBufSize seqnum.Size
+ // pendingBufUsed tracks the total number of bytes (including segment
+ // overhead) currently queued in pendingRcvdSegments.
+ pendingBufUsed int
// Time when the last ack was received.
lastRcvdAckTime time.Time `state:".(unixTime)"`
}
-func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver {
+func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8) *receiver {
return &receiver{
ep: ep,
rcvNxt: irs + 1,
rcvAcc: irs.Add(rcvWnd + 1),
rcvWnd: rcvWnd,
rcvWndScale: rcvWndScale,
- pendingBufSize: pendingBufSize,
lastRcvdAckTime: time.Now(),
}
}
@@ -85,15 +87,23 @@ func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
// getSendParams returns the parameters needed by the sender when building
// segments to send.
func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
- // Calculate the window size based on the available buffer space.
- receiveBufferAvailable := r.ep.receiveBufferAvailable()
- acc := r.rcvNxt.Add(seqnum.Size(receiveBufferAvailable))
- if r.rcvAcc.LessThan(acc) {
- r.rcvAcc = acc
+ avail := wndFromSpace(r.ep.receiveBufferAvailable())
+ acc := r.rcvNxt.Add(seqnum.Size(avail))
+ newWnd := r.rcvNxt.Size(acc)
+ curWnd := r.rcvNxt.Size(r.rcvAcc)
+
+ // Update rcvAcc only if new window is > previously advertised window. We
+ // should never shrink the acceptable sequence space once it has been
+ // advertised the peer. If we shrink the acceptable sequence space then we
+ // would end up dropping bytes that might already be in flight.
+ if newWnd > curWnd {
+ r.rcvAcc = r.rcvNxt.Add(newWnd)
+ } else {
+ newWnd = curWnd
}
// Stash away the non-scaled receive window as we use it for measuring
// receiver's estimated RTT.
- r.rcvWnd = r.rcvNxt.Size(r.rcvAcc)
+ r.rcvWnd = newWnd
return r.rcvNxt, r.rcvWnd >> r.rcvWndScale
}
@@ -195,7 +205,9 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
}
for i := first; i < len(r.pendingRcvdSegments); i++ {
+ r.pendingBufUsed -= r.pendingRcvdSegments[i].segMemSize()
r.pendingRcvdSegments[i].decRef()
+
// Note that slice truncation does not allow garbage collection of
// truncated items, thus truncated items must be set to nil to avoid
// memory leaks.
@@ -384,10 +396,16 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
// Defer segment processing if it can't be consumed now.
if !r.consumeSegment(s, segSeq, segLen) {
if segLen > 0 || s.flagIsSet(header.TCPFlagFin) {
- // We only store the segment if it's within our buffer
- // size limit.
- if r.pendingBufUsed < r.pendingBufSize {
- r.pendingBufUsed += seqnum.Size(s.segMemSize())
+ // We only store the segment if it's within our buffer size limit.
+ //
+ // Only use 75% of the receive buffer queue for out-of-order
+ // segments. This ensures that we always leave some space for the inorder
+ // segments to arrive allowing pending segments to be processed and
+ // delivered to the user.
+ if r.ep.receiveBufferAvailable() > 0 && r.pendingBufUsed < r.ep.receiveBufferSize()>>2 {
+ r.ep.rcvListMu.Lock()
+ r.pendingBufUsed += s.segMemSize()
+ r.ep.rcvListMu.Unlock()
s.incRef()
heap.Push(&r.pendingRcvdSegments, s)
UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.rcvNxt)
@@ -421,7 +439,9 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
}
heap.Pop(&r.pendingRcvdSegments)
- r.pendingBufUsed -= seqnum.Size(s.segMemSize())
+ r.ep.rcvListMu.Lock()
+ r.pendingBufUsed -= s.segMemSize()
+ r.ep.rcvListMu.Unlock()
s.decRef()
}
return false, nil
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 94307d31a..13acaf753 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -15,6 +15,7 @@
package tcp
import (
+ "fmt"
"sync/atomic"
"time"
@@ -24,6 +25,15 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
+// queueFlags are used to indicate which queue of an endpoint a particular segment
+// belongs to. This is used to track memory accounting correctly.
+type queueFlags uint8
+
+const (
+ recvQ queueFlags = 1 << iota
+ sendQ
+)
+
// segment represents a TCP segment. It holds the payload and parsed TCP segment
// information, and can be added to intrusive lists.
// segment is mostly immutable, the only field allowed to change is viewToDeliver.
@@ -32,6 +42,8 @@ import (
type segment struct {
segmentEntry
refCnt int32
+ ep *endpoint
+ qFlags queueFlags
id stack.TransportEndpointID `state:"manual"`
route stack.Route `state:"manual"`
data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
@@ -100,6 +112,8 @@ func (s *segment) clone() *segment {
rcvdTime: s.rcvdTime,
xmitTime: s.xmitTime,
xmitCount: s.xmitCount,
+ ep: s.ep,
+ qFlags: s.qFlags,
}
t.data = s.data.Clone(t.views[:])
return t
@@ -115,8 +129,34 @@ func (s *segment) flagsAreSet(flags uint8) bool {
return s.flags&flags == flags
}
+// setOwner sets the owning endpoint for this segment. Its required
+// to be called to ensure memory accounting for receive/send buffer
+// queues is done properly.
+func (s *segment) setOwner(ep *endpoint, qFlags queueFlags) {
+ switch qFlags {
+ case recvQ:
+ ep.updateReceiveMemUsed(s.segMemSize())
+ case sendQ:
+ // no memory account for sendQ yet.
+ default:
+ panic(fmt.Sprintf("unexpected queue flag %b", qFlags))
+ }
+ s.ep = ep
+ s.qFlags = qFlags
+}
+
func (s *segment) decRef() {
if atomic.AddInt32(&s.refCnt, -1) == 0 {
+ if s.ep != nil {
+ switch s.qFlags {
+ case recvQ:
+ s.ep.updateReceiveMemUsed(-s.segMemSize())
+ case sendQ:
+ // no memory accounting for sendQ yet.
+ default:
+ panic(fmt.Sprintf("unexpected queue flag %b set for segment", s.qFlags))
+ }
+ }
s.route.Release()
}
}
@@ -138,6 +178,11 @@ func (s *segment) logicalLen() seqnum.Size {
return l
}
+// payloadSize is the size of s.data.
+func (s *segment) payloadSize() int {
+ return s.data.Size()
+}
+
// segMemSize is the amount of memory used to hold the segment data and
// the associated metadata.
func (s *segment) segMemSize() int {
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 48a257137..54545a1b1 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -22,16 +22,16 @@ import (
//
// +stateify savable
type segmentQueue struct {
- mu sync.Mutex `state:"nosave"`
- list segmentList `state:"wait"`
- limit int
- used int
+ mu sync.Mutex `state:"nosave"`
+ list segmentList `state:"wait"`
+ ep *endpoint
+ frozen bool
}
// emptyLocked determines if the queue is empty.
// Preconditions: q.mu must be held.
func (q *segmentQueue) emptyLocked() bool {
- return q.used == 0
+ return q.list.Empty()
}
// empty determines if the queue is empty.
@@ -43,14 +43,6 @@ func (q *segmentQueue) empty() bool {
return r
}
-// setLimit updates the limit. No segments are immediately dropped in case the
-// queue becomes full due to the new limit.
-func (q *segmentQueue) setLimit(limit int) {
- q.mu.Lock()
- q.limit = limit
- q.mu.Unlock()
-}
-
// enqueue adds the given segment to the queue.
//
// Returns true when the segment is successfully added to the queue, in which
@@ -58,15 +50,23 @@ func (q *segmentQueue) setLimit(limit int) {
// false if the queue is full, in which case ownership is retained by the
// caller.
func (q *segmentQueue) enqueue(s *segment) bool {
+ // q.ep.receiveBufferParams() must be called without holding q.mu to
+ // avoid lock order inversion.
+ bufSz := q.ep.receiveBufferSize()
+ used := q.ep.receiveMemUsed()
q.mu.Lock()
- r := q.used < q.limit
- if r {
+ // Allow zero sized segments (ACK/FIN/RSTs etc even if the segment queue
+ // is currently full).
+ allow := (used <= bufSz || s.payloadSize() == 0) && !q.frozen
+
+ if allow {
q.list.PushBack(s)
- q.used++
+ // Set the owner now that the endpoint owns the segment.
+ s.setOwner(q.ep, recvQ)
}
q.mu.Unlock()
- return r
+ return allow
}
// dequeue removes and returns the next segment from queue, if one exists.
@@ -77,9 +77,25 @@ func (q *segmentQueue) dequeue() *segment {
s := q.list.Front()
if s != nil {
q.list.Remove(s)
- q.used--
}
q.mu.Unlock()
return s
}
+
+// freeze prevents any more segments from being added to the queue. i.e all
+// future segmentQueue.enqueue will return false and not add the segment to the
+// queue till the queue is unfroze with a corresponding segmentQueue.thaw call.
+func (q *segmentQueue) freeze() {
+ q.mu.Lock()
+ q.frozen = true
+ q.mu.Unlock()
+}
+
+// thaw unfreezes a previously frozen queue using segmentQueue.freeze() and
+// allows new segments to be queued again.
+func (q *segmentQueue) thaw() {
+ q.mu.Lock()
+ q.frozen = false
+ q.mu.Unlock()
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 8b2217a98..8326736dc 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -21,6 +21,7 @@ import (
"testing"
"time"
+ "gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -349,8 +350,8 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+1)),
- checker.AckNum(uint32(iss)+1),
+ checker.TCPSeqNum(uint32(c.IRS+1)),
+ checker.TCPAckNum(uint32(iss)+1),
checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
finHeaders := &context.Headers{
SrcPort: context.TestPort,
@@ -380,8 +381,8 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+1)),
- checker.AckNum(0),
+ checker.TCPSeqNum(uint32(c.IRS+1)),
+ checker.TCPAckNum(0),
checker.TCPFlags(header.TCPFlagRst)))
}
@@ -479,8 +480,8 @@ func TestConnectResetAfterClose(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -521,8 +522,8 @@ func TestConnectResetAfterClose(t *testing.T) {
// RST is always generated with sndNxt which if the FIN
// has been sent will be 1 higher than the sequence number
// of the FIN itself.
- checker.SeqNum(uint32(c.IRS)+2),
- checker.AckNum(0),
+ checker.TCPSeqNum(uint32(c.IRS)+2),
+ checker.TCPAckNum(0),
checker.TCPFlags(header.TCPFlagRst),
),
)
@@ -561,8 +562,8 @@ func TestCurrentConnectedIncrement(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -597,8 +598,8 @@ func TestCurrentConnectedIncrement(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+2),
- checker.AckNum(791),
+ checker.TCPSeqNum(uint32(c.IRS)+2),
+ checker.TCPAckNum(791),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -644,8 +645,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(791),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(791),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -665,8 +666,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(791),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(791),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -725,8 +726,8 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+2),
- checker.AckNum(0),
+ checker.TCPSeqNum(uint32(c.IRS)+2),
+ checker.TCPAckNum(0),
checker.TCPFlags(header.TCPFlagRst),
),
)
@@ -777,8 +778,8 @@ func TestSimpleReceive(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+len(data))),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+len(data))),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -1030,7 +1031,7 @@ func TestSendRstOnListenerRxSynAckV4(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagRst),
- checker.SeqNum(200)))
+ checker.TCPSeqNum(200)))
}
func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
@@ -1058,7 +1059,7 @@ func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
checker.IPv6(t, c.GetV6Packet(), checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagRst),
- checker.SeqNum(200)))
+ checker.TCPSeqNum(200)))
}
// TestTCPAckBeforeAcceptV4 tests that once the 3-way handshake is complete,
@@ -1095,8 +1096,8 @@ func TestTCPAckBeforeAcceptV4(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(iss+1)),
- checker.AckNum(uint32(irs+5))))
+ checker.TCPSeqNum(uint32(iss+1)),
+ checker.TCPAckNum(uint32(irs+5))))
}
// TestTCPAckBeforeAcceptV6 tests that once the 3-way handshake is complete,
@@ -1133,8 +1134,8 @@ func TestTCPAckBeforeAcceptV6(t *testing.T) {
checker.IPv6(t, c.GetV6Packet(), checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(iss+1)),
- checker.AckNum(uint32(irs+5))))
+ checker.TCPSeqNum(uint32(iss+1)),
+ checker.TCPAckNum(uint32(irs+5))))
}
func TestSendRstOnListenerRxAckV4(t *testing.T) {
@@ -1162,7 +1163,7 @@ func TestSendRstOnListenerRxAckV4(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagRst),
- checker.SeqNum(200)))
+ checker.TCPSeqNum(200)))
}
func TestSendRstOnListenerRxAckV6(t *testing.T) {
@@ -1190,7 +1191,7 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) {
checker.IPv6(t, c.GetV6Packet(), checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagRst),
- checker.SeqNum(200)))
+ checker.TCPSeqNum(200)))
}
// TestListenShutdown tests for the listening endpoint replying with RST
@@ -1306,8 +1307,8 @@ func TestTOSV4(t *testing.T) {
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790), // Acknum is initial sequence number + 1
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790), // Acknum is initial sequence number + 1
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
checker.TOS(tos, 0),
@@ -1355,8 +1356,8 @@ func TestTrafficClassV6(t *testing.T) {
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
checker.TOS(tos, 0),
@@ -1546,8 +1547,8 @@ func TestOutOfOrderReceive(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -1597,8 +1598,8 @@ func TestOutOfOrderReceive(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+len(data))),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+len(data))),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -1608,8 +1609,8 @@ func TestOutOfOrderFlood(t *testing.T) {
c := context.New(t, defaultMTU)
defer c.Cleanup()
- // Create a new connection with initial window size of 10.
- c.CreateConnected(789, 30000, 10)
+ rcvBufSz := math.MaxUint16
+ c.CreateConnected(789, 30000, rcvBufSz)
if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
@@ -1630,8 +1631,8 @@ func TestOutOfOrderFlood(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -1651,8 +1652,8 @@ func TestOutOfOrderFlood(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -1671,8 +1672,8 @@ func TestOutOfOrderFlood(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(793),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(793),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -1713,8 +1714,8 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+len(data))),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+len(data))),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -1728,7 +1729,7 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
// We shouldn't consume a sequence number on RST.
- checker.SeqNum(uint32(c.IRS)+1),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
))
// The RST puts the endpoint into an error state.
if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
@@ -1782,8 +1783,8 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+len(data))),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+len(data))),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -1796,7 +1797,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
- checker.SeqNum(uint32(c.IRS)+1),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
))
if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
@@ -1815,7 +1816,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
// RST is always generated with sndNxt which if the FIN
// has been sent will be 1 higher than the sequence
// number of the FIN itself.
- checker.SeqNum(uint32(c.IRS)+2),
+ checker.TCPSeqNum(uint32(c.IRS)+2),
))
// The RST puts the endpoint into an error state.
if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
@@ -1861,7 +1862,8 @@ func TestFullWindowReceive(t *testing.T) {
c := context.New(t, defaultMTU)
defer c.Cleanup()
- c.CreateConnected(789, 30000, 10)
+ const rcvBufSz = 10
+ c.CreateConnected(789, 30000, rcvBufSz)
we, ch := waiter.NewChannelEntry(nil)
c.WQ.EventRegister(&we, waiter.EventIn)
@@ -1872,8 +1874,13 @@ func TestFullWindowReceive(t *testing.T) {
t.Fatalf("Read failed: %s", err)
}
- // Fill up the window.
- data := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+ // Fill up the window w/ tcp.SegOverheadFactor*rcvBufSz as netstack multiplies
+ // the provided buffer value by tcp.SegOverheadFactor to calculate the actual
+ // receive buffer size.
+ data := make([]byte, tcp.SegOverheadFactor*rcvBufSz)
+ for i := range data {
+ data[i] = byte(i % 255)
+ }
c.SendPacket(data, &context.Headers{
SrcPort: context.TestPort,
DstPort: c.Port,
@@ -1894,10 +1901,10 @@ func TestFullWindowReceive(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+len(data))),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+len(data))),
checker.TCPFlags(header.TCPFlagAck),
- checker.Window(0),
+ checker.TCPWindow(0),
),
)
@@ -1920,10 +1927,10 @@ func TestFullWindowReceive(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+len(data))),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+len(data))),
checker.TCPFlags(header.TCPFlagAck),
- checker.Window(10),
+ checker.TCPWindow(10),
),
)
}
@@ -1932,12 +1939,15 @@ func TestNoWindowShrinking(t *testing.T) {
c := context.New(t, defaultMTU)
defer c.Cleanup()
- // Start off with a window size of 10, then shrink it to 5.
- c.CreateConnected(789, 30000, 10)
-
- if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 5); err != nil {
- t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %s", err)
- }
+ // Start off with a certain receive buffer then cut it in half and verify that
+ // the right edge of the window does not shrink.
+ // NOTE: Netstack doubles the value specified here.
+ rcvBufSize := 65536
+ iss := seqnum.Value(789)
+ // Enable window scaling with a scale of zero from our end.
+ c.CreateConnectedWithRawOptions(iss, 30000, rcvBufSize, []byte{
+ header.TCPOptionWS, 3, 0, header.TCPOptionNOP,
+ })
we, ch := waiter.NewChannelEntry(nil)
c.WQ.EventRegister(&we, waiter.EventIn)
@@ -1946,14 +1956,15 @@ func TestNoWindowShrinking(t *testing.T) {
if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
}
-
- // Send 3 bytes, check that the peer acknowledges them.
- data := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- c.SendPacket(data[:3], &context.Headers{
+ // Send a 1 byte payload so that we can record the current receive window.
+ // Send a payload of half the size of rcvBufSize.
+ seqNum := iss.Add(1)
+ payload := []byte{1}
+ c.SendPacket(payload, &context.Headers{
SrcPort: context.TestPort,
DstPort: c.Port,
Flags: header.TCPFlagAck,
- SeqNum: 790,
+ SeqNum: seqNum,
AckNum: c.IRS.Add(1),
RcvWnd: 30000,
})
@@ -1965,46 +1976,93 @@ func TestNoWindowShrinking(t *testing.T) {
t.Fatalf("Timed out waiting for data to arrive")
}
- // Check that data is acknowledged, and that window doesn't go to zero
- // just yet because it was previously set to 10. It must go to 7 now.
- checker.IPv4(t, c.GetPacket(),
+ // Read the 1 byte payload we just sent.
+ v, _, err := c.EP.Read(nil)
+ if err != nil {
+ t.Fatalf("Read failed: %s", err)
+ }
+ if got, want := payload, v; !bytes.Equal(got, want) {
+ t.Fatalf("got data: %v, want: %v", got, want)
+ }
+
+ seqNum = seqNum.Add(1)
+ // Verify that the ACK does not shrink the window.
+ pkt := c.GetPacket()
+ checker.IPv4(t, pkt,
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(793),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(seqNum)),
checker.TCPFlags(header.TCPFlagAck),
- checker.Window(7),
),
)
+ // Stash the initial window.
+ initialWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() << c.RcvdWindowScale
+ initialLastAcceptableSeq := seqNum.Add(seqnum.Size(initialWnd))
+ // Now shrink the receive buffer to half its original size.
+ if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufSize/2); err != nil {
+ t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %s", err)
+ }
- // Send 7 more bytes, check that the window fills up.
- c.SendPacket(data[3:], &context.Headers{
+ data := generateRandomPayload(t, rcvBufSize)
+ // Send a payload of half the size of rcvBufSize.
+ c.SendPacket(data[:rcvBufSize/2], &context.Headers{
SrcPort: context.TestPort,
DstPort: c.Port,
Flags: header.TCPFlagAck,
- SeqNum: 793,
+ SeqNum: seqNum,
AckNum: c.IRS.Add(1),
RcvWnd: 30000,
})
+ seqNum = seqNum.Add(seqnum.Size(rcvBufSize / 2))
- select {
- case <-ch:
- case <-time.After(5 * time.Second):
- t.Fatalf("Timed out waiting for data to arrive")
+ // Verify that the ACK does not shrink the window.
+ pkt = c.GetPacket()
+ checker.IPv4(t, pkt,
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(seqNum)),
+ checker.TCPFlags(header.TCPFlagAck),
+ ),
+ )
+ newWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize() << c.RcvdWindowScale
+ newLastAcceptableSeq := seqNum.Add(seqnum.Size(newWnd))
+ if newLastAcceptableSeq.LessThan(initialLastAcceptableSeq) {
+ t.Fatalf("receive window shrunk unexpectedly got: %d, want >= %d", newLastAcceptableSeq, initialLastAcceptableSeq)
}
+ // Send another payload of half the size of rcvBufSize. This should fill up the
+ // socket receive buffer and we should see a zero window.
+ c.SendPacket(data[rcvBufSize/2:], &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: c.Port,
+ Flags: header.TCPFlagAck,
+ SeqNum: seqNum,
+ AckNum: c.IRS.Add(1),
+ RcvWnd: 30000,
+ })
+ seqNum = seqNum.Add(seqnum.Size(rcvBufSize / 2))
+
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+len(data))),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(seqNum)),
checker.TCPFlags(header.TCPFlagAck),
- checker.Window(0),
+ checker.TCPWindow(0),
),
)
+ // Wait for receive to be notified.
+ select {
+ case <-ch:
+ case <-time.After(5 * time.Second):
+ t.Fatalf("Timed out waiting for data to arrive")
+ }
+
// Receive data and check it.
- read := make([]byte, 0, 10)
+ read := make([]byte, 0, rcvBufSize)
for len(read) < len(data) {
v, _, err := c.EP.Read(nil)
if err != nil {
@@ -2018,15 +2076,15 @@ func TestNoWindowShrinking(t *testing.T) {
t.Fatalf("got data = %v, want = %v", read, data)
}
- // Check that we get an ACK for the newly non-zero window, which is the
- // new size.
+ // Check that we get an ACK for the newly non-zero window, which is the new
+ // receive buffer size we set after the connection was established.
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+len(data))),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(seqNum)),
checker.TCPFlags(header.TCPFlagAck),
- checker.Window(5),
+ checker.TCPWindow(uint16(rcvBufSize/2)>>c.RcvdWindowScale),
),
)
}
@@ -2051,8 +2109,8 @@ func TestSimpleSend(t *testing.T) {
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2093,8 +2151,8 @@ func TestZeroWindowSend(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2115,8 +2173,8 @@ func TestZeroWindowSend(t *testing.T) {
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2155,16 +2213,16 @@ func TestScaledWindowConnect(t *testing.T) {
t.Fatalf("Write failed: %s", err)
}
- // Check that data is received, and that advertised window is 0xbfff,
+ // Check that data is received, and that advertised window is 0x5fff,
// that is, that it is scaled.
b := c.GetPacket()
checker.IPv4(t, b,
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
- checker.Window(0xbfff),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
+ checker.TCPWindow(0x5fff),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2194,9 +2252,9 @@ func TestNonScaledWindowConnect(t *testing.T) {
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
- checker.Window(0xffff),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
+ checker.TCPWindow(0xffff),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2230,7 +2288,8 @@ func TestScaledWindowAccept(t *testing.T) {
}
// Do 3-way handshake.
- c.PassiveConnectWithOptions(100, 2, header.TCPSynOptions{MSS: defaultIPv4MSS})
+ // wndScale expected is 3 as 65535 * 3 * 2 < 65535 * 2^3 but > 65535 *2 *2
+ c.PassiveConnectWithOptions(100, 3 /* wndScale */, header.TCPSynOptions{MSS: defaultIPv4MSS})
// Try to accept the connection.
we, ch := waiter.NewChannelEntry(nil)
@@ -2260,16 +2319,16 @@ func TestScaledWindowAccept(t *testing.T) {
t.Fatalf("Write failed: %s", err)
}
- // Check that data is received, and that advertised window is 0xbfff,
+ // Check that data is received, and that advertised window is 0x5fff,
// that is, that it is scaled.
b := c.GetPacket()
checker.IPv4(t, b,
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
- checker.Window(0xbfff),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
+ checker.TCPWindow(0x5fff),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2341,9 +2400,9 @@ func TestNonScaledWindowAccept(t *testing.T) {
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
- checker.Window(0xffff),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
+ checker.TCPWindow(0xffff),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2356,18 +2415,19 @@ func TestZeroScaledWindowReceive(t *testing.T) {
c := context.New(t, defaultMTU)
defer c.Cleanup()
- // Set the window size such that a window scale of 4 will be used.
- const wnd = 65535 * 10
- const ws = uint32(4)
- c.CreateConnectedWithRawOptions(789, 30000, wnd, []byte{
+ // Set the buffer size such that a window scale of 5 will be used.
+ const bufSz = 65535 * 10
+ const ws = uint32(5)
+ c.CreateConnectedWithRawOptions(789, 30000, bufSz, []byte{
header.TCPOptionWS, 3, 0, header.TCPOptionNOP,
})
// Write chunks of 50000 bytes.
- remain := wnd
+ remain := 0
sent := 0
data := make([]byte, 50000)
- for remain > len(data) {
+ // Keep writing till the window drops below len(data).
+ for {
c.SendPacket(data, &context.Headers{
SrcPort: context.TestPort,
DstPort: c.Port,
@@ -2377,21 +2437,25 @@ func TestZeroScaledWindowReceive(t *testing.T) {
RcvWnd: 30000,
})
sent += len(data)
- remain -= len(data)
- checker.IPv4(t, c.GetPacket(),
+ pkt := c.GetPacket()
+ checker.IPv4(t, pkt,
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+sent)),
- checker.Window(uint16(remain>>ws)),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+sent)),
checker.TCPFlags(header.TCPFlagAck),
),
)
+ // Don't reduce window to zero here.
+ if wnd := int(header.TCP(header.IPv4(pkt).Payload()).WindowSize()); wnd<<ws < len(data) {
+ remain = wnd << ws
+ break
+ }
}
// Make the window non-zero, but the scaled window zero.
- if remain >= 16 {
+ for remain >= 16 {
data = data[:remain-15]
c.SendPacket(data, &context.Headers{
SrcPort: context.TestPort,
@@ -2402,22 +2466,35 @@ func TestZeroScaledWindowReceive(t *testing.T) {
RcvWnd: 30000,
})
sent += len(data)
- remain -= len(data)
- checker.IPv4(t, c.GetPacket(),
+ pkt := c.GetPacket()
+ checker.IPv4(t, pkt,
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+sent)),
- checker.Window(0),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+sent)),
checker.TCPFlags(header.TCPFlagAck),
),
)
+ // Since the receive buffer is split between window advertisement and
+ // application data buffer the window does not always reflect the space
+ // available and actual space available can be a bit more than what is
+ // advertised in the window.
+ wnd := int(header.TCP(header.IPv4(pkt).Payload()).WindowSize())
+ if wnd == 0 {
+ break
+ }
+ remain = wnd << ws
}
- // Read at least 1MSS of data. An ack should be sent in response to that.
+ // Read at least 2MSS of data. An ack should be sent in response to that.
+ // Since buffer space is now split in half between window and application
+ // data we need to read more than 1 MSS(65536) of data for a non-zero window
+ // update to be sent. For 1MSS worth of window to be available we need to
+ // read at least 128KB. Since our segments above were 50KB each it means
+ // we need to read at 3 packets.
sz := 0
- for sz < defaultMTU {
+ for sz < defaultMTU*2 {
v, _, err := c.EP.Read(nil)
if err != nil {
t.Fatalf("Read failed: %s", err)
@@ -2429,9 +2506,9 @@ func TestZeroScaledWindowReceive(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+sent)),
- checker.Window(uint16(sz>>ws)),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+sent)),
+ checker.TCPWindowGreaterThanEq(uint16(defaultMTU>>ws)),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -2498,8 +2575,8 @@ func TestSegmentMerging(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize+1),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+uint32(i)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+uint32(i)+1),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2521,8 +2598,8 @@ func TestSegmentMerging(t *testing.T) {
checker.PayloadLen(len(allData)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+11),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+11),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2569,8 +2646,8 @@ func TestDelay(t *testing.T) {
checker.PayloadLen(len(want)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(seq)),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(seq)),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2616,8 +2693,8 @@ func TestUndelay(t *testing.T) {
checker.PayloadLen(len(allData[0])+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(seq)),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(seq)),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2639,8 +2716,8 @@ func TestUndelay(t *testing.T) {
checker.PayloadLen(len(allData[1])+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(seq)),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(seq)),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2701,8 +2778,8 @@ func TestMSSNotDelayed(t *testing.T) {
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(seq)),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(seq)),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2753,8 +2830,8 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
checker.IPv4(t, b,
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1+uint32(bytesReceived)),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1+uint32(bytesReceived)),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -2996,7 +3073,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
// Set the buffer size to a deterministic size so that we can check the
// window scaling option.
const rcvBufferSize = 0x20000
- const wndScale = 2
+ const wndScale = 3
if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed failed: %s", rcvBufferSize, err)
}
@@ -3031,7 +3108,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagSyn),
checker.SrcPort(tcpHdr.SourcePort()),
- checker.SeqNum(tcpHdr.SequenceNumber()),
+ checker.TCPSeqNum(tcpHdr.SequenceNumber()),
checker.TCPSynOptions(header.TCPSynOptions{MSS: mss, WS: wndScale}),
),
)
@@ -3052,8 +3129,8 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(iss)+1),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(iss)+1),
),
)
@@ -3346,8 +3423,8 @@ func TestFinImmediately(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -3367,8 +3444,8 @@ func TestFinImmediately(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+2),
- checker.AckNum(791),
+ checker.TCPSeqNum(uint32(c.IRS)+2),
+ checker.TCPAckNum(791),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -3389,8 +3466,8 @@ func TestFinRetransmit(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -3400,8 +3477,8 @@ func TestFinRetransmit(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -3421,8 +3498,8 @@ func TestFinRetransmit(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+2),
- checker.AckNum(791),
+ checker.TCPSeqNum(uint32(c.IRS)+2),
+ checker.TCPAckNum(791),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -3445,8 +3522,8 @@ func TestFinWithNoPendingData(t *testing.T) {
checker.PayloadLen(len(view)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -3470,8 +3547,8 @@ func TestFinWithNoPendingData(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -3492,8 +3569,8 @@ func TestFinWithNoPendingData(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(791),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(791),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -3520,8 +3597,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
checker.PayloadLen(len(view)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -3539,8 +3616,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
checker.PayloadLen(len(view)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -3559,8 +3636,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -3580,8 +3657,8 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(791),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(791),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -3604,8 +3681,8 @@ func TestFinWithPendingData(t *testing.T) {
checker.PayloadLen(len(view)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -3629,8 +3706,8 @@ func TestFinWithPendingData(t *testing.T) {
checker.PayloadLen(len(view)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -3645,8 +3722,8 @@ func TestFinWithPendingData(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -3666,8 +3743,8 @@ func TestFinWithPendingData(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(791),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(791),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -3691,8 +3768,8 @@ func TestFinWithPartialAck(t *testing.T) {
checker.PayloadLen(len(view)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -3712,8 +3789,8 @@ func TestFinWithPartialAck(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(791),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(791),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -3727,8 +3804,8 @@ func TestFinWithPartialAck(t *testing.T) {
checker.PayloadLen(len(view)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(791),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(791),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -3743,8 +3820,8 @@ func TestFinWithPartialAck(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(791),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(791),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -3835,8 +3912,8 @@ func scaledSendWindow(t *testing.T, scale uint8) {
checker.PayloadLen((1<<scale)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -3974,7 +4051,7 @@ func TestReceivedSegmentQueuing(t *testing.T) {
checker.IPv4(t, b,
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -4025,8 +4102,8 @@ func TestReadAfterClosedState(t *testing.T) {
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
),
)
@@ -4050,8 +4127,8 @@ func TestReadAfterClosedState(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+2),
- checker.AckNum(uint32(791+len(data))),
+ checker.TCPSeqNum(uint32(c.IRS)+2),
+ checker.TCPAckNum(uint32(791+len(data))),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -4312,14 +4389,14 @@ func TestMinMaxBufferSizes(t *testing.T) {
}
}
- // Set values below the min.
- if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 199); err != nil {
+ // Set values below the min/2.
+ if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 99); err != nil {
t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 199) failed: %s", err)
}
checkRecvBufferSize(t, ep, 200)
- if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 299); err != nil {
+ if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 149); err != nil {
t.Fatalf("SetSockOptInt(SendBufferSizeOption, 299) failed: %s", err)
}
@@ -4330,13 +4407,15 @@ func TestMinMaxBufferSizes(t *testing.T) {
t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
}
- checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20)
+ // Values above max are capped at max and then doubled.
+ checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20*2)
if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 1+tcp.DefaultSendBufferSize*30); err != nil {
t.Fatalf("SetSockOptInt(SendBufferSizeOption) failed: %s", err)
}
- checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30)
+ // Values above max are capped at max and then doubled.
+ checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30*2)
}
func TestBindToDeviceOption(t *testing.T) {
@@ -4678,8 +4757,8 @@ func TestPathMTUDiscovery(t *testing.T) {
checker.PayloadLen(size+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(seqNum),
- checker.AckNum(790),
+ checker.TCPSeqNum(seqNum),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -4930,8 +5009,8 @@ func TestKeepalive(t *testing.T) {
checker.IPv4(t, b,
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)),
- checker.AckNum(uint32(790)),
+ checker.TCPSeqNum(uint32(c.IRS)),
+ checker.TCPAckNum(uint32(790)),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -4964,8 +5043,8 @@ func TestKeepalive(t *testing.T) {
checker.PayloadLen(len(view)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -4976,8 +5055,8 @@ func TestKeepalive(t *testing.T) {
checker.PayloadLen(len(view)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagPsh),
),
)
@@ -5002,8 +5081,8 @@ func TestKeepalive(t *testing.T) {
checker.IPv4(t, b,
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(next-1)),
- checker.AckNum(uint32(790)),
+ checker.TCPSeqNum(uint32(next-1)),
+ checker.TCPAckNum(uint32(790)),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -5029,8 +5108,8 @@ func TestKeepalive(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(next)),
- checker.AckNum(uint32(0)),
+ checker.TCPSeqNum(uint32(next)),
+ checker.TCPAckNum(uint32(0)),
checker.TCPFlags(header.TCPFlagRst),
),
)
@@ -5070,7 +5149,7 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki
checker.SrcPort(context.StackPort),
checker.DstPort(srcPort),
checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
- checker.AckNum(uint32(irs) + 1),
+ checker.TCPAckNum(uint32(irs) + 1),
}
if synCookieInUse {
@@ -5114,7 +5193,7 @@ func executeV6Handshake(t *testing.T, c *context.Context, srcPort uint16, synCoo
checker.SrcPort(context.StackPort),
checker.DstPort(srcPort),
checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
- checker.AckNum(uint32(irs) + 1),
+ checker.TCPAckNum(uint32(irs) + 1),
}
if synCookieInUse {
@@ -5348,7 +5427,7 @@ func TestListenNoAcceptNonUnicastV4(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
- checker.AckNum(uint32(irs)+1)))
+ checker.TCPAckNum(uint32(irs)+1)))
})
}
}
@@ -5448,7 +5527,7 @@ func TestListenNoAcceptNonUnicastV6(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
- checker.AckNum(uint32(irs)+1)))
+ checker.TCPAckNum(uint32(irs)+1)))
})
}
}
@@ -5496,7 +5575,7 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
- checker.AckNum(uint32(irs) + 1),
+ checker.TCPAckNum(uint32(irs) + 1),
}
checker.IPv4(t, b, checker.TCP(tcpCheckers...))
@@ -5674,7 +5753,7 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
- checker.AckNum(uint32(irs) + 1),
+ checker.TCPAckNum(uint32(irs) + 1),
}
checker.IPv4(t, b, checker.TCP(tcpCheckers...))
@@ -5695,8 +5774,8 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.AckNum(uint32(irs) + 1),
- checker.SeqNum(uint32(iss + 1)),
+ checker.TCPAckNum(uint32(irs) + 1),
+ checker.TCPSeqNum(uint32(iss + 1)),
}
checker.IPv4(t, b, checker.TCP(tcpCheckers...))
@@ -5994,16 +6073,14 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
time.Sleep(latency)
rawEP.SendPacketWithTS([]byte{1}, tsVal)
- // Verify that the ACK has the expected window.
- wantRcvWnd := receiveBufferSize
- wantRcvWnd = (wantRcvWnd >> uint32(c.WindowScale))
- rawEP.VerifyACKRcvWnd(uint16(wantRcvWnd - 1))
+ pkt := rawEP.VerifyAndReturnACKWithTS(tsVal)
+ rcvWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize()
time.Sleep(25 * time.Millisecond)
// Allocate a large enough payload for the test.
- b := make([]byte, int(receiveBufferSize)*2)
- offset := 0
- payloadSize := receiveBufferSize - 1
+ payloadSize := receiveBufferSize * 2
+ b := make([]byte, int(payloadSize))
+
worker := (c.EP).(interface {
StopWork()
ResumeWork()
@@ -6012,11 +6089,15 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
// Stop the worker goroutine.
worker.StopWork()
- start := offset
- end := offset + payloadSize
+ start := 0
+ end := payloadSize / 2
packetsSent := 0
for ; start < end; start += mss {
- rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
+ packetEnd := start + mss
+ if start+mss > end {
+ packetEnd = end
+ }
+ rawEP.SendPacketWithTS(b[start:packetEnd], tsVal)
packetsSent++
}
@@ -6024,29 +6105,20 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
// are waiting to be read.
worker.ResumeWork()
- // Since we read no bytes the window should goto zero till the
- // application reads some of the data.
- // Discard all intermediate acks except the last one.
- if packetsSent > 100 {
- for i := 0; i < (packetsSent / 100); i++ {
- _ = c.GetPacket()
- }
+ // Since we sent almost the full receive buffer worth of data (some may have
+ // been dropped due to segment overheads), we should get a zero window back.
+ pkt = c.GetPacket()
+ tcpHdr := header.TCP(header.IPv4(pkt).Payload())
+ gotRcvWnd := tcpHdr.WindowSize()
+ wantAckNum := tcpHdr.AckNumber()
+ if got, want := int(gotRcvWnd), 0; got != want {
+ t.Fatalf("got rcvWnd: %d, want: %d", got, want)
}
- rawEP.VerifyACKRcvWnd(0)
time.Sleep(25 * time.Millisecond)
- // Verify that sending more data when window is closed is dropped and
- // not acked.
+ // Verify that sending more data when receiveBuffer is exhausted.
rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
- // Verify that the stack sends us back an ACK with the sequence number
- // of the last packet sent indicating it was dropped.
- p := c.GetPacket()
- checker.IPv4(t, p, checker.TCP(
- checker.AckNum(uint32(rawEP.NextSeqNum)-uint32(mss)),
- checker.Window(0),
- ))
-
// Now read all the data from the endpoint and verify that advertised
// window increases to the full available buffer size.
for {
@@ -6059,23 +6131,26 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
// Verify that we receive a non-zero window update ACK. When running
// under thread santizer this test can end up sending more than 1
// ack, 1 for the non-zero window
- p = c.GetPacket()
+ p := c.GetPacket()
checker.IPv4(t, p, checker.TCP(
- checker.AckNum(uint32(rawEP.NextSeqNum)-uint32(mss)),
+ checker.TCPAckNum(uint32(wantAckNum)),
func(t *testing.T, h header.Transport) {
tcp, ok := h.(header.TCP)
if !ok {
return
}
- if w := tcp.WindowSize(); w == 0 || w > uint16(wantRcvWnd) {
- t.Errorf("expected a non-zero window: got %d, want <= wantRcvWnd", w)
+ // We use 10% here as the error margin upwards as the initial window we
+ // got was afer 1 segment was already in the receive buffer queue.
+ tolerance := 1.1
+ if w := tcp.WindowSize(); w == 0 || w > uint16(float64(rcvWnd)*tolerance) {
+ t.Errorf("expected a non-zero window: got %d, want <= %d", w, uint16(float64(rcvWnd)*tolerance))
}
},
))
}
-// This test verifies that the auto tuning does not grow the receive buffer if
-// the application is not reading the data actively.
+// This test verifies that the advertised window is auto-tuned up as the
+// application is reading the data that is being received.
func TestReceiveBufferAutoTuning(t *testing.T) {
const mtu = 1500
const mss = mtu - header.IPv4MinimumSize - header.TCPMinimumSize
@@ -6085,9 +6160,6 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
// Enable Auto-tuning.
stk := c.Stack()
- // Set lower limits for auto-tuning tests. This is required because the
- // test stops the worker which can cause packets to be dropped because
- // the segment queue holding unprocessed packets is limited to 300.
const receiveBufferSize = 80 << 10 // 80KB.
const maxReceiveBufferSize = receiveBufferSize * 10
{
@@ -6109,8 +6181,10 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
c.WindowScale = uint8(tcp.FindWndScale(maxReceiveBufferSize))
rawEP := c.CreateConnectedWithOptions(header.TCPSynOptions{TS: true, WS: 4})
-
- wantRcvWnd := receiveBufferSize
+ tsVal := uint32(rawEP.TSVal)
+ rawEP.SendPacketWithTS([]byte{1}, tsVal)
+ pkt := rawEP.VerifyAndReturnACKWithTS(tsVal)
+ curRcvWnd := int(header.TCP(header.IPv4(pkt).Payload()).WindowSize()) << c.WindowScale
scaleRcvWnd := func(rcvWnd int) uint16 {
return uint16(rcvWnd >> uint16(c.WindowScale))
}
@@ -6127,14 +6201,8 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
StopWork()
ResumeWork()
})
- tsVal := rawEP.TSVal
- // We are going to do our own computation of what the moderated receive
- // buffer should be based on sent/copied data per RTT and verify that
- // the advertised window by the stack matches our calculations.
- prevCopied := 0
- done := false
latency := 1 * time.Millisecond
- for i := 0; !done; i++ {
+ for i := 0; i < 5; i++ {
tsVal++
// Stop the worker goroutine.
@@ -6156,15 +6224,20 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
// Give 1ms for the worker to process the packets.
time.Sleep(1 * time.Millisecond)
- // Verify that the advertised window on the ACK is reduced by
- // the total bytes sent.
- expectedWnd := wantRcvWnd - totalSent
- if packetsSent > 100 {
- for i := 0; i < (packetsSent / 100); i++ {
- _ = c.GetPacket()
+ lastACK := c.GetPacket()
+ // Discard any intermediate ACKs and only check the last ACK we get in a
+ // short time period of few ms.
+ for {
+ time.Sleep(1 * time.Millisecond)
+ pkt := c.GetPacketNonBlocking()
+ if pkt == nil {
+ break
}
+ lastACK = pkt
+ }
+ if got, want := int(header.TCP(header.IPv4(lastACK).Payload()).WindowSize()), int(scaleRcvWnd(curRcvWnd)); got > want {
+ t.Fatalf("advertised window got: %d, want <= %d", got, want)
}
- rawEP.VerifyACKRcvWnd(scaleRcvWnd(expectedWnd))
// Now read all the data from the endpoint and invoke the
// moderation API to allow for receive buffer auto-tuning
@@ -6189,35 +6262,20 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
rawEP.NextSeqNum--
rawEP.SendPacketWithTS(nil, tsVal)
rawEP.NextSeqNum++
-
if i == 0 {
// In the first iteration the receiver based RTT is not
// yet known as a result the moderation code should not
// increase the advertised window.
- rawEP.VerifyACKRcvWnd(scaleRcvWnd(wantRcvWnd))
- prevCopied = totalCopied
+ rawEP.VerifyACKRcvWnd(scaleRcvWnd(curRcvWnd))
} else {
- rttCopied := totalCopied
- if i == 1 {
- // The moderation code accumulates copied bytes till
- // RTT is established. So add in the bytes sent in
- // the first iteration to the total bytes for this
- // RTT.
- rttCopied += prevCopied
- // Now reset it to the initial value used by the
- // auto tuning logic.
- prevCopied = tcp.InitialCwnd * mss * 2
- }
- newWnd := rttCopied<<1 + 16*mss
- grow := (newWnd * (rttCopied - prevCopied)) / prevCopied
- newWnd += (grow << 1)
- if newWnd > maxReceiveBufferSize {
- newWnd = maxReceiveBufferSize
- done = true
+ pkt := c.GetPacket()
+ curRcvWnd = int(header.TCP(header.IPv4(pkt).Payload()).WindowSize()) << c.WindowScale
+ // If thew new current window is close maxReceiveBufferSize then terminate
+ // the loop. This can happen before all iterations are done due to timing
+ // differences when running the test.
+ if int(float64(curRcvWnd)*1.1) > maxReceiveBufferSize/2 {
+ break
}
- rawEP.VerifyACKRcvWnd(scaleRcvWnd(newWnd))
- wantRcvWnd = newWnd
- prevCopied = rttCopied
// Increase the latency after first two iterations to
// establish a low RTT value in the receiver since it
// only tracks the lowest value. This ensures that when
@@ -6230,6 +6288,12 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
offset += payloadSize
payloadSize *= 2
}
+ // Check that at the end of our iterations the receive window grew close to the maximum
+ // permissible size of maxReceiveBufferSize/2
+ if got, want := int(float64(curRcvWnd)*1.1), maxReceiveBufferSize/2; got < want {
+ t.Fatalf("unexpected rcvWnd got: %d, want > %d", got, want)
+ }
+
}
func TestDelayEnabled(t *testing.T) {
@@ -6381,8 +6445,8 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+1)),
- checker.AckNum(uint32(iss)+1),
+ checker.TCPSeqNum(uint32(c.IRS+1)),
+ checker.TCPAckNum(uint32(iss)+1),
checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
finHeaders := &context.Headers{
@@ -6399,8 +6463,8 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+2)),
- checker.AckNum(uint32(iss)+2),
+ checker.TCPSeqNum(uint32(c.IRS+2)),
+ checker.TCPAckNum(uint32(iss)+2),
checker.TCPFlags(header.TCPFlagAck)))
// Now send a RST and this should be ignored and not
@@ -6428,8 +6492,8 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+2)),
- checker.AckNum(uint32(iss)+2),
+ checker.TCPSeqNum(uint32(c.IRS+2)),
+ checker.TCPAckNum(uint32(iss)+2),
checker.TCPFlags(header.TCPFlagAck)))
}
@@ -6500,8 +6564,8 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+1)),
- checker.AckNum(uint32(iss)+1),
+ checker.TCPSeqNum(uint32(c.IRS+1)),
+ checker.TCPAckNum(uint32(iss)+1),
checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
finHeaders := &context.Headers{
@@ -6518,8 +6582,8 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+2)),
- checker.AckNum(uint32(iss)+2),
+ checker.TCPSeqNum(uint32(c.IRS+2)),
+ checker.TCPAckNum(uint32(iss)+2),
checker.TCPFlags(header.TCPFlagAck)))
// Out of order ACK should generate an immediate ACK in
@@ -6535,8 +6599,8 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+2)),
- checker.AckNum(uint32(iss)+2),
+ checker.TCPSeqNum(uint32(c.IRS+2)),
+ checker.TCPAckNum(uint32(iss)+2),
checker.TCPFlags(header.TCPFlagAck)))
}
@@ -6607,8 +6671,8 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+1)),
- checker.AckNum(uint32(iss)+1),
+ checker.TCPSeqNum(uint32(c.IRS+1)),
+ checker.TCPAckNum(uint32(iss)+1),
checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
finHeaders := &context.Headers{
@@ -6625,8 +6689,8 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+2)),
- checker.AckNum(uint32(iss)+2),
+ checker.TCPSeqNum(uint32(c.IRS+2)),
+ checker.TCPAckNum(uint32(iss)+2),
checker.TCPFlags(header.TCPFlagAck)))
// Send a SYN request w/ sequence number lower than
@@ -6764,8 +6828,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+1)),
- checker.AckNum(uint32(iss)+1),
+ checker.TCPSeqNum(uint32(c.IRS+1)),
+ checker.TCPAckNum(uint32(iss)+1),
checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
finHeaders := &context.Headers{
@@ -6782,8 +6846,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+2)),
- checker.AckNum(uint32(iss)+2),
+ checker.TCPSeqNum(uint32(c.IRS+2)),
+ checker.TCPAckNum(uint32(iss)+2),
checker.TCPFlags(header.TCPFlagAck)))
time.Sleep(2 * time.Second)
@@ -6797,8 +6861,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+2)),
- checker.AckNum(uint32(iss)+2),
+ checker.TCPSeqNum(uint32(c.IRS+2)),
+ checker.TCPAckNum(uint32(iss)+2),
checker.TCPFlags(header.TCPFlagAck)))
// Sleep for 4 seconds so at this point we are 1 second past the
@@ -6826,8 +6890,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(ackHeaders.AckNum)),
- checker.AckNum(0),
+ checker.TCPSeqNum(uint32(ackHeaders.AckNum)),
+ checker.TCPAckNum(0),
checker.TCPFlags(header.TCPFlagRst)))
if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want {
@@ -6926,8 +6990,8 @@ func TestTCPCloseWithData(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+1)),
- checker.AckNum(uint32(iss)+2),
+ checker.TCPSeqNum(uint32(c.IRS+1)),
+ checker.TCPAckNum(uint32(iss)+2),
checker.TCPFlags(header.TCPFlagAck)))
// Now write a few bytes and then close the endpoint.
@@ -6945,8 +7009,8 @@ func TestTCPCloseWithData(t *testing.T) {
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(iss)+2), // Acknum is initial sequence number + 1
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(iss)+2), // Acknum is initial sequence number + 1
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -6960,8 +7024,8 @@ func TestTCPCloseWithData(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+1)+uint32(len(data))),
- checker.AckNum(uint32(iss+2)),
+ checker.TCPSeqNum(uint32(c.IRS+1)+uint32(len(data))),
+ checker.TCPAckNum(uint32(iss+2)),
checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
// First send a partial ACK.
@@ -7006,8 +7070,8 @@ func TestTCPCloseWithData(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(ackHeaders.AckNum)),
- checker.AckNum(0),
+ checker.TCPSeqNum(uint32(ackHeaders.AckNum)),
+ checker.TCPAckNum(0),
checker.TCPFlags(header.TCPFlagRst)))
}
@@ -7043,8 +7107,8 @@ func TestTCPUserTimeout(t *testing.T) {
checker.PayloadLen(len(view)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(next),
- checker.AckNum(790),
+ checker.TCPSeqNum(next),
+ checker.TCPAckNum(790),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -7078,8 +7142,8 @@ func TestTCPUserTimeout(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(next)),
- checker.AckNum(uint32(0)),
+ checker.TCPSeqNum(uint32(next)),
+ checker.TCPAckNum(uint32(0)),
checker.TCPFlags(header.TCPFlagRst),
),
)
@@ -7140,8 +7204,8 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
checker.IPv4(t, b,
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)),
- checker.AckNum(uint32(790)),
+ checker.TCPSeqNum(uint32(c.IRS)),
+ checker.TCPAckNum(uint32(790)),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -7166,8 +7230,8 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS+1)),
- checker.AckNum(uint32(0)),
+ checker.TCPSeqNum(uint32(c.IRS+1)),
+ checker.TCPAckNum(uint32(0)),
checker.TCPFlags(header.TCPFlagRst),
),
)
@@ -7183,9 +7247,9 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
}
}
-func TestIncreaseWindowOnReceive(t *testing.T) {
+func TestIncreaseWindowOnRead(t *testing.T) {
// This test ensures that the endpoint sends an ack,
- // after recv() when the window grows to more than 1 MSS.
+ // after read() when the window grows by more than 1 MSS.
c := context.New(t, defaultMTU)
defer c.Cleanup()
@@ -7194,10 +7258,9 @@ func TestIncreaseWindowOnReceive(t *testing.T) {
// Write chunks of ~30000 bytes. It's important that two
// payloads make it equal or longer than MSS.
- remain := rcvBuf
+ remain := rcvBuf * 2
sent := 0
data := make([]byte, defaultMTU/2)
- lastWnd := uint16(0)
for remain > len(data) {
c.SendPacket(data, &context.Headers{
@@ -7210,46 +7273,43 @@ func TestIncreaseWindowOnReceive(t *testing.T) {
})
sent += len(data)
remain -= len(data)
-
- lastWnd = uint16(remain)
- if remain > 0xffff {
- lastWnd = 0xffff
- }
- checker.IPv4(t, c.GetPacket(),
+ pkt := c.GetPacket()
+ checker.IPv4(t, pkt,
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+sent)),
- checker.Window(lastWnd),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+sent)),
checker.TCPFlags(header.TCPFlagAck),
),
)
+ // Break once the window drops below defaultMTU/2
+ if wnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize(); wnd < defaultMTU/2 {
+ break
+ }
}
- if lastWnd == 0xffff || lastWnd == 0 {
- t.Fatalf("expected small, non-zero window: %d", lastWnd)
- }
-
- // We now have < 1 MSS in the buffer space. Read the data! An
- // ack should be sent in response to that. The window was not
- // zero, but it grew to larger than MSS.
- if _, _, err := c.EP.Read(nil); err != nil {
- t.Fatalf("Read failed: %s", err)
- }
-
- if _, _, err := c.EP.Read(nil); err != nil {
- t.Fatalf("Read failed: %s", err)
+ // We now have < 1 MSS in the buffer space. Read at least > 2 MSS
+ // worth of data as receive buffer space
+ read := 0
+ // defaultMTU is a good enough estimate for the MSS used for this
+ // connection.
+ for read < defaultMTU*2 {
+ v, _, err := c.EP.Read(nil)
+ if err != nil {
+ t.Fatalf("Read failed: %s", err)
+ }
+ read += len(v)
}
- // After reading two packets, we surely crossed MSS. See the ack:
+ // After reading > MSS worth of data, we surely crossed MSS. See the ack:
checker.IPv4(t, c.GetPacket(),
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+sent)),
- checker.Window(uint16(0xffff)),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+sent)),
+ checker.TCPWindow(uint16(0xffff)),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -7266,10 +7326,9 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
// Write chunks of ~30000 bytes. It's important that two
// payloads make it equal or longer than MSS.
- remain := rcvBuf
+ remain := rcvBuf * 2
sent := 0
data := make([]byte, defaultMTU/2)
- lastWnd := uint16(0)
for remain > len(data) {
c.SendPacket(data, &context.Headers{
@@ -7283,38 +7342,29 @@ func TestIncreaseWindowOnBufferResize(t *testing.T) {
sent += len(data)
remain -= len(data)
- lastWnd = uint16(remain)
- if remain > 0xffff {
- lastWnd = 0xffff
- }
checker.IPv4(t, c.GetPacket(),
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+sent)),
- checker.Window(lastWnd),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+sent)),
+ checker.TCPWindowLessThanEq(0xffff),
checker.TCPFlags(header.TCPFlagAck),
),
)
}
- if lastWnd == 0xffff || lastWnd == 0 {
- t.Fatalf("expected small, non-zero window: %d", lastWnd)
- }
-
// Increasing the buffer from should generate an ACK,
// since window grew from small value to larger equal MSS
c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBuf*2)
- // After reading two packets, we surely crossed MSS. See the ack:
checker.IPv4(t, c.GetPacket(),
checker.PayloadLen(header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(790+sent)),
- checker.Window(uint16(0xffff)),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(790+sent)),
+ checker.TCPWindow(uint16(0xffff)),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -7359,8 +7409,8 @@ func TestTCPDeferAccept(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(iss+1)),
- checker.AckNum(uint32(irs+5))))
+ checker.TCPSeqNum(uint32(iss+1)),
+ checker.TCPAckNum(uint32(irs+5))))
// Give a bit of time for the socket to be delivered to the accept queue.
time.Sleep(50 * time.Millisecond)
@@ -7374,8 +7424,8 @@ func TestTCPDeferAccept(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
- checker.SeqNum(uint32(iss+1)),
- checker.AckNum(uint32(irs+5))))
+ checker.TCPSeqNum(uint32(iss+1)),
+ checker.TCPAckNum(uint32(irs+5))))
}
func TestTCPDeferAcceptTimeout(t *testing.T) {
@@ -7412,7 +7462,7 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
- checker.AckNum(uint32(irs)+1)))
+ checker.TCPAckNum(uint32(irs)+1)))
// Send data. This should result in an acceptable endpoint.
c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
@@ -7428,8 +7478,8 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(iss+1)),
- checker.AckNum(uint32(irs+5))))
+ checker.TCPSeqNum(uint32(iss+1)),
+ checker.TCPAckNum(uint32(irs+5))))
// Give sometime for the endpoint to be delivered to the accept queue.
time.Sleep(50 * time.Millisecond)
@@ -7444,8 +7494,8 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
- checker.SeqNum(uint32(iss+1)),
- checker.AckNum(uint32(irs+5))))
+ checker.TCPSeqNum(uint32(iss+1)),
+ checker.TCPAckNum(uint32(irs+5))))
}
func TestResetDuringClose(t *testing.T) {
@@ -7470,8 +7520,8 @@ func TestResetDuringClose(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(irs.Add(1))),
- checker.AckNum(uint32(iss.Add(5)))))
+ checker.TCPSeqNum(uint32(irs.Add(1))),
+ checker.TCPAckNum(uint32(iss.Add(5)))))
// Close in a separate goroutine so that we can trigger
// a race with the RST we send below. This should not
@@ -7552,3 +7602,14 @@ func TestSetStackTimeWaitReuse(t *testing.T) {
}
}
}
+
+// generateRandomPayload generates a random byte slice of the specified length
+// causing a fatal test failure if it is unable to do so.
+func generateRandomPayload(t *testing.T, n int) []byte {
+ t.Helper()
+ buf := make([]byte, n)
+ if _, err := rand.Read(buf); err != nil {
+ t.Fatalf("rand.Read(buf) failed: %s", err)
+ }
+ return buf
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
index 44593ed98..0f9ed06cd 100644
--- a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -159,9 +159,9 @@ func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndS
checker.PayloadLen(len(data)+header.TCPMinimumSize+12),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
- checker.Window(wndSize),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
+ checker.TCPWindow(wndSize),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
checker.TCPTimestampChecker(true, 0, tsVal+1),
),
@@ -181,7 +181,8 @@ func TestTimeStampEnabledAccept(t *testing.T) {
wndSize uint16
}{
{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
- {false, 5, 0x8000}, // DefaultReceiveBufferSize is 1MB >> 5.
+ // DefaultReceiveBufferSize is 1MB >> 5. Advertised window will be 1/2 of that.
+ {false, 5, 0x4000},
}
for _, tc := range testCases {
timeStampEnabledAccept(t, tc.cookieEnabled, tc.wndScale, tc.wndSize)
@@ -219,9 +220,9 @@ func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wnd
checker.PayloadLen(len(data)+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(context.TestPort),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(790),
- checker.Window(wndSize),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(790),
+ checker.TCPWindow(wndSize),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
checker.TCPTimestampChecker(false, 0, 0),
),
@@ -237,7 +238,9 @@ func TestTimeStampDisabledAccept(t *testing.T) {
wndSize uint16
}{
{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
- {false, 5, 0x8000}, // DefaultReceiveBufferSize is 1MB >> 5.
+ // DefaultReceiveBufferSize is 1MB >> 5. Advertised window will be half of
+ // that.
+ {false, 5, 0x4000},
}
for _, tc := range testCases {
timeStampDisabledAccept(t, tc.cookieEnabled, tc.wndScale, tc.wndSize)
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 059c13821..ebbae6e2f 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -145,6 +145,10 @@ type Context struct {
// WindowScale is the expected window scale in SYN packets sent by
// the stack.
WindowScale uint8
+
+ // RcvdWindowScale is the actual window scale sent by the stack in
+ // SYN/SYN-ACK.
+ RcvdWindowScale uint8
}
// New allocates and initializes a test context containing a new
@@ -261,18 +265,17 @@ func (c *Context) CheckNoPacket(errMsg string) {
c.CheckNoPacketTimeout(errMsg, 1*time.Second)
}
-// GetPacket reads a packet from the link layer endpoint and verifies
+// GetPacketWithTimeout reads a packet from the link layer endpoint and verifies
// that it is an IPv4 packet with the expected source and destination
-// addresses. It will fail with an error if no packet is received for
-// 2 seconds.
-func (c *Context) GetPacket() []byte {
+// addresses. If no packet is received in the specified timeout it will return
+// nil.
+func (c *Context) GetPacketWithTimeout(timeout time.Duration) []byte {
c.t.Helper()
- ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
p, ok := c.linkEP.ReadContext(ctx)
if !ok {
- c.t.Fatalf("Packet wasn't written out")
return nil
}
@@ -299,6 +302,21 @@ func (c *Context) GetPacket() []byte {
return b
}
+// GetPacket reads a packet from the link layer endpoint and verifies
+// that it is an IPv4 packet with the expected source and destination
+// addresses.
+func (c *Context) GetPacket() []byte {
+ c.t.Helper()
+
+ p := c.GetPacketWithTimeout(5 * time.Second)
+ if p == nil {
+ c.t.Fatalf("Packet wasn't written out")
+ return nil
+ }
+
+ return p
+}
+
// GetPacketNonBlocking reads a packet from the link layer endpoint
// and verifies that it is an IPv4 packet with the expected source
// and destination address. If no packet is available it will return
@@ -486,8 +504,8 @@ func (c *Context) ReceiveAndCheckPacketWithOptions(data []byte, offset, size, op
checker.PayloadLen(size+header.TCPMinimumSize+optlen),
checker.TCP(
checker.DstPort(TestPort),
- checker.SeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
- checker.AckNum(uint32(seqnum.Value(testInitialSequenceNumber).Add(1))),
+ checker.TCPSeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
+ checker.TCPAckNum(uint32(seqnum.Value(testInitialSequenceNumber).Add(1))),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -513,8 +531,8 @@ func (c *Context) ReceiveNonBlockingAndCheckPacket(data []byte, offset, size int
checker.PayloadLen(size+header.TCPMinimumSize),
checker.TCP(
checker.DstPort(TestPort),
- checker.SeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
- checker.AckNum(uint32(seqnum.Value(testInitialSequenceNumber).Add(1))),
+ checker.TCPSeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
+ checker.TCPAckNum(uint32(seqnum.Value(testInitialSequenceNumber).Add(1))),
checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
),
)
@@ -652,6 +670,7 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
}
tcpHdr := header.TCP(header.IPv4(b).Payload())
+ synOpts := header.ParseSynOptions(tcpHdr.Options(), false /* isAck */)
c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
c.SendPacket(nil, &Headers{
@@ -669,8 +688,8 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
checker.TCP(
checker.DstPort(TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(c.IRS)+1),
- checker.AckNum(uint32(iss)+1),
+ checker.TCPSeqNum(uint32(c.IRS)+1),
+ checker.TCPAckNum(uint32(iss)+1),
),
)
@@ -687,6 +706,7 @@ func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte)
c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
}
+ c.RcvdWindowScale = uint8(synOpts.WS)
c.Port = tcpHdr.SourcePort()
}
@@ -758,17 +778,18 @@ func (r *RawEndpoint) SendPacket(payload []byte, opts []byte) {
r.NextSeqNum = r.NextSeqNum.Add(seqnum.Size(len(payload)))
}
-// VerifyACKWithTS verifies that the tsEcr field in the ack matches the provided
-// tsVal.
-func (r *RawEndpoint) VerifyACKWithTS(tsVal uint32) {
+// VerifyAndReturnACKWithTS verifies that the tsEcr field int he ACK matches
+// the provided tsVal as well as returns the original packet.
+func (r *RawEndpoint) VerifyAndReturnACKWithTS(tsVal uint32) []byte {
+ r.C.t.Helper()
// Read ACK and verify that tsEcr of ACK packet is [1,2,3,4]
ackPacket := r.C.GetPacket()
checker.IPv4(r.C.t, ackPacket,
checker.TCP(
checker.DstPort(r.SrcPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(r.AckNum)),
- checker.AckNum(uint32(r.NextSeqNum)),
+ checker.TCPSeqNum(uint32(r.AckNum)),
+ checker.TCPAckNum(uint32(r.NextSeqNum)),
checker.TCPTimestampChecker(true, 0, tsVal),
),
)
@@ -776,19 +797,28 @@ func (r *RawEndpoint) VerifyACKWithTS(tsVal uint32) {
tcpSeg := header.TCP(header.IPv4(ackPacket).Payload())
opts := tcpSeg.ParsedOptions()
r.RecentTS = opts.TSVal
+ return ackPacket
+}
+
+// VerifyACKWithTS verifies that the tsEcr field in the ack matches the provided
+// tsVal.
+func (r *RawEndpoint) VerifyACKWithTS(tsVal uint32) {
+ r.C.t.Helper()
+ _ = r.VerifyAndReturnACKWithTS(tsVal)
}
// VerifyACKRcvWnd verifies that the window advertised by the incoming ACK
// matches the provided rcvWnd.
func (r *RawEndpoint) VerifyACKRcvWnd(rcvWnd uint16) {
+ r.C.t.Helper()
ackPacket := r.C.GetPacket()
checker.IPv4(r.C.t, ackPacket,
checker.TCP(
checker.DstPort(r.SrcPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(r.AckNum)),
- checker.AckNum(uint32(r.NextSeqNum)),
- checker.Window(rcvWnd),
+ checker.TCPSeqNum(uint32(r.AckNum)),
+ checker.TCPAckNum(uint32(r.NextSeqNum)),
+ checker.TCPWindow(rcvWnd),
),
)
}
@@ -807,8 +837,8 @@ func (r *RawEndpoint) VerifyACKHasSACK(sackBlocks []header.SACKBlock) {
checker.TCP(
checker.DstPort(r.SrcPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(r.AckNum)),
- checker.AckNum(uint32(r.NextSeqNum)),
+ checker.TCPSeqNum(uint32(r.AckNum)),
+ checker.TCPAckNum(uint32(r.NextSeqNum)),
checker.TCPSACKBlockChecker(sackBlocks),
),
)
@@ -900,8 +930,8 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
tcpCheckers := []checker.TransportChecker{
checker.DstPort(TestPort),
checker.TCPFlags(header.TCPFlagAck),
- checker.SeqNum(uint32(c.IRS) + 1),
- checker.AckNum(uint32(iss) + 1),
+ checker.TCPSeqNum(uint32(c.IRS) + 1),
+ checker.TCPAckNum(uint32(iss) + 1),
}
// Verify that tsEcr of ACK packet is wantOptions.TSVal if the
@@ -936,7 +966,7 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
// Mark in context that timestamp option is enabled for this endpoint.
c.TimeStampEnabled = true
-
+ c.RcvdWindowScale = uint8(synOptions.WS)
return &RawEndpoint{
C: c,
SrcPort: tcpSeg.DestinationPort(),
@@ -1029,6 +1059,7 @@ func (c *Context) PassiveConnect(maxPayload, wndScale int, synOptions header.TCP
// value of the window scaling option to be sent in the SYN. If synOptions.WS >
// 0 then we send the WindowScale option.
func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions header.TCPSynOptions) *RawEndpoint {
+ c.t.Helper()
opts := make([]byte, header.TCPOptionsMaximumSize)
offset := 0
offset += header.EncodeMSSOption(uint32(maxPayload), opts)
@@ -1067,13 +1098,14 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
// are present.
b := c.GetPacket()
tcp := header.TCP(header.IPv4(b).Payload())
+ rcvdSynOptions := header.ParseSynOptions(tcp.Options(), true /* isAck */)
c.IRS = seqnum.Value(tcp.SequenceNumber())
tcpCheckers := []checker.TransportChecker{
checker.SrcPort(StackPort),
checker.DstPort(TestPort),
checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
- checker.AckNum(uint32(iss) + 1),
+ checker.TCPAckNum(uint32(iss) + 1),
checker.TCPSynOptions(header.TCPSynOptions{MSS: synOptions.MSS, WS: wndScale, SACKPermitted: synOptions.SACKPermitted && c.SACKEnabled()}),
}
@@ -1116,6 +1148,7 @@ func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions
// Send ACK.
c.SendPacket(nil, ackHeaders)
+ c.RcvdWindowScale = uint8(rcvdSynOptions.WS)
c.Port = StackPort
return &RawEndpoint{
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 64a5fc696..6813d57aa 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -1516,18 +1516,6 @@ func TestSetTTL(t *testing.T) {
c.t.Fatalf("SetSockOptInt(TTLOption, %d) failed: %s", wantTTL, err)
}
- var p stack.NetworkProtocol
- if flow.isV4() {
- p = ipv4.NewProtocol()
- } else {
- p = ipv6.NewProtocol()
- }
- ep := p.NewEndpoint(0, nil, nil, nil, nil, stack.New(stack.Options{
- NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
- TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
- }))
- ep.Close()
-
testWrite(c, flow, checker.TTL(wantTTL))
})
}