summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--Makefile16
-rwxr-xr-ximages/jekyll/build.sh3
-rw-r--r--pkg/sentry/fsimpl/fuse/dev.go7
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go1
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go2
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go29
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go27
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go11
-rw-r--r--pkg/sentry/fsimpl/host/host.go7
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go10
-rw-r--r--pkg/sentry/fsimpl/overlay/non_directory.go10
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go6
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go7
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go5
-rw-r--r--pkg/sentry/socket/hostinet/socket_vfs2.go5
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go4
-rw-r--r--pkg/sentry/socket/netfilter/tcp_matcher.go32
-rw-r--r--pkg/sentry/socket/netfilter/udp_matcher.go32
-rw-r--r--pkg/sentry/socket/netstack/netstack.go12
-rw-r--r--pkg/sentry/socket/unix/transport/unix.go18
-rw-r--r--pkg/sentry/syscalls/linux/sys_sched.go2
-rw-r--r--pkg/sentry/vfs/file_description.go3
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go6
-rw-r--r--pkg/tcpip/header/parse/BUILD15
-rw-r--r--pkg/tcpip/header/parse/parse.go166
-rw-r--r--pkg/tcpip/link/sniffer/BUILD1
-rw-r--r--pkg/tcpip/link/sniffer/sniffer.go60
-rw-r--r--pkg/tcpip/network/arp/BUILD1
-rw-r--r--pkg/tcpip/network/arp/arp.go7
-rw-r--r--pkg/tcpip/network/ipv4/BUILD1
-rw-r--r--pkg/tcpip/network/ipv4/ipv4.go46
-rw-r--r--pkg/tcpip/network/ipv4/ipv4_test.go208
-rw-r--r--pkg/tcpip/network/ipv6/BUILD1
-rw-r--r--pkg/tcpip/network/ipv6/ipv6.go142
-rw-r--r--pkg/tcpip/network/ipv6/ipv6_test.go208
-rw-r--r--pkg/tcpip/stack/iptables.go120
-rw-r--r--pkg/tcpip/stack/iptables_types.go65
-rw-r--r--pkg/tcpip/stack/nic.go3
-rw-r--r--pkg/tcpip/stack/packet_buffer.go2
-rw-r--r--pkg/tcpip/transport/icmp/endpoint.go22
-rw-r--r--pkg/tcpip/transport/packet/endpoint.go23
-rw-r--r--pkg/tcpip/transport/raw/endpoint.go23
-rw-r--r--pkg/tcpip/transport/tcp/BUILD1
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go19
-rw-r--r--pkg/tcpip/transport/tcp/protocol.go18
-rw-r--r--pkg/tcpip/transport/udp/BUILD1
-rw-r--r--pkg/tcpip/transport/udp/endpoint.go13
-rw-r--r--pkg/tcpip/transport/udp/protocol.go4
-rw-r--r--pkg/test/testutil/testutil.go4
-rw-r--r--runsc/boot/BUILD1
-rw-r--r--runsc/boot/filter/config_amd64.go1
-rw-r--r--runsc/boot/vfs.go184
-rw-r--r--runsc/cmd/gofer.go15
-rw-r--r--runsc/container/container.go3
-rw-r--r--runsc/container/container_test.go18
-rw-r--r--runsc/fsgofer/filter/config_amd64.go1
-rw-r--r--runsc/fsgofer/fsgofer.go3
-rw-r--r--runsc/fsgofer/fsgofer_test.go23
-rw-r--r--test/e2e/integration_test.go44
-rw-r--r--test/fuse/linux/rmdir_test.cc27
-rw-r--r--test/fuse/linux/unlink_test.cc24
-rw-r--r--test/image/image_test.go64
-rw-r--r--test/iptables/iptables_test.go52
-rw-r--r--test/packetimpact/tests/BUILD10
-rw-r--r--test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go133
-rw-r--r--test/runtimes/exclude/nodejs12.4.0.csv1
-rw-r--r--test/syscalls/linux/fallocate.cc6
-rw-r--r--test/syscalls/linux/packet_socket_raw.cc21
-rw-r--r--test/syscalls/linux/raw_socket_icmp.cc22
-rw-r--r--test/syscalls/linux/socket_ip_udp_generic.cc12
-rw-r--r--test/syscalls/linux/socket_unix_stream.cc18
-rw-r--r--test/syscalls/linux/vdso_clock_gettime.cc64
-rw-r--r--tools/bazel.mk4
-rw-r--r--website/_config.yml3
-rw-r--r--website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.pngbin0 -> 48602 bytes
-rw-r--r--website/blog/2019-11-18-security-basics.md4
-rw-r--r--website/blog/2020-09-18-containing-a-real-vulnerability.md224
-rw-r--r--website/blog/BUILD10
78 files changed, 1893 insertions, 528 deletions
diff --git a/Makefile b/Makefile
index 4f18501a5..d0d1840f6 100644
--- a/Makefile
+++ b/Makefile
@@ -129,10 +129,9 @@ tests: unit-tests
@$(call submake,test TARGETS="test/syscalls/...")
.PHONY: tests
-
integration-tests: ## Run all standard integration tests.
integration-tests: docker-tests overlay-tests hostnet-tests swgso-tests
-integration-tests: do-tests kvm-tests root-tests containerd-tests
+integration-tests: do-tests kvm-tests containerd-test-1.3.4
.PHONY: integration-tests
network-tests: ## Run all networking integration tests.
@@ -186,6 +185,7 @@ swgso-tests: load-basic-images
@$(call submake,install-test-runtime RUNTIME="swgso" ARGS="--software-gso=true --gso=false")
@$(call submake,test-runtime RUNTIME="swgso" TARGETS="$(INTEGRATION_TARGETS)")
.PHONY: swgso-tests
+
hostnet-tests: load-basic-images
@$(call submake,install-test-runtime RUNTIME="hostnet" ARGS="--network=host")
@$(call submake,test-runtime RUNTIME="hostnet" OPTIONS="--test_arg=-checkpoint=false" TARGETS="$(INTEGRATION_TARGETS)")
@@ -200,6 +200,8 @@ kvm-tests: load-basic-images
.PHONY: kvm-tests
iptables-tests: load-iptables
+ @sudo modprobe iptable_filter
+ @sudo modprobe ip6table_filter
@$(call submake,test-runtime RUNTIME="runc" TARGETS="//test/iptables:iptables_test")
@$(call submake,install-test-runtime RUNTIME="iptables" ARGS="--net-raw")
@$(call submake,test-runtime RUNTIME="iptables" TARGETS="//test/iptables:iptables_test")
@@ -217,16 +219,12 @@ packetimpact-tests: load-packetimpact
@$(call submake,test-runtime OPTIONS="--jobs=HOST_CPUS*3 --local_test_jobs=HOST_CPUS*3" RUNTIME="packetimpact" TARGETS="$(shell $(MAKE) query TARGETS='attr(tags, packetimpact, tests(//...))')")
.PHONY: packetimpact-tests
-root-tests: load-basic-images
- @$(call submake,install-test-runtime)
- @$(call submake,sudo TARGETS="//test/root:root_test" ARGS="-test.v")
-.PHONY: root-tests
-
# Specific containerd version tests.
-containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd install-test-runtime
+containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd load-basic_ubuntu
+ @$(call submake,install-test-runtime RUNTIME="root")
@CONTAINERD_VERSION=$* $(MAKE) sudo TARGETS="tools/installers:containerd"
@$(MAKE) sudo TARGETS="tools/installers:shim"
- @$(MAKE) sudo TARGETS="test/root:root_test" ARGS="-test.v"
+ @$(MAKE) sudo TARGETS="test/root:root_test" ARGS="--runtime=root -test.v"
# Note that we can't run containerd-test-1.1.8 tests here.
#
diff --git a/images/jekyll/build.sh b/images/jekyll/build.sh
index bfceb2781..010972ea6 100755
--- a/images/jekyll/build.sh
+++ b/images/jekyll/build.sh
@@ -18,4 +18,5 @@ set -euxo pipefail
# Generate the syntax highlighting css file.
/usr/gem/bin/rougify style github >/input/_sass/syntax.css
-/usr/gem/bin/jekyll build -t -s /input -d /output
+# Build website including pages irrespective of date.
+/usr/gem/bin/jekyll build --future -t -s /input -d /output
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index 5539466ff..f690ef5ad 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -95,9 +95,14 @@ type DeviceFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *DeviceFD) Release(context.Context) {
+func (fd *DeviceFD) Release(ctx context.Context) {
if fd.fs != nil {
+ fd.fs.conn.mu.Lock()
fd.fs.conn.connected = false
+ fd.fs.conn.mu.Unlock()
+
+ fd.fs.VFSFilesystem().DecRef(ctx)
+ fd.fs = nil
}
}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index a768b1a80..b3573f80d 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -218,6 +218,7 @@ func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt
conn: conn,
}
+ fs.VFSFilesystem().IncRef()
fuseFD.fs = fs
return fs, nil
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 03ce51df7..16abc59dc 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -1026,7 +1026,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
// step is required even if !d.cachedMetadataAuthoritative() because
// d.mappings has to be updated.
// d.metadataMu has already been acquired if trunc == true.
- d.updateFileSizeLocked(0)
+ d.updateSizeLocked(0)
if d.cachedMetadataAuthoritative() {
d.touchCMtimeLocked()
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 0e21c31a4..aaad9c0d9 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -833,7 +833,7 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
}
if mask.Size {
- d.updateFileSizeLocked(attr.Size)
+ d.updateSizeLocked(attr.Size)
}
}
@@ -987,7 +987,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
// d.size should be kept up to date, and privatized
// copy-on-write mappings of truncated pages need to be
// invalidated, even if InteropModeShared is in effect.
- d.updateFileSizeLocked(stat.Size)
+ d.updateSizeLocked(stat.Size)
}
}
if d.fs.opts.interop == InteropModeShared {
@@ -1024,8 +1024,31 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
return nil
}
+// doAllocate performs an allocate operation on d. Note that d.metadataMu will
+// be held when allocate is called.
+func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+
+ // Allocating a smaller size is a noop.
+ size := offset + length
+ if d.cachedMetadataAuthoritative() && size <= d.size {
+ return nil
+ }
+
+ err := allocate()
+ if err != nil {
+ return err
+ }
+ d.updateSizeLocked(size)
+ if d.cachedMetadataAuthoritative() {
+ d.touchCMtimeLocked()
+ }
+ return nil
+}
+
// Preconditions: d.metadataMu must be locked.
-func (d *dentry) updateFileSizeLocked(newSize uint64) {
+func (d *dentry) updateSizeLocked(newSize uint64) {
d.dataMu.Lock()
oldSize := d.size
atomic.StoreUint64(&d.size, newSize)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index a2e9342d5..24f03ee94 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -79,28 +79,11 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
d := fd.dentry()
- d.metadataMu.Lock()
- defer d.metadataMu.Unlock()
-
- // Allocating a smaller size is a noop.
- size := offset + length
- if d.cachedMetadataAuthoritative() && size <= d.size {
- return nil
- }
-
- d.handleMu.RLock()
- err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
- d.handleMu.RUnlock()
- if err != nil {
- return err
- }
- d.dataMu.Lock()
- atomic.StoreUint64(&d.size, size)
- d.dataMu.Unlock()
- if d.cachedMetadataAuthoritative() {
- d.touchCMtimeLocked()
- }
- return nil
+ return d.doAllocate(ctx, offset, length, func() error {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ })
}
// PRead implements vfs.FileDescriptionImpl.PRead.
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 3c39aa9b7..dc960e5bf 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
@@ -135,6 +136,16 @@ func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
fd.fileDescription.EventUnregister(e)
}
+func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ if fd.isRegularFile {
+ d := fd.dentry()
+ return d.doAllocate(ctx, offset, length, func() error {
+ return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ })
+ }
+ return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
if fd.seekable && offset < 0 {
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index afd57e015..db8536f26 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -560,12 +560,7 @@ func (f *fileDescription) Release(context.Context) {
// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
- if !f.inode.seekable {
- return syserror.ESPIPE
- }
-
- // TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds.
- return syserror.EOPNOTSUPP
+ return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
}
// PRead implements vfs.FileDescriptionImpl.PRead.
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 4a3dee631..89ed265dc 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -658,9 +658,6 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
fs.mu.Lock()
defer fs.mu.Unlock()
- // Store the name before walkExistingLocked as rp will be advanced past the
- // name in the following call.
- name := rp.Component()
vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
@@ -691,7 +688,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
- if err := parentDentry.inode.RmDir(ctx, name, vfsd); err != nil {
+ if err := parentDentry.inode.RmDir(ctx, d.name, vfsd); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
@@ -771,9 +768,6 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
fs.mu.Lock()
defer fs.mu.Unlock()
- // Store the name before walkExistingLocked as rp will be advanced past the
- // name in the following call.
- name := rp.Component()
vfsd, _, err := fs.walkExistingLocked(ctx, rp)
fs.processDeferredDecRefsLocked(ctx)
if err != nil {
@@ -799,7 +793,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
- if err := parentDentry.inode.Unlink(ctx, name, vfsd); err != nil {
+ if err := parentDentry.inode.Unlink(ctx, d.name, vfsd); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
index 74cfd3799..6e04705c7 100644
--- a/pkg/sentry/fsimpl/overlay/non_directory.go
+++ b/pkg/sentry/fsimpl/overlay/non_directory.go
@@ -147,6 +147,16 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux
return stat, nil
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *nonDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ wrappedFD, err := fd.getCurrentFD(ctx)
+ if err != nil {
+ return err
+ }
+ defer wrappedFD.DecRef(ctx)
+ return wrappedFD.Allocate(ctx, mode, offset, length)
+}
+
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
d := fd.dentry()
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index a1fe906ed..26b117ca4 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -185,8 +185,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
Start: child.lowerMerkleVD,
}, &vfs.GetXattrOptions{
Name: merkleOffsetInParentXattr,
- // Offset is a 32 bit integer.
- Size: sizeOfInt32,
+ Size: sizeOfStringInt32,
})
// The Merkle tree file for the child should have been created and
@@ -227,7 +226,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
// the size of all its children's root hashes.
dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
Name: merkleSizeXattr,
- Size: sizeOfInt32,
+ Size: sizeOfStringInt32,
})
// The Merkle tree file for the child should have been created and
@@ -372,6 +371,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
Path: fspath.Parse(childMerkleFilename),
}, &vfs.OpenOptions{
Flags: linux.O_RDWR | linux.O_CREAT,
+ Mode: 0644,
})
if err != nil {
return nil, err
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index 3e0bcd02b..9182df317 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -57,8 +57,9 @@ const merkleOffsetInParentXattr = "user.merkle.offset"
// whole file. For a directory, it's the size of all its children's root hashes.
const merkleSizeXattr = "user.merkle.size"
-// sizeOfInt32 is the size in bytes for a 32 bit integer in extended attributes.
-const sizeOfInt32 = 4
+// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
+// extended attributes. The maximum value of a 32 bit integer is 10 digits.
+const sizeOfStringInt32 = 10
// noCrashOnVerificationFailure indicates whether the sandbox should panic
// whenever verification fails. If true, an error is returned instead of
@@ -636,7 +637,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of
// dataSize is the size of the whole file.
dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
Name: merkleSizeXattr,
- Size: sizeOfInt32,
+ Size: sizeOfStringInt32,
})
// The Merkle tree file for the child should have been created and
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index f223d59e1..f61039f5b 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -67,6 +67,11 @@ func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlag
return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
}
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error {
+ return syserror.ESPIPE
+}
+
// Open opens the pipe represented by vp.
func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
vp.mu.Lock()
diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go
index 8a1d52ebf..97bc6027f 100644
--- a/pkg/sentry/socket/hostinet/socket_vfs2.go
+++ b/pkg/sentry/socket/hostinet/socket_vfs2.go
@@ -97,11 +97,6 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal
return ioctl(ctx, s.fd, uio, args)
}
-// Allocate implements vfs.FileDescriptionImpl.Allocate.
-func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error {
- return syserror.ENODEV
-}
-
// PRead implements vfs.FileDescriptionImpl.PRead.
func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
return 0, syserror.ESPIPE
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3e1735079..871ea80ee 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -146,6 +146,10 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
case stack.FilterTable:
table = stack.EmptyFilterTable()
case stack.NATTable:
+ if ipv6 {
+ nflog("IPv6 redirection not yet supported (gvisor.dev/issue/3549)")
+ return syserr.ErrInvalidArgument
+ }
table = stack.EmptyNATTable()
default:
nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 0bfd6c1f4..844acfede 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -97,17 +97,33 @@ func (*TCPMatcher) Name() string {
// Match implements Matcher.Match.
func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
- netHeader := header.IPv4(pkt.NetworkHeader().View())
+ // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+ // into the stack.Check codepath as matchers are added.
+ switch pkt.NetworkProtocolNumber {
+ case header.IPv4ProtocolNumber:
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
+ if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+ return false, false
+ }
- if netHeader.TransportProtocol() != header.TCPProtocolNumber {
- return false, false
- }
+ // We don't match fragments.
+ if frag := netHeader.FragmentOffset(); frag != 0 {
+ if frag == 1 {
+ return false, true
+ }
+ return false, false
+ }
- // We dont't match fragments.
- if frag := netHeader.FragmentOffset(); frag != 0 {
- if frag == 1 {
- return false, true
+ case header.IPv6ProtocolNumber:
+ // As in Linux, we do not perform an IPv6 fragment check. See
+ // xt_action_param.fragoff in
+ // include/linux/netfilter/x_tables.h.
+ if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.TCPProtocolNumber {
+ return false, false
}
+
+ default:
+ // We don't know the network protocol.
return false, false
}
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 7ed05461d..63201201c 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -94,19 +94,33 @@ func (*UDPMatcher) Name() string {
// Match implements Matcher.Match.
func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
- netHeader := header.IPv4(pkt.NetworkHeader().View())
-
// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
// into the stack.Check codepath as matchers are added.
- if netHeader.TransportProtocol() != header.UDPProtocolNumber {
- return false, false
- }
+ switch pkt.NetworkProtocolNumber {
+ case header.IPv4ProtocolNumber:
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
+ if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+ return false, false
+ }
- // We dont't match fragments.
- if frag := netHeader.FragmentOffset(); frag != 0 {
- if frag == 1 {
- return false, true
+ // We don't match fragments.
+ if frag := netHeader.FragmentOffset(); frag != 0 {
+ if frag == 1 {
+ return false, true
+ }
+ return false, false
}
+
+ case header.IPv6ProtocolNumber:
+ // As in Linux, we do not perform an IPv6 fragment check. See
+ // xt_action_param.fragoff in
+ // include/linux/netfilter/x_tables.h.
+ if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.UDPProtocolNumber {
+ return false, false
+ }
+
+ default:
+ // We don't know the network protocol.
return false, false
}
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index b462924af..816c89cfa 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -1185,7 +1185,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
var v tcpip.LingerOption
var linger linux.Linger
if err := ep.GetSockOpt(&v); err != nil {
- return &linger, nil
+ return nil, syserr.TranslateNetstackError(err)
}
if v.Enabled {
@@ -1768,10 +1768,16 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int
case linux.SOL_IP:
return setSockOptIP(t, s, ep, name, optVal)
+ case linux.SOL_PACKET:
+ // gVisor doesn't support any SOL_PACKET options just return not
+ // supported. Returning nil here will result in tcpdump thinking AF_PACKET
+ // features are supported and proceed to use them and break.
+ t.Kernel().EmitUnimplementedEvent(t)
+ return syserr.ErrProtocolNotAvailable
+
case linux.SOL_UDP,
linux.SOL_ICMPV6,
- linux.SOL_RAW,
- linux.SOL_PACKET:
+ linux.SOL_RAW:
t.Kernel().EmitUnimplementedEvent(t)
}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 08504560c..d6fc03520 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -746,6 +746,9 @@ type baseEndpoint struct {
// path is not empty if the endpoint has been bound,
// or may be used if the endpoint is connected.
path string
+
+ // linger is used for SO_LINGER socket option.
+ linger tcpip.LingerOption
}
// EventRegister implements waiter.Waitable.EventRegister.
@@ -841,8 +844,14 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
return n, err
}
-// SetSockOpt sets a socket option. Currently not supported.
-func (e *baseEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error {
+// SetSockOpt sets a socket option.
+func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+ switch v := opt.(type) {
+ case *tcpip.LingerOption:
+ e.Lock()
+ e.linger = *v
+ e.Unlock()
+ }
return nil
}
@@ -945,8 +954,11 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
- switch opt.(type) {
+ switch o := opt.(type) {
case *tcpip.LingerOption:
+ e.Lock()
+ *o = e.linger
+ e.Unlock()
return nil
default:
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index cd6f4dd94..bfcf44b6f 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -30,7 +30,7 @@ const (
//
// +marshal
type SchedParam struct {
- schedPriority int64
+ schedPriority int32
}
// SchedGetparam implements linux syscall sched_getparam(2).
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 2b29a3c3f..73bb36d3e 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -326,6 +326,9 @@ type FileDescriptionImpl interface {
// Allocate grows the file to offset + length bytes.
// Only mode == 0 is supported currently.
//
+ // Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on
+ // other files where it is not supported.
+ //
// Preconditions: The FileDescription was opened for writing.
Allocate(ctx context.Context, mode, offset, length uint64) error
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 68b80a951..78da16bac 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -57,7 +57,11 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err
}
// Allocate implements FileDescriptionImpl.Allocate analogously to
-// fallocate called on regular file, directory or FIFO in Linux.
+// fallocate called on an invalid type of file in Linux.
+//
+// Note that directories can rely on this implementation even though they
+// should technically return EISDIR. Allocate should never be called for a
+// directory, because it requires a writable fd.
func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
return syserror.ENODEV
}
diff --git a/pkg/tcpip/header/parse/BUILD b/pkg/tcpip/header/parse/BUILD
new file mode 100644
index 000000000..2adee9288
--- /dev/null
+++ b/pkg/tcpip/header/parse/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "parse",
+ srcs = ["parse.go"],
+ visibility = ["//visibility:public"],
+ deps = [
+ "//pkg/tcpip",
+ "//pkg/tcpip/buffer",
+ "//pkg/tcpip/header",
+ "//pkg/tcpip/stack",
+ ],
+)
diff --git a/pkg/tcpip/header/parse/parse.go b/pkg/tcpip/header/parse/parse.go
new file mode 100644
index 000000000..522135557
--- /dev/null
+++ b/pkg/tcpip/header/parse/parse.go
@@ -0,0 +1,166 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package parse provides utilities to parse packets.
+package parse
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/buffer"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// ARP populates pkt's network header with an ARP header found in
+// pkt.Data.
+//
+// Returns true if the header was successfully parsed.
+func ARP(pkt *stack.PacketBuffer) bool {
+ _, ok := pkt.NetworkHeader().Consume(header.ARPSize)
+ if ok {
+ pkt.NetworkProtocolNumber = header.ARPProtocolNumber
+ }
+ return ok
+}
+
+// IPv4 parses an IPv4 packet found in pkt.Data and populates pkt's network
+// header with the IPv4 header.
+//
+// Returns true if the header was successfully parsed.
+func IPv4(pkt *stack.PacketBuffer) bool {
+ hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+ if !ok {
+ return false
+ }
+ ipHdr := header.IPv4(hdr)
+
+ // Header may have options, determine the true header length.
+ headerLen := int(ipHdr.HeaderLength())
+ if headerLen < header.IPv4MinimumSize {
+ // TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in
+ // order for the packet to be valid. Figure out if we want to reject this
+ // case.
+ headerLen = header.IPv4MinimumSize
+ }
+ hdr, ok = pkt.NetworkHeader().Consume(headerLen)
+ if !ok {
+ return false
+ }
+ ipHdr = header.IPv4(hdr)
+
+ pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
+ pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr))
+ return true
+}
+
+// IPv6 parses an IPv6 packet found in pkt.Data and populates pkt's network
+// header with the IPv6 header.
+func IPv6(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, fragID uint32, fragOffset uint16, fragMore bool, ok bool) {
+ hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+ if !ok {
+ return 0, 0, 0, false, false
+ }
+ ipHdr := header.IPv6(hdr)
+
+ // dataClone consists of:
+ // - Any IPv6 header bytes after the first 40 (i.e. extensions).
+ // - The transport header, if present.
+ // - Any other payload data.
+ views := [8]buffer.View{}
+ dataClone := pkt.Data.Clone(views[:])
+ dataClone.TrimFront(header.IPv6MinimumSize)
+ it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone)
+
+ // Iterate over the IPv6 extensions to find their length.
+ var nextHdr tcpip.TransportProtocolNumber
+ var extensionsSize int
+
+traverseExtensions:
+ for {
+ extHdr, done, err := it.Next()
+ if err != nil {
+ break
+ }
+
+ // If we exhaust the extension list, the entire packet is the IPv6 header
+ // and (possibly) extensions.
+ if done {
+ extensionsSize = dataClone.Size()
+ break
+ }
+
+ switch extHdr := extHdr.(type) {
+ case header.IPv6FragmentExtHdr:
+ if fragID == 0 && fragOffset == 0 && !fragMore {
+ fragID = extHdr.ID()
+ fragOffset = extHdr.FragmentOffset()
+ fragMore = extHdr.More()
+ }
+
+ case header.IPv6RawPayloadHeader:
+ // We've found the payload after any extensions.
+ extensionsSize = dataClone.Size() - extHdr.Buf.Size()
+ nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
+ break traverseExtensions
+
+ default:
+ // Any other extension is a no-op, keep looping until we find the payload.
+ }
+ }
+
+ // Put the IPv6 header with extensions in pkt.NetworkHeader().
+ hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize)
+ if !ok {
+ panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size()))
+ }
+ ipHdr = header.IPv6(hdr)
+ pkt.Data.CapLength(int(ipHdr.PayloadLength()))
+ pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber
+
+ return nextHdr, fragID, fragOffset, fragMore, true
+}
+
+// UDP parses a UDP packet found in pkt.Data and populates pkt's transport
+// header with the UDP header.
+//
+// Returns true if the header was successfully parsed.
+func UDP(pkt *stack.PacketBuffer) bool {
+ _, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize)
+ return ok
+}
+
+// TCP parses a TCP packet found in pkt.Data and populates pkt's transport
+// header with the TCP header.
+//
+// Returns true if the header was successfully parsed.
+func TCP(pkt *stack.PacketBuffer) bool {
+ // TCP header is variable length, peek at it first.
+ hdrLen := header.TCPMinimumSize
+ hdr, ok := pkt.Data.PullUp(hdrLen)
+ if !ok {
+ return false
+ }
+
+ // If the header has options, pull those up as well.
+ if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() {
+ // TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of
+ // packets.
+ hdrLen = offset
+ }
+
+ _, ok = pkt.TransportHeader().Consume(hdrLen)
+ return ok
+}
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index 7cbc305e7..4aac12a8c 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -14,6 +14,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
+ "//pkg/tcpip/header/parse",
"//pkg/tcpip/link/nested",
"//pkg/tcpip/stack",
],
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index 4fb127978..560477926 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -31,6 +31,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/header/parse"
"gvisor.dev/gvisor/pkg/tcpip/link/nested"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
@@ -195,49 +196,52 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
var transProto uint8
src := tcpip.Address("unknown")
dst := tcpip.Address("unknown")
- id := 0
- size := uint16(0)
+ var size uint16
+ var id uint32
var fragmentOffset uint16
var moreFragments bool
- // Examine the packet using a new VV. Backing storage must not be written.
- vv := buffer.NewVectorisedView(pkt.Size(), pkt.Views())
-
+ // Clone the packet buffer to not modify the original.
+ //
+ // We don't clone the original packet buffer so that the new packet buffer
+ // does not have any of its headers set.
+ pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views())})
switch protocol {
case header.IPv4ProtocolNumber:
- hdr, ok := vv.PullUp(header.IPv4MinimumSize)
- if !ok {
+ if ok := parse.IPv4(pkt); !ok {
return
}
- ipv4 := header.IPv4(hdr)
+
+ ipv4 := header.IPv4(pkt.NetworkHeader().View())
fragmentOffset = ipv4.FragmentOffset()
moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
src = ipv4.SourceAddress()
dst = ipv4.DestinationAddress()
transProto = ipv4.Protocol()
size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
- vv.TrimFront(int(ipv4.HeaderLength()))
- id = int(ipv4.ID())
+ id = uint32(ipv4.ID())
case header.IPv6ProtocolNumber:
- hdr, ok := vv.PullUp(header.IPv6MinimumSize)
+ proto, fragID, fragOffset, fragMore, ok := parse.IPv6(pkt)
if !ok {
return
}
- ipv6 := header.IPv6(hdr)
+
+ ipv6 := header.IPv6(pkt.NetworkHeader().View())
src = ipv6.SourceAddress()
dst = ipv6.DestinationAddress()
- transProto = ipv6.NextHeader()
+ transProto = uint8(proto)
size = ipv6.PayloadLength()
- vv.TrimFront(header.IPv6MinimumSize)
+ id = fragID
+ moreFragments = fragMore
+ fragmentOffset = fragOffset
case header.ARPProtocolNumber:
- hdr, ok := vv.PullUp(header.ARPSize)
- if !ok {
+ if parse.ARP(pkt) {
return
}
- vv.TrimFront(header.ARPSize)
- arp := header.ARP(hdr)
+
+ arp := header.ARP(pkt.NetworkHeader().View())
log.Infof(
"%s arp %s (%s) -> %s (%s) valid:%t",
prefix,
@@ -259,7 +263,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
switch tcpip.TransportProtocolNumber(transProto) {
case header.ICMPv4ProtocolNumber:
transName = "icmp"
- hdr, ok := vv.PullUp(header.ICMPv4MinimumSize)
+ hdr, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
if !ok {
break
}
@@ -296,7 +300,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
case header.ICMPv6ProtocolNumber:
transName = "icmp"
- hdr, ok := vv.PullUp(header.ICMPv6MinimumSize)
+ hdr, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize)
if !ok {
break
}
@@ -331,11 +335,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
case header.UDPProtocolNumber:
transName = "udp"
- hdr, ok := vv.PullUp(header.UDPMinimumSize)
- if !ok {
+ if ok := parse.UDP(pkt); !ok {
break
}
- udp := header.UDP(hdr)
+
+ udp := header.UDP(pkt.TransportHeader().View())
if fragmentOffset == 0 {
srcPort = udp.SourcePort()
dstPort = udp.DestinationPort()
@@ -345,19 +349,19 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
case header.TCPProtocolNumber:
transName = "tcp"
- hdr, ok := vv.PullUp(header.TCPMinimumSize)
- if !ok {
+ if ok := parse.TCP(pkt); !ok {
break
}
- tcp := header.TCP(hdr)
+
+ tcp := header.TCP(pkt.TransportHeader().View())
if fragmentOffset == 0 {
offset := int(tcp.DataOffset())
if offset < header.TCPMinimumSize {
details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
break
}
- if offset > vv.Size() && !moreFragments {
- details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, vv.Size())
+ if size := pkt.Data.Size() + len(tcp); offset > size && !moreFragments {
+ details += fmt.Sprintf("invalid packet: tcp data offset %d larger than tcp packet length %d", offset, size)
break
}
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
index 82c073e32..b40dde96b 100644
--- a/pkg/tcpip/network/arp/BUILD
+++ b/pkg/tcpip/network/arp/BUILD
@@ -10,6 +10,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
+ "//pkg/tcpip/header/parse",
"//pkg/tcpip/stack",
],
)
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 7aaee08c4..cb9225bd7 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -29,6 +29,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/header/parse"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
@@ -234,11 +235,7 @@ func (*protocol) Wait() {}
// Parse implements stack.NetworkProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
- _, ok = pkt.NetworkHeader().Consume(header.ARPSize)
- if !ok {
- return 0, false, false
- }
- return 0, false, true
+ return 0, false, parse.ARP(pkt)
}
// NewProtocol returns an ARP network protocol.
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index c82593e71..f9c2aa980 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -13,6 +13,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
+ "//pkg/tcpip/header/parse",
"//pkg/tcpip/network/fragmentation",
"//pkg/tcpip/network/hash",
"//pkg/tcpip/stack",
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index f4394749d..59c3101b5 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -26,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/header/parse"
"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
"gvisor.dev/gvisor/pkg/tcpip/network/hash"
"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -238,11 +239,13 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
return nil
}
- // If the packet is manipulated as per NAT Ouput rules, handle packet
- // based on destination address and do not send the packet to link layer.
- // TODO(gvisor.dev/issue/170): We should do this for every packet, rather than
- // only NATted packets, but removing this check short circuits broadcasts
- // before they are sent out to other hosts.
+ // If the packet is manipulated as per NAT Output rules, handle packet
+ // based on destination address and do not send the packet to link
+ // layer.
+ //
+ // TODO(gvisor.dev/issue/170): We should do this for every
+ // packet, rather than only NATted packets, but removing this check
+ // short circuits broadcasts before they are sent out to other hosts.
if pkt.NatDone {
netHeader := header.IPv4(pkt.NetworkHeader().View())
ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress())
@@ -298,7 +301,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
return n, err
}
- // Slow Path as we are dropping some packets in the batch degrade to
+ // Slow path as we are dropping some packets in the batch degrade to
// emitting one packet at a time.
n := 0
for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
@@ -527,37 +530,14 @@ func (*protocol) Close() {}
// Wait implements stack.TransportProtocol.Wait.
func (*protocol) Wait() {}
-// Parse implements stack.TransportProtocol.Parse.
+// Parse implements stack.NetworkProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
- hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
- if !ok {
- return 0, false, false
- }
- ipHdr := header.IPv4(hdr)
-
- // Header may have options, determine the true header length.
- headerLen := int(ipHdr.HeaderLength())
- if headerLen < header.IPv4MinimumSize {
- // TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in
- // order for the packet to be valid. Figure out if we want to reject this
- // case.
- headerLen = header.IPv4MinimumSize
- }
- hdr, ok = pkt.NetworkHeader().Consume(headerLen)
- if !ok {
+ if ok := parse.IPv4(pkt); !ok {
return 0, false, false
}
- ipHdr = header.IPv4(hdr)
-
- // If this is a fragment, don't bother parsing the transport header.
- parseTransportHeader := true
- if ipHdr.More() || ipHdr.FragmentOffset() != 0 {
- parseTransportHeader = false
- }
- pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
- pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr))
- return ipHdr.TransportProtocol(), parseTransportHeader, true
+ ipHdr := header.IPv4(pkt.NetworkHeader().View())
+ return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true
}
// calculateMTU calculates the network-layer payload MTU based on the link-layer
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 5e50558e8..c1a560914 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -1046,3 +1046,211 @@ func TestReceiveFragments(t *testing.T) {
})
}
}
+
+func TestWritePacketsStats(t *testing.T) {
+ const nPackets = 3
+ tests := []struct {
+ name string
+ setup func(*testing.T, *stack.Stack)
+ linkEP stack.LinkEndpoint
+ expectSent int
+ }{
+ {
+ name: "Accept all",
+ // No setup needed, tables accept everything by default.
+ setup: func(*testing.T, *stack.Stack) {},
+ linkEP: &limitedEP{nPackets},
+ expectSent: nPackets,
+ }, {
+ name: "Accept all with error",
+ // No setup needed, tables accept everything by default.
+ setup: func(*testing.T, *stack.Stack) {},
+ linkEP: &limitedEP{nPackets - 1},
+ expectSent: nPackets - 1,
+ }, {
+ name: "Drop all",
+ setup: func(t *testing.T, stk *stack.Stack) {
+ // Install Output DROP rule.
+ t.Helper()
+ ipt := stk.IPTables()
+ filter, ok := ipt.GetTable(stack.FilterTable, false /* ipv6 */)
+ if !ok {
+ t.Fatalf("failed to find filter table")
+ }
+ ruleIdx := filter.BuiltinChains[stack.Output]
+ filter.Rules[ruleIdx].Target = stack.DropTarget{}
+ if err := ipt.ReplaceTable(stack.FilterTable, filter, false /* ipv6 */); err != nil {
+ t.Fatalf("failed to replace table: %v", err)
+ }
+ },
+ linkEP: &limitedEP{nPackets},
+ expectSent: 0,
+ }, {
+ name: "Drop some",
+ setup: func(t *testing.T, stk *stack.Stack) {
+ // Install Output DROP rule that matches only 1
+ // of the 3 packets.
+ t.Helper()
+ ipt := stk.IPTables()
+ filter, ok := ipt.GetTable(stack.FilterTable, false /* ipv6 */)
+ if !ok {
+ t.Fatalf("failed to find filter table")
+ }
+ // We'll match and DROP the last packet.
+ ruleIdx := filter.BuiltinChains[stack.Output]
+ filter.Rules[ruleIdx].Target = stack.DropTarget{}
+ filter.Rules[ruleIdx].Matchers = []stack.Matcher{&limitedMatcher{nPackets - 1}}
+ // Make sure the next rule is ACCEPT.
+ filter.Rules[ruleIdx+1].Target = stack.AcceptTarget{}
+ if err := ipt.ReplaceTable(stack.FilterTable, filter, false /* ipv6 */); err != nil {
+ t.Fatalf("failed to replace table: %v", err)
+ }
+ },
+ linkEP: &limitedEP{nPackets},
+ expectSent: nPackets - 1,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ rt := buildRoute(t, nil, test.linkEP)
+
+ var pbl stack.PacketBufferList
+ for i := 0; i < nPackets; i++ {
+ pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+ ReserveHeaderBytes: header.UDPMinimumSize + int(rt.MaxHeaderLength()),
+ Data: buffer.NewView(1).ToVectorisedView(),
+ })
+ pkt.TransportHeader().Push(header.UDPMinimumSize)
+ pbl.PushBack(pkt)
+ }
+
+ test.setup(t, rt.Stack())
+
+ nWritten, err := rt.WritePackets(nil, pbl, stack.NetworkHeaderParams{})
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ got := int(rt.Stats().IP.PacketsSent.Value())
+ if got != test.expectSent {
+ t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent)
+ }
+ if got != nWritten {
+ t.Errorf("sent %d packets, WritePackets returned %d", got, nWritten)
+ }
+ })
+ }
+}
+
+func buildRoute(t *testing.T, packetCollectorErrors []*tcpip.Error, linkEP stack.LinkEndpoint) stack.Route {
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
+ })
+ s.CreateNIC(1, linkEP)
+ const (
+ src = "\x10\x00\x00\x01"
+ dst = "\x10\x00\x00\x02"
+ )
+ s.AddAddress(1, ipv4.ProtocolNumber, src)
+ {
+ subnet, err := tcpip.NewSubnet(dst, tcpip.AddressMask(header.IPv4Broadcast))
+ if err != nil {
+ t.Fatal(err)
+ }
+ s.SetRouteTable([]tcpip.Route{{
+ Destination: subnet,
+ NIC: 1,
+ }})
+ }
+ rt, err := s.FindRoute(0, src, dst, ipv4.ProtocolNumber, false /* multicastLoop */)
+ if err != nil {
+ t.Fatalf("s.FindRoute got %v, want %v", err, nil)
+ }
+ return rt
+}
+
+// limitedEP is a link endpoint that writes up to a certain number of packets
+// before returning errors.
+type limitedEP struct {
+ limit int
+}
+
+// MTU implements LinkEndpoint.MTU.
+func (*limitedEP) MTU() uint32 { return 0 }
+
+// Capabilities implements LinkEndpoint.Capabilities.
+func (*limitedEP) Capabilities() stack.LinkEndpointCapabilities { return 0 }
+
+// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength.
+func (*limitedEP) MaxHeaderLength() uint16 { return 0 }
+
+// LinkAddress implements LinkEndpoint.LinkAddress.
+func (*limitedEP) LinkAddress() tcpip.LinkAddress { return "" }
+
+// WritePacket implements LinkEndpoint.WritePacket.
+func (ep *limitedEP) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error {
+ if ep.limit == 0 {
+ return tcpip.ErrInvalidEndpointState
+ }
+ ep.limit--
+ return nil
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (ep *limitedEP) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ if ep.limit == 0 {
+ return 0, tcpip.ErrInvalidEndpointState
+ }
+ nWritten := ep.limit
+ if nWritten > pkts.Len() {
+ nWritten = pkts.Len()
+ }
+ ep.limit -= nWritten
+ return nWritten, nil
+}
+
+// WriteRawPacket implements LinkEndpoint.WriteRawPacket.
+func (ep *limitedEP) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error {
+ if ep.limit == 0 {
+ return tcpip.ErrInvalidEndpointState
+ }
+ ep.limit--
+ return nil
+}
+
+// Attach implements LinkEndpoint.Attach.
+func (*limitedEP) Attach(_ stack.NetworkDispatcher) {}
+
+// IsAttached implements LinkEndpoint.IsAttached.
+func (*limitedEP) IsAttached() bool { return false }
+
+// Wait implements LinkEndpoint.Wait.
+func (*limitedEP) Wait() {}
+
+// ARPHardwareType implements LinkEndpoint.ARPHardwareType.
+func (*limitedEP) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareEther }
+
+// AddHeader implements LinkEndpoint.AddHeader.
+func (*limitedEP) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) {
+}
+
+// limitedMatcher is an iptables matcher that matches after a certain number of
+// packets are checked against it.
+type limitedMatcher struct {
+ limit int
+}
+
+// Name implements Matcher.Name.
+func (*limitedMatcher) Name() string {
+ return "limitedMatcher"
+}
+
+// Match implements Matcher.Match.
+func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) {
+ if lm.limit == 0 {
+ return true, false
+ }
+ lm.limit--
+ return false, false
+}
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index bcc64994e..cd5fe3ea8 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -13,6 +13,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
+ "//pkg/tcpip/header/parse",
"//pkg/tcpip/network/fragmentation",
"//pkg/tcpip/stack",
],
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index e821a8bff..a4a4d6a21 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -27,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/header/parse"
"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
@@ -107,6 +108,31 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s
func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
e.addIPHeader(r, pkt, params)
+ // iptables filtering. All packets that reach here are locally
+ // generated.
+ nicName := e.stack.FindNICNameFromID(e.NICID())
+ ipt := e.stack.IPTables()
+ if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok {
+ // iptables is telling us to drop the packet.
+ return nil
+ }
+
+ // If the packet is manipulated as per NAT Output rules, handle packet
+ // based on destination address and do not send the packet to link
+ // layer.
+ //
+ // TODO(gvisor.dev/issue/170): We should do this for every
+ // packet, rather than only NATted packets, but removing this check
+ // short circuits broadcasts before they are sent out to other hosts.
+ if pkt.NatDone {
+ netHeader := header.IPv6(pkt.NetworkHeader().View())
+ if ep, err := e.stack.FindNetworkEndpoint(header.IPv6ProtocolNumber, netHeader.DestinationAddress()); err == nil {
+ route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress())
+ ep.HandlePacket(&route, pkt)
+ return nil
+ }
+ }
+
if r.Loop&stack.PacketLoop != 0 {
loopedR := r.MakeLoopedRoute()
@@ -138,9 +164,46 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
e.addIPHeader(r, pb, params)
}
- n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
+ // iptables filtering. All packets that reach here are locally
+ // generated.
+ nicName := e.stack.FindNICNameFromID(e.NICID())
+ ipt := e.stack.IPTables()
+ dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r, nicName)
+ if len(dropped) == 0 && len(natPkts) == 0 {
+ // Fast path: If no packets are to be dropped then we can just invoke the
+ // faster WritePackets API directly.
+ n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
+ r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+ return n, err
+ }
+
+ // Slow path as we are dropping some packets in the batch degrade to
+ // emitting one packet at a time.
+ n := 0
+ for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+ if _, ok := dropped[pkt]; ok {
+ continue
+ }
+ if _, ok := natPkts[pkt]; ok {
+ netHeader := header.IPv6(pkt.NetworkHeader().View())
+ if ep, err := e.stack.FindNetworkEndpoint(header.IPv6ProtocolNumber, netHeader.DestinationAddress()); err == nil {
+ src := netHeader.SourceAddress()
+ dst := netHeader.DestinationAddress()
+ route := r.ReverseRoute(src, dst)
+ ep.HandlePacket(&route, pkt)
+ n++
+ continue
+ }
+ }
+ if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+ r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+ return n, err
+ }
+ n++
+ }
+
r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
- return n, err
+ return n, nil
}
// WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
@@ -169,6 +232,14 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv)
hasFragmentHeader := false
+ // iptables filtering. All packets that reach here are intended for
+ // this machine and will not be forwarded.
+ ipt := e.stack.IPTables()
+ if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok {
+ // iptables is telling us to drop the packet.
+ return
+ }
+
for firstHeader := true; ; firstHeader = false {
extHdr, done, err := it.Next()
if err != nil {
@@ -504,75 +575,14 @@ func (*protocol) Close() {}
// Wait implements stack.TransportProtocol.Wait.
func (*protocol) Wait() {}
-// Parse implements stack.TransportProtocol.Parse.
+// Parse implements stack.NetworkProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
- hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+ proto, _, fragOffset, fragMore, ok := parse.IPv6(pkt)
if !ok {
return 0, false, false
}
- ipHdr := header.IPv6(hdr)
-
- // dataClone consists of:
- // - Any IPv6 header bytes after the first 40 (i.e. extensions).
- // - The transport header, if present.
- // - Any other payload data.
- views := [8]buffer.View{}
- dataClone := pkt.Data.Clone(views[:])
- dataClone.TrimFront(header.IPv6MinimumSize)
- it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone)
-
- // Iterate over the IPv6 extensions to find their length.
- //
- // Parsing occurs again in HandlePacket because we don't track the
- // extensions in PacketBuffer. Unfortunately, that means HandlePacket
- // has to do the parsing work again.
- var nextHdr tcpip.TransportProtocolNumber
- foundNext := true
- extensionsSize := 0
-traverseExtensions:
- for extHdr, done, err := it.Next(); ; extHdr, done, err = it.Next() {
- if err != nil {
- break
- }
- // If we exhaust the extension list, the entire packet is the IPv6 header
- // and (possibly) extensions.
- if done {
- extensionsSize = dataClone.Size()
- foundNext = false
- break
- }
-
- switch extHdr := extHdr.(type) {
- case header.IPv6FragmentExtHdr:
- // If this is an atomic fragment, we don't have to treat it specially.
- if !extHdr.More() && extHdr.FragmentOffset() == 0 {
- continue
- }
- // This is a non-atomic fragment and has to be re-assembled before we can
- // examine the payload for a transport header.
- foundNext = false
-
- case header.IPv6RawPayloadHeader:
- // We've found the payload after any extensions.
- extensionsSize = dataClone.Size() - extHdr.Buf.Size()
- nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
- break traverseExtensions
-
- default:
- // Any other extension is a no-op, keep looping until we find the payload.
- }
- }
-
- // Put the IPv6 header with extensions in pkt.NetworkHeader().
- hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize)
- if !ok {
- panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size()))
- }
- ipHdr = header.IPv6(hdr)
- pkt.Data.CapLength(int(ipHdr.PayloadLength()))
- pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber
- return nextHdr, foundNext, true
+ return proto, !fragMore && fragOffset == 0, true
}
// calculateMTU calculates the network-layer payload MTU based on the link-layer
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 5f9822c49..354d3b60d 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -1709,3 +1709,211 @@ func TestInvalidIPv6Fragments(t *testing.T) {
})
}
}
+
+func TestWritePacketsStats(t *testing.T) {
+ const nPackets = 3
+ tests := []struct {
+ name string
+ setup func(*testing.T, *stack.Stack)
+ linkEP stack.LinkEndpoint
+ expectSent int
+ }{
+ {
+ name: "Accept all",
+ // No setup needed, tables accept everything by default.
+ setup: func(*testing.T, *stack.Stack) {},
+ linkEP: &limitedEP{nPackets},
+ expectSent: nPackets,
+ }, {
+ name: "Accept all with error",
+ // No setup needed, tables accept everything by default.
+ setup: func(*testing.T, *stack.Stack) {},
+ linkEP: &limitedEP{nPackets - 1},
+ expectSent: nPackets - 1,
+ }, {
+ name: "Drop all",
+ setup: func(t *testing.T, stk *stack.Stack) {
+ // Install Output DROP rule.
+ t.Helper()
+ ipt := stk.IPTables()
+ filter, ok := ipt.GetTable(stack.FilterTable, true /* ipv6 */)
+ if !ok {
+ t.Fatalf("failed to find filter table")
+ }
+ ruleIdx := filter.BuiltinChains[stack.Output]
+ filter.Rules[ruleIdx].Target = stack.DropTarget{}
+ if err := ipt.ReplaceTable(stack.FilterTable, filter, true /* ipv6 */); err != nil {
+ t.Fatalf("failed to replace table: %v", err)
+ }
+ },
+ linkEP: &limitedEP{nPackets},
+ expectSent: 0,
+ }, {
+ name: "Drop some",
+ setup: func(t *testing.T, stk *stack.Stack) {
+ // Install Output DROP rule that matches only 1
+ // of the 3 packets.
+ t.Helper()
+ ipt := stk.IPTables()
+ filter, ok := ipt.GetTable(stack.FilterTable, true /* ipv6 */)
+ if !ok {
+ t.Fatalf("failed to find filter table")
+ }
+ // We'll match and DROP the last packet.
+ ruleIdx := filter.BuiltinChains[stack.Output]
+ filter.Rules[ruleIdx].Target = stack.DropTarget{}
+ filter.Rules[ruleIdx].Matchers = []stack.Matcher{&limitedMatcher{nPackets - 1}}
+ // Make sure the next rule is ACCEPT.
+ filter.Rules[ruleIdx+1].Target = stack.AcceptTarget{}
+ if err := ipt.ReplaceTable(stack.FilterTable, filter, true /* ipv6 */); err != nil {
+ t.Fatalf("failed to replace table: %v", err)
+ }
+ },
+ linkEP: &limitedEP{nPackets},
+ expectSent: nPackets - 1,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ rt := buildRoute(t, nil, test.linkEP)
+
+ var pbl stack.PacketBufferList
+ for i := 0; i < nPackets; i++ {
+ pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+ ReserveHeaderBytes: header.UDPMinimumSize + int(rt.MaxHeaderLength()),
+ Data: buffer.NewView(1).ToVectorisedView(),
+ })
+ pkt.TransportHeader().Push(header.UDPMinimumSize)
+ pbl.PushBack(pkt)
+ }
+
+ test.setup(t, rt.Stack())
+
+ nWritten, err := rt.WritePackets(nil, pbl, stack.NetworkHeaderParams{})
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ got := int(rt.Stats().IP.PacketsSent.Value())
+ if got != test.expectSent {
+ t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent)
+ }
+ if got != nWritten {
+ t.Errorf("sent %d packets, WritePackets returned %d", got, nWritten)
+ }
+ })
+ }
+}
+
+func buildRoute(t *testing.T, packetCollectorErrors []*tcpip.Error, linkEP stack.LinkEndpoint) stack.Route {
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+ })
+ s.CreateNIC(1, linkEP)
+ const (
+ src = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+ dst = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+ )
+ s.AddAddress(1, ProtocolNumber, src)
+ {
+ subnet, err := tcpip.NewSubnet(dst, tcpip.AddressMask("\xfc\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"))
+ if err != nil {
+ t.Fatal(err)
+ }
+ s.SetRouteTable([]tcpip.Route{{
+ Destination: subnet,
+ NIC: 1,
+ }})
+ }
+ rt, err := s.FindRoute(0, src, dst, ProtocolNumber, false /* multicastLoop */)
+ if err != nil {
+ t.Fatalf("s.FindRoute got %v, want %v", err, nil)
+ }
+ return rt
+}
+
+// limitedEP is a link endpoint that writes up to a certain number of packets
+// before returning errors.
+type limitedEP struct {
+ limit int
+}
+
+// MTU implements LinkEndpoint.MTU.
+func (*limitedEP) MTU() uint32 { return 0 }
+
+// Capabilities implements LinkEndpoint.Capabilities.
+func (*limitedEP) Capabilities() stack.LinkEndpointCapabilities { return 0 }
+
+// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength.
+func (*limitedEP) MaxHeaderLength() uint16 { return 0 }
+
+// LinkAddress implements LinkEndpoint.LinkAddress.
+func (*limitedEP) LinkAddress() tcpip.LinkAddress { return "" }
+
+// WritePacket implements LinkEndpoint.WritePacket.
+func (ep *limitedEP) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error {
+ if ep.limit == 0 {
+ return tcpip.ErrInvalidEndpointState
+ }
+ ep.limit--
+ return nil
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (ep *limitedEP) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+ if ep.limit == 0 {
+ return 0, tcpip.ErrInvalidEndpointState
+ }
+ nWritten := ep.limit
+ if nWritten > pkts.Len() {
+ nWritten = pkts.Len()
+ }
+ ep.limit -= nWritten
+ return nWritten, nil
+}
+
+// WriteRawPacket implements LinkEndpoint.WriteRawPacket.
+func (ep *limitedEP) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error {
+ if ep.limit == 0 {
+ return tcpip.ErrInvalidEndpointState
+ }
+ ep.limit--
+ return nil
+}
+
+// Attach implements LinkEndpoint.Attach.
+func (*limitedEP) Attach(_ stack.NetworkDispatcher) {}
+
+// IsAttached implements LinkEndpoint.IsAttached.
+func (*limitedEP) IsAttached() bool { return false }
+
+// Wait implements LinkEndpoint.Wait.
+func (*limitedEP) Wait() {}
+
+// ARPHardwareType implements LinkEndpoint.ARPHardwareType.
+func (*limitedEP) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareEther }
+
+// AddHeader implements LinkEndpoint.AddHeader.
+func (*limitedEP) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) {
+}
+
+// limitedMatcher is an iptables matcher that matches after a certain number of
+// packets are checked against it.
+type limitedMatcher struct {
+ limit int
+}
+
+// Name implements Matcher.Name.
+func (*limitedMatcher) Name() string {
+ return "limitedMatcher"
+}
+
+// Match implements Matcher.Match.
+func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) {
+ if lm.limit == 0 {
+ return true, false
+ }
+ lm.limit--
+ return false, false
+}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 0e33cbe92..b6ef04d32 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -57,7 +57,72 @@ const reaperDelay = 5 * time.Second
// all packets.
func DefaultTables() *IPTables {
return &IPTables{
- tables: [numTables]Table{
+ v4Tables: [numTables]Table{
+ natID: Table{
+ Rules: []Rule{
+ Rule{Target: AcceptTarget{}},
+ Rule{Target: AcceptTarget{}},
+ Rule{Target: AcceptTarget{}},
+ Rule{Target: AcceptTarget{}},
+ Rule{Target: ErrorTarget{}},
+ },
+ BuiltinChains: [NumHooks]int{
+ Prerouting: 0,
+ Input: 1,
+ Forward: HookUnset,
+ Output: 2,
+ Postrouting: 3,
+ },
+ Underflows: [NumHooks]int{
+ Prerouting: 0,
+ Input: 1,
+ Forward: HookUnset,
+ Output: 2,
+ Postrouting: 3,
+ },
+ },
+ mangleID: Table{
+ Rules: []Rule{
+ Rule{Target: AcceptTarget{}},
+ Rule{Target: AcceptTarget{}},
+ Rule{Target: ErrorTarget{}},
+ },
+ BuiltinChains: [NumHooks]int{
+ Prerouting: 0,
+ Output: 1,
+ },
+ Underflows: [NumHooks]int{
+ Prerouting: 0,
+ Input: HookUnset,
+ Forward: HookUnset,
+ Output: 1,
+ Postrouting: HookUnset,
+ },
+ },
+ filterID: Table{
+ Rules: []Rule{
+ Rule{Target: AcceptTarget{}},
+ Rule{Target: AcceptTarget{}},
+ Rule{Target: AcceptTarget{}},
+ Rule{Target: ErrorTarget{}},
+ },
+ BuiltinChains: [NumHooks]int{
+ Prerouting: HookUnset,
+ Input: 0,
+ Forward: 1,
+ Output: 2,
+ Postrouting: HookUnset,
+ },
+ Underflows: [NumHooks]int{
+ Prerouting: HookUnset,
+ Input: 0,
+ Forward: 1,
+ Output: 2,
+ Postrouting: HookUnset,
+ },
+ },
+ },
+ v6Tables: [numTables]Table{
natID: Table{
Rules: []Rule{
Rule{Target: AcceptTarget{}},
@@ -166,25 +231,20 @@ func EmptyNATTable() Table {
// GetTable returns a table by name.
func (it *IPTables) GetTable(name string, ipv6 bool) (Table, bool) {
- // TODO(gvisor.dev/issue/3549): Enable IPv6.
- if ipv6 {
- return Table{}, false
- }
id, ok := nameToID[name]
if !ok {
return Table{}, false
}
it.mu.RLock()
defer it.mu.RUnlock()
- return it.tables[id], true
+ if ipv6 {
+ return it.v6Tables[id], true
+ }
+ return it.v4Tables[id], true
}
// ReplaceTable replaces or inserts table by name.
func (it *IPTables) ReplaceTable(name string, table Table, ipv6 bool) *tcpip.Error {
- // TODO(gvisor.dev/issue/3549): Enable IPv6.
- if ipv6 {
- return tcpip.ErrInvalidOptionValue
- }
id, ok := nameToID[name]
if !ok {
return tcpip.ErrInvalidOptionValue
@@ -198,7 +258,11 @@ func (it *IPTables) ReplaceTable(name string, table Table, ipv6 bool) *tcpip.Err
it.startReaper(reaperDelay)
}
it.modified = true
- it.tables[id] = table
+ if ipv6 {
+ it.v6Tables[id] = table
+ } else {
+ it.v4Tables[id] = table
+ }
return nil
}
@@ -221,8 +285,17 @@ const (
// should continue traversing the network stack and false when it should be
// dropped.
//
+// TODO(gvisor.dev/issue/170): PacketBuffer should hold the GSO and route, from
+// which address and nicName can be gathered. Currently, address is only
+// needed for prerouting and nicName is only needed for output.
+//
+// TODO(gvisor.dev/issue/170): Dropped packets should be counted.
+//
// Precondition: pkt.NetworkHeader is set.
-func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, address tcpip.Address, nicName string) bool {
+func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) bool {
+ if pkt.NetworkProtocolNumber != header.IPv4ProtocolNumber && pkt.NetworkProtocolNumber != header.IPv6ProtocolNumber {
+ return true
+ }
// Many users never configure iptables. Spare them the cost of rule
// traversal if rules have never been set.
it.mu.RLock()
@@ -243,9 +316,14 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
if tableID == natID && pkt.NatDone {
continue
}
- table := it.tables[tableID]
+ var table Table
+ if pkt.NetworkProtocolNumber == header.IPv6ProtocolNumber {
+ table = it.v6Tables[tableID]
+ } else {
+ table = it.v4Tables[tableID]
+ }
ruleIdx := table.BuiltinChains[hook]
- switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
+ switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict {
// If the table returns Accept, move on to the next table.
case chainAccept:
continue
@@ -256,7 +334,7 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
// Any Return from a built-in chain means we have to
// call the underflow.
underflow := table.Rules[table.Underflows[hook]]
- switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, address); v {
+ switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, preroutingAddr); v {
case RuleAccept:
continue
case RuleDrop:
@@ -351,11 +429,11 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *
// Preconditions:
// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
// * pkt.NetworkHeader is not nil.
-func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) chainVerdict {
+func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) chainVerdict {
// Start from ruleIdx and walk the list of rules until a rule gives us
// a verdict.
for ruleIdx < len(table.Rules) {
- switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
+ switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict {
case RuleAccept:
return chainAccept
@@ -372,7 +450,7 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId
ruleIdx++
continue
}
- switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, address, nicName); verdict {
+ switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, preroutingAddr, nicName); verdict {
case chainAccept:
return chainAccept
case chainDrop:
@@ -398,11 +476,11 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId
// Preconditions:
// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
// * pkt.NetworkHeader is not nil.
-func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) (RuleVerdict, int) {
+func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) (RuleVerdict, int) {
rule := table.Rules[ruleIdx]
// Check whether the packet matches the IP header filter.
- if !rule.Filter.match(header.IPv4(pkt.NetworkHeader().View()), hook, nicName) {
+ if !rule.Filter.match(pkt, hook, nicName) {
// Continue on to the next rule.
return RuleJump, ruleIdx + 1
}
@@ -421,7 +499,7 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx
}
// All the matchers matched, so run the target.
- return rule.Target.Action(pkt, &it.connections, hook, gso, r, address)
+ return rule.Target.Action(pkt, &it.connections, hook, gso, r, preroutingAddr)
}
// OriginalDst returns the original destination of redirected connections. It
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
index fbbd2f50f..093ee6881 100644
--- a/pkg/tcpip/stack/iptables_types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -15,6 +15,7 @@
package stack
import (
+ "fmt"
"strings"
"sync"
@@ -81,26 +82,25 @@ const (
//
// +stateify savable
type IPTables struct {
- // mu protects tables, priorities, and modified.
+ // mu protects v4Tables, v6Tables, and modified.
mu sync.RWMutex
-
- // tables maps tableIDs to tables. Holds builtin tables only, not user
- // tables. mu must be locked for accessing.
- tables [numTables]Table
-
- // priorities maps each hook to a list of table names. The order of the
- // list is the order in which each table should be visited for that
- // hook. mu needs to be locked for accessing.
- priorities [NumHooks][]tableID
-
+ // v4Tables and v6tables map tableIDs to tables. They hold builtin
+ // tables only, not user tables. mu must be locked for accessing.
+ v4Tables [numTables]Table
+ v6Tables [numTables]Table
// modified is whether tables have been modified at least once. It is
// used to elide the iptables performance overhead for workloads that
// don't utilize iptables.
modified bool
+ // priorities maps each hook to a list of table names. The order of the
+ // list is the order in which each table should be visited for that
+ // hook. It is immutable.
+ priorities [NumHooks][]tableID
+
connections ConnTrack
- // reaperDone can be signalled to stop the reaper goroutine.
+ // reaperDone can be signaled to stop the reaper goroutine.
reaperDone chan struct{}
}
@@ -148,7 +148,7 @@ type Rule struct {
Target Target
}
-// IPHeaderFilter holds basic IP filtering data common to every rule.
+// IPHeaderFilter performs basic IP header matching common to every rule.
//
// +stateify savable
type IPHeaderFilter struct {
@@ -196,16 +196,43 @@ type IPHeaderFilter struct {
OutputInterfaceInvert bool
}
-// match returns whether hdr matches the filter.
-func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool {
- // TODO(gvisor.dev/issue/170): Support other fields of the filter.
+// match returns whether pkt matches the filter.
+//
+// Preconditions: pkt.NetworkHeader is set and is at least of the minimal IPv4
+// or IPv6 header length.
+func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, nicName string) bool {
+ // Extract header fields.
+ var (
+ // TODO(gvisor.dev/issue/170): Support other filter fields.
+ transProto tcpip.TransportProtocolNumber
+ dstAddr tcpip.Address
+ srcAddr tcpip.Address
+ )
+ switch proto := pkt.NetworkProtocolNumber; proto {
+ case header.IPv4ProtocolNumber:
+ hdr := header.IPv4(pkt.NetworkHeader().View())
+ transProto = hdr.TransportProtocol()
+ dstAddr = hdr.DestinationAddress()
+ srcAddr = hdr.SourceAddress()
+
+ case header.IPv6ProtocolNumber:
+ hdr := header.IPv6(pkt.NetworkHeader().View())
+ transProto = hdr.TransportProtocol()
+ dstAddr = hdr.DestinationAddress()
+ srcAddr = hdr.SourceAddress()
+
+ default:
+ panic(fmt.Sprintf("unknown network protocol with EtherType: %d", proto))
+ }
+
// Check the transport protocol.
- if fl.Protocol != 0 && fl.Protocol != hdr.TransportProtocol() {
+ if fl.CheckProtocol && fl.Protocol != transProto {
return false
}
- // Check the source and destination IPs.
- if !filterAddress(hdr.DestinationAddress(), fl.DstMask, fl.Dst, fl.DstInvert) || !filterAddress(hdr.SourceAddress(), fl.SrcMask, fl.Src, fl.SrcInvert) {
+ // Check the addresses.
+ if !filterAddress(dstAddr, fl.DstMask, fl.Dst, fl.DstInvert) ||
+ !filterAddress(srcAddr, fl.SrcMask, fl.Src, fl.SrcInvert) {
return false
}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 1f1a1426b..821d3feb9 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -1282,9 +1282,8 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp
return
}
- // TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
// Loopback traffic skips the prerouting chain.
- if protocol == header.IPv4ProtocolNumber && !n.isLoopback() {
+ if !n.isLoopback() {
// iptables filtering.
ipt := n.stack.IPTables()
address := n.primaryAddress(protocol)
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 17b8beebb..1932aaeb7 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -80,7 +80,7 @@ type PacketBuffer struct {
// data are held in the same underlying buffer storage.
header buffer.Prependable
- // NetworkProtocol is only valid when NetworkHeader is set.
+ // NetworkProtocolNumber is only valid when NetworkHeader is set.
// TODO(gvisor.dev/issue/3574): Remove the separately passed protocol
// numbers in registration APIs that take a PacketBuffer.
NetworkProtocolNumber tcpip.NetworkProtocolNumber
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index ad71ff3b6..31116309e 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -74,6 +74,8 @@ type endpoint struct {
route stack.Route `state:"manual"`
ttl uint8
stats tcpip.TransportEndpointStats `state:"nosave"`
+ // linger is used for SO_LINGER socket option.
+ linger tcpip.LingerOption
// owner is used to get uid and gid of the packet.
owner tcpip.PacketOwner
@@ -344,9 +346,14 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
// SetSockOpt sets a socket option.
func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
- switch opt.(type) {
+ switch v := opt.(type) {
case *tcpip.SocketDetachFilterOption:
return nil
+
+ case *tcpip.LingerOption:
+ e.mu.Lock()
+ e.linger = *v
+ e.mu.Unlock()
}
return nil
}
@@ -415,8 +422,17 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
}
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error {
- return tcpip.ErrUnknownProtocolOption
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+ switch o := opt.(type) {
+ case *tcpip.LingerOption:
+ e.mu.Lock()
+ *o = e.linger
+ e.mu.Unlock()
+ return nil
+
+ default:
+ return tcpip.ErrUnknownProtocolOption
+ }
}
func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) *tcpip.Error {
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 8bd4e5e37..072601d2d 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -83,6 +83,8 @@ type endpoint struct {
stats tcpip.TransportEndpointStats `state:"nosave"`
bound bool
boundNIC tcpip.NICID
+ // linger is used for SO_LINGER socket option.
+ linger tcpip.LingerOption
// lastErrorMu protects lastError.
lastErrorMu sync.Mutex `state:"nosave"`
@@ -298,10 +300,16 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
// used with SetSockOpt, and this function always returns
// tcpip.ErrNotSupported.
func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
- switch opt.(type) {
+ switch v := opt.(type) {
case *tcpip.SocketDetachFilterOption:
return nil
+ case *tcpip.LingerOption:
+ ep.mu.Lock()
+ ep.linger = *v
+ ep.mu.Unlock()
+ return nil
+
default:
return tcpip.ErrUnknownProtocolOption
}
@@ -366,8 +374,17 @@ func (ep *endpoint) LastError() *tcpip.Error {
}
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error {
- return tcpip.ErrNotSupported
+func (ep *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+ switch o := opt.(type) {
+ case *tcpip.LingerOption:
+ ep.mu.Lock()
+ *o = ep.linger
+ ep.mu.Unlock()
+ return nil
+
+ default:
+ return tcpip.ErrNotSupported
+ }
}
// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index fb03e6047..e37c00523 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -84,6 +84,8 @@ type endpoint struct {
// Connect(), and is valid only when conneted is true.
route stack.Route `state:"manual"`
stats tcpip.TransportEndpointStats `state:"nosave"`
+ // linger is used for SO_LINGER socket option.
+ linger tcpip.LingerOption
// owner is used to get uid and gid of the packet.
owner tcpip.PacketOwner
@@ -511,10 +513,16 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
- switch opt.(type) {
+ switch v := opt.(type) {
case *tcpip.SocketDetachFilterOption:
return nil
+ case *tcpip.LingerOption:
+ e.mu.Lock()
+ e.linger = *v
+ e.mu.Unlock()
+ return nil
+
default:
return tcpip.ErrUnknownProtocolOption
}
@@ -577,8 +585,17 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
}
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error {
- return tcpip.ErrUnknownProtocolOption
+func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
+ switch o := opt.(type) {
+ case *tcpip.LingerOption:
+ e.mu.Lock()
+ *o = e.linger
+ e.mu.Unlock()
+ return nil
+
+ default:
+ return tcpip.ErrUnknownProtocolOption
+ }
}
// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 234fb95ce..4778e7b1c 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -69,6 +69,7 @@ go_library(
"//pkg/tcpip/buffer",
"//pkg/tcpip/hash/jenkins",
"//pkg/tcpip/header",
+ "//pkg/tcpip/header/parse",
"//pkg/tcpip/ports",
"//pkg/tcpip/seqnum",
"//pkg/tcpip/stack",
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index faea7f2bb..120483838 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -1317,14 +1317,17 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
// indicating the reason why it's not writable.
// Caller must hold e.mu and e.sndBufMu
func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
- // The endpoint cannot be written to if it's not connected.
- if !e.EndpointState().connected() {
- switch e.EndpointState() {
- case StateError:
- return 0, e.HardError
- default:
- return 0, tcpip.ErrClosedForSend
- }
+ switch s := e.EndpointState(); {
+ case s == StateError:
+ return 0, e.HardError
+ case !s.connecting() && !s.connected():
+ return 0, tcpip.ErrClosedForSend
+ case s.connecting():
+ // As per RFC793, page 56, a send request arriving when in connecting
+ // state, can be queued to be completed after the state becomes
+ // connected. Return an error code for the caller of endpoint Write to
+ // try again, until the connection handshake is complete.
+ return 0, tcpip.ErrWouldBlock
}
// Check if the connection has already been closed for sends.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 63ec12be8..74a17af79 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -29,6 +29,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/header/parse"
"gvisor.dev/gvisor/pkg/tcpip/seqnum"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
@@ -506,22 +507,7 @@ func (p *protocol) SynRcvdCounter() *synRcvdCounter {
// Parse implements stack.TransportProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
- // TCP header is variable length, peek at it first.
- hdrLen := header.TCPMinimumSize
- hdr, ok := pkt.Data.PullUp(hdrLen)
- if !ok {
- return false
- }
-
- // If the header has options, pull those up as well.
- if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() {
- // TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of
- // packets.
- hdrLen = offset
- }
-
- _, ok = pkt.TransportHeader().Consume(hdrLen)
- return ok
+ return parse.TCP(pkt)
}
// NewProtocol returns a TCP transport protocol.
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index b5d2d0ba6..c78549424 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -32,6 +32,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
+ "//pkg/tcpip/header/parse",
"//pkg/tcpip/ports",
"//pkg/tcpip/stack",
"//pkg/tcpip/transport/raw",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index b572c39db..518f636f0 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -154,6 +154,9 @@ type endpoint struct {
// owner is used to get uid and gid of the packet.
owner tcpip.PacketOwner
+
+ // linger is used for SO_LINGER socket option.
+ linger tcpip.LingerOption
}
// +stateify savable
@@ -810,6 +813,11 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
case *tcpip.SocketDetachFilterOption:
return nil
+
+ case *tcpip.LingerOption:
+ e.mu.Lock()
+ e.linger = *v
+ e.mu.Unlock()
}
return nil
}
@@ -966,6 +974,11 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
*o = tcpip.BindToDeviceOption(e.bindToDevice)
e.mu.RUnlock()
+ case *tcpip.LingerOption:
+ e.mu.RLock()
+ *o = e.linger
+ e.mu.RUnlock()
+
default:
return tcpip.ErrUnknownProtocolOption
}
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 3f87e8057..7d6b91a75 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -24,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/header/parse"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
"gvisor.dev/gvisor/pkg/waiter"
@@ -219,8 +220,7 @@ func (*protocol) Wait() {}
// Parse implements stack.TransportProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
- _, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize)
- return ok
+ return parse.UDP(pkt)
}
// NewProtocol returns a UDP transport protocol.
diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go
index b7f873392..06fb823f6 100644
--- a/pkg/test/testutil/testutil.go
+++ b/pkg/test/testutil/testutil.go
@@ -332,13 +332,13 @@ func PollContext(ctx context.Context, cb func() error) error {
}
// WaitForHTTP tries GET requests on a port until the call succeeds or timeout.
-func WaitForHTTP(port int, timeout time.Duration) error {
+func WaitForHTTP(ip string, port int, timeout time.Duration) error {
cb := func() error {
c := &http.Client{
// Calculate timeout to be able to do minimum 5 attempts.
Timeout: timeout / 5,
}
- url := fmt.Sprintf("http://localhost:%d/", port)
+ url := fmt.Sprintf("http://%s:%d/", ip, port)
resp, err := c.Get(url)
if err != nil {
log.Printf("Waiting %s: %v", url, err)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 01f62d50a..2d9517f4a 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -27,6 +27,7 @@ go_library(
"//pkg/abi",
"//pkg/abi/linux",
"//pkg/bpf",
+ "//pkg/cleanup",
"//pkg/context",
"//pkg/control/server",
"//pkg/cpuid",
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
index 9b1799416..24e13565e 100644
--- a/runsc/boot/filter/config_amd64.go
+++ b/runsc/boot/filter/config_amd64.go
@@ -25,7 +25,6 @@ import (
func init() {
allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
- seccomp.Rule{seccomp.EqualTo(linux.ARCH_GET_FS)},
seccomp.Rule{seccomp.EqualTo(linux.ARCH_SET_FS)},
)
}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 7844ea28c..e36664938 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -21,6 +21,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
@@ -168,26 +169,30 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create
}
rootProcArgs.MountNamespaceVFS2 = mns
+ root := mns.Root()
+ defer root.DecRef(rootCtx)
+ if root.Mount().ReadOnly() {
+ // Switch to ReadWrite while we setup submounts.
+ if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
+ return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
+ }
+ // Restore back to ReadOnly at the end.
+ defer func() {
+ if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
+ panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
+ }
+ }()
+ }
+
// Mount submounts.
if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
}
- if c.root.Readonly || conf.Overlay {
- // Switch to ReadOnly after all submounts were setup.
- root := mns.Root()
- defer root.DecRef(rootCtx)
- if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
- return nil, fmt.Errorf(`failed to set mount at "/" readonly: %v`, err)
- }
- }
-
return mns, nil
}
// createMountNamespaceVFS2 creates the container's root mount and namespace.
-// The mount is created ReadWrite to allow mount point for submounts to be
-// created. ** The caller is responsible to switch to ReadOnly if needed **
func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
fd := c.fds.remove()
data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
@@ -201,21 +206,71 @@ func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *c
log.Infof("Mounting root over 9P, ioFD: %d", fd)
opts := &vfs.MountOptions{
- // Always mount as ReadWrite to allow other mounts on top of it. It'll be
- // made ReadOnly in the caller (if needed).
- ReadOnly: false,
+ ReadOnly: c.root.Readonly,
GetFilesystemOptions: vfs.GetFilesystemOptions{
Data: strings.Join(data, ","),
},
InternalMount: true,
}
- mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, opts)
+
+ fsName := gofer.Name
+ if conf.Overlay && !c.root.Readonly {
+ log.Infof("Adding overlay on top of root")
+ var err error
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting root with overlay: %w", err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
+ mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts)
if err != nil {
return nil, fmt.Errorf("setting up mount namespace: %w", err)
}
return mns, nil
}
+// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
+// layer using tmpfs, and return overlay mount options. "cleanup" must be called
+// after the options have been used to mount the overlay, to release refs on
+// lower and upper mounts.
+func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) {
+ // First copy options from lower layer to upper layer and overlay. Clear
+ // filesystem specific options.
+ upperOpts := *lowerOpts
+ upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+ overlayOpts := *lowerOpts
+ overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+ // Next mount upper and lower. Upper is a tmpfs mount to keep all
+ // modifications inside the sandbox.
+ upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
+ }
+ cu := cleanup.Make(func() { upper.DecRef(ctx) })
+ defer cu.Clean()
+
+ // All writes go to the upper layer, be paranoid and make lower readonly.
+ lowerOpts.ReadOnly = true
+ lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
+ if err != nil {
+ return nil, nil, err
+ }
+ cu.Add(func() { lower.DecRef(ctx) })
+
+ // Configure overlay with both layers.
+ overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
+ UpperRoot: vfs.MakeVirtualDentry(upper, upper.Root()),
+ LowerRoots: []vfs.VirtualDentry{vfs.MakeVirtualDentry(lower, lower.Root())},
+ }
+ return &overlayOpts, cu.Release(), nil
+}
+
func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
mounts, err := c.prepareMountsVFS2()
if err != nil {
@@ -245,7 +300,7 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.
if mnt != nil && mnt.ReadOnly() {
// Switch to ReadWrite while we setup submounts.
if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
- return fmt.Errorf("failed to set mount at %q readwrite: %v", submount.Destination, err)
+ return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err)
}
// Restore back to ReadOnly at the end.
defer func() {
@@ -297,14 +352,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
}
func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
- root := mns.Root()
- defer root.DecRef(ctx)
- target := &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(submount.Destination),
- }
- fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+ fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
if err != nil {
return nil, fmt.Errorf("mountOptions failed: %w", err)
}
@@ -313,8 +361,27 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
return nil, nil
}
- if err := c.k.VFS().MakeSyntheticMountpoint(ctx, submount.Destination, root, creds); err != nil {
- return nil, err
+ if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil {
+ return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err)
+ }
+
+ if useOverlay {
+ log.Infof("Adding overlay on top of mount %q", submount.Destination)
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(submount.Destination),
}
mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
if err != nil {
@@ -326,8 +393,9 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
// used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) {
fsName := m.Type
+ useOverlay := false
var data []string
// Find filesystem name and FS specific data field.
@@ -342,7 +410,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
var err error
data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
if err != nil {
- return "", nil, err
+ return "", nil, false, err
}
case bind:
@@ -350,13 +418,16 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
if m.fd == 0 {
// Check that an FD was provided to fails fast. Technically FD=0 is valid,
// but unlikely to be correct in this context.
- return "", nil, fmt.Errorf("9P mount requires a connection FD")
+ return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
}
data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+ // If configured, add overlay to all writable mounts.
+ useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
default:
log.Warningf("ignoring unknown filesystem type %q", m.Type)
- return "", nil, nil
+ return "", nil, false, nil
}
opts := &vfs.MountOptions{
@@ -381,11 +452,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
}
}
- if conf.Overlay {
- // All writes go to upper, be paranoid and make lower readonly.
- opts.ReadOnly = true
- }
- return fsName, opts, nil
+ return fsName, opts, useOverlay, nil
}
// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
@@ -488,13 +555,25 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *conf
// Map mount type to filesystem name, and parse out the options that we are
// capable of dealing with.
mntFD := &mountAndFD{Mount: hint.mount}
- fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+ fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
if err != nil {
return nil, err
}
if len(fsName) == 0 {
return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
}
+
+ if useOverlay {
+ log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination)
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
}
@@ -505,7 +584,9 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
return nil, err
}
- _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+ // Ignore data and useOverlay because these were already applied to
+ // the master mount.
+ _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
if err != nil {
return nil, err
}
@@ -517,18 +598,39 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
root := mns.Root()
defer root.DecRef(ctx)
- if err := c.k.VFS().MakeSyntheticMountpoint(ctx, mount.Destination, root, creds); err != nil {
- return nil, err
- }
-
target := &vfs.PathOperation{
Root: root,
Start: root,
Path: fspath.Parse(mount.Destination),
}
+
+ if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
+ return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
+ }
+
if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
return nil, err
}
log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
return newMnt, nil
}
+
+func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(dest),
+ }
+ // First check if mount point exists. When overlay is enabled, gofer doesn't
+ // allow changes to the FS, making MakeSytheticMountpoint() ineffective
+ // because MkdirAt fails with EROFS even if file exists.
+ vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
+ if err == nil {
+ // File exists, we're done.
+ vd.DecRef(ctx)
+ return nil
+ }
+ return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
+}
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index bba00d551..371fcc0ae 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -62,9 +62,8 @@ type Gofer struct {
applyCaps bool
setUpRoot bool
- panicOnWrite bool
- specFD int
- mountsFD int
+ specFD int
+ mountsFD int
}
// Name implements subcommands.Command.
@@ -87,7 +86,6 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec")
f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
- f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected")
f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
@@ -168,8 +166,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
// Start with root mount, then add any other additional mount as needed.
ats := make([]p9.Attacher, 0, len(spec.Mounts)+1)
ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{
- ROMount: spec.Root.Readonly || conf.Overlay,
- PanicOnWrite: g.panicOnWrite,
+ ROMount: spec.Root.Readonly || conf.Overlay,
})
if err != nil {
Fatalf("creating attach point: %v", err)
@@ -181,9 +178,8 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
for _, m := range spec.Mounts {
if specutils.Is9PMount(m) {
cfg := fsgofer.Config{
- ROMount: isReadonlyMount(m.Options) || conf.Overlay,
- PanicOnWrite: g.panicOnWrite,
- HostUDS: conf.FSGoferHostUDS,
+ ROMount: isReadonlyMount(m.Options) || conf.Overlay,
+ HostUDS: conf.FSGoferHostUDS,
}
ap, err := fsgofer.NewAttachPoint(m.Destination, cfg)
if err != nil {
@@ -316,6 +312,7 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error {
if err != nil {
return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
}
+ log.Infof("Create working directory %q if needed", spec.Process.Cwd)
if err := os.MkdirAll(dst, 0755); err != nil {
return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 6e1d6a568..63478ba8c 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -902,9 +902,6 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu
}
args = append(args, "gofer", "--bundle", bundleDir)
- if conf.Overlay {
- args = append(args, "--panic-on-write=true")
- }
// Open the spec file to donate to the sandbox.
specFile, err := specutils.OpenSpec(bundleDir)
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index ad49f8b16..ff0e60283 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -317,22 +317,12 @@ func configs(t *testing.T, opts ...configOption) map[string]*config.Config {
}
func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config {
- vfs1 := configs(t, opts...)
-
- var optsVFS2 []configOption
- for _, opt := range opts {
- // TODO(gvisor.dev/issue/1487): Enable overlay tests.
- if opt != overlay {
- optsVFS2 = append(optsVFS2, opt)
- }
- }
-
- for key, value := range configs(t, optsVFS2...) {
+ all := configs(t, opts...)
+ for key, value := range configs(t, opts...) {
value.VFS2 = true
- vfs1[key+"VFS2"] = value
+ all[key+"VFS2"] = value
}
-
- return vfs1
+ return all
}
// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go
index 53506b5e1..39f9851a8 100644
--- a/runsc/fsgofer/filter/config_amd64.go
+++ b/runsc/fsgofer/filter/config_amd64.go
@@ -25,7 +25,6 @@ import (
func init() {
allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
- {seccomp.EqualTo(linux.ARCH_GET_FS)},
{seccomp.EqualTo(linux.ARCH_SET_FS)},
}
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 4268d97a1..0b628c8ce 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -1181,9 +1181,6 @@ func extractErrno(err error) unix.Errno {
func (l *localFile) checkROMount() error {
if conf := l.attachPoint.conf; conf.ROMount {
- if conf.PanicOnWrite {
- panic("attempt to write to RO mount")
- }
return unix.EROFS
}
return nil
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index 0e4945b3d..a84206686 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -553,29 +553,6 @@ func TestROMountChecks(t *testing.T) {
})
}
-func TestROMountPanics(t *testing.T) {
- conf := Config{ROMount: true, PanicOnWrite: true}
- uid := p9.UID(os.Getuid())
- gid := p9.GID(os.Getgid())
-
- runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) {
- if s.fileType != unix.S_IFLNK {
- assertPanic(t, func() { s.file.Open(p9.WriteOnly) })
- }
- assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, uid, gid) })
- assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, uid, gid) })
- assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") })
- assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", uid, gid) })
- assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) })
- assertPanic(t, func() { s.file.Link(s.file, "some_link") })
- assertPanic(t, func() { s.file.Mknod("some-nod", 0777, 1, 2, uid, gid) })
-
- valid := p9.SetAttrMask{Size: true}
- attr := p9.SetAttr{Size: 0}
- assertPanic(t, func() { s.file.SetAttr(valid, attr) })
- })
-}
-
func TestWalkNotFound(t *testing.T) {
runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) {
if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT {
diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go
index 809244bab..8425abecb 100644
--- a/test/e2e/integration_test.go
+++ b/test/e2e/integration_test.go
@@ -64,9 +64,10 @@ func TestLifeCycle(t *testing.T) {
defer d.CleanUp(ctx)
// Start the container.
+ port := 80
if err := d.Create(ctx, dockerutil.RunOpts{
Image: "basic/nginx",
- Ports: []int{80},
+ Ports: []int{port},
}); err != nil {
t.Fatalf("docker create failed: %v", err)
}
@@ -74,16 +75,15 @@ func TestLifeCycle(t *testing.T) {
t.Fatalf("docker start failed: %v", err)
}
- // Test that container is working.
- port, err := d.FindPort(ctx, 80)
+ ip, err := d.FindIP(ctx, false)
if err != nil {
- t.Fatalf("docker.FindPort(80) failed: %v", err)
+ t.Fatalf("docker.FindIP failed: %v", err)
}
- if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+ if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
t.Fatalf("WaitForHTTP() timeout: %v", err)
}
client := http.Client{Timeout: defaultWait}
- if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+ if err := httpRequestSucceeds(client, ip.String(), port); err != nil {
t.Errorf("http request failed: %v", err)
}
@@ -105,27 +105,28 @@ func TestPauseResume(t *testing.T) {
defer d.CleanUp(ctx)
// Start the container.
+ port := 8080
if err := d.Spawn(ctx, dockerutil.RunOpts{
Image: "basic/python",
- Ports: []int{8080}, // See Dockerfile.
+ Ports: []int{port}, // See Dockerfile.
}); err != nil {
t.Fatalf("docker run failed: %v", err)
}
- // Find where port 8080 is mapped to.
- port, err := d.FindPort(ctx, 8080)
+ // Find container IP address.
+ ip, err := d.FindIP(ctx, false)
if err != nil {
- t.Fatalf("docker.FindPort(8080) failed: %v", err)
+ t.Fatalf("docker.FindIP failed: %v", err)
}
// Wait until it's up and running.
- if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+ if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
t.Fatalf("WaitForHTTP() timeout: %v", err)
}
// Check that container is working.
client := http.Client{Timeout: defaultWait}
- if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+ if err := httpRequestSucceeds(client, ip.String(), port); err != nil {
t.Error("http request failed:", err)
}
@@ -135,7 +136,7 @@ func TestPauseResume(t *testing.T) {
// Check if container is paused.
client = http.Client{Timeout: 10 * time.Millisecond} // Don't wait a minute.
- switch _, err := client.Get(fmt.Sprintf("http://localhost:%d", port)); v := err.(type) {
+ switch _, err := client.Get(fmt.Sprintf("http://%s:%d", ip.String(), port)); v := err.(type) {
case nil:
t.Errorf("http req expected to fail but it succeeded")
case net.Error:
@@ -151,13 +152,13 @@ func TestPauseResume(t *testing.T) {
}
// Wait until it's up and running.
- if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+ if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
t.Fatalf("WaitForHTTP() timeout: %v", err)
}
// Check if container is working again.
client = http.Client{Timeout: defaultWait}
- if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+ if err := httpRequestSucceeds(client, ip.String(), port); err != nil {
t.Error("http request failed:", err)
}
}
@@ -179,9 +180,10 @@ func TestCheckpointRestore(t *testing.T) {
defer d.CleanUp(ctx)
// Start the container.
+ port := 8080
if err := d.Spawn(ctx, dockerutil.RunOpts{
Image: "basic/python",
- Ports: []int{8080}, // See Dockerfile.
+ Ports: []int{port}, // See Dockerfile.
}); err != nil {
t.Fatalf("docker run failed: %v", err)
}
@@ -199,20 +201,20 @@ func TestCheckpointRestore(t *testing.T) {
t.Fatalf("docker restore failed: %v", err)
}
- // Find where port 8080 is mapped to.
- port, err := d.FindPort(ctx, 8080)
+ // Find container IP address.
+ ip, err := d.FindIP(ctx, false)
if err != nil {
- t.Fatalf("docker.FindPort(8080) failed: %v", err)
+ t.Fatalf("docker.FindIP failed: %v", err)
}
// Wait until it's up and running.
- if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+ if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
t.Fatalf("WaitForHTTP() timeout: %v", err)
}
// Check if container is working again.
client := http.Client{Timeout: defaultWait}
- if err := httpRequestSucceeds(client, "localhost", port); err != nil {
+ if err := httpRequestSucceeds(client, ip.String(), port); err != nil {
t.Error("http request failed:", err)
}
}
diff --git a/test/fuse/linux/rmdir_test.cc b/test/fuse/linux/rmdir_test.cc
index 913d3f910..e3200e446 100644
--- a/test/fuse/linux/rmdir_test.cc
+++ b/test/fuse/linux/rmdir_test.cc
@@ -38,6 +38,7 @@ namespace {
class RmDirTest : public FuseTest {
protected:
const std::string test_dir_name_ = "test_dir";
+ const std::string test_subdir_ = "test_subdir";
const mode_t test_dir_mode_ = S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO;
};
@@ -67,6 +68,32 @@ TEST_F(RmDirTest, NormalRmDir) {
EXPECT_EQ(std::string(actual_dirname.data()), test_dir_name_);
}
+TEST_F(RmDirTest, NormalRmDirSubdir) {
+ SetServerInodeLookup(test_subdir_, S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO);
+ const std::string test_dir_path_ =
+ JoinPath(mount_point_.path().c_str(), test_subdir_, test_dir_name_);
+ SetServerInodeLookup(test_dir_name_, test_dir_mode_);
+
+ // RmDir code.
+ struct fuse_out_header rmdir_header = {
+ .len = sizeof(struct fuse_out_header),
+ };
+
+ auto iov_out = FuseGenerateIovecs(rmdir_header);
+ SetServerResponse(FUSE_RMDIR, iov_out);
+
+ ASSERT_THAT(rmdir(test_dir_path_.c_str()), SyscallSucceeds());
+
+ struct fuse_in_header in_header;
+ std::vector<char> actual_dirname(test_dir_name_.length() + 1);
+ auto iov_in = FuseGenerateIovecs(in_header, actual_dirname);
+ GetServerActualRequest(iov_in);
+
+ EXPECT_EQ(in_header.len, sizeof(in_header) + test_dir_name_.length() + 1);
+ EXPECT_EQ(in_header.opcode, FUSE_RMDIR);
+ EXPECT_EQ(std::string(actual_dirname.data()), test_dir_name_);
+}
+
} // namespace
} // namespace testing
diff --git a/test/fuse/linux/unlink_test.cc b/test/fuse/linux/unlink_test.cc
index 5702e9b32..13efbf7c7 100644
--- a/test/fuse/linux/unlink_test.cc
+++ b/test/fuse/linux/unlink_test.cc
@@ -37,6 +37,7 @@ namespace {
class UnlinkTest : public FuseTest {
protected:
const std::string test_file_ = "test_file";
+ const std::string test_subdir_ = "test_subdir";
};
TEST_F(UnlinkTest, RegularFile) {
@@ -61,6 +62,29 @@ TEST_F(UnlinkTest, RegularFile) {
EXPECT_EQ(std::string(unlinked_file.data()), test_file_);
}
+TEST_F(UnlinkTest, RegularFileSubDir) {
+ SetServerInodeLookup(test_subdir_, S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO);
+ const std::string test_file_path =
+ JoinPath(mount_point_.path().c_str(), test_subdir_, test_file_);
+ SetServerInodeLookup(test_file_, S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO);
+
+ struct fuse_out_header out_header = {
+ .len = sizeof(struct fuse_out_header),
+ };
+ auto iov_out = FuseGenerateIovecs(out_header);
+ SetServerResponse(FUSE_UNLINK, iov_out);
+
+ ASSERT_THAT(unlink(test_file_path.c_str()), SyscallSucceeds());
+ struct fuse_in_header in_header;
+ std::vector<char> unlinked_file(test_file_.length() + 1);
+ auto iov_in = FuseGenerateIovecs(in_header, unlinked_file);
+ GetServerActualRequest(iov_in);
+
+ EXPECT_EQ(in_header.len, sizeof(in_header) + test_file_.length() + 1);
+ EXPECT_EQ(in_header.opcode, FUSE_UNLINK);
+ EXPECT_EQ(std::string(unlinked_file.data()), test_file_);
+}
+
TEST_F(UnlinkTest, NoFile) {
const std::string test_file_path =
JoinPath(mount_point_.path().c_str(), test_file_);
diff --git a/test/image/image_test.go b/test/image/image_test.go
index ac6186688..968e62f63 100644
--- a/test/image/image_test.go
+++ b/test/image/image_test.go
@@ -63,8 +63,8 @@ func TestHelloWorld(t *testing.T) {
}
}
-func runHTTPRequest(port int) error {
- url := fmt.Sprintf("http://localhost:%d/not-found", port)
+func runHTTPRequest(ip string, port int) error {
+ url := fmt.Sprintf("http://%s:%d/not-found", ip, port)
resp, err := http.Get(url)
if err != nil {
return fmt.Errorf("error reaching http server: %v", err)
@@ -73,7 +73,7 @@ func runHTTPRequest(port int) error {
return fmt.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want)
}
- url = fmt.Sprintf("http://localhost:%d/latin10k.txt", port)
+ url = fmt.Sprintf("http://%s:%d/latin10k.txt", ip, port)
resp, err = http.Get(url)
if err != nil {
return fmt.Errorf("Error reaching http server: %v", err)
@@ -95,13 +95,13 @@ func runHTTPRequest(port int) error {
return nil
}
-func testHTTPServer(t *testing.T, port int) {
+func testHTTPServer(t *testing.T, ip string, port int) {
const requests = 10
ch := make(chan error, requests)
for i := 0; i < requests; i++ {
go func() {
start := time.Now()
- err := runHTTPRequest(port)
+ err := runHTTPRequest(ip, port)
log.Printf("Response time %v: %v", time.Since(start).String(), err)
ch <- err
}()
@@ -110,7 +110,7 @@ func testHTTPServer(t *testing.T, port int) {
for i := 0; i < requests; i++ {
err := <-ch
if err != nil {
- t.Errorf("testHTTPServer(%d) failed: %v", port, err)
+ t.Errorf("testHTTPServer(%s, %d) failed: %v", ip, port, err)
}
}
}
@@ -121,27 +121,28 @@ func TestHttpd(t *testing.T) {
defer d.CleanUp(ctx)
// Start the container.
+ port := 80
opts := dockerutil.RunOpts{
Image: "basic/httpd",
- Ports: []int{80},
+ Ports: []int{port},
}
d.CopyFiles(&opts, "/usr/local/apache2/htdocs", "test/image/latin10k.txt")
if err := d.Spawn(ctx, opts); err != nil {
t.Fatalf("docker run failed: %v", err)
}
- // Find where port 80 is mapped to.
- port, err := d.FindPort(ctx, 80)
+ // Find container IP address.
+ ip, err := d.FindIP(ctx, false)
if err != nil {
- t.Fatalf("FindPort(80) failed: %v", err)
+ t.Fatalf("docker.FindIP failed: %v", err)
}
// Wait until it's up and running.
- if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+ if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
t.Errorf("WaitForHTTP() timeout: %v", err)
}
- testHTTPServer(t, port)
+ testHTTPServer(t, ip.String(), port)
}
func TestNginx(t *testing.T) {
@@ -150,27 +151,28 @@ func TestNginx(t *testing.T) {
defer d.CleanUp(ctx)
// Start the container.
+ port := 80
opts := dockerutil.RunOpts{
Image: "basic/nginx",
- Ports: []int{80},
+ Ports: []int{port},
}
d.CopyFiles(&opts, "/usr/share/nginx/html", "test/image/latin10k.txt")
if err := d.Spawn(ctx, opts); err != nil {
t.Fatalf("docker run failed: %v", err)
}
- // Find where port 80 is mapped to.
- port, err := d.FindPort(ctx, 80)
+ // Find container IP address.
+ ip, err := d.FindIP(ctx, false)
if err != nil {
- t.Fatalf("FindPort(80) failed: %v", err)
+ t.Fatalf("docker.FindIP failed: %v", err)
}
// Wait until it's up and running.
- if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+ if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
t.Errorf("WaitForHTTP() timeout: %v", err)
}
- testHTTPServer(t, port)
+ testHTTPServer(t, ip.String(), port)
}
func TestMysql(t *testing.T) {
@@ -218,26 +220,27 @@ func TestTomcat(t *testing.T) {
defer d.CleanUp(ctx)
// Start the server.
+ port := 8080
if err := d.Spawn(ctx, dockerutil.RunOpts{
Image: "basic/tomcat",
- Ports: []int{8080},
+ Ports: []int{port},
}); err != nil {
t.Fatalf("docker run failed: %v", err)
}
- // Find where port 8080 is mapped to.
- port, err := d.FindPort(ctx, 8080)
+ // Find container IP address.
+ ip, err := d.FindIP(ctx, false)
if err != nil {
- t.Fatalf("FindPort(8080) failed: %v", err)
+ t.Fatalf("docker.FindIP failed: %v", err)
}
// Wait until it's up and running.
- if err := testutil.WaitForHTTP(port, defaultWait); err != nil {
+ if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil {
t.Fatalf("WaitForHTTP() timeout: %v", err)
}
// Ensure that content is being served.
- url := fmt.Sprintf("http://localhost:%d", port)
+ url := fmt.Sprintf("http://%s:%d", ip.String(), port)
resp, err := http.Get(url)
if err != nil {
t.Errorf("Error reaching http server: %v", err)
@@ -253,28 +256,29 @@ func TestRuby(t *testing.T) {
defer d.CleanUp(ctx)
// Execute the ruby workload.
+ port := 8080
opts := dockerutil.RunOpts{
Image: "basic/ruby",
- Ports: []int{8080},
+ Ports: []int{port},
}
d.CopyFiles(&opts, "/src", "test/image/ruby.rb", "test/image/ruby.sh")
if err := d.Spawn(ctx, opts, "/src/ruby.sh"); err != nil {
t.Fatalf("docker run failed: %v", err)
}
- // Find where port 8080 is mapped to.
- port, err := d.FindPort(ctx, 8080)
+ // Find container IP address.
+ ip, err := d.FindIP(ctx, false)
if err != nil {
- t.Fatalf("FindPort(8080) failed: %v", err)
+ t.Fatalf("docker.FindIP failed: %v", err)
}
// Wait until it's up and running, 'gem install' can take some time.
- if err := testutil.WaitForHTTP(port, time.Minute); err != nil {
+ if err := testutil.WaitForHTTP(ip.String(), port, time.Minute); err != nil {
t.Fatalf("WaitForHTTP() timeout: %v", err)
}
// Ensure that content is being served.
- url := fmt.Sprintf("http://localhost:%d", port)
+ url := fmt.Sprintf("http://%s:%d", ip.String(), port)
resp, err := http.Get(url)
if err != nil {
t.Errorf("error reaching http server: %v", err)
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
index e2beb30d5..398f70ecd 100644
--- a/test/iptables/iptables_test.go
+++ b/test/iptables/iptables_test.go
@@ -48,6 +48,13 @@ func singleTest(t *testing.T, test TestCase) {
}
}
+// TODO(gvisor.dev/issue/3549): IPv6 NAT support.
+func ipv4Test(t *testing.T, test TestCase) {
+ t.Run("IPv4", func(t *testing.T) {
+ iptablesTest(t, test, false)
+ })
+}
+
func iptablesTest(t *testing.T, test TestCase, ipv6 bool) {
if _, ok := Tests[test.Name()]; !ok {
t.Fatalf("no test found with name %q. Has it been registered?", test.Name())
@@ -72,11 +79,6 @@ func iptablesTest(t *testing.T, test TestCase, ipv6 bool) {
d.CleanUp(context.Background())
}()
- // TODO(gvisor.dev/issue/170): Skipping IPv6 gVisor tests.
- if ipv6 && dockerutil.Runtime() != "runc" {
- t.Skip("gVisor ip6tables not yet implemented")
- }
-
// Create and start the container.
opts := dockerutil.RunOpts{
Image: "iptables",
@@ -314,75 +316,75 @@ func TestInputInvertDestination(t *testing.T) {
singleTest(t, FilterInputInvertDestination{})
}
-func TestOutputDestination(t *testing.T) {
+func TestFilterOutputDestination(t *testing.T) {
singleTest(t, FilterOutputDestination{})
}
-func TestOutputInvertDestination(t *testing.T) {
+func TestFilterOutputInvertDestination(t *testing.T) {
singleTest(t, FilterOutputInvertDestination{})
}
func TestNATPreRedirectUDPPort(t *testing.T) {
- singleTest(t, NATPreRedirectUDPPort{})
+ ipv4Test(t, NATPreRedirectUDPPort{})
}
func TestNATPreRedirectTCPPort(t *testing.T) {
- singleTest(t, NATPreRedirectTCPPort{})
+ ipv4Test(t, NATPreRedirectTCPPort{})
}
func TestNATPreRedirectTCPOutgoing(t *testing.T) {
- singleTest(t, NATPreRedirectTCPOutgoing{})
+ ipv4Test(t, NATPreRedirectTCPOutgoing{})
}
func TestNATOutRedirectTCPIncoming(t *testing.T) {
- singleTest(t, NATOutRedirectTCPIncoming{})
+ ipv4Test(t, NATOutRedirectTCPIncoming{})
}
func TestNATOutRedirectUDPPort(t *testing.T) {
- singleTest(t, NATOutRedirectUDPPort{})
+ ipv4Test(t, NATOutRedirectUDPPort{})
}
func TestNATOutRedirectTCPPort(t *testing.T) {
- singleTest(t, NATOutRedirectTCPPort{})
+ ipv4Test(t, NATOutRedirectTCPPort{})
}
func TestNATDropUDP(t *testing.T) {
- singleTest(t, NATDropUDP{})
+ ipv4Test(t, NATDropUDP{})
}
func TestNATAcceptAll(t *testing.T) {
- singleTest(t, NATAcceptAll{})
+ ipv4Test(t, NATAcceptAll{})
}
func TestNATOutRedirectIP(t *testing.T) {
- singleTest(t, NATOutRedirectIP{})
+ ipv4Test(t, NATOutRedirectIP{})
}
func TestNATOutDontRedirectIP(t *testing.T) {
- singleTest(t, NATOutDontRedirectIP{})
+ ipv4Test(t, NATOutDontRedirectIP{})
}
func TestNATOutRedirectInvert(t *testing.T) {
- singleTest(t, NATOutRedirectInvert{})
+ ipv4Test(t, NATOutRedirectInvert{})
}
func TestNATPreRedirectIP(t *testing.T) {
- singleTest(t, NATPreRedirectIP{})
+ ipv4Test(t, NATPreRedirectIP{})
}
func TestNATPreDontRedirectIP(t *testing.T) {
- singleTest(t, NATPreDontRedirectIP{})
+ ipv4Test(t, NATPreDontRedirectIP{})
}
func TestNATPreRedirectInvert(t *testing.T) {
- singleTest(t, NATPreRedirectInvert{})
+ ipv4Test(t, NATPreRedirectInvert{})
}
func TestNATRedirectRequiresProtocol(t *testing.T) {
- singleTest(t, NATRedirectRequiresProtocol{})
+ ipv4Test(t, NATRedirectRequiresProtocol{})
}
func TestNATLoopbackSkipsPrerouting(t *testing.T) {
- singleTest(t, NATLoopbackSkipsPrerouting{})
+ ipv4Test(t, NATLoopbackSkipsPrerouting{})
}
func TestInputSource(t *testing.T) {
@@ -419,9 +421,9 @@ func TestFilterAddrs(t *testing.T) {
}
func TestNATPreOriginalDst(t *testing.T) {
- singleTest(t, NATPreOriginalDst{})
+ ipv4Test(t, NATPreOriginalDst{})
}
func TestNATOutOriginalDst(t *testing.T) {
- singleTest(t, NATOutOriginalDst{})
+ ipv4Test(t, NATOutOriginalDst{})
}
diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD
index f850dfcd8..fbfea61e1 100644
--- a/test/packetimpact/tests/BUILD
+++ b/test/packetimpact/tests/BUILD
@@ -272,6 +272,16 @@ packetimpact_go_test(
)
packetimpact_go_test(
+ name = "tcp_queue_send_in_syn_sent",
+ srcs = ["tcp_queue_send_in_syn_sent_test.go"],
+ deps = [
+ "//pkg/tcpip/header",
+ "//test/packetimpact/testbench",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+packetimpact_go_test(
name = "icmpv6_param_problem",
srcs = ["icmpv6_param_problem_test.go"],
# TODO(b/153485026): Fix netstack then remove the line below.
diff --git a/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go b/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go
new file mode 100644
index 000000000..0ec8fd748
--- /dev/null
+++ b/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_queue_send_in_syn_sent_test
+
+import (
+ "context"
+ "errors"
+ "flag"
+ "net"
+ "sync"
+ "syscall"
+ "testing"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/test/packetimpact/testbench"
+)
+
+func init() {
+ testbench.RegisterFlags(flag.CommandLine)
+}
+
+// TestQueueSendInSynSent tests send behavior when the TCP state
+// is SYN-SENT.
+// It tests for 2 variants when in SYN_SENT state and:
+// (1) DUT blocks on send and complete handshake
+// (2) DUT blocks on send and receive a TCP RST.
+func TestQueueSendInSynSent(t *testing.T) {
+ for _, tt := range []struct {
+ description string
+ reset bool
+ }{
+ {description: "Complete handshake", reset: false},
+ {description: "Send RST", reset: true},
+ } {
+ t.Run(tt.description, func(t *testing.T) {
+ dut := testbench.NewDUT(t)
+ defer dut.TearDown()
+
+ socket, remotePort := dut.CreateBoundSocket(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4))
+ conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort})
+ defer conn.Close(t)
+
+ sampleData := []byte("Sample Data")
+ samplePayload := &testbench.Payload{Bytes: sampleData}
+ dut.SetNonBlocking(t, socket, true)
+ if _, err := dut.ConnectWithErrno(context.Background(), t, socket, conn.LocalAddr(t)); !errors.Is(err, syscall.EINPROGRESS) {
+ t.Fatalf("failed to bring DUT to SYN-SENT, got: %s, want EINPROGRESS", err)
+ }
+ if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, time.Second); err != nil {
+ t.Fatalf("expected a SYN from DUT, but got none: %s", err)
+ }
+ if _, err := dut.SendWithErrno(context.Background(), t, socket, sampleData, 0); err != syscall.Errno(unix.EWOULDBLOCK) {
+ t.Fatalf("expected error %s, got %s", syscall.Errno(unix.EWOULDBLOCK), err)
+ }
+
+ // Test blocking write.
+ dut.SetNonBlocking(t, socket, false)
+
+ var wg sync.WaitGroup
+ defer wg.Wait()
+ wg.Add(1)
+ var block sync.WaitGroup
+ block.Add(1)
+ go func() {
+ defer wg.Done()
+ ctx, cancel := context.WithTimeout(context.Background(), time.Second*3)
+ defer cancel()
+
+ block.Done()
+ // Issue SEND call in SYN-SENT, this should be queued for
+ // process until the connection is established.
+ n, err := dut.SendWithErrno(ctx, t, socket, sampleData, 0)
+ if tt.reset {
+ if err != syscall.Errno(unix.ECONNREFUSED) {
+ t.Errorf("expected error %s, got %s", syscall.Errno(unix.ECONNREFUSED), err)
+ }
+ if n != -1 {
+ t.Errorf("expected return value %d, got %d", -1, n)
+ }
+ return
+ }
+ if n != int32(len(sampleData)) {
+ t.Errorf("failed to send on DUT: %s", err)
+ }
+ }()
+
+ // Wait for the goroutine to be scheduled and before it
+ // blocks on endpoint send.
+ block.Wait()
+ // The following sleep is used to prevent the connection
+ // from being established before we are blocked on send.
+ time.Sleep(100 * time.Millisecond)
+
+ if tt.reset {
+ conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)})
+ return
+ }
+
+ // Bring the connection to Established.
+ conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)})
+
+ // Expect the data from the DUT's enqueued send request.
+ //
+ // On Linux, this can be piggybacked with the ACK completing the
+ // handshake. On gVisor, getting such a piggyback is a bit more
+ // complicated because the actual data enqueuing occurs in the
+ // callers of endpoint Write.
+ if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagPsh | header.TCPFlagAck)}, samplePayload, time.Second); err != nil {
+ t.Fatalf("expected payload was not received: %s", err)
+ }
+
+ // Send sample payload and expect an ACK to ensure connection is still ESTABLISHED.
+ conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagPsh | header.TCPFlagAck)}, &testbench.Payload{Bytes: sampleData})
+ if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil {
+ t.Fatalf("expected an ACK from DUT, but got none: %s", err)
+ }
+ })
+ }
+}
diff --git a/test/runtimes/exclude/nodejs12.4.0.csv b/test/runtimes/exclude/nodejs12.4.0.csv
index 1740dbb76..749fb9482 100644
--- a/test/runtimes/exclude/nodejs12.4.0.csv
+++ b/test/runtimes/exclude/nodejs12.4.0.csv
@@ -1,4 +1,5 @@
test name,bug id,comment
+async-hooks/test-statwatcher.js,https://github.com/nodejs/node/issues/21425,Check for fix inclusion in nodejs releases after 2020-03-29
benchmark/test-benchmark-fs.js,,
benchmark/test-benchmark-napi.js,,
doctool/test-make-doc.js,b/68848110,Expected to fail.
diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
index cabc2b751..edd23e063 100644
--- a/test/syscalls/linux/fallocate.cc
+++ b/test/syscalls/linux/fallocate.cc
@@ -179,6 +179,12 @@ TEST_F(AllocateTest, FallocateOtherFDs) {
auto sock0 = FileDescriptor(socks[0]);
auto sock1 = FileDescriptor(socks[1]);
EXPECT_THAT(fallocate(sock0.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV));
+
+ int pipefds[2];
+ ASSERT_THAT(pipe(pipefds), SyscallSucceeds());
+ EXPECT_THAT(fallocate(pipefds[1], 0, 0, 10), SyscallFailsWithErrno(ESPIPE));
+ close(pipefds[0]);
+ close(pipefds[1]);
}
} // namespace
diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc
index f3c1d6bc9..b558e3a01 100644
--- a/test/syscalls/linux/packet_socket_raw.cc
+++ b/test/syscalls/linux/packet_socket_raw.cc
@@ -643,6 +643,27 @@ TEST_P(RawPacketTest, GetSocketDetachFilter) {
SyscallFailsWithErrno(ENOPROTOOPT));
}
+TEST_P(RawPacketTest, SetAndGetSocketLinger) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int level = SOL_SOCKET;
+ int type = SO_LINGER;
+
+ struct linger sl;
+ sl.l_onoff = 1;
+ sl.l_linger = 5;
+ ASSERT_THAT(setsockopt(s_, level, type, &sl, sizeof(sl)),
+ SyscallSucceedsWithValue(0));
+
+ struct linger got_linger = {};
+ socklen_t length = sizeof(sl);
+ ASSERT_THAT(getsockopt(s_, level, type, &got_linger, &length),
+ SyscallSucceedsWithValue(0));
+
+ ASSERT_EQ(length, sizeof(got_linger));
+ EXPECT_EQ(0, memcmp(&sl, &got_linger, length));
+}
+
INSTANTIATE_TEST_SUITE_P(AllInetTests, RawPacketTest,
::testing::Values(ETH_P_IP, ETH_P_ALL));
diff --git a/test/syscalls/linux/raw_socket_icmp.cc b/test/syscalls/linux/raw_socket_icmp.cc
index 3de898df7..1b9dbc584 100644
--- a/test/syscalls/linux/raw_socket_icmp.cc
+++ b/test/syscalls/linux/raw_socket_icmp.cc
@@ -416,6 +416,28 @@ TEST_F(RawSocketICMPTest, BindConnectSendAndReceive) {
ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
}
+// Set and get SO_LINGER.
+TEST_F(RawSocketICMPTest, SetAndGetSocketLinger) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int level = SOL_SOCKET;
+ int type = SO_LINGER;
+
+ struct linger sl;
+ sl.l_onoff = 1;
+ sl.l_linger = 5;
+ ASSERT_THAT(setsockopt(s_, level, type, &sl, sizeof(sl)),
+ SyscallSucceedsWithValue(0));
+
+ struct linger got_linger = {};
+ socklen_t length = sizeof(sl);
+ ASSERT_THAT(getsockopt(s_, level, type, &got_linger, &length),
+ SyscallSucceedsWithValue(0));
+
+ ASSERT_EQ(length, sizeof(got_linger));
+ EXPECT_EQ(0, memcmp(&sl, &got_linger, length));
+}
+
void RawSocketICMPTest::ExpectICMPSuccess(const struct icmphdr& icmp) {
// We're going to receive both the echo request and reply, but the order is
// indeterminate.
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 6e4ecd680..3f2c0fdf2 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -451,7 +451,7 @@ TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
}
// Test the SO_LINGER option can be set/get on udp socket.
-TEST_P(UDPSocketPairTest, SoLingerFail) {
+TEST_P(UDPSocketPairTest, SetAndGetSocketLinger) {
auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
int level = SOL_SOCKET;
int type = SO_LINGER;
@@ -469,15 +469,7 @@ TEST_P(UDPSocketPairTest, SoLingerFail) {
SyscallSucceedsWithValue(0));
ASSERT_EQ(length, sizeof(got_linger));
- // Linux returns the values which are set in the SetSockOpt for SO_LINGER.
- // In gVisor, we do not store the linger values for UDP as SO_LINGER for UDP
- // is a no-op.
- if (IsRunningOnGvisor()) {
- struct linger want_linger = {};
- EXPECT_EQ(0, memcmp(&want_linger, &got_linger, length));
- } else {
- EXPECT_EQ(0, memcmp(&sl, &got_linger, length));
- }
+ EXPECT_EQ(0, memcmp(&sl, &got_linger, length));
}
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc
index 99e77b89e..1edcb15a7 100644
--- a/test/syscalls/linux/socket_unix_stream.cc
+++ b/test/syscalls/linux/socket_unix_stream.cc
@@ -103,6 +103,24 @@ TEST_P(StreamUnixSocketPairTest, Sendto) {
SyscallFailsWithErrno(EISCONN));
}
+TEST_P(StreamUnixSocketPairTest, SetAndGetSocketLinger) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct linger sl = {1, 5};
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+ SyscallSucceedsWithValue(0));
+
+ struct linger got_linger = {};
+ socklen_t length = sizeof(sl);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+ &got_linger, &length),
+ SyscallSucceedsWithValue(0));
+
+ ASSERT_EQ(length, sizeof(got_linger));
+ EXPECT_EQ(0, memcmp(&got_linger, &sl, length));
+}
+
INSTANTIATE_TEST_SUITE_P(
AllUnixDomainSockets, StreamUnixSocketPairTest,
::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
diff --git a/test/syscalls/linux/vdso_clock_gettime.cc b/test/syscalls/linux/vdso_clock_gettime.cc
index ce1899f45..52b67249d 100644
--- a/test/syscalls/linux/vdso_clock_gettime.cc
+++ b/test/syscalls/linux/vdso_clock_gettime.cc
@@ -38,8 +38,6 @@ std::string PrintClockId(::testing::TestParamInfo<clockid_t> info) {
switch (info.param) {
case CLOCK_MONOTONIC:
return "CLOCK_MONOTONIC";
- case CLOCK_REALTIME:
- return "CLOCK_REALTIME";
case CLOCK_BOOTTIME:
return "CLOCK_BOOTTIME";
default:
@@ -47,59 +45,31 @@ std::string PrintClockId(::testing::TestParamInfo<clockid_t> info) {
}
}
-class CorrectVDSOClockTest : public ::testing::TestWithParam<clockid_t> {};
+class MonotonicVDSOClockTest : public ::testing::TestWithParam<clockid_t> {};
-TEST_P(CorrectVDSOClockTest, IsCorrect) {
+TEST_P(MonotonicVDSOClockTest, IsCorrect) {
+ // Check that when we alternate readings from the clock_gettime syscall and
+ // the VDSO's implementation, we observe the combined sequence as being
+ // monotonic.
struct timespec tvdso, tsys;
absl::Time vdso_time, sys_time;
- uint64_t total_calls = 0;
-
- // It is expected that 82.5% of clock_gettime calls will be less than 100us
- // skewed from the system time.
- // Unfortunately this is not only influenced by the VDSO clock skew, but also
- // by arbitrary scheduling delays and the like. The test is therefore
- // regularly disabled.
- std::map<absl::Duration, std::tuple<double, uint64_t, uint64_t>> confidence =
- {
- {absl::Microseconds(100), std::make_tuple(0.825, 0, 0)},
- {absl::Microseconds(250), std::make_tuple(0.94, 0, 0)},
- {absl::Milliseconds(1), std::make_tuple(0.999, 0, 0)},
- };
-
- absl::Time start = absl::Now();
- while (absl::Now() < start + absl::Seconds(30)) {
- EXPECT_THAT(clock_gettime(GetParam(), &tvdso), SyscallSucceeds());
- EXPECT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys),
- SyscallSucceeds());
-
+ ASSERT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys),
+ SyscallSucceeds());
+ sys_time = absl::TimeFromTimespec(tsys);
+ auto end = absl::Now() + absl::Seconds(10);
+ while (absl::Now() < end) {
+ ASSERT_THAT(clock_gettime(GetParam(), &tvdso), SyscallSucceeds());
vdso_time = absl::TimeFromTimespec(tvdso);
-
- for (auto const& conf : confidence) {
- std::get<1>(confidence[conf.first]) +=
- (sys_time - vdso_time) < conf.first;
- }
-
+ EXPECT_LE(sys_time, vdso_time);
+ ASSERT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys),
+ SyscallSucceeds());
sys_time = absl::TimeFromTimespec(tsys);
-
- for (auto const& conf : confidence) {
- std::get<2>(confidence[conf.first]) +=
- (vdso_time - sys_time) < conf.first;
- }
-
- ++total_calls;
- }
-
- for (auto const& conf : confidence) {
- EXPECT_GE(std::get<1>(conf.second) / static_cast<double>(total_calls),
- std::get<0>(conf.second));
- EXPECT_GE(std::get<2>(conf.second) / static_cast<double>(total_calls),
- std::get<0>(conf.second));
+ EXPECT_LE(vdso_time, sys_time);
}
}
-INSTANTIATE_TEST_SUITE_P(ClockGettime, CorrectVDSOClockTest,
- ::testing::Values(CLOCK_MONOTONIC, CLOCK_REALTIME,
- CLOCK_BOOTTIME),
+INSTANTIATE_TEST_SUITE_P(ClockGettime, MonotonicVDSOClockTest,
+ ::testing::Values(CLOCK_MONOTONIC, CLOCK_BOOTTIME),
PrintClockId);
} // namespace
diff --git a/tools/bazel.mk b/tools/bazel.mk
index 3e27af7d1..5cc1cdea2 100644
--- a/tools/bazel.mk
+++ b/tools/bazel.mk
@@ -31,6 +31,7 @@ DOCKER_PRIVILEGED ?= --privileged
BAZEL_CACHE := $(shell readlink -m ~/.cache/bazel/)
GCLOUD_CONFIG := $(shell readlink -m ~/.config/gcloud/)
DOCKER_SOCKET := /var/run/docker.sock
+DOCKER_CONFIG := /etc/docker/daemon.json
# Bazel flags.
BAZEL := bazel $(STARTUP_OPTIONS)
@@ -56,6 +57,9 @@ endif
# Add docker passthrough options.
ifneq ($(DOCKER_PRIVILEGED),)
FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_SOCKET):$(DOCKER_SOCKET)"
+# TODO(gvisor.dev/issue/1624): Remove docker config volume. This is required
+# temporarily for checking VFS1 vs VFS2 by some tests.
+FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_CONFIG):$(DOCKER_CONFIG)"
FULL_DOCKER_RUN_OPTIONS += $(DOCKER_PRIVILEGED)
FULL_DOCKER_EXEC_OPTIONS += $(DOCKER_PRIVILEGED)
DOCKER_GROUP := $(shell stat -c '%g' $(DOCKER_SOCKET))
diff --git a/website/_config.yml b/website/_config.yml
index b08602970..20fbb3d2d 100644
--- a/website/_config.yml
+++ b/website/_config.yml
@@ -34,3 +34,6 @@ authors:
igudger:
name: Ian Gudger
email: igudger@google.com
+ fvoznika:
+ name: Fabricio Voznika
+ email: fvoznika@google.com
diff --git a/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png b/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png
new file mode 100644
index 000000000..c750f0851
--- /dev/null
+++ b/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png
Binary files differ
diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md
index 2256ee9d5..b6cf57a77 100644
--- a/website/blog/2019-11-18-security-basics.md
+++ b/website/blog/2019-11-18-security-basics.md
@@ -279,8 +279,10 @@ weaknesses of each gVisor component.
We will also use it to introduce Google's Vulnerability Reward Program[^14], and
other ways the community can contribute to help make gVisor safe, fast and
stable.
+<br>
+<br>
-## Notes
+--------------------------------------------------------------------------------
[^1]: [https://en.wikipedia.org/wiki/Secure_by_design](https://en.wikipedia.org/wiki/Secure_by_design)
[^2]: [https://gvisor.dev/docs/architecture_guide](https://gvisor.dev/docs/architecture_guide/)
diff --git a/website/blog/2020-09-18-containing-a-real-vulnerability.md b/website/blog/2020-09-18-containing-a-real-vulnerability.md
new file mode 100644
index 000000000..b71ef63d9
--- /dev/null
+++ b/website/blog/2020-09-18-containing-a-real-vulnerability.md
@@ -0,0 +1,224 @@
+# Containing a Real Vulnerability
+
+In the previous two posts we talked about gVisor's
+[security design principles](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/)
+as well as how those are applied in the
+[context of networking](https://gvisor.dev/blog/2020/04/02/gvisor-networking-security/).
+Recently, a new container escape vulnerability
+([CVE-2020-14386](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-14386))
+was announced that ties these topics well together. gVisor is
+[not vulnerable](https://seclists.org/oss-sec/2020/q3/168) to this specific
+issue, but it provides an interesting case study to continue our exploration of
+gVisor's security. While gVisor is not immune to vulnerabilities,
+[we take several steps](https://gvisor.dev/security/) to minimize the impact and
+remediate if a vulnerability is found.
+
+## Escaping the Container
+
+First, let’s describe how the discovered vulnerability works. There are numerous
+ways one can send and receive bytes over the network with Linux. One of the most
+performant ways is to use a ring buffer, which is a memory region shared by the
+application and the kernel. These rings are created by calling
+[setsockopt(2)](https://man7.org/linux/man-pages/man2/setsockopt.2.html) with
+[`PACKET_RX_RING`](https://man7.org/linux/man-pages/man7/packet.7.html) for
+receiving and
+[`PACKET_TX_RING`](https://man7.org/linux/man-pages/man7/packet.7.html) for
+sending packets.
+
+The vulnerability is in the code that reads packets when `PACKET_RX_RING` is
+enabled. There is another option
+([`PACKET_RESERVE`](https://man7.org/linux/man-pages/man7/packet.7.html)) that
+asks the kernel to leave some space in the ring buffer before each packet for
+anything the application needs, e.g. control structures. When a packet is
+received, the kernel calculates where to copy the packet to, taking the amount
+reserved before each packet into consideration. If the amount reserved is large,
+the kernel performed an incorrect calculation which could cause an overflow
+leading to an out-of-bounds write of up to 10 bytes, controlled by the attacker.
+The data in the write is easily controlled using the loopback to send a crafted
+packet and receiving it using a `PACKET_RX_RING` with a carefully selected
+`PACKET_RESERVE` size.
+
+```c
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+// ...
+ if (sk->sk_type == SOCK_DGRAM) {
+ macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
+ po->tp_reserve;
+ } else {
+ unsigned int maclen = skb_network_offset(skb);
+ // tp_reserve is unsigned int, netoff is unsigned short. Addition can overflow netoff
+ netoff = TPACKET_ALIGN(po->tp_hdrlen +
+ (maclen < 16 ? 16 : maclen)) +
+ po->tp_reserve;
+ if (po->has_vnet_hdr) {
+ netoff += sizeof(struct virtio_net_hdr);
+ do_vnet = true;
+ }
+ // Attacker controls netoff and can make macoff be smaller than sizeof(struct virtio_net_hdr)
+ macoff = netoff - maclen;
+ }
+// ...
+ // "macoff - sizeof(struct virtio_net_hdr)" can be negative, resulting in a pointer before h.raw
+ if (do_vnet &&
+ virtio_net_hdr_from_skb(skb, h.raw + macoff -
+ sizeof(struct virtio_net_hdr),
+ vio_le(), true, 0)) {
+// ...
+```
+
+The [`CAP_NET_RAW`](https://man7.org/linux/man-pages/man7/capabilities.7.html)
+capability is required to create the socket above. However, in order to support
+common debugging tools like `ping` and `tcpdump`, Docker containers, including
+those created for Kubernetes, are given `CAP_NET_RAW` by default and thus may be
+able to trigger this vulnerability to elevate privileges and escape the
+container.
+
+Next, we are going to explore why this vulnerability doesn’t work in gVisor, and
+how gVisor could prevent the escape even if a similar vulnerability existed
+inside gVisor’s kernel.
+
+## Default Protections
+
+gVisor does not implement `PACKET_RX_RING`, but **does** support raw sockets
+which are required for `PACKET_RX_RING`. Raw sockets are a controversial feature
+to support in a sandbox environment. While it allows great customizations for
+essential tools like `ping`, it may allow packets to be written to the network
+without any validation. In general, allowing an untrusted application to write
+crafted packets to the network is a questionable idea and a historical source of
+vulnerabilities. With that in mind, if `CAP_NET_RAW` is enabled by default, it
+would not be _secure by default_ to run untrusted applications.
+
+After multiple discussions when raw sockets were first implemented, we decided
+to disable raw sockets by default, **even if `CAP_NET_RAW` is given to the
+application**. Instead, enabling raw sockets in gVisor requires the admin to set
+`--net-raw` flag to runsc when configuring the runtime, in addition to requiring
+the `CAP_NET_RAW` capability in the application. It comes at the expense that
+some tools may not work out of the box, but as part of our
+[secure-by-default](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/#secure-by-default)
+principle, we felt that it was important for the “less secure” configuration to
+be explicit.
+
+Since this bug was due to an overflow in the specific Linux implementation of
+the packet ring, gVisor's raw socket implementation is not affected. However, if
+there were a vulnerability in gVisor, containers would not be allowed to exploit
+it by default.
+
+As an alternative way to implement this same constraint, Kubernetes allows
+[admission controllers](https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/)
+to be configured to customize requests. Cloud providers can use this to
+implement more stringent policies. For example, GKE implements an admission
+controller for gVisor that
+[removes `CAP_NET_RAW` from gVisor pods](https://cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods#capabilities)
+unless it has been explicitly set in the pod spec.
+
+## Isolated Kernel
+
+gVisor has its own application kernel, called the Sentry, that is distinct from
+the host kernel. Just like what you would expect from a kernel, gVisor has a
+memory management subsystem, virtual file system, and a full network stack. The
+host network is only used as a transport to carry packets in and out the
+sandbox[^1]. The loopback interface which is used in the exploit stays
+completely inside the sandbox, never reaching the host.
+
+Therefore, even if the Sentry was vulnerable to the attack, there would be two
+factors that would prevent a container escape from happening. First, the
+vulnerability would be limited to the Sentry, and the attacker would compromise
+only the application kernel, bound by a restricted set of
+[seccomp](https://en.wikipedia.org/wiki/Seccomp) filters, discussed more in
+depth below. Second, the Sentry is a distinct implementation of the API, written
+in Go, which provides bounds checking that would have likely prevented access
+past the bounds of the shared region (e.g. see
+[aio](https://cs.opensource.google/gvisor/gvisor/+/master:pkg/sentry/syscalls/linux/vfs2/aio.go;l=210;drc=a11061d78a58ed75b10606d1a770b035ed944b66?q=file:aio&ss=gvisor%2Fgvisor)
+or
+[kcov](https://cs.opensource.google/gvisor/gvisor/+/master:pkg/sentry/kernel/kcov.go;l=272?q=file:kcov&ss=gvisor%2Fgvisor),
+which have similar shared regions).
+
+Here, Kubernetes warrants slightly more explanation. gVisor makes pods the unit
+of isolation and a pod can run multiple containers. In other words, each pod is
+a gVisor instance, and each container is a set of processes running inside
+gVisor, isolated via Sentry-internal namespaces like regular containers inside a
+pod. If there were a vulnerability in gVisor, the privilege escalation would
+allow a container inside the pod to break out to other **containers inside the
+same pod**, but the container still **cannot break out of the pod**.
+
+## Defense in Depth
+
+gVisor follows a
+[common security principle used at Google](https://cloud.google.com/security/infrastructure/design/resources/google_infrastructure_whitepaper_fa.pdf)
+that the system should have two layers of protection, and those layers should
+require different compromises to be broken. We apply this principle by assuming
+that the Sentry (first layer of defense)
+[will be compromised and should not be trusted](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/#defense-in-depth).
+In order to protect the host kernel from a compromised Sentry, we wrap it around
+many security and isolations features to ensure only the minimal set of
+functionality from the host kernel is exposed.
+
+![Figure 1](/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png "Protection layers.")
+
+First, the sandbox runs inside a cgroup that can limit and throttle host
+resources being used. Second, the sandbox joins empty namespaces, including user
+and mount, to further isolate from the host. Next, it changes the process root
+to a read-only directory that contains only `/proc` and nothing else. Then, it
+executes with the unprivileged user/group
+[`nobody`](https://en.wikipedia.org/wiki/Nobody_\(username\)) with all
+capabilities stripped. Last and most importantly, a seccomp filter is added to
+tightly restrict what parts of the Linux syscall surface that gVisor is allowed
+to access. The allowed host surface is a far smaller set of syscalls than the
+Sentry implements for applications to use. Not only restricting the syscall
+being called, but also checking that arguments to these syscalls are within the
+expected set. Dangerous syscalls like <code>execve(2)</code>,
+<code>open(2)</code>, and <code>socket(2)</code> are prohibited, thus an
+attacker isn’t able to execute binaries or acquire new resources on the host.
+
+if there were a vulnerability in gVisor that allowed an attacker to execute code
+inside the Sentry, the attacker still has extremely limited privileges on the
+host. In fact, a compromised Sentry is much more restricted than a
+non-compromised regular container. For CVE-2020-14386 in particular, the attack
+would be blocked by more than one security layer: non-privileged user, no
+capability, and seccomp filters.
+
+Although the surface is drastically reduced, there is still a chance that there
+is a vulnerability in one of the allowed syscalls. That’s why it’s important to
+keep the surface small and carefully consider what syscalls are allowed. You can
+find the full set of allowed syscalls
+[here](https://cs.opensource.google/gvisor/gvisor/+/master:runsc/boot/filter/).
+
+Another possible attack vector is resources that are present in the Sentry, like
+open file descriptors. The Sentry has file descriptors that an attacker could
+potentially use, such as log files, platform files (e.g. `/dev/kvm`), an RPC
+endpoint that allows external communication with the Sentry, and a Netstack
+endpoint that connects the sandbox to the network. The Netstack endpoint in
+particular is a concern because it gives direct access to the network. It’s an
+`AF_PACKET` socket that allows arbitrary L2 packets to be written to the
+network. In the normal case, Netstack assembles packets that go out the network,
+giving the container control over only the payload. But if the Sentry is
+compromised, an attacker can craft packets to the network. In many ways this is
+similar to anyone sending random packets over the internet, but still this is a
+place where the host kernel surface exposed is larger than we would like it to
+be.
+
+## Conclusion
+
+Security comes with many tradeoffs that are often hard to make, such as the
+decision to disable raw sockets by default. However, these tradeoffs have served
+us well, and we've found them to have paid off over time. CVE-2020-14386 offers
+great insight into how multiple layers of protection can be effective against
+such an attack.
+
+We cannot guarantee that a container escape will never happen in gVisor, but we
+do our best to make it as hard as we possibly can.
+
+If you have not tried gVisor yet, it’s easier than you think. Just follow the
+steps in the
+[Quick Start](https://gvisor.dev/docs/user_guide/quick_start/docker/) guide.
+<br>
+<br>
+
+--------------------------------------------------------------------------------
+
+[^1]: Those packets are eventually handled by the host, as it needs to route
+ them to local containers or send them out the NIC. The packet will be
+ handled by many switches, routers, proxies, servers, etc. along the way,
+ which may be subject to their own vulnerabilities.
diff --git a/website/blog/BUILD b/website/blog/BUILD
index 01c1f5a6e..865e403da 100644
--- a/website/blog/BUILD
+++ b/website/blog/BUILD
@@ -28,6 +28,16 @@ doc(
permalink = "/blog/2020/04/02/gvisor-networking-security/",
)
+doc(
+ name = "containing_a_real_vulnerability",
+ src = "2020-09-18-containing-a-real-vulnerability.md",
+ authors = [
+ "fvoznika",
+ ],
+ layout = "post",
+ permalink = "/blog/2020/09/18/containing-a-real-vulnerability/",
+)
+
docs(
name = "posts",
deps = [