diff options
78 files changed, 1893 insertions, 528 deletions
@@ -129,10 +129,9 @@ tests: unit-tests @$(call submake,test TARGETS="test/syscalls/...") .PHONY: tests - integration-tests: ## Run all standard integration tests. integration-tests: docker-tests overlay-tests hostnet-tests swgso-tests -integration-tests: do-tests kvm-tests root-tests containerd-tests +integration-tests: do-tests kvm-tests containerd-test-1.3.4 .PHONY: integration-tests network-tests: ## Run all networking integration tests. @@ -186,6 +185,7 @@ swgso-tests: load-basic-images @$(call submake,install-test-runtime RUNTIME="swgso" ARGS="--software-gso=true --gso=false") @$(call submake,test-runtime RUNTIME="swgso" TARGETS="$(INTEGRATION_TARGETS)") .PHONY: swgso-tests + hostnet-tests: load-basic-images @$(call submake,install-test-runtime RUNTIME="hostnet" ARGS="--network=host") @$(call submake,test-runtime RUNTIME="hostnet" OPTIONS="--test_arg=-checkpoint=false" TARGETS="$(INTEGRATION_TARGETS)") @@ -200,6 +200,8 @@ kvm-tests: load-basic-images .PHONY: kvm-tests iptables-tests: load-iptables + @sudo modprobe iptable_filter + @sudo modprobe ip6table_filter @$(call submake,test-runtime RUNTIME="runc" TARGETS="//test/iptables:iptables_test") @$(call submake,install-test-runtime RUNTIME="iptables" ARGS="--net-raw") @$(call submake,test-runtime RUNTIME="iptables" TARGETS="//test/iptables:iptables_test") @@ -217,16 +219,12 @@ packetimpact-tests: load-packetimpact @$(call submake,test-runtime OPTIONS="--jobs=HOST_CPUS*3 --local_test_jobs=HOST_CPUS*3" RUNTIME="packetimpact" TARGETS="$(shell $(MAKE) query TARGETS='attr(tags, packetimpact, tests(//...))')") .PHONY: packetimpact-tests -root-tests: load-basic-images - @$(call submake,install-test-runtime) - @$(call submake,sudo TARGETS="//test/root:root_test" ARGS="-test.v") -.PHONY: root-tests - # Specific containerd version tests. -containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd install-test-runtime +containerd-test-%: load-basic_alpine load-basic_python load-basic_busybox load-basic_resolv load-basic_httpd load-basic_ubuntu + @$(call submake,install-test-runtime RUNTIME="root") @CONTAINERD_VERSION=$* $(MAKE) sudo TARGETS="tools/installers:containerd" @$(MAKE) sudo TARGETS="tools/installers:shim" - @$(MAKE) sudo TARGETS="test/root:root_test" ARGS="-test.v" + @$(MAKE) sudo TARGETS="test/root:root_test" ARGS="--runtime=root -test.v" # Note that we can't run containerd-test-1.1.8 tests here. # diff --git a/images/jekyll/build.sh b/images/jekyll/build.sh index bfceb2781..010972ea6 100755 --- a/images/jekyll/build.sh +++ b/images/jekyll/build.sh @@ -18,4 +18,5 @@ set -euxo pipefail # Generate the syntax highlighting css file. /usr/gem/bin/rougify style github >/input/_sass/syntax.css -/usr/gem/bin/jekyll build -t -s /input -d /output +# Build website including pages irrespective of date. +/usr/gem/bin/jekyll build --future -t -s /input -d /output diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index 5539466ff..f690ef5ad 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -95,9 +95,14 @@ type DeviceFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DeviceFD) Release(context.Context) { +func (fd *DeviceFD) Release(ctx context.Context) { if fd.fs != nil { + fd.fs.conn.mu.Lock() fd.fs.conn.connected = false + fd.fs.conn.mu.Unlock() + + fd.fs.VFSFilesystem().DecRef(ctx) + fd.fs = nil } } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index a768b1a80..b3573f80d 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -218,6 +218,7 @@ func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt conn: conn, } + fs.VFSFilesystem().IncRef() fuseFD.fs = fs return fs, nil diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 03ce51df7..16abc59dc 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -1026,7 +1026,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open // step is required even if !d.cachedMetadataAuthoritative() because // d.mappings has to be updated. // d.metadataMu has already been acquired if trunc == true. - d.updateFileSizeLocked(0) + d.updateSizeLocked(0) if d.cachedMetadataAuthoritative() { d.touchCMtimeLocked() diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 0e21c31a4..aaad9c0d9 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -833,7 +833,7 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { atomic.StoreUint32(&d.nlink, uint32(attr.NLink)) } if mask.Size { - d.updateFileSizeLocked(attr.Size) + d.updateSizeLocked(attr.Size) } } @@ -987,7 +987,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs // d.size should be kept up to date, and privatized // copy-on-write mappings of truncated pages need to be // invalidated, even if InteropModeShared is in effect. - d.updateFileSizeLocked(stat.Size) + d.updateSizeLocked(stat.Size) } } if d.fs.opts.interop == InteropModeShared { @@ -1024,8 +1024,31 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } +// doAllocate performs an allocate operation on d. Note that d.metadataMu will +// be held when allocate is called. +func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error { + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + // Allocating a smaller size is a noop. + size := offset + length + if d.cachedMetadataAuthoritative() && size <= d.size { + return nil + } + + err := allocate() + if err != nil { + return err + } + d.updateSizeLocked(size) + if d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + return nil +} + // Preconditions: d.metadataMu must be locked. -func (d *dentry) updateFileSizeLocked(newSize uint64) { +func (d *dentry) updateSizeLocked(newSize uint64) { d.dataMu.Lock() oldSize := d.size atomic.StoreUint64(&d.size, newSize) diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index a2e9342d5..24f03ee94 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -79,28 +79,11 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { d := fd.dentry() - d.metadataMu.Lock() - defer d.metadataMu.Unlock() - - // Allocating a smaller size is a noop. - size := offset + length - if d.cachedMetadataAuthoritative() && size <= d.size { - return nil - } - - d.handleMu.RLock() - err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) - d.handleMu.RUnlock() - if err != nil { - return err - } - d.dataMu.Lock() - atomic.StoreUint64(&d.size, size) - d.dataMu.Unlock() - if d.cachedMetadataAuthoritative() { - d.touchCMtimeLocked() - } - return nil + return d.doAllocate(ctx, offset, length, func() error { + d.handleMu.RLock() + defer d.handleMu.RUnlock() + return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + }) } // PRead implements vfs.FileDescriptionImpl.PRead. diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 3c39aa9b7..dc960e5bf 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -135,6 +136,16 @@ func (fd *specialFileFD) EventUnregister(e *waiter.Entry) { fd.fileDescription.EventUnregister(e) } +func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + if fd.isRegularFile { + d := fd.dentry() + return d.doAllocate(ctx, offset, length, func() error { + return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + }) + } + return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if fd.seekable && offset < 0 { diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index afd57e015..db8536f26 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -560,12 +560,7 @@ func (f *fileDescription) Release(context.Context) { // Allocate implements vfs.FileDescriptionImpl.Allocate. func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { - if !f.inode.seekable { - return syserror.ESPIPE - } - - // TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds. - return syserror.EOPNOTSUPP + return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length)) } // PRead implements vfs.FileDescriptionImpl.PRead. diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 4a3dee631..89ed265dc 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -658,9 +658,6 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error fs.mu.Lock() defer fs.mu.Unlock() - // Store the name before walkExistingLocked as rp will be advanced past the - // name in the following call. - name := rp.Component() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { @@ -691,7 +688,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } - if err := parentDentry.inode.RmDir(ctx, name, vfsd); err != nil { + if err := parentDentry.inode.RmDir(ctx, d.name, vfsd); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } @@ -771,9 +768,6 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error fs.mu.Lock() defer fs.mu.Unlock() - // Store the name before walkExistingLocked as rp will be advanced past the - // name in the following call. - name := rp.Component() vfsd, _, err := fs.walkExistingLocked(ctx, rp) fs.processDeferredDecRefsLocked(ctx) if err != nil { @@ -799,7 +793,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.Unlink(ctx, name, vfsd); err != nil { + if err := parentDentry.inode.Unlink(ctx, d.name, vfsd); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go index 74cfd3799..6e04705c7 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -147,6 +147,16 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux return stat, nil } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *nonDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return err + } + defer wrappedFD.DecRef(ctx) + return wrappedFD.Allocate(ctx, mode, offset, length) +} + // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { d := fd.dentry() diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index a1fe906ed..26b117ca4 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -185,8 +185,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de Start: child.lowerMerkleVD, }, &vfs.GetXattrOptions{ Name: merkleOffsetInParentXattr, - // Offset is a 32 bit integer. - Size: sizeOfInt32, + Size: sizeOfStringInt32, }) // The Merkle tree file for the child should have been created and @@ -227,7 +226,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de // the size of all its children's root hashes. dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{ Name: merkleSizeXattr, - Size: sizeOfInt32, + Size: sizeOfStringInt32, }) // The Merkle tree file for the child should have been created and @@ -372,6 +371,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, Path: fspath.Parse(childMerkleFilename), }, &vfs.OpenOptions{ Flags: linux.O_RDWR | linux.O_CREAT, + Mode: 0644, }) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index 3e0bcd02b..9182df317 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -57,8 +57,9 @@ const merkleOffsetInParentXattr = "user.merkle.offset" // whole file. For a directory, it's the size of all its children's root hashes. const merkleSizeXattr = "user.merkle.size" -// sizeOfInt32 is the size in bytes for a 32 bit integer in extended attributes. -const sizeOfInt32 = 4 +// sizeOfStringInt32 is the size for a 32 bit integer stored as string in +// extended attributes. The maximum value of a 32 bit integer is 10 digits. +const sizeOfStringInt32 = 10 // noCrashOnVerificationFailure indicates whether the sandbox should panic // whenever verification fails. If true, an error is returned instead of @@ -636,7 +637,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of // dataSize is the size of the whole file. dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{ Name: merkleSizeXattr, - Size: sizeOfInt32, + Size: sizeOfStringInt32, }) // The Merkle tree file for the child should have been created and diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index f223d59e1..f61039f5b 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -67,6 +67,11 @@ func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlag return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error { + return syserror.ESPIPE +} + // Open opens the pipe represented by vp. func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) { vp.mu.Lock() diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index 8a1d52ebf..97bc6027f 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -97,11 +97,6 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal return ioctl(ctx, s.fd, uio, args) } -// Allocate implements vfs.FileDescriptionImpl.Allocate. -func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.ENODEV -} - // PRead implements vfs.FileDescriptionImpl.PRead. func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, syserror.ESPIPE diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 3e1735079..871ea80ee 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -146,6 +146,10 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { case stack.FilterTable: table = stack.EmptyFilterTable() case stack.NATTable: + if ipv6 { + nflog("IPv6 redirection not yet supported (gvisor.dev/issue/3549)") + return syserr.ErrInvalidArgument + } table = stack.EmptyNATTable() default: nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index 0bfd6c1f4..844acfede 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -97,17 +97,33 @@ func (*TCPMatcher) Name() string { // Match implements Matcher.Match. func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { - netHeader := header.IPv4(pkt.NetworkHeader().View()) + // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved + // into the stack.Check codepath as matchers are added. + switch pkt.NetworkProtocolNumber { + case header.IPv4ProtocolNumber: + netHeader := header.IPv4(pkt.NetworkHeader().View()) + if netHeader.TransportProtocol() != header.TCPProtocolNumber { + return false, false + } - if netHeader.TransportProtocol() != header.TCPProtocolNumber { - return false, false - } + // We don't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + return false, true + } + return false, false + } - // We dont't match fragments. - if frag := netHeader.FragmentOffset(); frag != 0 { - if frag == 1 { - return false, true + case header.IPv6ProtocolNumber: + // As in Linux, we do not perform an IPv6 fragment check. See + // xt_action_param.fragoff in + // include/linux/netfilter/x_tables.h. + if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.TCPProtocolNumber { + return false, false } + + default: + // We don't know the network protocol. return false, false } diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index 7ed05461d..63201201c 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -94,19 +94,33 @@ func (*UDPMatcher) Name() string { // Match implements Matcher.Match. func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { - netHeader := header.IPv4(pkt.NetworkHeader().View()) - // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved // into the stack.Check codepath as matchers are added. - if netHeader.TransportProtocol() != header.UDPProtocolNumber { - return false, false - } + switch pkt.NetworkProtocolNumber { + case header.IPv4ProtocolNumber: + netHeader := header.IPv4(pkt.NetworkHeader().View()) + if netHeader.TransportProtocol() != header.UDPProtocolNumber { + return false, false + } - // We dont't match fragments. - if frag := netHeader.FragmentOffset(); frag != 0 { - if frag == 1 { - return false, true + // We don't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + return false, true + } + return false, false } + + case header.IPv6ProtocolNumber: + // As in Linux, we do not perform an IPv6 fragment check. See + // xt_action_param.fragoff in + // include/linux/netfilter/x_tables.h. + if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.UDPProtocolNumber { + return false, false + } + + default: + // We don't know the network protocol. return false, false } diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index b462924af..816c89cfa 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -1185,7 +1185,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam var v tcpip.LingerOption var linger linux.Linger if err := ep.GetSockOpt(&v); err != nil { - return &linger, nil + return nil, syserr.TranslateNetstackError(err) } if v.Enabled { @@ -1768,10 +1768,16 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int case linux.SOL_IP: return setSockOptIP(t, s, ep, name, optVal) + case linux.SOL_PACKET: + // gVisor doesn't support any SOL_PACKET options just return not + // supported. Returning nil here will result in tcpdump thinking AF_PACKET + // features are supported and proceed to use them and break. + t.Kernel().EmitUnimplementedEvent(t) + return syserr.ErrProtocolNotAvailable + case linux.SOL_UDP, linux.SOL_ICMPV6, - linux.SOL_RAW, - linux.SOL_PACKET: + linux.SOL_RAW: t.Kernel().EmitUnimplementedEvent(t) } diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 08504560c..d6fc03520 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -746,6 +746,9 @@ type baseEndpoint struct { // path is not empty if the endpoint has been bound, // or may be used if the endpoint is connected. path string + + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption } // EventRegister implements waiter.Waitable.EventRegister. @@ -841,8 +844,14 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess return n, err } -// SetSockOpt sets a socket option. Currently not supported. -func (e *baseEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error { +// SetSockOpt sets a socket option. +func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { + switch v := opt.(type) { + case *tcpip.LingerOption: + e.Lock() + e.linger = *v + e.Unlock() + } return nil } @@ -945,8 +954,11 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { - switch opt.(type) { + switch o := opt.(type) { case *tcpip.LingerOption: + e.Lock() + *o = e.linger + e.Unlock() return nil default: diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go index cd6f4dd94..bfcf44b6f 100644 --- a/pkg/sentry/syscalls/linux/sys_sched.go +++ b/pkg/sentry/syscalls/linux/sys_sched.go @@ -30,7 +30,7 @@ const ( // // +marshal type SchedParam struct { - schedPriority int64 + schedPriority int32 } // SchedGetparam implements linux syscall sched_getparam(2). diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 2b29a3c3f..73bb36d3e 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -326,6 +326,9 @@ type FileDescriptionImpl interface { // Allocate grows the file to offset + length bytes. // Only mode == 0 is supported currently. // + // Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on + // other files where it is not supported. + // // Preconditions: The FileDescription was opened for writing. Allocate(ctx context.Context, mode, offset, length uint64) error diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 68b80a951..78da16bac 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -57,7 +57,11 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err } // Allocate implements FileDescriptionImpl.Allocate analogously to -// fallocate called on regular file, directory or FIFO in Linux. +// fallocate called on an invalid type of file in Linux. +// +// Note that directories can rely on this implementation even though they +// should technically return EISDIR. Allocate should never be called for a +// directory, because it requires a writable fd. func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { return syserror.ENODEV } diff --git a/pkg/tcpip/header/parse/BUILD b/pkg/tcpip/header/parse/BUILD new file mode 100644 index 000000000..2adee9288 --- /dev/null +++ b/pkg/tcpip/header/parse/BUILD @@ -0,0 +1,15 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "parse", + srcs = ["parse.go"], + visibility = ["//visibility:public"], + deps = [ + "//pkg/tcpip", + "//pkg/tcpip/buffer", + "//pkg/tcpip/header", + "//pkg/tcpip/stack", + ], +) diff --git a/pkg/tcpip/header/parse/parse.go b/pkg/tcpip/header/parse/parse.go new file mode 100644 index 000000000..522135557 --- /dev/null +++ b/pkg/tcpip/header/parse/parse.go @@ -0,0 +1,166 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package parse provides utilities to parse packets. +package parse + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/buffer" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +// ARP populates pkt's network header with an ARP header found in +// pkt.Data. +// +// Returns true if the header was successfully parsed. +func ARP(pkt *stack.PacketBuffer) bool { + _, ok := pkt.NetworkHeader().Consume(header.ARPSize) + if ok { + pkt.NetworkProtocolNumber = header.ARPProtocolNumber + } + return ok +} + +// IPv4 parses an IPv4 packet found in pkt.Data and populates pkt's network +// header with the IPv4 header. +// +// Returns true if the header was successfully parsed. +func IPv4(pkt *stack.PacketBuffer) bool { + hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize) + if !ok { + return false + } + ipHdr := header.IPv4(hdr) + + // Header may have options, determine the true header length. + headerLen := int(ipHdr.HeaderLength()) + if headerLen < header.IPv4MinimumSize { + // TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in + // order for the packet to be valid. Figure out if we want to reject this + // case. + headerLen = header.IPv4MinimumSize + } + hdr, ok = pkt.NetworkHeader().Consume(headerLen) + if !ok { + return false + } + ipHdr = header.IPv4(hdr) + + pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber + pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr)) + return true +} + +// IPv6 parses an IPv6 packet found in pkt.Data and populates pkt's network +// header with the IPv6 header. +func IPv6(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, fragID uint32, fragOffset uint16, fragMore bool, ok bool) { + hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize) + if !ok { + return 0, 0, 0, false, false + } + ipHdr := header.IPv6(hdr) + + // dataClone consists of: + // - Any IPv6 header bytes after the first 40 (i.e. extensions). + // - The transport header, if present. + // - Any other payload data. + views := [8]buffer.View{} + dataClone := pkt.Data.Clone(views[:]) + dataClone.TrimFront(header.IPv6MinimumSize) + it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone) + + // Iterate over the IPv6 extensions to find their length. + var nextHdr tcpip.TransportProtocolNumber + var extensionsSize int + +traverseExtensions: + for { + extHdr, done, err := it.Next() + if err != nil { + break + } + + // If we exhaust the extension list, the entire packet is the IPv6 header + // and (possibly) extensions. + if done { + extensionsSize = dataClone.Size() + break + } + + switch extHdr := extHdr.(type) { + case header.IPv6FragmentExtHdr: + if fragID == 0 && fragOffset == 0 && !fragMore { + fragID = extHdr.ID() + fragOffset = extHdr.FragmentOffset() + fragMore = extHdr.More() + } + + case header.IPv6RawPayloadHeader: + // We've found the payload after any extensions. + extensionsSize = dataClone.Size() - extHdr.Buf.Size() + nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier) + break traverseExtensions + + default: + // Any other extension is a no-op, keep looping until we find the payload. + } + } + + // Put the IPv6 header with extensions in pkt.NetworkHeader(). + hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize) + if !ok { + panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size())) + } + ipHdr = header.IPv6(hdr) + pkt.Data.CapLength(int(ipHdr.PayloadLength())) + pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber + + return nextHdr, fragID, fragOffset, fragMore, true +} + +// UDP parses a UDP packet found in pkt.Data and populates pkt's transport +// header with the UDP header. +// +// Returns true if the header was successfully parsed. +func UDP(pkt *stack.PacketBuffer) bool { + _, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize) + return ok +} + +// TCP parses a TCP packet found in pkt.Data and populates pkt's transport +// header with the TCP header. +// +// Returns true if the header was successfully parsed. +func TCP(pkt *stack.PacketBuffer) bool { + // TCP header is variable length, peek at it first. + hdrLen := header.TCPMinimumSize + hdr, ok := pkt.Data.PullUp(hdrLen) + if !ok { + return false + } + + // If the header has options, pull those up as well. + if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() { + // TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of + // packets. + hdrLen = offset + } + + _, ok = pkt.TransportHeader().Consume(hdrLen) + return ok +} diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD index 7cbc305e7..4aac12a8c 100644 --- a/pkg/tcpip/link/sniffer/BUILD +++ b/pkg/tcpip/link/sniffer/BUILD @@ -14,6 +14,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/link/nested", "//pkg/tcpip/stack", ], diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go index 4fb127978..560477926 100644 --- a/pkg/tcpip/link/sniffer/sniffer.go +++ b/pkg/tcpip/link/sniffer/sniffer.go @@ -31,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/link/nested" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -195,49 +196,52 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P var transProto uint8 src := tcpip.Address("unknown") dst := tcpip.Address("unknown") - id := 0 - size := uint16(0) + var size uint16 + var id uint32 var fragmentOffset uint16 var moreFragments bool - // Examine the packet using a new VV. Backing storage must not be written. - vv := buffer.NewVectorisedView(pkt.Size(), pkt.Views()) - + // Clone the packet buffer to not modify the original. + // + // We don't clone the original packet buffer so that the new packet buffer + // does not have any of its headers set. + pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views())}) switch protocol { case header.IPv4ProtocolNumber: - hdr, ok := vv.PullUp(header.IPv4MinimumSize) - if !ok { + if ok := parse.IPv4(pkt); !ok { return } - ipv4 := header.IPv4(hdr) + + ipv4 := header.IPv4(pkt.NetworkHeader().View()) fragmentOffset = ipv4.FragmentOffset() moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments src = ipv4.SourceAddress() dst = ipv4.DestinationAddress() transProto = ipv4.Protocol() size = ipv4.TotalLength() - uint16(ipv4.HeaderLength()) - vv.TrimFront(int(ipv4.HeaderLength())) - id = int(ipv4.ID()) + id = uint32(ipv4.ID()) case header.IPv6ProtocolNumber: - hdr, ok := vv.PullUp(header.IPv6MinimumSize) + proto, fragID, fragOffset, fragMore, ok := parse.IPv6(pkt) if !ok { return } - ipv6 := header.IPv6(hdr) + + ipv6 := header.IPv6(pkt.NetworkHeader().View()) src = ipv6.SourceAddress() dst = ipv6.DestinationAddress() - transProto = ipv6.NextHeader() + transProto = uint8(proto) size = ipv6.PayloadLength() - vv.TrimFront(header.IPv6MinimumSize) + id = fragID + moreFragments = fragMore + fragmentOffset = fragOffset case header.ARPProtocolNumber: - hdr, ok := vv.PullUp(header.ARPSize) - if !ok { + if parse.ARP(pkt) { return } - vv.TrimFront(header.ARPSize) - arp := header.ARP(hdr) + + arp := header.ARP(pkt.NetworkHeader().View()) log.Infof( "%s arp %s (%s) -> %s (%s) valid:%t", prefix, @@ -259,7 +263,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P switch tcpip.TransportProtocolNumber(transProto) { case header.ICMPv4ProtocolNumber: transName = "icmp" - hdr, ok := vv.PullUp(header.ICMPv4MinimumSize) + hdr, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize) if !ok { break } @@ -296,7 +300,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P case header.ICMPv6ProtocolNumber: transName = "icmp" - hdr, ok := vv.PullUp(header.ICMPv6MinimumSize) + hdr, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize) if !ok { break } @@ -331,11 +335,11 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P case header.UDPProtocolNumber: transName = "udp" - hdr, ok := vv.PullUp(header.UDPMinimumSize) - if !ok { + if ok := parse.UDP(pkt); !ok { break } - udp := header.UDP(hdr) + + udp := header.UDP(pkt.TransportHeader().View()) if fragmentOffset == 0 { srcPort = udp.SourcePort() dstPort = udp.DestinationPort() @@ -345,19 +349,19 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P case header.TCPProtocolNumber: transName = "tcp" - hdr, ok := vv.PullUp(header.TCPMinimumSize) - if !ok { + if ok := parse.TCP(pkt); !ok { break } - tcp := header.TCP(hdr) + + tcp := header.TCP(pkt.TransportHeader().View()) if fragmentOffset == 0 { offset := int(tcp.DataOffset()) if offset < header.TCPMinimumSize { details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset) break } - if offset > vv.Size() && !moreFragments { - details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, vv.Size()) + if size := pkt.Data.Size() + len(tcp); offset > size && !moreFragments { + details += fmt.Sprintf("invalid packet: tcp data offset %d larger than tcp packet length %d", offset, size) break } diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD index 82c073e32..b40dde96b 100644 --- a/pkg/tcpip/network/arp/BUILD +++ b/pkg/tcpip/network/arp/BUILD @@ -10,6 +10,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/stack", ], ) diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go index 7aaee08c4..cb9225bd7 100644 --- a/pkg/tcpip/network/arp/arp.go +++ b/pkg/tcpip/network/arp/arp.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -234,11 +235,7 @@ func (*protocol) Wait() {} // Parse implements stack.NetworkProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { - _, ok = pkt.NetworkHeader().Consume(header.ARPSize) - if !ok { - return 0, false, false - } - return 0, false, true + return 0, false, parse.ARP(pkt) } // NewProtocol returns an ARP network protocol. diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD index c82593e71..f9c2aa980 100644 --- a/pkg/tcpip/network/ipv4/BUILD +++ b/pkg/tcpip/network/ipv4/BUILD @@ -13,6 +13,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/network/fragmentation", "//pkg/tcpip/network/hash", "//pkg/tcpip/stack", diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go index f4394749d..59c3101b5 100644 --- a/pkg/tcpip/network/ipv4/ipv4.go +++ b/pkg/tcpip/network/ipv4/ipv4.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation" "gvisor.dev/gvisor/pkg/tcpip/network/hash" "gvisor.dev/gvisor/pkg/tcpip/stack" @@ -238,11 +239,13 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw return nil } - // If the packet is manipulated as per NAT Ouput rules, handle packet - // based on destination address and do not send the packet to link layer. - // TODO(gvisor.dev/issue/170): We should do this for every packet, rather than - // only NATted packets, but removing this check short circuits broadcasts - // before they are sent out to other hosts. + // If the packet is manipulated as per NAT Output rules, handle packet + // based on destination address and do not send the packet to link + // layer. + // + // TODO(gvisor.dev/issue/170): We should do this for every + // packet, rather than only NATted packets, but removing this check + // short circuits broadcasts before they are sent out to other hosts. if pkt.NatDone { netHeader := header.IPv4(pkt.NetworkHeader().View()) ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress()) @@ -298,7 +301,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe return n, err } - // Slow Path as we are dropping some packets in the batch degrade to + // Slow path as we are dropping some packets in the batch degrade to // emitting one packet at a time. n := 0 for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { @@ -527,37 +530,14 @@ func (*protocol) Close() {} // Wait implements stack.TransportProtocol.Wait. func (*protocol) Wait() {} -// Parse implements stack.TransportProtocol.Parse. +// Parse implements stack.NetworkProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { - hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize) - if !ok { - return 0, false, false - } - ipHdr := header.IPv4(hdr) - - // Header may have options, determine the true header length. - headerLen := int(ipHdr.HeaderLength()) - if headerLen < header.IPv4MinimumSize { - // TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in - // order for the packet to be valid. Figure out if we want to reject this - // case. - headerLen = header.IPv4MinimumSize - } - hdr, ok = pkt.NetworkHeader().Consume(headerLen) - if !ok { + if ok := parse.IPv4(pkt); !ok { return 0, false, false } - ipHdr = header.IPv4(hdr) - - // If this is a fragment, don't bother parsing the transport header. - parseTransportHeader := true - if ipHdr.More() || ipHdr.FragmentOffset() != 0 { - parseTransportHeader = false - } - pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber - pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr)) - return ipHdr.TransportProtocol(), parseTransportHeader, true + ipHdr := header.IPv4(pkt.NetworkHeader().View()) + return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true } // calculateMTU calculates the network-layer payload MTU based on the link-layer diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go index 5e50558e8..c1a560914 100644 --- a/pkg/tcpip/network/ipv4/ipv4_test.go +++ b/pkg/tcpip/network/ipv4/ipv4_test.go @@ -1046,3 +1046,211 @@ func TestReceiveFragments(t *testing.T) { }) } } + +func TestWritePacketsStats(t *testing.T) { + const nPackets = 3 + tests := []struct { + name string + setup func(*testing.T, *stack.Stack) + linkEP stack.LinkEndpoint + expectSent int + }{ + { + name: "Accept all", + // No setup needed, tables accept everything by default. + setup: func(*testing.T, *stack.Stack) {}, + linkEP: &limitedEP{nPackets}, + expectSent: nPackets, + }, { + name: "Accept all with error", + // No setup needed, tables accept everything by default. + setup: func(*testing.T, *stack.Stack) {}, + linkEP: &limitedEP{nPackets - 1}, + expectSent: nPackets - 1, + }, { + name: "Drop all", + setup: func(t *testing.T, stk *stack.Stack) { + // Install Output DROP rule. + t.Helper() + ipt := stk.IPTables() + filter, ok := ipt.GetTable(stack.FilterTable, false /* ipv6 */) + if !ok { + t.Fatalf("failed to find filter table") + } + ruleIdx := filter.BuiltinChains[stack.Output] + filter.Rules[ruleIdx].Target = stack.DropTarget{} + if err := ipt.ReplaceTable(stack.FilterTable, filter, false /* ipv6 */); err != nil { + t.Fatalf("failed to replace table: %v", err) + } + }, + linkEP: &limitedEP{nPackets}, + expectSent: 0, + }, { + name: "Drop some", + setup: func(t *testing.T, stk *stack.Stack) { + // Install Output DROP rule that matches only 1 + // of the 3 packets. + t.Helper() + ipt := stk.IPTables() + filter, ok := ipt.GetTable(stack.FilterTable, false /* ipv6 */) + if !ok { + t.Fatalf("failed to find filter table") + } + // We'll match and DROP the last packet. + ruleIdx := filter.BuiltinChains[stack.Output] + filter.Rules[ruleIdx].Target = stack.DropTarget{} + filter.Rules[ruleIdx].Matchers = []stack.Matcher{&limitedMatcher{nPackets - 1}} + // Make sure the next rule is ACCEPT. + filter.Rules[ruleIdx+1].Target = stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterTable, filter, false /* ipv6 */); err != nil { + t.Fatalf("failed to replace table: %v", err) + } + }, + linkEP: &limitedEP{nPackets}, + expectSent: nPackets - 1, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + rt := buildRoute(t, nil, test.linkEP) + + var pbl stack.PacketBufferList + for i := 0; i < nPackets; i++ { + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + ReserveHeaderBytes: header.UDPMinimumSize + int(rt.MaxHeaderLength()), + Data: buffer.NewView(1).ToVectorisedView(), + }) + pkt.TransportHeader().Push(header.UDPMinimumSize) + pbl.PushBack(pkt) + } + + test.setup(t, rt.Stack()) + + nWritten, err := rt.WritePackets(nil, pbl, stack.NetworkHeaderParams{}) + if err != nil { + t.Fatal(err) + } + + got := int(rt.Stats().IP.PacketsSent.Value()) + if got != test.expectSent { + t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent) + } + if got != nWritten { + t.Errorf("sent %d packets, WritePackets returned %d", got, nWritten) + } + }) + } +} + +func buildRoute(t *testing.T, packetCollectorErrors []*tcpip.Error, linkEP stack.LinkEndpoint) stack.Route { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()}, + }) + s.CreateNIC(1, linkEP) + const ( + src = "\x10\x00\x00\x01" + dst = "\x10\x00\x00\x02" + ) + s.AddAddress(1, ipv4.ProtocolNumber, src) + { + subnet, err := tcpip.NewSubnet(dst, tcpip.AddressMask(header.IPv4Broadcast)) + if err != nil { + t.Fatal(err) + } + s.SetRouteTable([]tcpip.Route{{ + Destination: subnet, + NIC: 1, + }}) + } + rt, err := s.FindRoute(0, src, dst, ipv4.ProtocolNumber, false /* multicastLoop */) + if err != nil { + t.Fatalf("s.FindRoute got %v, want %v", err, nil) + } + return rt +} + +// limitedEP is a link endpoint that writes up to a certain number of packets +// before returning errors. +type limitedEP struct { + limit int +} + +// MTU implements LinkEndpoint.MTU. +func (*limitedEP) MTU() uint32 { return 0 } + +// Capabilities implements LinkEndpoint.Capabilities. +func (*limitedEP) Capabilities() stack.LinkEndpointCapabilities { return 0 } + +// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength. +func (*limitedEP) MaxHeaderLength() uint16 { return 0 } + +// LinkAddress implements LinkEndpoint.LinkAddress. +func (*limitedEP) LinkAddress() tcpip.LinkAddress { return "" } + +// WritePacket implements LinkEndpoint.WritePacket. +func (ep *limitedEP) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error { + if ep.limit == 0 { + return tcpip.ErrInvalidEndpointState + } + ep.limit-- + return nil +} + +// WritePackets implements LinkEndpoint.WritePackets. +func (ep *limitedEP) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + if ep.limit == 0 { + return 0, tcpip.ErrInvalidEndpointState + } + nWritten := ep.limit + if nWritten > pkts.Len() { + nWritten = pkts.Len() + } + ep.limit -= nWritten + return nWritten, nil +} + +// WriteRawPacket implements LinkEndpoint.WriteRawPacket. +func (ep *limitedEP) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error { + if ep.limit == 0 { + return tcpip.ErrInvalidEndpointState + } + ep.limit-- + return nil +} + +// Attach implements LinkEndpoint.Attach. +func (*limitedEP) Attach(_ stack.NetworkDispatcher) {} + +// IsAttached implements LinkEndpoint.IsAttached. +func (*limitedEP) IsAttached() bool { return false } + +// Wait implements LinkEndpoint.Wait. +func (*limitedEP) Wait() {} + +// ARPHardwareType implements LinkEndpoint.ARPHardwareType. +func (*limitedEP) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareEther } + +// AddHeader implements LinkEndpoint.AddHeader. +func (*limitedEP) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) { +} + +// limitedMatcher is an iptables matcher that matches after a certain number of +// packets are checked against it. +type limitedMatcher struct { + limit int +} + +// Name implements Matcher.Name. +func (*limitedMatcher) Name() string { + return "limitedMatcher" +} + +// Match implements Matcher.Match. +func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) { + if lm.limit == 0 { + return true, false + } + lm.limit-- + return false, false +} diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD index bcc64994e..cd5fe3ea8 100644 --- a/pkg/tcpip/network/ipv6/BUILD +++ b/pkg/tcpip/network/ipv6/BUILD @@ -13,6 +13,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/network/fragmentation", "//pkg/tcpip/stack", ], diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go index e821a8bff..a4a4d6a21 100644 --- a/pkg/tcpip/network/ipv6/ipv6.go +++ b/pkg/tcpip/network/ipv6/ipv6.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -107,6 +108,31 @@ func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params s func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error { e.addIPHeader(r, pkt, params) + // iptables filtering. All packets that reach here are locally + // generated. + nicName := e.stack.FindNICNameFromID(e.NICID()) + ipt := e.stack.IPTables() + if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok { + // iptables is telling us to drop the packet. + return nil + } + + // If the packet is manipulated as per NAT Output rules, handle packet + // based on destination address and do not send the packet to link + // layer. + // + // TODO(gvisor.dev/issue/170): We should do this for every + // packet, rather than only NATted packets, but removing this check + // short circuits broadcasts before they are sent out to other hosts. + if pkt.NatDone { + netHeader := header.IPv6(pkt.NetworkHeader().View()) + if ep, err := e.stack.FindNetworkEndpoint(header.IPv6ProtocolNumber, netHeader.DestinationAddress()); err == nil { + route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress()) + ep.HandlePacket(&route, pkt) + return nil + } + } + if r.Loop&stack.PacketLoop != 0 { loopedR := r.MakeLoopedRoute() @@ -138,9 +164,46 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe e.addIPHeader(r, pb, params) } - n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber) + // iptables filtering. All packets that reach here are locally + // generated. + nicName := e.stack.FindNICNameFromID(e.NICID()) + ipt := e.stack.IPTables() + dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r, nicName) + if len(dropped) == 0 && len(natPkts) == 0 { + // Fast path: If no packets are to be dropped then we can just invoke the + // faster WritePackets API directly. + n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber) + r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) + return n, err + } + + // Slow path as we are dropping some packets in the batch degrade to + // emitting one packet at a time. + n := 0 + for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { + if _, ok := dropped[pkt]; ok { + continue + } + if _, ok := natPkts[pkt]; ok { + netHeader := header.IPv6(pkt.NetworkHeader().View()) + if ep, err := e.stack.FindNetworkEndpoint(header.IPv6ProtocolNumber, netHeader.DestinationAddress()); err == nil { + src := netHeader.SourceAddress() + dst := netHeader.DestinationAddress() + route := r.ReverseRoute(src, dst) + ep.HandlePacket(&route, pkt) + n++ + continue + } + } + if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil { + r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) + return n, err + } + n++ + } + r.Stats().IP.PacketsSent.IncrementBy(uint64(n)) - return n, err + return n, nil } // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet @@ -169,6 +232,14 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) { it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv) hasFragmentHeader := false + // iptables filtering. All packets that reach here are intended for + // this machine and will not be forwarded. + ipt := e.stack.IPTables() + if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok { + // iptables is telling us to drop the packet. + return + } + for firstHeader := true; ; firstHeader = false { extHdr, done, err := it.Next() if err != nil { @@ -504,75 +575,14 @@ func (*protocol) Close() {} // Wait implements stack.TransportProtocol.Wait. func (*protocol) Wait() {} -// Parse implements stack.TransportProtocol.Parse. +// Parse implements stack.NetworkProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { - hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize) + proto, _, fragOffset, fragMore, ok := parse.IPv6(pkt) if !ok { return 0, false, false } - ipHdr := header.IPv6(hdr) - - // dataClone consists of: - // - Any IPv6 header bytes after the first 40 (i.e. extensions). - // - The transport header, if present. - // - Any other payload data. - views := [8]buffer.View{} - dataClone := pkt.Data.Clone(views[:]) - dataClone.TrimFront(header.IPv6MinimumSize) - it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone) - - // Iterate over the IPv6 extensions to find their length. - // - // Parsing occurs again in HandlePacket because we don't track the - // extensions in PacketBuffer. Unfortunately, that means HandlePacket - // has to do the parsing work again. - var nextHdr tcpip.TransportProtocolNumber - foundNext := true - extensionsSize := 0 -traverseExtensions: - for extHdr, done, err := it.Next(); ; extHdr, done, err = it.Next() { - if err != nil { - break - } - // If we exhaust the extension list, the entire packet is the IPv6 header - // and (possibly) extensions. - if done { - extensionsSize = dataClone.Size() - foundNext = false - break - } - - switch extHdr := extHdr.(type) { - case header.IPv6FragmentExtHdr: - // If this is an atomic fragment, we don't have to treat it specially. - if !extHdr.More() && extHdr.FragmentOffset() == 0 { - continue - } - // This is a non-atomic fragment and has to be re-assembled before we can - // examine the payload for a transport header. - foundNext = false - - case header.IPv6RawPayloadHeader: - // We've found the payload after any extensions. - extensionsSize = dataClone.Size() - extHdr.Buf.Size() - nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier) - break traverseExtensions - - default: - // Any other extension is a no-op, keep looping until we find the payload. - } - } - - // Put the IPv6 header with extensions in pkt.NetworkHeader(). - hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize) - if !ok { - panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size())) - } - ipHdr = header.IPv6(hdr) - pkt.Data.CapLength(int(ipHdr.PayloadLength())) - pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber - return nextHdr, foundNext, true + return proto, !fragMore && fragOffset == 0, true } // calculateMTU calculates the network-layer payload MTU based on the link-layer diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go index 5f9822c49..354d3b60d 100644 --- a/pkg/tcpip/network/ipv6/ipv6_test.go +++ b/pkg/tcpip/network/ipv6/ipv6_test.go @@ -1709,3 +1709,211 @@ func TestInvalidIPv6Fragments(t *testing.T) { }) } } + +func TestWritePacketsStats(t *testing.T) { + const nPackets = 3 + tests := []struct { + name string + setup func(*testing.T, *stack.Stack) + linkEP stack.LinkEndpoint + expectSent int + }{ + { + name: "Accept all", + // No setup needed, tables accept everything by default. + setup: func(*testing.T, *stack.Stack) {}, + linkEP: &limitedEP{nPackets}, + expectSent: nPackets, + }, { + name: "Accept all with error", + // No setup needed, tables accept everything by default. + setup: func(*testing.T, *stack.Stack) {}, + linkEP: &limitedEP{nPackets - 1}, + expectSent: nPackets - 1, + }, { + name: "Drop all", + setup: func(t *testing.T, stk *stack.Stack) { + // Install Output DROP rule. + t.Helper() + ipt := stk.IPTables() + filter, ok := ipt.GetTable(stack.FilterTable, true /* ipv6 */) + if !ok { + t.Fatalf("failed to find filter table") + } + ruleIdx := filter.BuiltinChains[stack.Output] + filter.Rules[ruleIdx].Target = stack.DropTarget{} + if err := ipt.ReplaceTable(stack.FilterTable, filter, true /* ipv6 */); err != nil { + t.Fatalf("failed to replace table: %v", err) + } + }, + linkEP: &limitedEP{nPackets}, + expectSent: 0, + }, { + name: "Drop some", + setup: func(t *testing.T, stk *stack.Stack) { + // Install Output DROP rule that matches only 1 + // of the 3 packets. + t.Helper() + ipt := stk.IPTables() + filter, ok := ipt.GetTable(stack.FilterTable, true /* ipv6 */) + if !ok { + t.Fatalf("failed to find filter table") + } + // We'll match and DROP the last packet. + ruleIdx := filter.BuiltinChains[stack.Output] + filter.Rules[ruleIdx].Target = stack.DropTarget{} + filter.Rules[ruleIdx].Matchers = []stack.Matcher{&limitedMatcher{nPackets - 1}} + // Make sure the next rule is ACCEPT. + filter.Rules[ruleIdx+1].Target = stack.AcceptTarget{} + if err := ipt.ReplaceTable(stack.FilterTable, filter, true /* ipv6 */); err != nil { + t.Fatalf("failed to replace table: %v", err) + } + }, + linkEP: &limitedEP{nPackets}, + expectSent: nPackets - 1, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + rt := buildRoute(t, nil, test.linkEP) + + var pbl stack.PacketBufferList + for i := 0; i < nPackets; i++ { + pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ + ReserveHeaderBytes: header.UDPMinimumSize + int(rt.MaxHeaderLength()), + Data: buffer.NewView(1).ToVectorisedView(), + }) + pkt.TransportHeader().Push(header.UDPMinimumSize) + pbl.PushBack(pkt) + } + + test.setup(t, rt.Stack()) + + nWritten, err := rt.WritePackets(nil, pbl, stack.NetworkHeaderParams{}) + if err != nil { + t.Fatal(err) + } + + got := int(rt.Stats().IP.PacketsSent.Value()) + if got != test.expectSent { + t.Errorf("sent %d packets, but expected to send %d", got, test.expectSent) + } + if got != nWritten { + t.Errorf("sent %d packets, WritePackets returned %d", got, nWritten) + } + }) + } +} + +func buildRoute(t *testing.T, packetCollectorErrors []*tcpip.Error, linkEP stack.LinkEndpoint) stack.Route { + s := stack.New(stack.Options{ + NetworkProtocols: []stack.NetworkProtocol{NewProtocol()}, + }) + s.CreateNIC(1, linkEP) + const ( + src = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" + dst = "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" + ) + s.AddAddress(1, ProtocolNumber, src) + { + subnet, err := tcpip.NewSubnet(dst, tcpip.AddressMask("\xfc\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff")) + if err != nil { + t.Fatal(err) + } + s.SetRouteTable([]tcpip.Route{{ + Destination: subnet, + NIC: 1, + }}) + } + rt, err := s.FindRoute(0, src, dst, ProtocolNumber, false /* multicastLoop */) + if err != nil { + t.Fatalf("s.FindRoute got %v, want %v", err, nil) + } + return rt +} + +// limitedEP is a link endpoint that writes up to a certain number of packets +// before returning errors. +type limitedEP struct { + limit int +} + +// MTU implements LinkEndpoint.MTU. +func (*limitedEP) MTU() uint32 { return 0 } + +// Capabilities implements LinkEndpoint.Capabilities. +func (*limitedEP) Capabilities() stack.LinkEndpointCapabilities { return 0 } + +// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength. +func (*limitedEP) MaxHeaderLength() uint16 { return 0 } + +// LinkAddress implements LinkEndpoint.LinkAddress. +func (*limitedEP) LinkAddress() tcpip.LinkAddress { return "" } + +// WritePacket implements LinkEndpoint.WritePacket. +func (ep *limitedEP) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error { + if ep.limit == 0 { + return tcpip.ErrInvalidEndpointState + } + ep.limit-- + return nil +} + +// WritePackets implements LinkEndpoint.WritePackets. +func (ep *limitedEP) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) { + if ep.limit == 0 { + return 0, tcpip.ErrInvalidEndpointState + } + nWritten := ep.limit + if nWritten > pkts.Len() { + nWritten = pkts.Len() + } + ep.limit -= nWritten + return nWritten, nil +} + +// WriteRawPacket implements LinkEndpoint.WriteRawPacket. +func (ep *limitedEP) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error { + if ep.limit == 0 { + return tcpip.ErrInvalidEndpointState + } + ep.limit-- + return nil +} + +// Attach implements LinkEndpoint.Attach. +func (*limitedEP) Attach(_ stack.NetworkDispatcher) {} + +// IsAttached implements LinkEndpoint.IsAttached. +func (*limitedEP) IsAttached() bool { return false } + +// Wait implements LinkEndpoint.Wait. +func (*limitedEP) Wait() {} + +// ARPHardwareType implements LinkEndpoint.ARPHardwareType. +func (*limitedEP) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareEther } + +// AddHeader implements LinkEndpoint.AddHeader. +func (*limitedEP) AddHeader(_, _ tcpip.LinkAddress, _ tcpip.NetworkProtocolNumber, _ *stack.PacketBuffer) { +} + +// limitedMatcher is an iptables matcher that matches after a certain number of +// packets are checked against it. +type limitedMatcher struct { + limit int +} + +// Name implements Matcher.Name. +func (*limitedMatcher) Name() string { + return "limitedMatcher" +} + +// Match implements Matcher.Match. +func (lm *limitedMatcher) Match(stack.Hook, *stack.PacketBuffer, string) (bool, bool) { + if lm.limit == 0 { + return true, false + } + lm.limit-- + return false, false +} diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index 0e33cbe92..b6ef04d32 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -57,7 +57,72 @@ const reaperDelay = 5 * time.Second // all packets. func DefaultTables() *IPTables { return &IPTables{ - tables: [numTables]Table{ + v4Tables: [numTables]Table{ + natID: Table{ + Rules: []Rule{ + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: ErrorTarget{}}, + }, + BuiltinChains: [NumHooks]int{ + Prerouting: 0, + Input: 1, + Forward: HookUnset, + Output: 2, + Postrouting: 3, + }, + Underflows: [NumHooks]int{ + Prerouting: 0, + Input: 1, + Forward: HookUnset, + Output: 2, + Postrouting: 3, + }, + }, + mangleID: Table{ + Rules: []Rule{ + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: ErrorTarget{}}, + }, + BuiltinChains: [NumHooks]int{ + Prerouting: 0, + Output: 1, + }, + Underflows: [NumHooks]int{ + Prerouting: 0, + Input: HookUnset, + Forward: HookUnset, + Output: 1, + Postrouting: HookUnset, + }, + }, + filterID: Table{ + Rules: []Rule{ + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: AcceptTarget{}}, + Rule{Target: ErrorTarget{}}, + }, + BuiltinChains: [NumHooks]int{ + Prerouting: HookUnset, + Input: 0, + Forward: 1, + Output: 2, + Postrouting: HookUnset, + }, + Underflows: [NumHooks]int{ + Prerouting: HookUnset, + Input: 0, + Forward: 1, + Output: 2, + Postrouting: HookUnset, + }, + }, + }, + v6Tables: [numTables]Table{ natID: Table{ Rules: []Rule{ Rule{Target: AcceptTarget{}}, @@ -166,25 +231,20 @@ func EmptyNATTable() Table { // GetTable returns a table by name. func (it *IPTables) GetTable(name string, ipv6 bool) (Table, bool) { - // TODO(gvisor.dev/issue/3549): Enable IPv6. - if ipv6 { - return Table{}, false - } id, ok := nameToID[name] if !ok { return Table{}, false } it.mu.RLock() defer it.mu.RUnlock() - return it.tables[id], true + if ipv6 { + return it.v6Tables[id], true + } + return it.v4Tables[id], true } // ReplaceTable replaces or inserts table by name. func (it *IPTables) ReplaceTable(name string, table Table, ipv6 bool) *tcpip.Error { - // TODO(gvisor.dev/issue/3549): Enable IPv6. - if ipv6 { - return tcpip.ErrInvalidOptionValue - } id, ok := nameToID[name] if !ok { return tcpip.ErrInvalidOptionValue @@ -198,7 +258,11 @@ func (it *IPTables) ReplaceTable(name string, table Table, ipv6 bool) *tcpip.Err it.startReaper(reaperDelay) } it.modified = true - it.tables[id] = table + if ipv6 { + it.v6Tables[id] = table + } else { + it.v4Tables[id] = table + } return nil } @@ -221,8 +285,17 @@ const ( // should continue traversing the network stack and false when it should be // dropped. // +// TODO(gvisor.dev/issue/170): PacketBuffer should hold the GSO and route, from +// which address and nicName can be gathered. Currently, address is only +// needed for prerouting and nicName is only needed for output. +// +// TODO(gvisor.dev/issue/170): Dropped packets should be counted. +// // Precondition: pkt.NetworkHeader is set. -func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, address tcpip.Address, nicName string) bool { +func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) bool { + if pkt.NetworkProtocolNumber != header.IPv4ProtocolNumber && pkt.NetworkProtocolNumber != header.IPv6ProtocolNumber { + return true + } // Many users never configure iptables. Spare them the cost of rule // traversal if rules have never been set. it.mu.RLock() @@ -243,9 +316,14 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr if tableID == natID && pkt.NatDone { continue } - table := it.tables[tableID] + var table Table + if pkt.NetworkProtocolNumber == header.IPv6ProtocolNumber { + table = it.v6Tables[tableID] + } else { + table = it.v4Tables[tableID] + } ruleIdx := table.BuiltinChains[hook] - switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict { + switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict { // If the table returns Accept, move on to the next table. case chainAccept: continue @@ -256,7 +334,7 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr // Any Return from a built-in chain means we have to // call the underflow. underflow := table.Rules[table.Underflows[hook]] - switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, address); v { + switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, preroutingAddr); v { case RuleAccept: continue case RuleDrop: @@ -351,11 +429,11 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r * // Preconditions: // * pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // * pkt.NetworkHeader is not nil. -func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) chainVerdict { +func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) chainVerdict { // Start from ruleIdx and walk the list of rules until a rule gives us // a verdict. for ruleIdx < len(table.Rules) { - switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict { + switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, preroutingAddr, nicName); verdict { case RuleAccept: return chainAccept @@ -372,7 +450,7 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId ruleIdx++ continue } - switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, address, nicName); verdict { + switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, preroutingAddr, nicName); verdict { case chainAccept: return chainAccept case chainDrop: @@ -398,11 +476,11 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId // Preconditions: // * pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // * pkt.NetworkHeader is not nil. -func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) (RuleVerdict, int) { +func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, preroutingAddr tcpip.Address, nicName string) (RuleVerdict, int) { rule := table.Rules[ruleIdx] // Check whether the packet matches the IP header filter. - if !rule.Filter.match(header.IPv4(pkt.NetworkHeader().View()), hook, nicName) { + if !rule.Filter.match(pkt, hook, nicName) { // Continue on to the next rule. return RuleJump, ruleIdx + 1 } @@ -421,7 +499,7 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx } // All the matchers matched, so run the target. - return rule.Target.Action(pkt, &it.connections, hook, gso, r, address) + return rule.Target.Action(pkt, &it.connections, hook, gso, r, preroutingAddr) } // OriginalDst returns the original destination of redirected connections. It diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go index fbbd2f50f..093ee6881 100644 --- a/pkg/tcpip/stack/iptables_types.go +++ b/pkg/tcpip/stack/iptables_types.go @@ -15,6 +15,7 @@ package stack import ( + "fmt" "strings" "sync" @@ -81,26 +82,25 @@ const ( // // +stateify savable type IPTables struct { - // mu protects tables, priorities, and modified. + // mu protects v4Tables, v6Tables, and modified. mu sync.RWMutex - - // tables maps tableIDs to tables. Holds builtin tables only, not user - // tables. mu must be locked for accessing. - tables [numTables]Table - - // priorities maps each hook to a list of table names. The order of the - // list is the order in which each table should be visited for that - // hook. mu needs to be locked for accessing. - priorities [NumHooks][]tableID - + // v4Tables and v6tables map tableIDs to tables. They hold builtin + // tables only, not user tables. mu must be locked for accessing. + v4Tables [numTables]Table + v6Tables [numTables]Table // modified is whether tables have been modified at least once. It is // used to elide the iptables performance overhead for workloads that // don't utilize iptables. modified bool + // priorities maps each hook to a list of table names. The order of the + // list is the order in which each table should be visited for that + // hook. It is immutable. + priorities [NumHooks][]tableID + connections ConnTrack - // reaperDone can be signalled to stop the reaper goroutine. + // reaperDone can be signaled to stop the reaper goroutine. reaperDone chan struct{} } @@ -148,7 +148,7 @@ type Rule struct { Target Target } -// IPHeaderFilter holds basic IP filtering data common to every rule. +// IPHeaderFilter performs basic IP header matching common to every rule. // // +stateify savable type IPHeaderFilter struct { @@ -196,16 +196,43 @@ type IPHeaderFilter struct { OutputInterfaceInvert bool } -// match returns whether hdr matches the filter. -func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool { - // TODO(gvisor.dev/issue/170): Support other fields of the filter. +// match returns whether pkt matches the filter. +// +// Preconditions: pkt.NetworkHeader is set and is at least of the minimal IPv4 +// or IPv6 header length. +func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, nicName string) bool { + // Extract header fields. + var ( + // TODO(gvisor.dev/issue/170): Support other filter fields. + transProto tcpip.TransportProtocolNumber + dstAddr tcpip.Address + srcAddr tcpip.Address + ) + switch proto := pkt.NetworkProtocolNumber; proto { + case header.IPv4ProtocolNumber: + hdr := header.IPv4(pkt.NetworkHeader().View()) + transProto = hdr.TransportProtocol() + dstAddr = hdr.DestinationAddress() + srcAddr = hdr.SourceAddress() + + case header.IPv6ProtocolNumber: + hdr := header.IPv6(pkt.NetworkHeader().View()) + transProto = hdr.TransportProtocol() + dstAddr = hdr.DestinationAddress() + srcAddr = hdr.SourceAddress() + + default: + panic(fmt.Sprintf("unknown network protocol with EtherType: %d", proto)) + } + // Check the transport protocol. - if fl.Protocol != 0 && fl.Protocol != hdr.TransportProtocol() { + if fl.CheckProtocol && fl.Protocol != transProto { return false } - // Check the source and destination IPs. - if !filterAddress(hdr.DestinationAddress(), fl.DstMask, fl.Dst, fl.DstInvert) || !filterAddress(hdr.SourceAddress(), fl.SrcMask, fl.Src, fl.SrcInvert) { + // Check the addresses. + if !filterAddress(dstAddr, fl.DstMask, fl.Dst, fl.DstInvert) || + !filterAddress(srcAddr, fl.SrcMask, fl.Src, fl.SrcInvert) { return false } diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 1f1a1426b..821d3feb9 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -1282,9 +1282,8 @@ func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcp return } - // TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet. // Loopback traffic skips the prerouting chain. - if protocol == header.IPv4ProtocolNumber && !n.isLoopback() { + if !n.isLoopback() { // iptables filtering. ipt := n.stack.IPTables() address := n.primaryAddress(protocol) diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go index 17b8beebb..1932aaeb7 100644 --- a/pkg/tcpip/stack/packet_buffer.go +++ b/pkg/tcpip/stack/packet_buffer.go @@ -80,7 +80,7 @@ type PacketBuffer struct { // data are held in the same underlying buffer storage. header buffer.Prependable - // NetworkProtocol is only valid when NetworkHeader is set. + // NetworkProtocolNumber is only valid when NetworkHeader is set. // TODO(gvisor.dev/issue/3574): Remove the separately passed protocol // numbers in registration APIs that take a PacketBuffer. NetworkProtocolNumber tcpip.NetworkProtocolNumber diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index ad71ff3b6..31116309e 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -74,6 +74,8 @@ type endpoint struct { route stack.Route `state:"manual"` ttl uint8 stats tcpip.TransportEndpointStats `state:"nosave"` + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner @@ -344,9 +346,14 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { // SetSockOpt sets a socket option. func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { - switch opt.(type) { + switch v := opt.(type) { case *tcpip.SocketDetachFilterOption: return nil + + case *tcpip.LingerOption: + e.mu.Lock() + e.linger = *v + e.mu.Unlock() } return nil } @@ -415,8 +422,17 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption +func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { + switch o := opt.(type) { + case *tcpip.LingerOption: + e.mu.Lock() + *o = e.linger + e.mu.Unlock() + return nil + + default: + return tcpip.ErrUnknownProtocolOption + } } func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) *tcpip.Error { diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 8bd4e5e37..072601d2d 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -83,6 +83,8 @@ type endpoint struct { stats tcpip.TransportEndpointStats `state:"nosave"` bound bool boundNIC tcpip.NICID + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption // lastErrorMu protects lastError. lastErrorMu sync.Mutex `state:"nosave"` @@ -298,10 +300,16 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // used with SetSockOpt, and this function always returns // tcpip.ErrNotSupported. func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { - switch opt.(type) { + switch v := opt.(type) { case *tcpip.SocketDetachFilterOption: return nil + case *tcpip.LingerOption: + ep.mu.Lock() + ep.linger = *v + ep.mu.Unlock() + return nil + default: return tcpip.ErrUnknownProtocolOption } @@ -366,8 +374,17 @@ func (ep *endpoint) LastError() *tcpip.Error { } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error { - return tcpip.ErrNotSupported +func (ep *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { + switch o := opt.(type) { + case *tcpip.LingerOption: + ep.mu.Lock() + *o = ep.linger + ep.mu.Unlock() + return nil + + default: + return tcpip.ErrNotSupported + } } // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index fb03e6047..e37c00523 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -84,6 +84,8 @@ type endpoint struct { // Connect(), and is valid only when conneted is true. route stack.Route `state:"manual"` stats tcpip.TransportEndpointStats `state:"nosave"` + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner @@ -511,10 +513,16 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // SetSockOpt implements tcpip.Endpoint.SetSockOpt. func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { - switch opt.(type) { + switch v := opt.(type) { case *tcpip.SocketDetachFilterOption: return nil + case *tcpip.LingerOption: + e.mu.Lock() + e.linger = *v + e.mu.Unlock() + return nil + default: return tcpip.ErrUnknownProtocolOption } @@ -577,8 +585,17 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption +func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { + switch o := opt.(type) { + case *tcpip.LingerOption: + e.mu.Lock() + *o = e.linger + e.mu.Unlock() + return nil + + default: + return tcpip.ErrUnknownProtocolOption + } } // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool. diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 234fb95ce..4778e7b1c 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -69,6 +69,7 @@ go_library( "//pkg/tcpip/buffer", "//pkg/tcpip/hash/jenkins", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/ports", "//pkg/tcpip/seqnum", "//pkg/tcpip/stack", diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index faea7f2bb..120483838 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -1317,14 +1317,17 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) { // indicating the reason why it's not writable. // Caller must hold e.mu and e.sndBufMu func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) { - // The endpoint cannot be written to if it's not connected. - if !e.EndpointState().connected() { - switch e.EndpointState() { - case StateError: - return 0, e.HardError - default: - return 0, tcpip.ErrClosedForSend - } + switch s := e.EndpointState(); { + case s == StateError: + return 0, e.HardError + case !s.connecting() && !s.connected(): + return 0, tcpip.ErrClosedForSend + case s.connecting(): + // As per RFC793, page 56, a send request arriving when in connecting + // state, can be queued to be completed after the state becomes + // connected. Return an error code for the caller of endpoint Write to + // try again, until the connection handshake is complete. + return 0, tcpip.ErrWouldBlock } // Check if the connection has already been closed for sends. diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index 63ec12be8..74a17af79 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/raw" @@ -506,22 +507,7 @@ func (p *protocol) SynRcvdCounter() *synRcvdCounter { // Parse implements stack.TransportProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) bool { - // TCP header is variable length, peek at it first. - hdrLen := header.TCPMinimumSize - hdr, ok := pkt.Data.PullUp(hdrLen) - if !ok { - return false - } - - // If the header has options, pull those up as well. - if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() { - // TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of - // packets. - hdrLen = offset - } - - _, ok = pkt.TransportHeader().Consume(hdrLen) - return ok + return parse.TCP(pkt) } // NewProtocol returns a TCP transport protocol. diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index b5d2d0ba6..c78549424 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", + "//pkg/tcpip/header/parse", "//pkg/tcpip/ports", "//pkg/tcpip/stack", "//pkg/tcpip/transport/raw", diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index b572c39db..518f636f0 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -154,6 +154,9 @@ type endpoint struct { // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner + + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption } // +stateify savable @@ -810,6 +813,11 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { case *tcpip.SocketDetachFilterOption: return nil + + case *tcpip.LingerOption: + e.mu.Lock() + e.linger = *v + e.mu.Unlock() } return nil } @@ -966,6 +974,11 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { *o = tcpip.BindToDeviceOption(e.bindToDevice) e.mu.RUnlock() + case *tcpip.LingerOption: + e.mu.RLock() + *o = e.linger + e.mu.RUnlock() + default: return tcpip.ErrUnknownProtocolOption } diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go index 3f87e8057..7d6b91a75 100644 --- a/pkg/tcpip/transport/udp/protocol.go +++ b/pkg/tcpip/transport/udp/protocol.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/raw" "gvisor.dev/gvisor/pkg/waiter" @@ -219,8 +220,7 @@ func (*protocol) Wait() {} // Parse implements stack.TransportProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) bool { - _, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize) - return ok + return parse.UDP(pkt) } // NewProtocol returns a UDP transport protocol. diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go index b7f873392..06fb823f6 100644 --- a/pkg/test/testutil/testutil.go +++ b/pkg/test/testutil/testutil.go @@ -332,13 +332,13 @@ func PollContext(ctx context.Context, cb func() error) error { } // WaitForHTTP tries GET requests on a port until the call succeeds or timeout. -func WaitForHTTP(port int, timeout time.Duration) error { +func WaitForHTTP(ip string, port int, timeout time.Duration) error { cb := func() error { c := &http.Client{ // Calculate timeout to be able to do minimum 5 attempts. Timeout: timeout / 5, } - url := fmt.Sprintf("http://localhost:%d/", port) + url := fmt.Sprintf("http://%s:%d/", ip, port) resp, err := c.Get(url) if err != nil { log.Printf("Waiting %s: %v", url, err) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 01f62d50a..2d9517f4a 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -27,6 +27,7 @@ go_library( "//pkg/abi", "//pkg/abi/linux", "//pkg/bpf", + "//pkg/cleanup", "//pkg/context", "//pkg/control/server", "//pkg/cpuid", diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go index 9b1799416..24e13565e 100644 --- a/runsc/boot/filter/config_amd64.go +++ b/runsc/boot/filter/config_amd64.go @@ -25,7 +25,6 @@ import ( func init() { allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL], - seccomp.Rule{seccomp.EqualTo(linux.ARCH_GET_FS)}, seccomp.Rule{seccomp.EqualTo(linux.ARCH_SET_FS)}, ) } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 7844ea28c..e36664938 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -21,6 +21,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" @@ -168,26 +169,30 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create } rootProcArgs.MountNamespaceVFS2 = mns + root := mns.Root() + defer root.DecRef(rootCtx) + if root.Mount().ReadOnly() { + // Switch to ReadWrite while we setup submounts. + if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil { + return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err) + } + // Restore back to ReadOnly at the end. + defer func() { + if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { + panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err)) + } + }() + } + // Mount submounts. if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil { return nil, fmt.Errorf("mounting submounts vfs2: %w", err) } - if c.root.Readonly || conf.Overlay { - // Switch to ReadOnly after all submounts were setup. - root := mns.Root() - defer root.DecRef(rootCtx) - if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { - return nil, fmt.Errorf(`failed to set mount at "/" readonly: %v`, err) - } - } - return mns, nil } // createMountNamespaceVFS2 creates the container's root mount and namespace. -// The mount is created ReadWrite to allow mount point for submounts to be -// created. ** The caller is responsible to switch to ReadOnly if needed ** func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { fd := c.fds.remove() data := p9MountData(fd, conf.FileAccess, true /* vfs2 */) @@ -201,21 +206,71 @@ func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *c log.Infof("Mounting root over 9P, ioFD: %d", fd) opts := &vfs.MountOptions{ - // Always mount as ReadWrite to allow other mounts on top of it. It'll be - // made ReadOnly in the caller (if needed). - ReadOnly: false, + ReadOnly: c.root.Readonly, GetFilesystemOptions: vfs.GetFilesystemOptions{ Data: strings.Join(data, ","), }, InternalMount: true, } - mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, opts) + + fsName := gofer.Name + if conf.Overlay && !c.root.Readonly { + log.Infof("Adding overlay on top of root") + var err error + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting root with overlay: %w", err) + } + defer cleanup() + fsName = overlay.Name + } + + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts) if err != nil { return nil, fmt.Errorf("setting up mount namespace: %w", err) } return mns, nil } +// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper +// layer using tmpfs, and return overlay mount options. "cleanup" must be called +// after the options have been used to mount the overlay, to release refs on +// lower and upper mounts. +func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) { + // First copy options from lower layer to upper layer and overlay. Clear + // filesystem specific options. + upperOpts := *lowerOpts + upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} + + overlayOpts := *lowerOpts + overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} + + // Next mount upper and lower. Upper is a tmpfs mount to keep all + // modifications inside the sandbox. + upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) + if err != nil { + return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) + } + cu := cleanup.Make(func() { upper.DecRef(ctx) }) + defer cu.Clean() + + // All writes go to the upper layer, be paranoid and make lower readonly. + lowerOpts.ReadOnly = true + lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) + if err != nil { + return nil, nil, err + } + cu.Add(func() { lower.DecRef(ctx) }) + + // Configure overlay with both layers. + overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{ + UpperRoot: vfs.MakeVirtualDentry(upper, upper.Root()), + LowerRoots: []vfs.VirtualDentry{vfs.MakeVirtualDentry(lower, lower.Root())}, + } + return &overlayOpts, cu.Release(), nil +} + func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { mounts, err := c.prepareMountsVFS2() if err != nil { @@ -245,7 +300,7 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config. if mnt != nil && mnt.ReadOnly() { // Switch to ReadWrite while we setup submounts. if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { - return fmt.Errorf("failed to set mount at %q readwrite: %v", submount.Destination, err) + return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err) } // Restore back to ReadOnly at the end. defer func() { @@ -297,14 +352,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { } func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) { - root := mns.Root() - defer root.DecRef(ctx) - target := &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(submount.Destination), - } - fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount) + fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount) if err != nil { return nil, fmt.Errorf("mountOptions failed: %w", err) } @@ -313,8 +361,27 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C return nil, nil } - if err := c.k.VFS().MakeSyntheticMountpoint(ctx, submount.Destination, root, creds); err != nil { - return nil, err + if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err) + } + + if useOverlay { + log.Infof("Adding overlay on top of mount %q", submount.Destination) + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err) + } + defer cleanup() + fsName = overlay.Name + } + + root := mns.Root() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(submount.Destination), } mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) if err != nil { @@ -326,8 +393,9 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values // used for mounts. -func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, error) { +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) { fsName := m.Type + useOverlay := false var data []string // Find filesystem name and FS specific data field. @@ -342,7 +410,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo var err error data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { - return "", nil, err + return "", nil, false, err } case bind: @@ -350,13 +418,16 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo if m.fd == 0 { // Check that an FD was provided to fails fast. Technically FD=0 is valid, // but unlikely to be correct in this context. - return "", nil, fmt.Errorf("9P mount requires a connection FD") + return "", nil, false, fmt.Errorf("9P mount requires a connection FD") } data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + default: log.Warningf("ignoring unknown filesystem type %q", m.Type) - return "", nil, nil + return "", nil, false, nil } opts := &vfs.MountOptions{ @@ -381,11 +452,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo } } - if conf.Overlay { - // All writes go to upper, be paranoid and make lower readonly. - opts.ReadOnly = true - } - return fsName, opts, nil + return fsName, opts, useOverlay, nil } // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so. @@ -488,13 +555,25 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *conf // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. mntFD := &mountAndFD{Mount: hint.mount} - fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD) + fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD) if err != nil { return nil, err } if len(fsName) == 0 { return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) } + + if useOverlay { + log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination) + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err) + } + defer cleanup() + fsName = overlay.Name + } + return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) } @@ -505,7 +584,9 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co return nil, err } - _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + // Ignore data and useOverlay because these were already applied to + // the master mount. + _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) if err != nil { return nil, err } @@ -517,18 +598,39 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co root := mns.Root() defer root.DecRef(ctx) - if err := c.k.VFS().MakeSyntheticMountpoint(ctx, mount.Destination, root, creds); err != nil { - return nil, err - } - target := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(mount.Destination), } + + if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err) + } + if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { return nil, err } log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) return newMnt, nil } + +func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error { + root := mns.Root() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dest), + } + // First check if mount point exists. When overlay is enabled, gofer doesn't + // allow changes to the FS, making MakeSytheticMountpoint() ineffective + // because MkdirAt fails with EROFS even if file exists. + vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{}) + if err == nil { + // File exists, we're done. + vd.DecRef(ctx) + return nil + } + return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds) +} diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index bba00d551..371fcc0ae 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -62,9 +62,8 @@ type Gofer struct { applyCaps bool setUpRoot bool - panicOnWrite bool - specFD int - mountsFD int + specFD int + mountsFD int } // Name implements subcommands.Command. @@ -87,7 +86,6 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") - f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected") f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).") @@ -168,8 +166,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Start with root mount, then add any other additional mount as needed. ats := make([]p9.Attacher, 0, len(spec.Mounts)+1) ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{ - ROMount: spec.Root.Readonly || conf.Overlay, - PanicOnWrite: g.panicOnWrite, + ROMount: spec.Root.Readonly || conf.Overlay, }) if err != nil { Fatalf("creating attach point: %v", err) @@ -181,9 +178,8 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) for _, m := range spec.Mounts { if specutils.Is9PMount(m) { cfg := fsgofer.Config{ - ROMount: isReadonlyMount(m.Options) || conf.Overlay, - PanicOnWrite: g.panicOnWrite, - HostUDS: conf.FSGoferHostUDS, + ROMount: isReadonlyMount(m.Options) || conf.Overlay, + HostUDS: conf.FSGoferHostUDS, } ap, err := fsgofer.NewAttachPoint(m.Destination, cfg) if err != nil { @@ -316,6 +312,7 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error { if err != nil { return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err) } + log.Infof("Create working directory %q if needed", spec.Process.Cwd) if err := os.MkdirAll(dst, 0755); err != nil { return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err) } diff --git a/runsc/container/container.go b/runsc/container/container.go index 6e1d6a568..63478ba8c 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -902,9 +902,6 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu } args = append(args, "gofer", "--bundle", bundleDir) - if conf.Overlay { - args = append(args, "--panic-on-write=true") - } // Open the spec file to donate to the sandbox. specFile, err := specutils.OpenSpec(bundleDir) diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index ad49f8b16..ff0e60283 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -317,22 +317,12 @@ func configs(t *testing.T, opts ...configOption) map[string]*config.Config { } func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config { - vfs1 := configs(t, opts...) - - var optsVFS2 []configOption - for _, opt := range opts { - // TODO(gvisor.dev/issue/1487): Enable overlay tests. - if opt != overlay { - optsVFS2 = append(optsVFS2, opt) - } - } - - for key, value := range configs(t, optsVFS2...) { + all := configs(t, opts...) + for key, value := range configs(t, opts...) { value.VFS2 = true - vfs1[key+"VFS2"] = value + all[key+"VFS2"] = value } - - return vfs1 + return all } // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle. diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go index 53506b5e1..39f9851a8 100644 --- a/runsc/fsgofer/filter/config_amd64.go +++ b/runsc/fsgofer/filter/config_amd64.go @@ -25,7 +25,6 @@ import ( func init() { allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{ - {seccomp.EqualTo(linux.ARCH_GET_FS)}, {seccomp.EqualTo(linux.ARCH_SET_FS)}, } diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 4268d97a1..0b628c8ce 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -1181,9 +1181,6 @@ func extractErrno(err error) unix.Errno { func (l *localFile) checkROMount() error { if conf := l.attachPoint.conf; conf.ROMount { - if conf.PanicOnWrite { - panic("attempt to write to RO mount") - } return unix.EROFS } return nil diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index 0e4945b3d..a84206686 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -553,29 +553,6 @@ func TestROMountChecks(t *testing.T) { }) } -func TestROMountPanics(t *testing.T) { - conf := Config{ROMount: true, PanicOnWrite: true} - uid := p9.UID(os.Getuid()) - gid := p9.GID(os.Getgid()) - - runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) { - if s.fileType != unix.S_IFLNK { - assertPanic(t, func() { s.file.Open(p9.WriteOnly) }) - } - assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, uid, gid) }) - assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, uid, gid) }) - assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") }) - assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", uid, gid) }) - assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) }) - assertPanic(t, func() { s.file.Link(s.file, "some_link") }) - assertPanic(t, func() { s.file.Mknod("some-nod", 0777, 1, 2, uid, gid) }) - - valid := p9.SetAttrMask{Size: true} - attr := p9.SetAttr{Size: 0} - assertPanic(t, func() { s.file.SetAttr(valid, attr) }) - }) -} - func TestWalkNotFound(t *testing.T) { runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) { if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT { diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go index 809244bab..8425abecb 100644 --- a/test/e2e/integration_test.go +++ b/test/e2e/integration_test.go @@ -64,9 +64,10 @@ func TestLifeCycle(t *testing.T) { defer d.CleanUp(ctx) // Start the container. + port := 80 if err := d.Create(ctx, dockerutil.RunOpts{ Image: "basic/nginx", - Ports: []int{80}, + Ports: []int{port}, }); err != nil { t.Fatalf("docker create failed: %v", err) } @@ -74,16 +75,15 @@ func TestLifeCycle(t *testing.T) { t.Fatalf("docker start failed: %v", err) } - // Test that container is working. - port, err := d.FindPort(ctx, 80) + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("docker.FindPort(80) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } client := http.Client{Timeout: defaultWait} - if err := httpRequestSucceeds(client, "localhost", port); err != nil { + if err := httpRequestSucceeds(client, ip.String(), port); err != nil { t.Errorf("http request failed: %v", err) } @@ -105,27 +105,28 @@ func TestPauseResume(t *testing.T) { defer d.CleanUp(ctx) // Start the container. + port := 8080 if err := d.Spawn(ctx, dockerutil.RunOpts{ Image: "basic/python", - Ports: []int{8080}, // See Dockerfile. + Ports: []int{port}, // See Dockerfile. }); err != nil { t.Fatalf("docker run failed: %v", err) } - // Find where port 8080 is mapped to. - port, err := d.FindPort(ctx, 8080) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("docker.FindPort(8080) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check that container is working. client := http.Client{Timeout: defaultWait} - if err := httpRequestSucceeds(client, "localhost", port); err != nil { + if err := httpRequestSucceeds(client, ip.String(), port); err != nil { t.Error("http request failed:", err) } @@ -135,7 +136,7 @@ func TestPauseResume(t *testing.T) { // Check if container is paused. client = http.Client{Timeout: 10 * time.Millisecond} // Don't wait a minute. - switch _, err := client.Get(fmt.Sprintf("http://localhost:%d", port)); v := err.(type) { + switch _, err := client.Get(fmt.Sprintf("http://%s:%d", ip.String(), port)); v := err.(type) { case nil: t.Errorf("http req expected to fail but it succeeded") case net.Error: @@ -151,13 +152,13 @@ func TestPauseResume(t *testing.T) { } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check if container is working again. client = http.Client{Timeout: defaultWait} - if err := httpRequestSucceeds(client, "localhost", port); err != nil { + if err := httpRequestSucceeds(client, ip.String(), port); err != nil { t.Error("http request failed:", err) } } @@ -179,9 +180,10 @@ func TestCheckpointRestore(t *testing.T) { defer d.CleanUp(ctx) // Start the container. + port := 8080 if err := d.Spawn(ctx, dockerutil.RunOpts{ Image: "basic/python", - Ports: []int{8080}, // See Dockerfile. + Ports: []int{port}, // See Dockerfile. }); err != nil { t.Fatalf("docker run failed: %v", err) } @@ -199,20 +201,20 @@ func TestCheckpointRestore(t *testing.T) { t.Fatalf("docker restore failed: %v", err) } - // Find where port 8080 is mapped to. - port, err := d.FindPort(ctx, 8080) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("docker.FindPort(8080) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check if container is working again. client := http.Client{Timeout: defaultWait} - if err := httpRequestSucceeds(client, "localhost", port); err != nil { + if err := httpRequestSucceeds(client, ip.String(), port); err != nil { t.Error("http request failed:", err) } } diff --git a/test/fuse/linux/rmdir_test.cc b/test/fuse/linux/rmdir_test.cc index 913d3f910..e3200e446 100644 --- a/test/fuse/linux/rmdir_test.cc +++ b/test/fuse/linux/rmdir_test.cc @@ -38,6 +38,7 @@ namespace { class RmDirTest : public FuseTest { protected: const std::string test_dir_name_ = "test_dir"; + const std::string test_subdir_ = "test_subdir"; const mode_t test_dir_mode_ = S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO; }; @@ -67,6 +68,32 @@ TEST_F(RmDirTest, NormalRmDir) { EXPECT_EQ(std::string(actual_dirname.data()), test_dir_name_); } +TEST_F(RmDirTest, NormalRmDirSubdir) { + SetServerInodeLookup(test_subdir_, S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO); + const std::string test_dir_path_ = + JoinPath(mount_point_.path().c_str(), test_subdir_, test_dir_name_); + SetServerInodeLookup(test_dir_name_, test_dir_mode_); + + // RmDir code. + struct fuse_out_header rmdir_header = { + .len = sizeof(struct fuse_out_header), + }; + + auto iov_out = FuseGenerateIovecs(rmdir_header); + SetServerResponse(FUSE_RMDIR, iov_out); + + ASSERT_THAT(rmdir(test_dir_path_.c_str()), SyscallSucceeds()); + + struct fuse_in_header in_header; + std::vector<char> actual_dirname(test_dir_name_.length() + 1); + auto iov_in = FuseGenerateIovecs(in_header, actual_dirname); + GetServerActualRequest(iov_in); + + EXPECT_EQ(in_header.len, sizeof(in_header) + test_dir_name_.length() + 1); + EXPECT_EQ(in_header.opcode, FUSE_RMDIR); + EXPECT_EQ(std::string(actual_dirname.data()), test_dir_name_); +} + } // namespace } // namespace testing diff --git a/test/fuse/linux/unlink_test.cc b/test/fuse/linux/unlink_test.cc index 5702e9b32..13efbf7c7 100644 --- a/test/fuse/linux/unlink_test.cc +++ b/test/fuse/linux/unlink_test.cc @@ -37,6 +37,7 @@ namespace { class UnlinkTest : public FuseTest { protected: const std::string test_file_ = "test_file"; + const std::string test_subdir_ = "test_subdir"; }; TEST_F(UnlinkTest, RegularFile) { @@ -61,6 +62,29 @@ TEST_F(UnlinkTest, RegularFile) { EXPECT_EQ(std::string(unlinked_file.data()), test_file_); } +TEST_F(UnlinkTest, RegularFileSubDir) { + SetServerInodeLookup(test_subdir_, S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO); + const std::string test_file_path = + JoinPath(mount_point_.path().c_str(), test_subdir_, test_file_); + SetServerInodeLookup(test_file_, S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO); + + struct fuse_out_header out_header = { + .len = sizeof(struct fuse_out_header), + }; + auto iov_out = FuseGenerateIovecs(out_header); + SetServerResponse(FUSE_UNLINK, iov_out); + + ASSERT_THAT(unlink(test_file_path.c_str()), SyscallSucceeds()); + struct fuse_in_header in_header; + std::vector<char> unlinked_file(test_file_.length() + 1); + auto iov_in = FuseGenerateIovecs(in_header, unlinked_file); + GetServerActualRequest(iov_in); + + EXPECT_EQ(in_header.len, sizeof(in_header) + test_file_.length() + 1); + EXPECT_EQ(in_header.opcode, FUSE_UNLINK); + EXPECT_EQ(std::string(unlinked_file.data()), test_file_); +} + TEST_F(UnlinkTest, NoFile) { const std::string test_file_path = JoinPath(mount_point_.path().c_str(), test_file_); diff --git a/test/image/image_test.go b/test/image/image_test.go index ac6186688..968e62f63 100644 --- a/test/image/image_test.go +++ b/test/image/image_test.go @@ -63,8 +63,8 @@ func TestHelloWorld(t *testing.T) { } } -func runHTTPRequest(port int) error { - url := fmt.Sprintf("http://localhost:%d/not-found", port) +func runHTTPRequest(ip string, port int) error { + url := fmt.Sprintf("http://%s:%d/not-found", ip, port) resp, err := http.Get(url) if err != nil { return fmt.Errorf("error reaching http server: %v", err) @@ -73,7 +73,7 @@ func runHTTPRequest(port int) error { return fmt.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want) } - url = fmt.Sprintf("http://localhost:%d/latin10k.txt", port) + url = fmt.Sprintf("http://%s:%d/latin10k.txt", ip, port) resp, err = http.Get(url) if err != nil { return fmt.Errorf("Error reaching http server: %v", err) @@ -95,13 +95,13 @@ func runHTTPRequest(port int) error { return nil } -func testHTTPServer(t *testing.T, port int) { +func testHTTPServer(t *testing.T, ip string, port int) { const requests = 10 ch := make(chan error, requests) for i := 0; i < requests; i++ { go func() { start := time.Now() - err := runHTTPRequest(port) + err := runHTTPRequest(ip, port) log.Printf("Response time %v: %v", time.Since(start).String(), err) ch <- err }() @@ -110,7 +110,7 @@ func testHTTPServer(t *testing.T, port int) { for i := 0; i < requests; i++ { err := <-ch if err != nil { - t.Errorf("testHTTPServer(%d) failed: %v", port, err) + t.Errorf("testHTTPServer(%s, %d) failed: %v", ip, port, err) } } } @@ -121,27 +121,28 @@ func TestHttpd(t *testing.T) { defer d.CleanUp(ctx) // Start the container. + port := 80 opts := dockerutil.RunOpts{ Image: "basic/httpd", - Ports: []int{80}, + Ports: []int{port}, } d.CopyFiles(&opts, "/usr/local/apache2/htdocs", "test/image/latin10k.txt") if err := d.Spawn(ctx, opts); err != nil { t.Fatalf("docker run failed: %v", err) } - // Find where port 80 is mapped to. - port, err := d.FindPort(ctx, 80) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("FindPort(80) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Errorf("WaitForHTTP() timeout: %v", err) } - testHTTPServer(t, port) + testHTTPServer(t, ip.String(), port) } func TestNginx(t *testing.T) { @@ -150,27 +151,28 @@ func TestNginx(t *testing.T) { defer d.CleanUp(ctx) // Start the container. + port := 80 opts := dockerutil.RunOpts{ Image: "basic/nginx", - Ports: []int{80}, + Ports: []int{port}, } d.CopyFiles(&opts, "/usr/share/nginx/html", "test/image/latin10k.txt") if err := d.Spawn(ctx, opts); err != nil { t.Fatalf("docker run failed: %v", err) } - // Find where port 80 is mapped to. - port, err := d.FindPort(ctx, 80) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("FindPort(80) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Errorf("WaitForHTTP() timeout: %v", err) } - testHTTPServer(t, port) + testHTTPServer(t, ip.String(), port) } func TestMysql(t *testing.T) { @@ -218,26 +220,27 @@ func TestTomcat(t *testing.T) { defer d.CleanUp(ctx) // Start the server. + port := 8080 if err := d.Spawn(ctx, dockerutil.RunOpts{ Image: "basic/tomcat", - Ports: []int{8080}, + Ports: []int{port}, }); err != nil { t.Fatalf("docker run failed: %v", err) } - // Find where port 8080 is mapped to. - port, err := d.FindPort(ctx, 8080) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("FindPort(8080) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, defaultWait); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, defaultWait); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Ensure that content is being served. - url := fmt.Sprintf("http://localhost:%d", port) + url := fmt.Sprintf("http://%s:%d", ip.String(), port) resp, err := http.Get(url) if err != nil { t.Errorf("Error reaching http server: %v", err) @@ -253,28 +256,29 @@ func TestRuby(t *testing.T) { defer d.CleanUp(ctx) // Execute the ruby workload. + port := 8080 opts := dockerutil.RunOpts{ Image: "basic/ruby", - Ports: []int{8080}, + Ports: []int{port}, } d.CopyFiles(&opts, "/src", "test/image/ruby.rb", "test/image/ruby.sh") if err := d.Spawn(ctx, opts, "/src/ruby.sh"); err != nil { t.Fatalf("docker run failed: %v", err) } - // Find where port 8080 is mapped to. - port, err := d.FindPort(ctx, 8080) + // Find container IP address. + ip, err := d.FindIP(ctx, false) if err != nil { - t.Fatalf("FindPort(8080) failed: %v", err) + t.Fatalf("docker.FindIP failed: %v", err) } // Wait until it's up and running, 'gem install' can take some time. - if err := testutil.WaitForHTTP(port, time.Minute); err != nil { + if err := testutil.WaitForHTTP(ip.String(), port, time.Minute); err != nil { t.Fatalf("WaitForHTTP() timeout: %v", err) } // Ensure that content is being served. - url := fmt.Sprintf("http://localhost:%d", port) + url := fmt.Sprintf("http://%s:%d", ip.String(), port) resp, err := http.Get(url) if err != nil { t.Errorf("error reaching http server: %v", err) diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go index e2beb30d5..398f70ecd 100644 --- a/test/iptables/iptables_test.go +++ b/test/iptables/iptables_test.go @@ -48,6 +48,13 @@ func singleTest(t *testing.T, test TestCase) { } } +// TODO(gvisor.dev/issue/3549): IPv6 NAT support. +func ipv4Test(t *testing.T, test TestCase) { + t.Run("IPv4", func(t *testing.T) { + iptablesTest(t, test, false) + }) +} + func iptablesTest(t *testing.T, test TestCase, ipv6 bool) { if _, ok := Tests[test.Name()]; !ok { t.Fatalf("no test found with name %q. Has it been registered?", test.Name()) @@ -72,11 +79,6 @@ func iptablesTest(t *testing.T, test TestCase, ipv6 bool) { d.CleanUp(context.Background()) }() - // TODO(gvisor.dev/issue/170): Skipping IPv6 gVisor tests. - if ipv6 && dockerutil.Runtime() != "runc" { - t.Skip("gVisor ip6tables not yet implemented") - } - // Create and start the container. opts := dockerutil.RunOpts{ Image: "iptables", @@ -314,75 +316,75 @@ func TestInputInvertDestination(t *testing.T) { singleTest(t, FilterInputInvertDestination{}) } -func TestOutputDestination(t *testing.T) { +func TestFilterOutputDestination(t *testing.T) { singleTest(t, FilterOutputDestination{}) } -func TestOutputInvertDestination(t *testing.T) { +func TestFilterOutputInvertDestination(t *testing.T) { singleTest(t, FilterOutputInvertDestination{}) } func TestNATPreRedirectUDPPort(t *testing.T) { - singleTest(t, NATPreRedirectUDPPort{}) + ipv4Test(t, NATPreRedirectUDPPort{}) } func TestNATPreRedirectTCPPort(t *testing.T) { - singleTest(t, NATPreRedirectTCPPort{}) + ipv4Test(t, NATPreRedirectTCPPort{}) } func TestNATPreRedirectTCPOutgoing(t *testing.T) { - singleTest(t, NATPreRedirectTCPOutgoing{}) + ipv4Test(t, NATPreRedirectTCPOutgoing{}) } func TestNATOutRedirectTCPIncoming(t *testing.T) { - singleTest(t, NATOutRedirectTCPIncoming{}) + ipv4Test(t, NATOutRedirectTCPIncoming{}) } func TestNATOutRedirectUDPPort(t *testing.T) { - singleTest(t, NATOutRedirectUDPPort{}) + ipv4Test(t, NATOutRedirectUDPPort{}) } func TestNATOutRedirectTCPPort(t *testing.T) { - singleTest(t, NATOutRedirectTCPPort{}) + ipv4Test(t, NATOutRedirectTCPPort{}) } func TestNATDropUDP(t *testing.T) { - singleTest(t, NATDropUDP{}) + ipv4Test(t, NATDropUDP{}) } func TestNATAcceptAll(t *testing.T) { - singleTest(t, NATAcceptAll{}) + ipv4Test(t, NATAcceptAll{}) } func TestNATOutRedirectIP(t *testing.T) { - singleTest(t, NATOutRedirectIP{}) + ipv4Test(t, NATOutRedirectIP{}) } func TestNATOutDontRedirectIP(t *testing.T) { - singleTest(t, NATOutDontRedirectIP{}) + ipv4Test(t, NATOutDontRedirectIP{}) } func TestNATOutRedirectInvert(t *testing.T) { - singleTest(t, NATOutRedirectInvert{}) + ipv4Test(t, NATOutRedirectInvert{}) } func TestNATPreRedirectIP(t *testing.T) { - singleTest(t, NATPreRedirectIP{}) + ipv4Test(t, NATPreRedirectIP{}) } func TestNATPreDontRedirectIP(t *testing.T) { - singleTest(t, NATPreDontRedirectIP{}) + ipv4Test(t, NATPreDontRedirectIP{}) } func TestNATPreRedirectInvert(t *testing.T) { - singleTest(t, NATPreRedirectInvert{}) + ipv4Test(t, NATPreRedirectInvert{}) } func TestNATRedirectRequiresProtocol(t *testing.T) { - singleTest(t, NATRedirectRequiresProtocol{}) + ipv4Test(t, NATRedirectRequiresProtocol{}) } func TestNATLoopbackSkipsPrerouting(t *testing.T) { - singleTest(t, NATLoopbackSkipsPrerouting{}) + ipv4Test(t, NATLoopbackSkipsPrerouting{}) } func TestInputSource(t *testing.T) { @@ -419,9 +421,9 @@ func TestFilterAddrs(t *testing.T) { } func TestNATPreOriginalDst(t *testing.T) { - singleTest(t, NATPreOriginalDst{}) + ipv4Test(t, NATPreOriginalDst{}) } func TestNATOutOriginalDst(t *testing.T) { - singleTest(t, NATOutOriginalDst{}) + ipv4Test(t, NATOutOriginalDst{}) } diff --git a/test/packetimpact/tests/BUILD b/test/packetimpact/tests/BUILD index f850dfcd8..fbfea61e1 100644 --- a/test/packetimpact/tests/BUILD +++ b/test/packetimpact/tests/BUILD @@ -272,6 +272,16 @@ packetimpact_go_test( ) packetimpact_go_test( + name = "tcp_queue_send_in_syn_sent", + srcs = ["tcp_queue_send_in_syn_sent_test.go"], + deps = [ + "//pkg/tcpip/header", + "//test/packetimpact/testbench", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +packetimpact_go_test( name = "icmpv6_param_problem", srcs = ["icmpv6_param_problem_test.go"], # TODO(b/153485026): Fix netstack then remove the line below. diff --git a/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go b/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go new file mode 100644 index 000000000..0ec8fd748 --- /dev/null +++ b/test/packetimpact/tests/tcp_queue_send_in_syn_sent_test.go @@ -0,0 +1,133 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tcp_queue_send_in_syn_sent_test + +import ( + "context" + "errors" + "flag" + "net" + "sync" + "syscall" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/test/packetimpact/testbench" +) + +func init() { + testbench.RegisterFlags(flag.CommandLine) +} + +// TestQueueSendInSynSent tests send behavior when the TCP state +// is SYN-SENT. +// It tests for 2 variants when in SYN_SENT state and: +// (1) DUT blocks on send and complete handshake +// (2) DUT blocks on send and receive a TCP RST. +func TestQueueSendInSynSent(t *testing.T) { + for _, tt := range []struct { + description string + reset bool + }{ + {description: "Complete handshake", reset: false}, + {description: "Send RST", reset: true}, + } { + t.Run(tt.description, func(t *testing.T) { + dut := testbench.NewDUT(t) + defer dut.TearDown() + + socket, remotePort := dut.CreateBoundSocket(t, unix.SOCK_STREAM, unix.IPPROTO_TCP, net.ParseIP(testbench.RemoteIPv4)) + conn := testbench.NewTCPIPv4(t, testbench.TCP{DstPort: &remotePort}, testbench.TCP{SrcPort: &remotePort}) + defer conn.Close(t) + + sampleData := []byte("Sample Data") + samplePayload := &testbench.Payload{Bytes: sampleData} + dut.SetNonBlocking(t, socket, true) + if _, err := dut.ConnectWithErrno(context.Background(), t, socket, conn.LocalAddr(t)); !errors.Is(err, syscall.EINPROGRESS) { + t.Fatalf("failed to bring DUT to SYN-SENT, got: %s, want EINPROGRESS", err) + } + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn)}, time.Second); err != nil { + t.Fatalf("expected a SYN from DUT, but got none: %s", err) + } + if _, err := dut.SendWithErrno(context.Background(), t, socket, sampleData, 0); err != syscall.Errno(unix.EWOULDBLOCK) { + t.Fatalf("expected error %s, got %s", syscall.Errno(unix.EWOULDBLOCK), err) + } + + // Test blocking write. + dut.SetNonBlocking(t, socket, false) + + var wg sync.WaitGroup + defer wg.Wait() + wg.Add(1) + var block sync.WaitGroup + block.Add(1) + go func() { + defer wg.Done() + ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) + defer cancel() + + block.Done() + // Issue SEND call in SYN-SENT, this should be queued for + // process until the connection is established. + n, err := dut.SendWithErrno(ctx, t, socket, sampleData, 0) + if tt.reset { + if err != syscall.Errno(unix.ECONNREFUSED) { + t.Errorf("expected error %s, got %s", syscall.Errno(unix.ECONNREFUSED), err) + } + if n != -1 { + t.Errorf("expected return value %d, got %d", -1, n) + } + return + } + if n != int32(len(sampleData)) { + t.Errorf("failed to send on DUT: %s", err) + } + }() + + // Wait for the goroutine to be scheduled and before it + // blocks on endpoint send. + block.Wait() + // The following sleep is used to prevent the connection + // from being established before we are blocked on send. + time.Sleep(100 * time.Millisecond) + + if tt.reset { + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagRst | header.TCPFlagAck)}) + return + } + + // Bring the connection to Established. + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagSyn | header.TCPFlagAck)}) + + // Expect the data from the DUT's enqueued send request. + // + // On Linux, this can be piggybacked with the ACK completing the + // handshake. On gVisor, getting such a piggyback is a bit more + // complicated because the actual data enqueuing occurs in the + // callers of endpoint Write. + if _, err := conn.ExpectData(t, &testbench.TCP{Flags: testbench.Uint8(header.TCPFlagPsh | header.TCPFlagAck)}, samplePayload, time.Second); err != nil { + t.Fatalf("expected payload was not received: %s", err) + } + + // Send sample payload and expect an ACK to ensure connection is still ESTABLISHED. + conn.Send(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagPsh | header.TCPFlagAck)}, &testbench.Payload{Bytes: sampleData}) + if _, err := conn.Expect(t, testbench.TCP{Flags: testbench.Uint8(header.TCPFlagAck)}, time.Second); err != nil { + t.Fatalf("expected an ACK from DUT, but got none: %s", err) + } + }) + } +} diff --git a/test/runtimes/exclude/nodejs12.4.0.csv b/test/runtimes/exclude/nodejs12.4.0.csv index 1740dbb76..749fb9482 100644 --- a/test/runtimes/exclude/nodejs12.4.0.csv +++ b/test/runtimes/exclude/nodejs12.4.0.csv @@ -1,4 +1,5 @@ test name,bug id,comment +async-hooks/test-statwatcher.js,https://github.com/nodejs/node/issues/21425,Check for fix inclusion in nodejs releases after 2020-03-29 benchmark/test-benchmark-fs.js,, benchmark/test-benchmark-napi.js,, doctool/test-make-doc.js,b/68848110,Expected to fail. diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc index cabc2b751..edd23e063 100644 --- a/test/syscalls/linux/fallocate.cc +++ b/test/syscalls/linux/fallocate.cc @@ -179,6 +179,12 @@ TEST_F(AllocateTest, FallocateOtherFDs) { auto sock0 = FileDescriptor(socks[0]); auto sock1 = FileDescriptor(socks[1]); EXPECT_THAT(fallocate(sock0.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV)); + + int pipefds[2]; + ASSERT_THAT(pipe(pipefds), SyscallSucceeds()); + EXPECT_THAT(fallocate(pipefds[1], 0, 0, 10), SyscallFailsWithErrno(ESPIPE)); + close(pipefds[0]); + close(pipefds[1]); } } // namespace diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc index f3c1d6bc9..b558e3a01 100644 --- a/test/syscalls/linux/packet_socket_raw.cc +++ b/test/syscalls/linux/packet_socket_raw.cc @@ -643,6 +643,27 @@ TEST_P(RawPacketTest, GetSocketDetachFilter) { SyscallFailsWithErrno(ENOPROTOOPT)); } +TEST_P(RawPacketTest, SetAndGetSocketLinger) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int level = SOL_SOCKET; + int type = SO_LINGER; + + struct linger sl; + sl.l_onoff = 1; + sl.l_linger = 5; + ASSERT_THAT(setsockopt(s_, level, type, &sl, sizeof(sl)), + SyscallSucceedsWithValue(0)); + + struct linger got_linger = {}; + socklen_t length = sizeof(sl); + ASSERT_THAT(getsockopt(s_, level, type, &got_linger, &length), + SyscallSucceedsWithValue(0)); + + ASSERT_EQ(length, sizeof(got_linger)); + EXPECT_EQ(0, memcmp(&sl, &got_linger, length)); +} + INSTANTIATE_TEST_SUITE_P(AllInetTests, RawPacketTest, ::testing::Values(ETH_P_IP, ETH_P_ALL)); diff --git a/test/syscalls/linux/raw_socket_icmp.cc b/test/syscalls/linux/raw_socket_icmp.cc index 3de898df7..1b9dbc584 100644 --- a/test/syscalls/linux/raw_socket_icmp.cc +++ b/test/syscalls/linux/raw_socket_icmp.cc @@ -416,6 +416,28 @@ TEST_F(RawSocketICMPTest, BindConnectSendAndReceive) { ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp)); } +// Set and get SO_LINGER. +TEST_F(RawSocketICMPTest, SetAndGetSocketLinger) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int level = SOL_SOCKET; + int type = SO_LINGER; + + struct linger sl; + sl.l_onoff = 1; + sl.l_linger = 5; + ASSERT_THAT(setsockopt(s_, level, type, &sl, sizeof(sl)), + SyscallSucceedsWithValue(0)); + + struct linger got_linger = {}; + socklen_t length = sizeof(sl); + ASSERT_THAT(getsockopt(s_, level, type, &got_linger, &length), + SyscallSucceedsWithValue(0)); + + ASSERT_EQ(length, sizeof(got_linger)); + EXPECT_EQ(0, memcmp(&sl, &got_linger, length)); +} + void RawSocketICMPTest::ExpectICMPSuccess(const struct icmphdr& icmp) { // We're going to receive both the echo request and reply, but the order is // indeterminate. diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc index 6e4ecd680..3f2c0fdf2 100644 --- a/test/syscalls/linux/socket_ip_udp_generic.cc +++ b/test/syscalls/linux/socket_ip_udp_generic.cc @@ -451,7 +451,7 @@ TEST_P(UDPSocketPairTest, TClassRecvMismatch) { } // Test the SO_LINGER option can be set/get on udp socket. -TEST_P(UDPSocketPairTest, SoLingerFail) { +TEST_P(UDPSocketPairTest, SetAndGetSocketLinger) { auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); int level = SOL_SOCKET; int type = SO_LINGER; @@ -469,15 +469,7 @@ TEST_P(UDPSocketPairTest, SoLingerFail) { SyscallSucceedsWithValue(0)); ASSERT_EQ(length, sizeof(got_linger)); - // Linux returns the values which are set in the SetSockOpt for SO_LINGER. - // In gVisor, we do not store the linger values for UDP as SO_LINGER for UDP - // is a no-op. - if (IsRunningOnGvisor()) { - struct linger want_linger = {}; - EXPECT_EQ(0, memcmp(&want_linger, &got_linger, length)); - } else { - EXPECT_EQ(0, memcmp(&sl, &got_linger, length)); - } + EXPECT_EQ(0, memcmp(&sl, &got_linger, length)); } } // namespace testing diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc index 99e77b89e..1edcb15a7 100644 --- a/test/syscalls/linux/socket_unix_stream.cc +++ b/test/syscalls/linux/socket_unix_stream.cc @@ -103,6 +103,24 @@ TEST_P(StreamUnixSocketPairTest, Sendto) { SyscallFailsWithErrno(EISCONN)); } +TEST_P(StreamUnixSocketPairTest, SetAndGetSocketLinger) { + auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); + + struct linger sl = {1, 5}; + EXPECT_THAT( + setsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)), + SyscallSucceedsWithValue(0)); + + struct linger got_linger = {}; + socklen_t length = sizeof(sl); + EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER, + &got_linger, &length), + SyscallSucceedsWithValue(0)); + + ASSERT_EQ(length, sizeof(got_linger)); + EXPECT_EQ(0, memcmp(&got_linger, &sl, length)); +} + INSTANTIATE_TEST_SUITE_P( AllUnixDomainSockets, StreamUnixSocketPairTest, ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>( diff --git a/test/syscalls/linux/vdso_clock_gettime.cc b/test/syscalls/linux/vdso_clock_gettime.cc index ce1899f45..52b67249d 100644 --- a/test/syscalls/linux/vdso_clock_gettime.cc +++ b/test/syscalls/linux/vdso_clock_gettime.cc @@ -38,8 +38,6 @@ std::string PrintClockId(::testing::TestParamInfo<clockid_t> info) { switch (info.param) { case CLOCK_MONOTONIC: return "CLOCK_MONOTONIC"; - case CLOCK_REALTIME: - return "CLOCK_REALTIME"; case CLOCK_BOOTTIME: return "CLOCK_BOOTTIME"; default: @@ -47,59 +45,31 @@ std::string PrintClockId(::testing::TestParamInfo<clockid_t> info) { } } -class CorrectVDSOClockTest : public ::testing::TestWithParam<clockid_t> {}; +class MonotonicVDSOClockTest : public ::testing::TestWithParam<clockid_t> {}; -TEST_P(CorrectVDSOClockTest, IsCorrect) { +TEST_P(MonotonicVDSOClockTest, IsCorrect) { + // Check that when we alternate readings from the clock_gettime syscall and + // the VDSO's implementation, we observe the combined sequence as being + // monotonic. struct timespec tvdso, tsys; absl::Time vdso_time, sys_time; - uint64_t total_calls = 0; - - // It is expected that 82.5% of clock_gettime calls will be less than 100us - // skewed from the system time. - // Unfortunately this is not only influenced by the VDSO clock skew, but also - // by arbitrary scheduling delays and the like. The test is therefore - // regularly disabled. - std::map<absl::Duration, std::tuple<double, uint64_t, uint64_t>> confidence = - { - {absl::Microseconds(100), std::make_tuple(0.825, 0, 0)}, - {absl::Microseconds(250), std::make_tuple(0.94, 0, 0)}, - {absl::Milliseconds(1), std::make_tuple(0.999, 0, 0)}, - }; - - absl::Time start = absl::Now(); - while (absl::Now() < start + absl::Seconds(30)) { - EXPECT_THAT(clock_gettime(GetParam(), &tvdso), SyscallSucceeds()); - EXPECT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys), - SyscallSucceeds()); - + ASSERT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys), + SyscallSucceeds()); + sys_time = absl::TimeFromTimespec(tsys); + auto end = absl::Now() + absl::Seconds(10); + while (absl::Now() < end) { + ASSERT_THAT(clock_gettime(GetParam(), &tvdso), SyscallSucceeds()); vdso_time = absl::TimeFromTimespec(tvdso); - - for (auto const& conf : confidence) { - std::get<1>(confidence[conf.first]) += - (sys_time - vdso_time) < conf.first; - } - + EXPECT_LE(sys_time, vdso_time); + ASSERT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys), + SyscallSucceeds()); sys_time = absl::TimeFromTimespec(tsys); - - for (auto const& conf : confidence) { - std::get<2>(confidence[conf.first]) += - (vdso_time - sys_time) < conf.first; - } - - ++total_calls; - } - - for (auto const& conf : confidence) { - EXPECT_GE(std::get<1>(conf.second) / static_cast<double>(total_calls), - std::get<0>(conf.second)); - EXPECT_GE(std::get<2>(conf.second) / static_cast<double>(total_calls), - std::get<0>(conf.second)); + EXPECT_LE(vdso_time, sys_time); } } -INSTANTIATE_TEST_SUITE_P(ClockGettime, CorrectVDSOClockTest, - ::testing::Values(CLOCK_MONOTONIC, CLOCK_REALTIME, - CLOCK_BOOTTIME), +INSTANTIATE_TEST_SUITE_P(ClockGettime, MonotonicVDSOClockTest, + ::testing::Values(CLOCK_MONOTONIC, CLOCK_BOOTTIME), PrintClockId); } // namespace diff --git a/tools/bazel.mk b/tools/bazel.mk index 3e27af7d1..5cc1cdea2 100644 --- a/tools/bazel.mk +++ b/tools/bazel.mk @@ -31,6 +31,7 @@ DOCKER_PRIVILEGED ?= --privileged BAZEL_CACHE := $(shell readlink -m ~/.cache/bazel/) GCLOUD_CONFIG := $(shell readlink -m ~/.config/gcloud/) DOCKER_SOCKET := /var/run/docker.sock +DOCKER_CONFIG := /etc/docker/daemon.json # Bazel flags. BAZEL := bazel $(STARTUP_OPTIONS) @@ -56,6 +57,9 @@ endif # Add docker passthrough options. ifneq ($(DOCKER_PRIVILEGED),) FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_SOCKET):$(DOCKER_SOCKET)" +# TODO(gvisor.dev/issue/1624): Remove docker config volume. This is required +# temporarily for checking VFS1 vs VFS2 by some tests. +FULL_DOCKER_RUN_OPTIONS += -v "$(DOCKER_CONFIG):$(DOCKER_CONFIG)" FULL_DOCKER_RUN_OPTIONS += $(DOCKER_PRIVILEGED) FULL_DOCKER_EXEC_OPTIONS += $(DOCKER_PRIVILEGED) DOCKER_GROUP := $(shell stat -c '%g' $(DOCKER_SOCKET)) diff --git a/website/_config.yml b/website/_config.yml index b08602970..20fbb3d2d 100644 --- a/website/_config.yml +++ b/website/_config.yml @@ -34,3 +34,6 @@ authors: igudger: name: Ian Gudger email: igudger@google.com + fvoznika: + name: Fabricio Voznika + email: fvoznika@google.com diff --git a/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png b/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png Binary files differnew file mode 100644 index 000000000..c750f0851 --- /dev/null +++ b/website/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png diff --git a/website/blog/2019-11-18-security-basics.md b/website/blog/2019-11-18-security-basics.md index 2256ee9d5..b6cf57a77 100644 --- a/website/blog/2019-11-18-security-basics.md +++ b/website/blog/2019-11-18-security-basics.md @@ -279,8 +279,10 @@ weaknesses of each gVisor component. We will also use it to introduce Google's Vulnerability Reward Program[^14], and other ways the community can contribute to help make gVisor safe, fast and stable. +<br> +<br> -## Notes +-------------------------------------------------------------------------------- [^1]: [https://en.wikipedia.org/wiki/Secure_by_design](https://en.wikipedia.org/wiki/Secure_by_design) [^2]: [https://gvisor.dev/docs/architecture_guide](https://gvisor.dev/docs/architecture_guide/) diff --git a/website/blog/2020-09-18-containing-a-real-vulnerability.md b/website/blog/2020-09-18-containing-a-real-vulnerability.md new file mode 100644 index 000000000..b71ef63d9 --- /dev/null +++ b/website/blog/2020-09-18-containing-a-real-vulnerability.md @@ -0,0 +1,224 @@ +# Containing a Real Vulnerability + +In the previous two posts we talked about gVisor's +[security design principles](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/) +as well as how those are applied in the +[context of networking](https://gvisor.dev/blog/2020/04/02/gvisor-networking-security/). +Recently, a new container escape vulnerability +([CVE-2020-14386](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-14386)) +was announced that ties these topics well together. gVisor is +[not vulnerable](https://seclists.org/oss-sec/2020/q3/168) to this specific +issue, but it provides an interesting case study to continue our exploration of +gVisor's security. While gVisor is not immune to vulnerabilities, +[we take several steps](https://gvisor.dev/security/) to minimize the impact and +remediate if a vulnerability is found. + +## Escaping the Container + +First, let’s describe how the discovered vulnerability works. There are numerous +ways one can send and receive bytes over the network with Linux. One of the most +performant ways is to use a ring buffer, which is a memory region shared by the +application and the kernel. These rings are created by calling +[setsockopt(2)](https://man7.org/linux/man-pages/man2/setsockopt.2.html) with +[`PACKET_RX_RING`](https://man7.org/linux/man-pages/man7/packet.7.html) for +receiving and +[`PACKET_TX_RING`](https://man7.org/linux/man-pages/man7/packet.7.html) for +sending packets. + +The vulnerability is in the code that reads packets when `PACKET_RX_RING` is +enabled. There is another option +([`PACKET_RESERVE`](https://man7.org/linux/man-pages/man7/packet.7.html)) that +asks the kernel to leave some space in the ring buffer before each packet for +anything the application needs, e.g. control structures. When a packet is +received, the kernel calculates where to copy the packet to, taking the amount +reserved before each packet into consideration. If the amount reserved is large, +the kernel performed an incorrect calculation which could cause an overflow +leading to an out-of-bounds write of up to 10 bytes, controlled by the attacker. +The data in the write is easily controlled using the loopback to send a crafted +packet and receiving it using a `PACKET_RX_RING` with a carefully selected +`PACKET_RESERVE` size. + +```c +static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ +// ... + if (sk->sk_type == SOCK_DGRAM) { + macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + + po->tp_reserve; + } else { + unsigned int maclen = skb_network_offset(skb); + // tp_reserve is unsigned int, netoff is unsigned short. Addition can overflow netoff + netoff = TPACKET_ALIGN(po->tp_hdrlen + + (maclen < 16 ? 16 : maclen)) + + po->tp_reserve; + if (po->has_vnet_hdr) { + netoff += sizeof(struct virtio_net_hdr); + do_vnet = true; + } + // Attacker controls netoff and can make macoff be smaller than sizeof(struct virtio_net_hdr) + macoff = netoff - maclen; + } +// ... + // "macoff - sizeof(struct virtio_net_hdr)" can be negative, resulting in a pointer before h.raw + if (do_vnet && + virtio_net_hdr_from_skb(skb, h.raw + macoff - + sizeof(struct virtio_net_hdr), + vio_le(), true, 0)) { +// ... +``` + +The [`CAP_NET_RAW`](https://man7.org/linux/man-pages/man7/capabilities.7.html) +capability is required to create the socket above. However, in order to support +common debugging tools like `ping` and `tcpdump`, Docker containers, including +those created for Kubernetes, are given `CAP_NET_RAW` by default and thus may be +able to trigger this vulnerability to elevate privileges and escape the +container. + +Next, we are going to explore why this vulnerability doesn’t work in gVisor, and +how gVisor could prevent the escape even if a similar vulnerability existed +inside gVisor’s kernel. + +## Default Protections + +gVisor does not implement `PACKET_RX_RING`, but **does** support raw sockets +which are required for `PACKET_RX_RING`. Raw sockets are a controversial feature +to support in a sandbox environment. While it allows great customizations for +essential tools like `ping`, it may allow packets to be written to the network +without any validation. In general, allowing an untrusted application to write +crafted packets to the network is a questionable idea and a historical source of +vulnerabilities. With that in mind, if `CAP_NET_RAW` is enabled by default, it +would not be _secure by default_ to run untrusted applications. + +After multiple discussions when raw sockets were first implemented, we decided +to disable raw sockets by default, **even if `CAP_NET_RAW` is given to the +application**. Instead, enabling raw sockets in gVisor requires the admin to set +`--net-raw` flag to runsc when configuring the runtime, in addition to requiring +the `CAP_NET_RAW` capability in the application. It comes at the expense that +some tools may not work out of the box, but as part of our +[secure-by-default](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/#secure-by-default) +principle, we felt that it was important for the “less secure” configuration to +be explicit. + +Since this bug was due to an overflow in the specific Linux implementation of +the packet ring, gVisor's raw socket implementation is not affected. However, if +there were a vulnerability in gVisor, containers would not be allowed to exploit +it by default. + +As an alternative way to implement this same constraint, Kubernetes allows +[admission controllers](https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/) +to be configured to customize requests. Cloud providers can use this to +implement more stringent policies. For example, GKE implements an admission +controller for gVisor that +[removes `CAP_NET_RAW` from gVisor pods](https://cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods#capabilities) +unless it has been explicitly set in the pod spec. + +## Isolated Kernel + +gVisor has its own application kernel, called the Sentry, that is distinct from +the host kernel. Just like what you would expect from a kernel, gVisor has a +memory management subsystem, virtual file system, and a full network stack. The +host network is only used as a transport to carry packets in and out the +sandbox[^1]. The loopback interface which is used in the exploit stays +completely inside the sandbox, never reaching the host. + +Therefore, even if the Sentry was vulnerable to the attack, there would be two +factors that would prevent a container escape from happening. First, the +vulnerability would be limited to the Sentry, and the attacker would compromise +only the application kernel, bound by a restricted set of +[seccomp](https://en.wikipedia.org/wiki/Seccomp) filters, discussed more in +depth below. Second, the Sentry is a distinct implementation of the API, written +in Go, which provides bounds checking that would have likely prevented access +past the bounds of the shared region (e.g. see +[aio](https://cs.opensource.google/gvisor/gvisor/+/master:pkg/sentry/syscalls/linux/vfs2/aio.go;l=210;drc=a11061d78a58ed75b10606d1a770b035ed944b66?q=file:aio&ss=gvisor%2Fgvisor) +or +[kcov](https://cs.opensource.google/gvisor/gvisor/+/master:pkg/sentry/kernel/kcov.go;l=272?q=file:kcov&ss=gvisor%2Fgvisor), +which have similar shared regions). + +Here, Kubernetes warrants slightly more explanation. gVisor makes pods the unit +of isolation and a pod can run multiple containers. In other words, each pod is +a gVisor instance, and each container is a set of processes running inside +gVisor, isolated via Sentry-internal namespaces like regular containers inside a +pod. If there were a vulnerability in gVisor, the privilege escalation would +allow a container inside the pod to break out to other **containers inside the +same pod**, but the container still **cannot break out of the pod**. + +## Defense in Depth + +gVisor follows a +[common security principle used at Google](https://cloud.google.com/security/infrastructure/design/resources/google_infrastructure_whitepaper_fa.pdf) +that the system should have two layers of protection, and those layers should +require different compromises to be broken. We apply this principle by assuming +that the Sentry (first layer of defense) +[will be compromised and should not be trusted](https://gvisor.dev/blog/2019/11/18/gvisor-security-basics-part-1/#defense-in-depth). +In order to protect the host kernel from a compromised Sentry, we wrap it around +many security and isolations features to ensure only the minimal set of +functionality from the host kernel is exposed. + +![Figure 1](/assets/images/2020-09-18-containing-a-real-vulnerability-figure1.png "Protection layers.") + +First, the sandbox runs inside a cgroup that can limit and throttle host +resources being used. Second, the sandbox joins empty namespaces, including user +and mount, to further isolate from the host. Next, it changes the process root +to a read-only directory that contains only `/proc` and nothing else. Then, it +executes with the unprivileged user/group +[`nobody`](https://en.wikipedia.org/wiki/Nobody_\(username\)) with all +capabilities stripped. Last and most importantly, a seccomp filter is added to +tightly restrict what parts of the Linux syscall surface that gVisor is allowed +to access. The allowed host surface is a far smaller set of syscalls than the +Sentry implements for applications to use. Not only restricting the syscall +being called, but also checking that arguments to these syscalls are within the +expected set. Dangerous syscalls like <code>execve(2)</code>, +<code>open(2)</code>, and <code>socket(2)</code> are prohibited, thus an +attacker isn’t able to execute binaries or acquire new resources on the host. + +if there were a vulnerability in gVisor that allowed an attacker to execute code +inside the Sentry, the attacker still has extremely limited privileges on the +host. In fact, a compromised Sentry is much more restricted than a +non-compromised regular container. For CVE-2020-14386 in particular, the attack +would be blocked by more than one security layer: non-privileged user, no +capability, and seccomp filters. + +Although the surface is drastically reduced, there is still a chance that there +is a vulnerability in one of the allowed syscalls. That’s why it’s important to +keep the surface small and carefully consider what syscalls are allowed. You can +find the full set of allowed syscalls +[here](https://cs.opensource.google/gvisor/gvisor/+/master:runsc/boot/filter/). + +Another possible attack vector is resources that are present in the Sentry, like +open file descriptors. The Sentry has file descriptors that an attacker could +potentially use, such as log files, platform files (e.g. `/dev/kvm`), an RPC +endpoint that allows external communication with the Sentry, and a Netstack +endpoint that connects the sandbox to the network. The Netstack endpoint in +particular is a concern because it gives direct access to the network. It’s an +`AF_PACKET` socket that allows arbitrary L2 packets to be written to the +network. In the normal case, Netstack assembles packets that go out the network, +giving the container control over only the payload. But if the Sentry is +compromised, an attacker can craft packets to the network. In many ways this is +similar to anyone sending random packets over the internet, but still this is a +place where the host kernel surface exposed is larger than we would like it to +be. + +## Conclusion + +Security comes with many tradeoffs that are often hard to make, such as the +decision to disable raw sockets by default. However, these tradeoffs have served +us well, and we've found them to have paid off over time. CVE-2020-14386 offers +great insight into how multiple layers of protection can be effective against +such an attack. + +We cannot guarantee that a container escape will never happen in gVisor, but we +do our best to make it as hard as we possibly can. + +If you have not tried gVisor yet, it’s easier than you think. Just follow the +steps in the +[Quick Start](https://gvisor.dev/docs/user_guide/quick_start/docker/) guide. +<br> +<br> + +-------------------------------------------------------------------------------- + +[^1]: Those packets are eventually handled by the host, as it needs to route + them to local containers or send them out the NIC. The packet will be + handled by many switches, routers, proxies, servers, etc. along the way, + which may be subject to their own vulnerabilities. diff --git a/website/blog/BUILD b/website/blog/BUILD index 01c1f5a6e..865e403da 100644 --- a/website/blog/BUILD +++ b/website/blog/BUILD @@ -28,6 +28,16 @@ doc( permalink = "/blog/2020/04/02/gvisor-networking-security/", ) +doc( + name = "containing_a_real_vulnerability", + src = "2020-09-18-containing-a-real-vulnerability.md", + authors = [ + "fvoznika", + ], + layout = "post", + permalink = "/blog/2020/09/18/containing-a-real-vulnerability/", +) + docs( name = "posts", deps = [ |