Merge release-20210201.0-67-g18e993eb4 (automated)

author: gVisor bot <gvisor-bot@google.com> 2021-02-09 19:57:43 +0000
committer: gVisor bot <gvisor-bot@google.com> 2021-02-09 19:57:43 +0000
commit: d337c01a6615b03a6524b8416bffd6f655fcff5e (patch)
tree: 197eacee59b2458cd3d24bd9eaeeb2814838c39c /pkg/tcpip/network/internal
parent: 7976b340af1a8a9ad4a806c9284f09407c108684 (diff)
parent: 18e993eb4f2e6db829acfb5e8725f7d12f73ab67 (diff)
6 files changed, 1602 insertions, 0 deletions
diff --git a/pkg/tcpip/network/internal/fragmentation/fragmentation.go b/pkg/tcpip/network/internal/fragmentation/fragmentation.go
new file mode 100644
index 000000000..243738951
--- /dev/null
+++ b/pkg/tcpip/network/internal/fragmentation/fragmentation.go
@@ -0,0 +1,339 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fragmentation contains the implementation of IP fragmentation.
+// It is based on RFC 791, RFC 815 and RFC 8200.
+package fragmentation
+
+import (
+	"errors"
+	"fmt"
+	"log"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	// HighFragThreshold is the threshold at which we start trimming old
+	// fragmented packets. Linux uses a default value of 4 MB. See
+	// net.ipv4.ipfrag_high_thresh for more information.
+	HighFragThreshold = 4 << 20 // 4MB
+
+	// LowFragThreshold is the threshold we reach to when we start dropping
+	// older fragmented packets. It's important that we keep enough room for newer
+	// packets to be re-assembled. Hence, this needs to be lower than
+	// HighFragThreshold enough. Linux uses a default value of 3 MB. See
+	// net.ipv4.ipfrag_low_thresh for more information.
+	LowFragThreshold = 3 << 20 // 3MB
+
+	// minBlockSize is the minimum block size for fragments.
+	minBlockSize = 1
+)
+
+var (
+	// ErrInvalidArgs indicates to the caller that an invalid argument was
+	// provided.
+	ErrInvalidArgs = errors.New("invalid args")
+
+	// ErrFragmentOverlap indicates that, during reassembly, a fragment overlaps
+	// with another one.
+	ErrFragmentOverlap = errors.New("overlapping fragments")
+
+	// ErrFragmentConflict indicates that, during reassembly, some fragments are
+	// in conflict with one another.
+	ErrFragmentConflict = errors.New("conflicting fragments")
+)
+
+// FragmentID is the identifier for a fragment.
+type FragmentID struct {
+	// Source is the source address of the fragment.
+	Source tcpip.Address
+
+	// Destination is the destination address of the fragment.
+	Destination tcpip.Address
+
+	// ID is the identification value of the fragment.
+	//
+	// This is a uint32 because IPv6 uses a 32-bit identification value.
+	ID uint32
+
+	// The protocol for the packet.
+	Protocol uint8
+}
+
+// Fragmentation is the main structure that other modules
+// of the stack should use to implement IP Fragmentation.
+type Fragmentation struct {
+	mu             sync.Mutex
+	highLimit      int
+	lowLimit       int
+	reassemblers   map[FragmentID]*reassembler
+	rList          reassemblerList
+	memSize        int
+	timeout        time.Duration
+	blockSize      uint16
+	clock          tcpip.Clock
+	releaseJob     *tcpip.Job
+	timeoutHandler TimeoutHandler
+}
+
+// TimeoutHandler is consulted if a packet reassembly has timed out.
+type TimeoutHandler interface {
+	// OnReassemblyTimeout will be called with the first fragment (or nil, if the
+	// first fragment has not been received) of a packet whose reassembly has
+	// timed out.
+	OnReassemblyTimeout(pkt *stack.PacketBuffer)
+}
+
+// NewFragmentation creates a new Fragmentation.
+//
+// blockSize specifies the fragment block size, in bytes.
+//
+// highMemoryLimit specifies the limit on the memory consumed
+// by the fragments stored by Fragmentation (overhead of internal data-structures
+// is not accounted). Fragments are dropped when the limit is reached.
+//
+// lowMemoryLimit specifies the limit on which we will reach by dropping
+// fragments after reaching highMemoryLimit.
+//
+// reassemblingTimeout specifies the maximum time allowed to reassemble a packet.
+// Fragments are lazily evicted only when a new a packet with an
+// already existing fragmentation-id arrives after the timeout.
+func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration, clock tcpip.Clock, timeoutHandler TimeoutHandler) *Fragmentation {
+	if lowMemoryLimit >= highMemoryLimit {
+		lowMemoryLimit = highMemoryLimit
+	}
+
+	if lowMemoryLimit < 0 {
+		lowMemoryLimit = 0
+	}
+
+	if blockSize < minBlockSize {
+		blockSize = minBlockSize
+	}
+
+	f := &Fragmentation{
+		reassemblers:   make(map[FragmentID]*reassembler),
+		highLimit:      highMemoryLimit,
+		lowLimit:       lowMemoryLimit,
+		timeout:        reassemblingTimeout,
+		blockSize:      blockSize,
+		clock:          clock,
+		timeoutHandler: timeoutHandler,
+	}
+	f.releaseJob = tcpip.NewJob(f.clock, &f.mu, f.releaseReassemblersLocked)
+
+	return f
+}
+
+// Process processes an incoming fragment belonging to an ID and returns a
+// complete packet and its protocol number when all the packets belonging to
+// that ID have been received.
+//
+// [first, last] is the range of the fragment bytes.
+//
+// first must be a multiple of the block size f is configured with. The size
+// of the fragment data must be a multiple of the block size, unless there are
+// no fragments following this fragment (more set to false).
+//
+// proto is the protocol number marked in the fragment being processed. It has
+// to be given here outside of the FragmentID struct because IPv6 should not use
+// the protocol to identify a fragment.
+func (f *Fragmentation) Process(
+	id FragmentID, first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) (
+	*stack.PacketBuffer, uint8, bool, error) {
+	if first > last {
+		return nil, 0, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs)
+	}
+
+	if first%f.blockSize != 0 {
+		return nil, 0, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs)
+	}
+
+	fragmentSize := last - first + 1
+	if more && fragmentSize%f.blockSize != 0 {
+		return nil, 0, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs)
+	}
+
+	if l := pkt.Data.Size(); l != int(fragmentSize) {
+		return nil, 0, false, fmt.Errorf("got fragment size=%d bytes not equal to the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs)
+	}
+
+	f.mu.Lock()
+	r, ok := f.reassemblers[id]
+	if !ok {
+		r = newReassembler(id, f.clock)
+		f.reassemblers[id] = r
+		wasEmpty := f.rList.Empty()
+		f.rList.PushFront(r)
+		if wasEmpty {
+			// If we have just pushed a first reassembler into an empty list, we
+			// should kickstart the release job. The release job will keep
+			// rescheduling itself until the list becomes empty.
+			f.releaseReassemblersLocked()
+		}
+	}
+	f.mu.Unlock()
+
+	resPkt, firstFragmentProto, done, memConsumed, err := r.process(first, last, more, proto, pkt)
+	if err != nil {
+		// We probably got an invalid sequence of fragments. Just
+		// discard the reassembler and move on.
+		f.mu.Lock()
+		f.release(r, false /* timedOut */)
+		f.mu.Unlock()
+		return nil, 0, false, fmt.Errorf("fragmentation processing error: %w", err)
+	}
+	f.mu.Lock()
+	f.memSize += memConsumed
+	if done {
+		f.release(r, false /* timedOut */)
+	}
+	// Evict reassemblers if we are consuming more memory than highLimit until
+	// we reach lowLimit.
+	if f.memSize > f.highLimit {
+		for f.memSize > f.lowLimit {
+			tail := f.rList.Back()
+			if tail == nil {
+				break
+			}
+			f.release(tail, false /* timedOut */)
+		}
+	}
+	f.mu.Unlock()
+	return resPkt, firstFragmentProto, done, nil
+}
+
+func (f *Fragmentation) release(r *reassembler, timedOut bool) {
+	// Before releasing a fragment we need to check if r is already marked as done.
+	// Otherwise, we would delete it twice.
+	if r.checkDoneOrMark() {
+		return
+	}
+
+	delete(f.reassemblers, r.id)
+	f.rList.Remove(r)
+	f.memSize -= r.memSize
+	if f.memSize < 0 {
+		log.Printf("memory counter < 0 (%d), this is an accounting bug that requires investigation", f.memSize)
+		f.memSize = 0
+	}
+
+	if h := f.timeoutHandler; timedOut && h != nil {
+		h.OnReassemblyTimeout(r.pkt)
+	}
+}
+
+// releaseReassemblersLocked releases already-expired reassemblers, then
+// schedules the job to call back itself for the remaining reassemblers if
+// any. This function must be called with f.mu locked.
+func (f *Fragmentation) releaseReassemblersLocked() {
+	now := f.clock.NowMonotonic()
+	for {
+		// The reassembler at the end of the list is the oldest.
+		r := f.rList.Back()
+		if r == nil {
+			// The list is empty.
+			break
+		}
+		elapsed := time.Duration(now-r.creationTime) * time.Nanosecond
+		if f.timeout > elapsed {
+			// If the oldest reassembler has not expired, schedule the release
+			// job so that this function is called back when it has expired.
+			f.releaseJob.Schedule(f.timeout - elapsed)
+			break
+		}
+		// If the oldest reassembler has already expired, release it.
+		f.release(r, true /* timedOut*/)
+	}
+}
+
+// PacketFragmenter is the book-keeping struct for packet fragmentation.
+type PacketFragmenter struct {
+	transportHeader    buffer.View
+	data               buffer.VectorisedView
+	reserve            int
+	fragmentPayloadLen int
+	fragmentCount      int
+	currentFragment    int
+	fragmentOffset     int
+}
+
+// MakePacketFragmenter prepares the struct needed for packet fragmentation.
+//
+// pkt is the packet to be fragmented.
+//
+// fragmentPayloadLen is the maximum number of bytes of fragmentable data a fragment can
+// have.
+//
+// reserve is the number of bytes that should be reserved for the headers in
+// each generated fragment.
+func MakePacketFragmenter(pkt *stack.PacketBuffer, fragmentPayloadLen uint32, reserve int) PacketFragmenter {
+	// As per RFC 8200 Section 4.5, some IPv6 extension headers should not be
+	// repeated in each fragment. However we do not currently support any header
+	// of that kind yet, so the following computation is valid for both IPv4 and
+	// IPv6.
+	// TODO(gvisor.dev/issue/3912): Once Authentication or ESP Headers are
+	// supported for outbound packets, the fragmentable data should not include
+	// these headers.
+	var fragmentableData buffer.VectorisedView
+	fragmentableData.AppendView(pkt.TransportHeader().View())
+	fragmentableData.Append(pkt.Data)
+	fragmentCount := (uint32(fragmentableData.Size()) + fragmentPayloadLen - 1) / fragmentPayloadLen
+
+	return PacketFragmenter{
+		data:               fragmentableData,
+		reserve:            reserve,
+		fragmentPayloadLen: int(fragmentPayloadLen),
+		fragmentCount:      int(fragmentCount),
+	}
+}
+
+// BuildNextFragment returns a packet with the payload of the next fragment,
+// along with the fragment's offset, the number of bytes copied and a boolean
+// indicating if there are more fragments left or not. If this function is
+// called again after it indicated that no more fragments were left, it will
+// panic.
+//
+// Note that the returned packet will not have its network and link headers
+// populated, but space for them will be reserved. The transport header will be
+// stored in the packet's data.
+func (pf *PacketFragmenter) BuildNextFragment() (*stack.PacketBuffer, int, int, bool) {
+	if pf.currentFragment >= pf.fragmentCount {
+		panic("BuildNextFragment should not be called again after the last fragment was returned")
+	}
+
+	fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: pf.reserve,
+	})
+
+	// Copy data for the fragment.
+	copied := pf.data.ReadToVV(&fragPkt.Data, pf.fragmentPayloadLen)
+
+	offset := pf.fragmentOffset
+	pf.fragmentOffset += copied
+	pf.currentFragment++
+	more := pf.currentFragment != pf.fragmentCount
+
+	return fragPkt, offset, copied, more
+}
+
+// RemainingFragmentCount returns the number of fragments left to be built.
+func (pf *PacketFragmenter) RemainingFragmentCount() int {
+	return pf.fragmentCount - pf.currentFragment
+}
diff --git a/pkg/tcpip/network/internal/fragmentation/fragmentation_state_autogen.go b/pkg/tcpip/network/internal/fragmentation/fragmentation_state_autogen.go
new file mode 100644
index 000000000..3f82c184a
--- /dev/null
+++ b/pkg/tcpip/network/internal/fragmentation/fragmentation_state_autogen.go
@@ -0,0 +1,64 @@
+// automatically generated by stateify.
+
+package fragmentation
+
+import (
+	"gvisor.dev/gvisor/pkg/state"
+)
+
+func (l *reassemblerList) StateTypeName() string {
+	return "pkg/tcpip/network/internal/fragmentation.reassemblerList"
+}
+
+func (l *reassemblerList) StateFields() []string {
+	return []string{
+		"head",
+		"tail",
+	}
+}
+
+func (l *reassemblerList) beforeSave() {}
+
+func (l *reassemblerList) StateSave(stateSinkObject state.Sink) {
+	l.beforeSave()
+	stateSinkObject.Save(0, &l.head)
+	stateSinkObject.Save(1, &l.tail)
+}
+
+func (l *reassemblerList) afterLoad() {}
+
+func (l *reassemblerList) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &l.head)
+	stateSourceObject.Load(1, &l.tail)
+}
+
+func (e *reassemblerEntry) StateTypeName() string {
+	return "pkg/tcpip/network/internal/fragmentation.reassemblerEntry"
+}
+
+func (e *reassemblerEntry) StateFields() []string {
+	return []string{
+		"next",
+		"prev",
+	}
+}
+
+func (e *reassemblerEntry) beforeSave() {}
+
+func (e *reassemblerEntry) StateSave(stateSinkObject state.Sink) {
+	e.beforeSave()
+	stateSinkObject.Save(0, &e.next)
+	stateSinkObject.Save(1, &e.prev)
+}
+
+func (e *reassemblerEntry) afterLoad() {}
+
+func (e *reassemblerEntry) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &e.next)
+	stateSourceObject.Load(1, &e.prev)
+}
+
+func init() {
+	state.Register((*reassemblerList)(nil))
+	state.Register((*reassemblerEntry)(nil))
+}
diff --git a/pkg/tcpip/network/internal/fragmentation/reassembler.go b/pkg/tcpip/network/internal/fragmentation/reassembler.go
new file mode 100644
index 000000000..933d63d32
--- /dev/null
+++ b/pkg/tcpip/network/internal/fragmentation/reassembler.go
@@ -0,0 +1,182 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fragmentation
+
+import (
+	"math"
+	"sort"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+type hole struct {
+	first  uint16
+	last   uint16
+	filled bool
+	final  bool
+	// pkt is the fragment packet if hole is filled. We keep the whole pkt rather
+	// than the fragmented payload to prevent binding to specific buffer types.
+	pkt *stack.PacketBuffer
+}
+
+type reassembler struct {
+	reassemblerEntry
+	id           FragmentID
+	memSize      int
+	proto        uint8
+	mu           sync.Mutex
+	holes        []hole
+	filled       int
+	done         bool
+	creationTime int64
+	pkt          *stack.PacketBuffer
+}
+
+func newReassembler(id FragmentID, clock tcpip.Clock) *reassembler {
+	r := &reassembler{
+		id:           id,
+		creationTime: clock.NowMonotonic(),
+	}
+	r.holes = append(r.holes, hole{
+		first:  0,
+		last:   math.MaxUint16,
+		filled: false,
+		final:  true,
+	})
+	return r
+}
+
+func (r *reassembler) process(first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) (*stack.PacketBuffer, uint8, bool, int, error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.done {
+		// A concurrent goroutine might have already reassembled
+		// the packet and emptied the heap while this goroutine
+		// was waiting on the mutex. We don't have to do anything in this case.
+		return nil, 0, false, 0, nil
+	}
+
+	var holeFound bool
+	var memConsumed int
+	for i := range r.holes {
+		currentHole := &r.holes[i]
+
+		if last < currentHole.first || currentHole.last < first {
+			continue
+		}
+		// For IPv6, overlaps with an existing fragment are explicitly forbidden by
+		// RFC 8200 section 4.5:
+		//   If any of the fragments being reassembled overlap with any other
+		//   fragments being reassembled for the same packet, reassembly of that
+		//   packet must be abandoned and all the fragments that have been received
+		//   for that packet must be discarded, and no ICMP error messages should be
+		//   sent.
+		//
+		// It is not explicitly forbidden for IPv4, but to keep parity with Linux we
+		// disallow it as well:
+		// https://github.com/torvalds/linux/blob/38525c6/net/ipv4/inet_fragment.c#L349
+		if first < currentHole.first || currentHole.last < last {
+			// Incoming fragment only partially fits in the free hole.
+			return nil, 0, false, 0, ErrFragmentOverlap
+		}
+		if !more {
+			if !currentHole.final || currentHole.filled && currentHole.last != last {
+				// We have another final fragment, which does not perfectly overlap.
+				return nil, 0, false, 0, ErrFragmentConflict
+			}
+		}
+
+		holeFound = true
+		if currentHole.filled {
+			// Incoming fragment is a duplicate.
+			continue
+		}
+
+		// We are populating the current hole with the payload and creating a new
+		// hole for any unfilled ranges on either end.
+		if first > currentHole.first {
+			r.holes = append(r.holes, hole{
+				first:  currentHole.first,
+				last:   first - 1,
+				filled: false,
+				final:  false,
+			})
+		}
+		if last < currentHole.last && more {
+			r.holes = append(r.holes, hole{
+				first:  last + 1,
+				last:   currentHole.last,
+				filled: false,
+				final:  currentHole.final,
+			})
+			currentHole.final = false
+		}
+		memConsumed = pkt.MemSize()
+		r.memSize += memConsumed
+		// Update the current hole to precisely match the incoming fragment.
+		r.holes[i] = hole{
+			first:  first,
+			last:   last,
+			filled: true,
+			final:  currentHole.final,
+			pkt:    pkt,
+		}
+		r.filled++
+		// For IPv6, it is possible to have different Protocol values between
+		// fragments of a packet (because, unlike IPv4, the Protocol is not used to
+		// identify a fragment). In this case, only the Protocol of the first
+		// fragment must be used as per RFC 8200 Section 4.5.
+		//
+		// TODO(gvisor.dev/issue/3648): During reassembly of an IPv6 packet, IP
+		// options received in the first fragment should be used - and they should
+		// override options from following fragments.
+		if first == 0 {
+			r.pkt = pkt
+			r.proto = proto
+		}
+
+		break
+	}
+	if !holeFound {
+		// Incoming fragment is beyond end.
+		return nil, 0, false, 0, ErrFragmentConflict
+	}
+
+	// Check if all the holes have been filled and we are ready to reassemble.
+	if r.filled < len(r.holes) {
+		return nil, 0, false, memConsumed, nil
+	}
+
+	sort.Slice(r.holes, func(i, j int) bool {
+		return r.holes[i].first < r.holes[j].first
+	})
+
+	resPkt := r.holes[0].pkt
+	for i := 1; i < len(r.holes); i++ {
+		fragPkt := r.holes[i].pkt
+		fragPkt.Data.ReadToVV(&resPkt.Data, fragPkt.Data.Size())
+	}
+	return resPkt, r.proto, true, memConsumed, nil
+}
+
+func (r *reassembler) checkDoneOrMark() bool {
+	r.mu.Lock()
+	prev := r.done
+	r.done = true
+	r.mu.Unlock()
+	return prev
+}
diff --git a/pkg/tcpip/network/internal/fragmentation/reassembler_list.go b/pkg/tcpip/network/internal/fragmentation/reassembler_list.go
new file mode 100644
index 000000000..673bb11b0
--- /dev/null
+++ b/pkg/tcpip/network/internal/fragmentation/reassembler_list.go
@@ -0,0 +1,221 @@
+package fragmentation
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type reassemblerElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (reassemblerElementMapper) linkerFor(elem *reassembler) *reassembler { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+//      for e := l.Front(); e != nil; e = e.Next() {
+// 		// do something with e.
+//      }
+//
+// +stateify savable
+type reassemblerList struct {
+	head *reassembler
+	tail *reassembler
+}
+
+// Reset resets list l to the empty state.
+func (l *reassemblerList) Reset() {
+	l.head = nil
+	l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+//
+//go:nosplit
+func (l *reassemblerList) Empty() bool {
+	return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+//
+//go:nosplit
+func (l *reassemblerList) Front() *reassembler {
+	return l.head
+}
+
+// Back returns the last element of list l or nil.
+//
+//go:nosplit
+func (l *reassemblerList) Back() *reassembler {
+	return l.tail
+}
+
+// Len returns the number of elements in the list.
+//
+// NOTE: This is an O(n) operation.
+//
+//go:nosplit
+func (l *reassemblerList) Len() (count int) {
+	for e := l.Front(); e != nil; e = (reassemblerElementMapper{}.linkerFor(e)).Next() {
+		count++
+	}
+	return count
+}
+
+// PushFront inserts the element e at the front of list l.
+//
+//go:nosplit
+func (l *reassemblerList) PushFront(e *reassembler) {
+	linker := reassemblerElementMapper{}.linkerFor(e)
+	linker.SetNext(l.head)
+	linker.SetPrev(nil)
+	if l.head != nil {
+		reassemblerElementMapper{}.linkerFor(l.head).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+
+	l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+//
+//go:nosplit
+func (l *reassemblerList) PushBack(e *reassembler) {
+	linker := reassemblerElementMapper{}.linkerFor(e)
+	linker.SetNext(nil)
+	linker.SetPrev(l.tail)
+	if l.tail != nil {
+		reassemblerElementMapper{}.linkerFor(l.tail).SetNext(e)
+	} else {
+		l.head = e
+	}
+
+	l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+//
+//go:nosplit
+func (l *reassemblerList) PushBackList(m *reassemblerList) {
+	if l.head == nil {
+		l.head = m.head
+		l.tail = m.tail
+	} else if m.head != nil {
+		reassemblerElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+		reassemblerElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+		l.tail = m.tail
+	}
+	m.head = nil
+	m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+//
+//go:nosplit
+func (l *reassemblerList) InsertAfter(b, e *reassembler) {
+	bLinker := reassemblerElementMapper{}.linkerFor(b)
+	eLinker := reassemblerElementMapper{}.linkerFor(e)
+
+	a := bLinker.Next()
+
+	eLinker.SetNext(a)
+	eLinker.SetPrev(b)
+	bLinker.SetNext(e)
+
+	if a != nil {
+		reassemblerElementMapper{}.linkerFor(a).SetPrev(e)
+	} else {
+		l.tail = e
+	}
+}
+
+// InsertBefore inserts e before a.
+//
+//go:nosplit
+func (l *reassemblerList) InsertBefore(a, e *reassembler) {
+	aLinker := reassemblerElementMapper{}.linkerFor(a)
+	eLinker := reassemblerElementMapper{}.linkerFor(e)
+
+	b := aLinker.Prev()
+	eLinker.SetNext(a)
+	eLinker.SetPrev(b)
+	aLinker.SetPrev(e)
+
+	if b != nil {
+		reassemblerElementMapper{}.linkerFor(b).SetNext(e)
+	} else {
+		l.head = e
+	}
+}
+
+// Remove removes e from l.
+//
+//go:nosplit
+func (l *reassemblerList) Remove(e *reassembler) {
+	linker := reassemblerElementMapper{}.linkerFor(e)
+	prev := linker.Prev()
+	next := linker.Next()
+
+	if prev != nil {
+		reassemblerElementMapper{}.linkerFor(prev).SetNext(next)
+	} else if l.head == e {
+		l.head = next
+	}
+
+	if next != nil {
+		reassemblerElementMapper{}.linkerFor(next).SetPrev(prev)
+	} else if l.tail == e {
+		l.tail = prev
+	}
+
+	linker.SetNext(nil)
+	linker.SetPrev(nil)
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type reassemblerEntry struct {
+	next *reassembler
+	prev *reassembler
+}
+
+// Next returns the entry that follows e in the list.
+//
+//go:nosplit
+func (e *reassemblerEntry) Next() *reassembler {
+	return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+//
+//go:nosplit
+func (e *reassemblerEntry) Prev() *reassembler {
+	return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+//
+//go:nosplit
+func (e *reassemblerEntry) SetNext(elem *reassembler) {
+	e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+//
+//go:nosplit
+func (e *reassemblerEntry) SetPrev(elem *reassembler) {
+	e.prev = elem
+}
diff --git a/pkg/tcpip/network/internal/ip/generic_multicast_protocol.go b/pkg/tcpip/network/internal/ip/generic_multicast_protocol.go
new file mode 100644
index 000000000..b9f129728
--- /dev/null
+++ b/pkg/tcpip/network/internal/ip/generic_multicast_protocol.go
@@ -0,0 +1,696 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ip holds IPv4/IPv6 common utilities.
+package ip
+
+import (
+	"fmt"
+	"math/rand"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// hostState is the state a host may be in for a multicast group.
+type hostState int
+
+// The states below are generic across IGMPv2 (RFC 2236 section 6) and MLDv1
+// (RFC 2710 section 5). Even though the states are generic across both IGMPv2
+// and MLDv1, IGMPv2 terminology will be used.
+//
+//                                  ______________receive query______________
+//                                 |                                         |
+//                                 |   _____send or receive report_____      |
+//                                 |  |                                |     |
+//                                 V  |                                V     |
+//  +-------+ +-----------+ +------------+ +-------------------+ +--------+  |
+//  | Non-M | | Pending-M | | Delaying-M | | Queued Delaying-M | | Idle-M | -
+//  +-------+ +-----------+ +------------+ +-------------------+ +--------+
+//    |          ^      |       ^      |          ^       |             ^
+//    |          |      |       |      |          |       |             |
+//     ----------        -------        ----------         -------------
+//   initialize new    send inital     fail to send       send or receive
+//  group membership     report       delayed report          report
+//
+// Not shown in the diagram above, but any state may transition into the non
+// member state when a group is left.
+const (
+	// nonMember is the "'Non-Member' state, when the host does not belong to the
+	// group on the interface. This is the initial state for all memberships on
+	// all network interfaces; it requires no storage in the host."
+	//
+	// 'Non-Listener' is the MLDv1 term used to describe this state.
+	//
+	// This state is used to keep track of groups that have been joined locally,
+	// but without advertising the membership to the network.
+	nonMember hostState = iota
+
+	// pendingMember is a newly joined member that is waiting to successfully send
+	// the initial set of reports.
+	//
+	// This is not an RFC defined state; it is an implementation specific state to
+	// track that the initial report needs to be sent.
+	//
+	// MAY NOT transition to the idle member state from this state.
+	pendingMember
+
+	// delayingMember is the "'Delaying Member' state, when the host belongs to
+	// the group on the interface and has a report delay timer running for that
+	// membership."
+	//
+	// 'Delaying Listener' is the MLDv1 term used to describe this state.
+	delayingMember
+
+	// queuedDelayingMember is a delayingMember that failed to send a report after
+	// its delayed report timer fired. Hosts in this state are waiting to attempt
+	// retransmission of the delayed report.
+	//
+	// This is not an RFC defined state; it is an implementation specific state to
+	// track that the delayed report needs to be sent.
+	//
+	// May transition to idle member if a report is received for a group.
+	queuedDelayingMember
+
+	// idleMember is the "Idle Member" state, when the host belongs to the group
+	// on the interface and does not have a report delay timer running for that
+	// membership.
+	//
+	// 'Idle Listener' is the MLDv1 term used to describe this state.
+	idleMember
+)
+
+func (s hostState) isDelayingMember() bool {
+	switch s {
+	case nonMember, pendingMember, idleMember:
+		return false
+	case delayingMember, queuedDelayingMember:
+		return true
+	default:
+		panic(fmt.Sprintf("unrecognized host state = %d", s))
+	}
+}
+
+// multicastGroupState holds the Generic Multicast Protocol state for a
+// multicast group.
+type multicastGroupState struct {
+	// joins is the number of times the group has been joined.
+	joins uint64
+
+	// state holds the host's state for the group.
+	state hostState
+
+	// lastToSendReport is true if we sent the last report for the group. It is
+	// used to track whether there are other hosts on the subnet that are also
+	// members of the group.
+	//
+	// Defined in RFC 2236 section 6 page 9 for IGMPv2 and RFC 2710 section 5 page
+	// 8 for MLDv1.
+	lastToSendReport bool
+
+	// delayedReportJob is used to delay sending responses to membership report
+	// messages in order to reduce duplicate reports from multiple hosts on the
+	// interface.
+	//
+	// Must not be nil.
+	delayedReportJob *tcpip.Job
+
+	// delyedReportJobFiresAt is the time when the delayed report job will fire.
+	//
+	// A zero value indicates that the job is not scheduled.
+	delayedReportJobFiresAt time.Time
+}
+
+func (m *multicastGroupState) cancelDelayedReportJob() {
+	m.delayedReportJob.Cancel()
+	m.delayedReportJobFiresAt = time.Time{}
+}
+
+// GenericMulticastProtocolOptions holds options for the generic multicast
+// protocol.
+type GenericMulticastProtocolOptions struct {
+	// Rand is the source of random numbers.
+	Rand *rand.Rand
+
+	// Clock is the clock used to create timers.
+	Clock tcpip.Clock
+
+	// Protocol is the implementation of the variant of multicast group protocol
+	// in use.
+	Protocol MulticastGroupProtocol
+
+	// MaxUnsolicitedReportDelay is the maximum amount of time to wait between
+	// transmitting unsolicited reports.
+	//
+	// Unsolicited reports are transmitted when a group is newly joined.
+	MaxUnsolicitedReportDelay time.Duration
+
+	// AllNodesAddress is a multicast address that all nodes on a network should
+	// be a member of.
+	//
+	// This address will not have the generic multicast protocol performed on it;
+	// it will be left in the non member/listener state, and packets will never
+	// be sent for it.
+	AllNodesAddress tcpip.Address
+}
+
+// MulticastGroupProtocol is a multicast group protocol whose core state machine
+// can be represented by GenericMulticastProtocolState.
+type MulticastGroupProtocol interface {
+	// Enabled indicates whether the generic multicast protocol will be
+	// performed.
+	//
+	// When enabled, the protocol may transmit report and leave messages when
+	// joining and leaving multicast groups respectively, and handle incoming
+	// packets.
+	//
+	// When disabled, the protocol will still keep track of locally joined groups,
+	// it just won't transmit and handle packets, or update groups' state.
+	Enabled() bool
+
+	// SendReport sends a multicast report for the specified group address.
+	//
+	// Returns false if the caller should queue the report to be sent later. Note,
+	// returning false does not mean that the receiver hit an error.
+	SendReport(groupAddress tcpip.Address) (sent bool, err tcpip.Error)
+
+	// SendLeave sends a multicast leave for the specified group address.
+	SendLeave(groupAddress tcpip.Address) tcpip.Error
+}
+
+// GenericMulticastProtocolState is the per interface generic multicast protocol
+// state.
+//
+// There is actually no protocol named "Generic Multicast Protocol". Instead,
+// the term used to refer to a generic multicast protocol that applies to both
+// IPv4 and IPv6. Specifically, Generic Multicast Protocol is the core state
+// machine of IGMPv2 as defined by RFC 2236 and MLDv1 as defined by RFC 2710.
+//
+// Callers must synchronize accesses to the generic multicast protocol state;
+// GenericMulticastProtocolState obtains no locks in any of its methods. The
+// only exception to this is GenericMulticastProtocolState's timer/job callbacks
+// which will obtain the lock provided to the GenericMulticastProtocolState when
+// it is initialized.
+//
+// GenericMulticastProtocolState.Init MUST be called before calling any of
+// the methods on GenericMulticastProtocolState.
+//
+// GenericMulticastProtocolState.MakeAllNonMemberLocked MUST be called when the
+// multicast group protocol is disabled so that leave messages may be sent.
+type GenericMulticastProtocolState struct {
+	// Do not allow overwriting this state.
+	_ sync.NoCopy
+
+	opts GenericMulticastProtocolOptions
+
+	// memberships holds group addresses and their associated state.
+	memberships map[tcpip.Address]multicastGroupState
+
+	// protocolMU is the mutex used to protect the protocol.
+	protocolMU *sync.RWMutex
+}
+
+// Init initializes the Generic Multicast Protocol state.
+//
+// Must only be called once for the lifetime of g; Init will panic if it is
+// called twice.
+//
+// The GenericMulticastProtocolState will only grab the lock when timers/jobs
+// fire.
+//
+// Note: the methods on opts.Protocol will always be called while protocolMU is
+// held.
+func (g *GenericMulticastProtocolState) Init(protocolMU *sync.RWMutex, opts GenericMulticastProtocolOptions) {
+	if g.memberships != nil {
+		panic("attempted to initialize generic membership protocol state twice")
+	}
+
+	*g = GenericMulticastProtocolState{
+		opts:        opts,
+		memberships: make(map[tcpip.Address]multicastGroupState),
+		protocolMU:  protocolMU,
+	}
+}
+
+// MakeAllNonMemberLocked transitions all groups to the non-member state.
+//
+// The groups will still be considered joined locally.
+//
+// MUST be called when the multicast group protocol is disabled.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) MakeAllNonMemberLocked() {
+	if !g.opts.Protocol.Enabled() {
+		return
+	}
+
+	for groupAddress, info := range g.memberships {
+		g.transitionToNonMemberLocked(groupAddress, &info)
+		g.memberships[groupAddress] = info
+	}
+}
+
+// InitializeGroupsLocked initializes each group, as if they were newly joined
+// but without affecting the groups' join count.
+//
+// Must only be called after calling MakeAllNonMember as a group should not be
+// initialized while it is not in the non-member state.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) InitializeGroupsLocked() {
+	if !g.opts.Protocol.Enabled() {
+		return
+	}
+
+	for groupAddress, info := range g.memberships {
+		g.initializeNewMemberLocked(groupAddress, &info)
+		g.memberships[groupAddress] = info
+	}
+}
+
+// SendQueuedReportsLocked attempts to send reports for groups that failed to
+// send reports during their last attempt.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) SendQueuedReportsLocked() {
+	for groupAddress, info := range g.memberships {
+		switch info.state {
+		case nonMember, delayingMember, idleMember:
+		case pendingMember:
+			// pendingMembers failed to send their initial unsolicited report so try
+			// to send the report and queue the extra unsolicited reports.
+			g.maybeSendInitialReportLocked(groupAddress, &info)
+		case queuedDelayingMember:
+			// queuedDelayingMembers failed to send their delayed reports so try to
+			// send the report and transition them to the idle state.
+			g.maybeSendDelayedReportLocked(groupAddress, &info)
+		default:
+			panic(fmt.Sprintf("unrecognized host state = %d", info.state))
+		}
+		g.memberships[groupAddress] = info
+	}
+}
+
+// JoinGroupLocked handles joining a new group.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) JoinGroupLocked(groupAddress tcpip.Address) {
+	if info, ok := g.memberships[groupAddress]; ok {
+		// The group has already been joined.
+		info.joins++
+		g.memberships[groupAddress] = info
+		return
+	}
+
+	info := multicastGroupState{
+		// Since we just joined the group, its count is 1.
+		joins: 1,
+		// The state will be updated below, if required.
+		state:            nonMember,
+		lastToSendReport: false,
+		delayedReportJob: tcpip.NewJob(g.opts.Clock, g.protocolMU, func() {
+			if !g.opts.Protocol.Enabled() {
+				panic(fmt.Sprintf("delayed report job fired for group %s while the multicast group protocol is disabled", groupAddress))
+			}
+
+			info, ok := g.memberships[groupAddress]
+			if !ok {
+				panic(fmt.Sprintf("expected to find group state for group = %s", groupAddress))
+			}
+
+			g.maybeSendDelayedReportLocked(groupAddress, &info)
+			g.memberships[groupAddress] = info
+		}),
+	}
+
+	if g.opts.Protocol.Enabled() {
+		g.initializeNewMemberLocked(groupAddress, &info)
+	}
+
+	g.memberships[groupAddress] = info
+}
+
+// IsLocallyJoinedRLocked returns true if the group is locally joined.
+//
+// Precondition: g.protocolMU must be read locked.
+func (g *GenericMulticastProtocolState) IsLocallyJoinedRLocked(groupAddress tcpip.Address) bool {
+	_, ok := g.memberships[groupAddress]
+	return ok
+}
+
+// LeaveGroupLocked handles leaving the group.
+//
+// Returns false if the group is not currently joined.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) LeaveGroupLocked(groupAddress tcpip.Address) bool {
+	info, ok := g.memberships[groupAddress]
+	if !ok {
+		return false
+	}
+
+	if info.joins == 0 {
+		panic(fmt.Sprintf("tried to leave group %s with a join count of 0", groupAddress))
+	}
+	info.joins--
+	if info.joins != 0 {
+		// If we still have outstanding joins, then do nothing further.
+		g.memberships[groupAddress] = info
+		return true
+	}
+
+	g.transitionToNonMemberLocked(groupAddress, &info)
+	delete(g.memberships, groupAddress)
+	return true
+}
+
+// HandleQueryLocked handles a query message with the specified maximum response
+// time.
+//
+// If the group address is unspecified, then reports will be scheduled for all
+// joined groups.
+//
+// Report(s) will be scheduled to be sent after a random duration between 0 and
+// the maximum response time.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) HandleQueryLocked(groupAddress tcpip.Address, maxResponseTime time.Duration) {
+	if !g.opts.Protocol.Enabled() {
+		return
+	}
+
+	// As per RFC 2236 section 2.4 (for IGMPv2),
+	//
+	//   In a Membership Query message, the group address field is set to zero
+	//   when sending a General Query, and set to the group address being
+	//   queried when sending a Group-Specific Query.
+	//
+	// As per RFC 2710 section 3.6 (for MLDv1),
+	//
+	//   In a Query message, the Multicast Address field is set to zero when
+	//   sending a General Query, and set to a specific IPv6 multicast address
+	//   when sending a Multicast-Address-Specific Query.
+	if groupAddress.Unspecified() {
+		// This is a general query as the group address is unspecified.
+		for groupAddress, info := range g.memberships {
+			g.setDelayTimerForAddressRLocked(groupAddress, &info, maxResponseTime)
+			g.memberships[groupAddress] = info
+		}
+	} else if info, ok := g.memberships[groupAddress]; ok {
+		g.setDelayTimerForAddressRLocked(groupAddress, &info, maxResponseTime)
+		g.memberships[groupAddress] = info
+	}
+}
+
+// HandleReportLocked handles a report message.
+//
+// If the report is for a joined group, any active delayed report will be
+// cancelled and the host state for the group transitions to idle.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) HandleReportLocked(groupAddress tcpip.Address) {
+	if !g.opts.Protocol.Enabled() {
+		return
+	}
+
+	// As per RFC 2236 section 3 pages 3-4 (for IGMPv2),
+	//
+	//   If the host receives another host's Report (version 1 or 2) while it has
+	//   a timer running, it stops its timer for the specified group and does not
+	//   send a Report
+	//
+	// As per RFC 2710 section 4 page 6 (for MLDv1),
+	//
+	//   If a node receives another node's Report from an interface for a
+	//   multicast address while it has a timer running for that same address
+	//   on that interface, it stops its timer and does not send a Report for
+	//   that address, thus suppressing duplicate reports on the link.
+	if info, ok := g.memberships[groupAddress]; ok && info.state.isDelayingMember() {
+		info.cancelDelayedReportJob()
+		info.lastToSendReport = false
+		info.state = idleMember
+		g.memberships[groupAddress] = info
+	}
+}
+
+// initializeNewMemberLocked initializes a new group membership.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) initializeNewMemberLocked(groupAddress tcpip.Address, info *multicastGroupState) {
+	if info.state != nonMember {
+		panic(fmt.Sprintf("host must be in non-member state to be initialized; group = %s, state = %d", groupAddress, info.state))
+	}
+
+	info.lastToSendReport = false
+
+	if groupAddress == g.opts.AllNodesAddress {
+		// As per RFC 2236 section 6 page 10 (for IGMPv2),
+		//
+		//   The all-systems group (address 224.0.0.1) is handled as a special
+		//   case. The host starts in Idle Member state for that group on every
+		//   interface, never transitions to another state, and never sends a
+		//   report for that group.
+		//
+		// As per RFC 2710 section 5 page 10 (for MLDv1),
+		//
+		//   The link-scope all-nodes address (FF02::1) is handled as a special
+		//   case. The node starts in Idle Listener state for that address on
+		//   every interface, never transitions to another state, and never sends
+		//   a Report or Done for that address.
+		info.state = idleMember
+		return
+	}
+
+	info.state = pendingMember
+	g.maybeSendInitialReportLocked(groupAddress, info)
+}
+
+// maybeSendInitialReportLocked attempts to start transmission of the initial
+// set of reports after newly joining a group.
+//
+// Host must be in pending member state.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) maybeSendInitialReportLocked(groupAddress tcpip.Address, info *multicastGroupState) {
+	if info.state != pendingMember {
+		panic(fmt.Sprintf("host must be in pending member state to send initial reports; group = %s, state = %d", groupAddress, info.state))
+	}
+
+	// As per RFC 2236 section 3 page 5 (for IGMPv2),
+	//
+	//   When a host joins a multicast group, it should immediately transmit an
+	//   unsolicited Version 2 Membership Report for that group" ... "it is
+	//   recommended that it be repeated".
+	//
+	// As per RFC 2710 section 4 page 6 (for MLDv1),
+	//
+	//   When a node starts listening to a multicast address on an interface,
+	//   it should immediately transmit an unsolicited Report for that address
+	//   on that interface, in case it is the first listener on the link. To
+	//   cover the possibility of the initial Report being lost or damaged, it
+	//   is recommended that it be repeated once or twice after short delays
+	//   [Unsolicited Report Interval].
+	//
+	// TODO(gvisor.dev/issue/4901): Support a configurable number of initial
+	// unsolicited reports.
+	sent, err := g.opts.Protocol.SendReport(groupAddress)
+	if err == nil && sent {
+		info.lastToSendReport = true
+		g.setDelayTimerForAddressRLocked(groupAddress, info, g.opts.MaxUnsolicitedReportDelay)
+	}
+}
+
+// maybeSendDelayedReportLocked attempts to send the delayed report.
+//
+// Host must be in pending, delaying or queued delaying member state.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) maybeSendDelayedReportLocked(groupAddress tcpip.Address, info *multicastGroupState) {
+	if !info.state.isDelayingMember() {
+		panic(fmt.Sprintf("host must be in delaying or queued delaying member state to send delayed reports; group = %s, state = %d", groupAddress, info.state))
+	}
+
+	sent, err := g.opts.Protocol.SendReport(groupAddress)
+	if err == nil && sent {
+		info.lastToSendReport = true
+		info.state = idleMember
+	} else {
+		info.state = queuedDelayingMember
+	}
+}
+
+// maybeSendLeave attempts to send a leave message.
+func (g *GenericMulticastProtocolState) maybeSendLeave(groupAddress tcpip.Address, lastToSendReport bool) {
+	if !g.opts.Protocol.Enabled() || !lastToSendReport {
+		return
+	}
+
+	if groupAddress == g.opts.AllNodesAddress {
+		// As per RFC 2236 section 6 page 10 (for IGMPv2),
+		//
+		//   The all-systems group (address 224.0.0.1) is handled as a special
+		//   case. The host starts in Idle Member state for that group on every
+		//   interface, never transitions to another state, and never sends a
+		//   report for that group.
+		//
+		// As per RFC 2710 section 5 page 10 (for MLDv1),
+		//
+		//   The link-scope all-nodes address (FF02::1) is handled as a special
+		//   case. The node starts in Idle Listener state for that address on
+		//   every interface, never transitions to another state, and never sends
+		//   a Report or Done for that address.
+		return
+	}
+
+	// Okay to ignore the error here as if packet write failed, the multicast
+	// routers will eventually drop our membership anyways. If the interface is
+	// being disabled or removed, the generic multicast protocol's should be
+	// cleared eventually.
+	//
+	// As per RFC 2236 section 3 page 5 (for IGMPv2),
+	//
+	//   When a router receives a Report, it adds the group being reported to
+	//   the list of multicast group memberships on the network on which it
+	//   received the Report and sets the timer for the membership to the
+	//   [Group Membership Interval]. Repeated Reports refresh the timer. If
+	//   no Reports are received for a particular group before this timer has
+	//   expired, the router assumes that the group has no local members and
+	//   that it need not forward remotely-originated multicasts for that
+	//   group onto the attached network.
+	//
+	// As per RFC 2710 section 4 page 5 (for MLDv1),
+	//
+	//   When a router receives a Report from a link, if the reported address
+	//   is not already present in the router's list of multicast address
+	//   having listeners on that link, the reported address is added to the
+	//   list, its timer is set to [Multicast Listener Interval], and its
+	//   appearance is made known to the router's multicast routing component.
+	//   If a Report is received for a multicast address that is already
+	//   present in the router's list, the timer for that address is reset to
+	//   [Multicast Listener Interval]. If an address's timer expires, it is
+	//   assumed that there are no longer any listeners for that address
+	//   present on the link, so it is deleted from the list and its
+	//   disappearance is made known to the multicast routing component.
+	//
+	// The requirement to send a leave message is also optional (it MAY be
+	// skipped):
+	//
+	// As per RFC 2236 section 6 page 8 (for IGMPv2),
+	//
+	//  "send leave" for the group on the interface. If the interface
+	//   state says the Querier is running IGMPv1, this action SHOULD be
+	//   skipped. If the flag saying we were the last host to report is
+	//   cleared, this action MAY be skipped. The Leave Message is sent to
+	//   the ALL-ROUTERS group (224.0.0.2).
+	//
+	// As per RFC 2710 section 5 page 8 (for MLDv1),
+	//
+	//   "send done" for the address on the interface. If the flag saying
+	//   we were the last node to report is cleared, this action MAY be
+	//   skipped. The Done message is sent to the link-scope all-routers
+	//   address (FF02::2).
+	_ = g.opts.Protocol.SendLeave(groupAddress)
+}
+
+// transitionToNonMemberLocked transitions the given multicast group the the
+// non-member/listener state.
+//
+// Precondition: g.protocolMU must be locked.
+func (g *GenericMulticastProtocolState) transitionToNonMemberLocked(groupAddress tcpip.Address, info *multicastGroupState) {
+	if info.state == nonMember {
+		return
+	}
+
+	info.cancelDelayedReportJob()
+	g.maybeSendLeave(groupAddress, info.lastToSendReport)
+	info.lastToSendReport = false
+	info.state = nonMember
+}
+
+// setDelayTimerForAddressRLocked sets timer to send a delay report.
+//
+// Precondition: g.protocolMU MUST be read locked.
+func (g *GenericMulticastProtocolState) setDelayTimerForAddressRLocked(groupAddress tcpip.Address, info *multicastGroupState, maxResponseTime time.Duration) {
+	if info.state == nonMember {
+		return
+	}
+
+	if groupAddress == g.opts.AllNodesAddress {
+		// As per RFC 2236 section 6 page 10 (for IGMPv2),
+		//
+		//   The all-systems group (address 224.0.0.1) is handled as a special
+		//   case. The host starts in Idle Member state for that group on every
+		//   interface, never transitions to another state, and never sends a
+		//   report for that group.
+		//
+		// As per RFC 2710 section 5 page 10 (for MLDv1),
+		//
+		//   The link-scope all-nodes address (FF02::1) is handled as a special
+		//   case. The node starts in Idle Listener state for that address on
+		//   every interface, never transitions to another state, and never sends
+		//   a Report or Done for that address.
+		return
+	}
+
+	// As per RFC 2236 section 3 page 3 (for IGMPv2),
+	//
+	//   If a timer for the group is already unning, it is reset to the random
+	//   value only if the requested Max Response Time is less than the remaining
+	//   value of the running timer.
+	//
+	// As per RFC 2710 section 4 page 5 (for MLDv1),
+	//
+	//   If a timer for any address is already running, it is reset to the new
+	//   random value only if the requested Maximum Response Delay is less than
+	//   the remaining value of the running timer.
+	now := time.Unix(0 /* seconds */, g.opts.Clock.NowNanoseconds())
+	if info.state == delayingMember {
+		if info.delayedReportJobFiresAt.IsZero() {
+			panic(fmt.Sprintf("delayed report unscheduled while in the delaying member state; group = %s", groupAddress))
+		}
+
+		if info.delayedReportJobFiresAt.Sub(now) <= maxResponseTime {
+			// The timer is scheduled to fire before the maximum response time so we
+			// leave our timer as is.
+			return
+		}
+	}
+
+	info.state = delayingMember
+	info.cancelDelayedReportJob()
+	maxResponseTime = g.calculateDelayTimerDuration(maxResponseTime)
+	info.delayedReportJob.Schedule(maxResponseTime)
+	info.delayedReportJobFiresAt = now.Add(maxResponseTime)
+}
+
+// calculateDelayTimerDuration returns a random time between (0, maxRespTime].
+func (g *GenericMulticastProtocolState) calculateDelayTimerDuration(maxRespTime time.Duration) time.Duration {
+	// As per RFC 2236 section 3 page 3 (for IGMPv2),
+	//
+	//   When a host receives a Group-Specific Query, it sets a delay timer to a
+	//   random value selected from the range (0, Max Response Time]...
+	//
+	// As per RFC 2710 section 4 page 6 (for MLDv1),
+	//
+	//   When a node receives a Multicast-Address-Specific Query, if it is
+	//   listening to the queried Multicast Address on the interface from
+	//   which the Query was received, it sets a delay timer for that address
+	//   to a random value selected from the range [0, Maximum Response Delay],
+	//   as above.
+	if maxRespTime == 0 {
+		return 0
+	}
+	return time.Duration(g.opts.Rand.Int63n(int64(maxRespTime)))
+}
diff --git a/pkg/tcpip/network/internal/ip/stats.go b/pkg/tcpip/network/internal/ip/stats.go
new file mode 100644
index 000000000..898f8b356
--- /dev/null
+++ b/pkg/tcpip/network/internal/ip/stats.go
@@ -0,0 +1,100 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ip
+
+import "gvisor.dev/gvisor/pkg/tcpip"
+
+// LINT.IfChange(MultiCounterIPStats)
+
+// MultiCounterIPStats holds IP statistics, each counter may have several
+// versions.
+type MultiCounterIPStats struct {
+	// PacketsReceived is the total number of IP packets received from the link
+	// layer.
+	PacketsReceived tcpip.MultiCounterStat
+
+	// DisabledPacketsReceived is the total number of IP packets received from the
+	// link layer when the IP layer is disabled.
+	DisabledPacketsReceived tcpip.MultiCounterStat
+
+	// InvalidDestinationAddressesReceived is the total number of IP packets
+	// received with an unknown or invalid destination address.
+	InvalidDestinationAddressesReceived tcpip.MultiCounterStat
+
+	// InvalidSourceAddressesReceived is the total number of IP packets received
+	// with a source address that should never have been received on the wire.
+	InvalidSourceAddressesReceived tcpip.MultiCounterStat
+
+	// PacketsDelivered is the total number of incoming IP packets that are
+	// successfully delivered to the transport layer.
+	PacketsDelivered tcpip.MultiCounterStat
+
+	// PacketsSent is the total number of IP packets sent via WritePacket.
+	PacketsSent tcpip.MultiCounterStat
+
+	// OutgoingPacketErrors is the total number of IP packets which failed to
+	// write to a link-layer endpoint.
+	OutgoingPacketErrors tcpip.MultiCounterStat
+
+	// MalformedPacketsReceived is the total number of IP Packets that were
+	// dropped due to the IP packet header failing validation checks.
+	MalformedPacketsReceived tcpip.MultiCounterStat
+
+	// MalformedFragmentsReceived is the total number of IP Fragments that were
+	// dropped due to the fragment failing validation checks.
+	MalformedFragmentsReceived tcpip.MultiCounterStat
+
+	// IPTablesPreroutingDropped is the total number of IP packets dropped in the
+	// Prerouting chain.
+	IPTablesPreroutingDropped tcpip.MultiCounterStat
+
+	// IPTablesInputDropped is the total number of IP packets dropped in the Input
+	// chain.
+	IPTablesInputDropped tcpip.MultiCounterStat
+
+	// IPTablesOutputDropped is the total number of IP packets dropped in the
+	// Output chain.
+	IPTablesOutputDropped tcpip.MultiCounterStat
+
+	// OptionTSReceived is the number of Timestamp options seen.
+	OptionTSReceived tcpip.MultiCounterStat
+
+	// OptionRRReceived is the number of Record Route options seen.
+	OptionRRReceived tcpip.MultiCounterStat
+
+	// OptionUnknownReceived is the number of unknown IP options seen.
+	OptionUnknownReceived tcpip.MultiCounterStat
+}
+
+// Init sets internal counters to track a and b counters.
+func (m *MultiCounterIPStats) Init(a, b *tcpip.IPStats) {
+	m.PacketsReceived.Init(a.PacketsReceived, b.PacketsReceived)
+	m.DisabledPacketsReceived.Init(a.DisabledPacketsReceived, b.DisabledPacketsReceived)
+	m.InvalidDestinationAddressesReceived.Init(a.InvalidDestinationAddressesReceived, b.InvalidDestinationAddressesReceived)
+	m.InvalidSourceAddressesReceived.Init(a.InvalidSourceAddressesReceived, b.InvalidSourceAddressesReceived)
+	m.PacketsDelivered.Init(a.PacketsDelivered, b.PacketsDelivered)
+	m.PacketsSent.Init(a.PacketsSent, b.PacketsSent)
+	m.OutgoingPacketErrors.Init(a.OutgoingPacketErrors, b.OutgoingPacketErrors)
+	m.MalformedPacketsReceived.Init(a.MalformedPacketsReceived, b.MalformedPacketsReceived)
+	m.MalformedFragmentsReceived.Init(a.MalformedFragmentsReceived, b.MalformedFragmentsReceived)
+	m.IPTablesPreroutingDropped.Init(a.IPTablesPreroutingDropped, b.IPTablesPreroutingDropped)
+	m.IPTablesInputDropped.Init(a.IPTablesInputDropped, b.IPTablesInputDropped)
+	m.IPTablesOutputDropped.Init(a.IPTablesOutputDropped, b.IPTablesOutputDropped)
+	m.OptionTSReceived.Init(a.OptionTSReceived, b.OptionTSReceived)
+	m.OptionRRReceived.Init(a.OptionRRReceived, b.OptionRRReceived)
+	m.OptionUnknownReceived.Init(a.OptionUnknownReceived, b.OptionUnknownReceived)
+}
+
+// LINT.ThenChange(:MultiCounterIPStats, ../../tcpip.go:IPStats)
author	gVisor bot <gvisor-bot@google.com>	2021-02-09 19:57:43 +0000
committer	gVisor bot <gvisor-bot@google.com>	2021-02-09 19:57:43 +0000
commit	d337c01a6615b03a6524b8416bffd6f655fcff5e (patch)
tree	197eacee59b2458cd3d24bd9eaeeb2814838c39c /pkg/tcpip/network/internal
parent	7976b340af1a8a9ad4a806c9284f09407c108684 (diff)
parent	18e993eb4f2e6db829acfb5e8725f7d12f73ab67 (diff)