summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2020-07-13 19:03:18 +0000
committergVisor bot <gvisor-bot@google.com>2020-07-13 19:03:18 +0000
commit0a49098e2b456f23af79341530d79f9128b58bb5 (patch)
treedde9d942d1dae331e62bdde25f067286c3b8a390
parent4e931be12f70e320c7d750fba1c3a3c6b008ddb6 (diff)
parent43c209f48e0aa9024705583cc6f0fafa7d6380ca (diff)
Merge release-20200622.1-97-g43c209f48 (automated)
-rw-r--r--pkg/tcpip/stack/conntrack.go277
-rw-r--r--pkg/tcpip/stack/iptables.go42
-rw-r--r--pkg/tcpip/stack/iptables_state.go40
-rw-r--r--pkg/tcpip/stack/iptables_types.go11
-rw-r--r--pkg/tcpip/stack/stack.go1
-rw-r--r--pkg/tcpip/stack/stack_state_autogen.go392
-rw-r--r--pkg/tcpip/stack/tuple_list.go193
-rw-r--r--pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go5
8 files changed, 918 insertions, 43 deletions
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
index af9c325ca..d39baf620 100644
--- a/pkg/tcpip/stack/conntrack.go
+++ b/pkg/tcpip/stack/conntrack.go
@@ -15,9 +15,12 @@
package stack
import (
+ "encoding/binary"
"sync"
+ "time"
"gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack"
)
@@ -30,6 +33,10 @@ import (
//
// Currently, only TCP tracking is supported.
+// Our hash table has 16K buckets.
+// TODO(gvisor.dev/issue/170): These should be tunable.
+const numBuckets = 1 << 14
+
// Direction of the tuple.
type direction int
@@ -48,7 +55,12 @@ const (
// tuple holds a connection's identifying and manipulating data in one
// direction. It is immutable.
+//
+// +stateify savable
type tuple struct {
+ // tupleEntry is used to build an intrusive list of tuples.
+ tupleEntry
+
tupleID
// conn is the connection tracking entry this tuple belongs to.
@@ -61,6 +73,8 @@ type tuple struct {
// tupleID uniquely identifies a connection in one direction. It currently
// contains enough information to distinguish between any TCP or UDP
// connection, and will need to be extended to support other protocols.
+//
+// +stateify savable
type tupleID struct {
srcAddr tcpip.Address
srcPort uint16
@@ -83,6 +97,8 @@ func (ti tupleID) reply() tupleID {
}
// conn is a tracked connection.
+//
+// +stateify savable
type conn struct {
// original is the tuple in original direction. It is immutable.
original tuple
@@ -98,22 +114,67 @@ type conn struct {
tcbHook Hook
// mu protects tcb.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// tcb is TCB control block. It is used to keep track of states
// of tcp connection and is protected by mu.
tcb tcpconntrack.TCB
+
+ // lastUsed is the last time the connection saw a relevant packet, and
+ // is updated by each packet on the connection. It is protected by mu.
+ lastUsed time.Time `state:".(unixTime)"`
+}
+
+// timedOut returns whether the connection timed out based on its state.
+func (cn *conn) timedOut(now time.Time) bool {
+ const establishedTimeout = 5 * 24 * time.Hour
+ const defaultTimeout = 120 * time.Second
+ cn.mu.Lock()
+ defer cn.mu.Unlock()
+ if cn.tcb.State() == tcpconntrack.ResultAlive {
+ // Use the same default as Linux, which doesn't delete
+ // established connections for 5(!) days.
+ return now.Sub(cn.lastUsed) > establishedTimeout
+ }
+ // Use the same default as Linux, which lets connections in most states
+ // other than established remain for <= 120 seconds.
+ return now.Sub(cn.lastUsed) > defaultTimeout
}
// ConnTrack tracks all connections created for NAT rules. Most users are
// expected to only call handlePacket and createConnFor.
+//
+// ConnTrack keeps all connections in a slice of buckets, each of which holds a
+// linked list of tuples. This gives us some desirable properties:
+// - Each bucket has its own lock, lessening lock contention.
+// - The slice is large enough that lists stay short (<10 elements on average).
+// Thus traversal is fast.
+// - During linked list traversal we reap expired connections. This amortizes
+// the cost of reaping them and makes reapUnused faster.
+//
+// Locks are ordered by their location in the buckets slice. That is, a
+// goroutine that locks buckets[i] can only lock buckets[j] s.t. i < j.
+//
+// +stateify savable
type ConnTrack struct {
- // mu protects conns.
- mu sync.RWMutex
+ // seed is a one-time random value initialized at stack startup
+ // and is used in the calculation of hash keys for the list of buckets.
+ // It is immutable.
+ seed uint32
- // conns maintains a map of tuples needed for connection tracking for
- // iptables NAT rules. It is protected by mu.
- conns map[tupleID]tuple
+ // mu protects the buckets slice, but not buckets' contents. Only take
+ // the write lock if you are modifying the slice or saving for S/R.
+ mu sync.RWMutex `state:"nosave"`
+
+ // buckets is protected by mu.
+ buckets []bucket
+}
+
+// +stateify savable
+type bucket struct {
+ // mu protects tuples.
+ mu sync.Mutex `state:"nosave"`
+ tuples tupleList
}
// packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid
@@ -143,8 +204,9 @@ func packetToTupleID(pkt *PacketBuffer) (tupleID, *tcpip.Error) {
// newConn creates new connection.
func newConn(orig, reply tupleID, manip manipType, hook Hook) *conn {
conn := conn{
- manip: manip,
- tcbHook: hook,
+ manip: manip,
+ tcbHook: hook,
+ lastUsed: time.Now(),
}
conn.original = tuple{conn: &conn, tupleID: orig}
conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply}
@@ -162,14 +224,28 @@ func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) {
return nil, dirOriginal
}
- ct.mu.Lock()
- defer ct.mu.Unlock()
-
- tuple, ok := ct.conns[tid]
- if !ok {
- return nil, dirOriginal
+ bucket := ct.bucket(tid)
+ now := time.Now()
+
+ ct.mu.RLock()
+ defer ct.mu.RUnlock()
+ ct.buckets[bucket].mu.Lock()
+ defer ct.buckets[bucket].mu.Unlock()
+
+ // Iterate over the tuples in a bucket, cleaning up any unused
+ // connections we find.
+ for other := ct.buckets[bucket].tuples.Front(); other != nil; other = other.Next() {
+ // Clean up any timed-out connections we happen to find.
+ if ct.reapTupleLocked(other, bucket, now) {
+ // The tuple expired.
+ continue
+ }
+ if tid == other.tupleID {
+ return other.conn, other.direction
+ }
}
- return tuple.conn, tuple.direction
+
+ return nil, dirOriginal
}
// createConnFor creates a new conn for pkt.
@@ -197,13 +273,31 @@ func (ct *ConnTrack) createConnFor(pkt *PacketBuffer, hook Hook, rt RedirectTarg
}
conn := newConn(tid, replyTID, manip, hook)
- // Add the changed tuple to the map.
- // TODO(gvisor.dev/issue/170): Need to support collisions using linked
- // list.
- ct.mu.Lock()
- defer ct.mu.Unlock()
- ct.conns[tid] = conn.original
- ct.conns[replyTID] = conn.reply
+ // Lock the buckets in the correct order.
+ tupleBucket := ct.bucket(tid)
+ replyBucket := ct.bucket(replyTID)
+ ct.mu.RLock()
+ defer ct.mu.RUnlock()
+ if tupleBucket < replyBucket {
+ ct.buckets[tupleBucket].mu.Lock()
+ ct.buckets[replyBucket].mu.Lock()
+ } else if tupleBucket > replyBucket {
+ ct.buckets[replyBucket].mu.Lock()
+ ct.buckets[tupleBucket].mu.Lock()
+ } else {
+ // Both tuples are in the same bucket.
+ ct.buckets[tupleBucket].mu.Lock()
+ }
+
+ // Add the tuple to the map.
+ ct.buckets[tupleBucket].tuples.PushFront(&conn.original)
+ ct.buckets[replyBucket].tuples.PushFront(&conn.reply)
+
+ // Unlocking can happen in any order.
+ ct.buckets[tupleBucket].mu.Unlock()
+ if tupleBucket != replyBucket {
+ ct.buckets[replyBucket].mu.Unlock()
+ }
return conn
}
@@ -297,35 +391,134 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Rou
// other tcp states.
conn.mu.Lock()
defer conn.mu.Unlock()
- var st tcpconntrack.Result
- tcpHeader := header.TCP(pkt.TransportHeader)
- if conn.tcb.IsEmpty() {
+
+ // Mark the connection as having been used recently so it isn't reaped.
+ conn.lastUsed = time.Now()
+ // Update connection state.
+ if tcpHeader := header.TCP(pkt.TransportHeader); conn.tcb.IsEmpty() {
conn.tcb.Init(tcpHeader)
conn.tcbHook = hook
+ } else if hook == conn.tcbHook {
+ conn.tcb.UpdateStateOutbound(tcpHeader)
} else {
- switch hook {
- case conn.tcbHook:
- st = conn.tcb.UpdateStateOutbound(tcpHeader)
- default:
- st = conn.tcb.UpdateStateInbound(tcpHeader)
- }
+ conn.tcb.UpdateStateInbound(tcpHeader)
}
+}
+
+// bucket gets the conntrack bucket for a tupleID.
+func (ct *ConnTrack) bucket(id tupleID) int {
+ h := jenkins.Sum32(ct.seed)
+ h.Write([]byte(id.srcAddr))
+ h.Write([]byte(id.dstAddr))
+ shortBuf := make([]byte, 2)
+ binary.LittleEndian.PutUint16(shortBuf, id.srcPort)
+ h.Write([]byte(shortBuf))
+ binary.LittleEndian.PutUint16(shortBuf, id.dstPort)
+ h.Write([]byte(shortBuf))
+ binary.LittleEndian.PutUint16(shortBuf, uint16(id.transProto))
+ h.Write([]byte(shortBuf))
+ binary.LittleEndian.PutUint16(shortBuf, uint16(id.netProto))
+ h.Write([]byte(shortBuf))
+ ct.mu.RLock()
+ defer ct.mu.RUnlock()
+ return int(h.Sum32()) % len(ct.buckets)
+}
- // Delete conn if tcp connection is closed.
- if st == tcpconntrack.ResultClosedByPeer || st == tcpconntrack.ResultClosedBySelf || st == tcpconntrack.ResultReset {
- ct.deleteConn(conn)
+// reapUnused deletes timed out entries from the conntrack map. The rules for
+// reaping are:
+// - Most reaping occurs in connFor, which is called on each packet. connFor
+// cleans up the bucket the packet's connection maps to. Thus calls to
+// reapUnused should be fast.
+// - Each call to reapUnused traverses a fraction of the conntrack table.
+// Specifically, it traverses len(ct.buckets)/fractionPerReaping.
+// - After reaping, reapUnused decides when it should next run based on the
+// ratio of expired connections to examined connections. If the ratio is
+// greater than maxExpiredPct, it schedules the next run quickly. Otherwise it
+// slightly increases the interval between runs.
+// - maxFullTraversal caps the time it takes to traverse the entire table.
+//
+// reapUnused returns the next bucket that should be checked and the time after
+// which it should be called again.
+func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, time.Duration) {
+ // TODO(gvisor.dev/issue/170): This can be more finely controlled, as
+ // it is in Linux via sysctl.
+ const fractionPerReaping = 128
+ const maxExpiredPct = 50
+ const maxFullTraversal = 60 * time.Second
+ const minInterval = 10 * time.Millisecond
+ const maxInterval = maxFullTraversal / fractionPerReaping
+
+ now := time.Now()
+ checked := 0
+ expired := 0
+ var idx int
+ ct.mu.RLock()
+ defer ct.mu.RUnlock()
+ for i := 0; i < len(ct.buckets)/fractionPerReaping; i++ {
+ idx = (i + start) % len(ct.buckets)
+ ct.buckets[idx].mu.Lock()
+ for tuple := ct.buckets[idx].tuples.Front(); tuple != nil; tuple = tuple.Next() {
+ checked++
+ if ct.reapTupleLocked(tuple, idx, now) {
+ expired++
+ }
+ }
+ ct.buckets[idx].mu.Unlock()
+ }
+ // We already checked buckets[idx].
+ idx++
+
+ // If half or more of the connections are expired, the table has gotten
+ // stale. Reschedule quickly.
+ expiredPct := 0
+ if checked != 0 {
+ expiredPct = expired * 100 / checked
+ }
+ if expiredPct > maxExpiredPct {
+ return idx, minInterval
+ }
+ if interval := prevInterval + minInterval; interval <= maxInterval {
+ // Increment the interval between runs.
+ return idx, interval
}
+ // We've hit the maximum interval.
+ return idx, maxInterval
}
-// deleteConn deletes the connection.
-func (ct *ConnTrack) deleteConn(conn *conn) {
- if conn == nil {
- return
+// reapTupleLocked tries to remove tuple and its reply from the table. It
+// returns whether the tuple's connection has timed out.
+//
+// Preconditions: ct.mu is locked for reading and bucket is locked.
+func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bool {
+ if !tuple.conn.timedOut(now) {
+ return false
}
- ct.mu.Lock()
- defer ct.mu.Unlock()
+ // To maintain lock order, we can only reap these tuples if the reply
+ // appears later in the table.
+ replyBucket := ct.bucket(tuple.reply())
+ if bucket > replyBucket {
+ return true
+ }
+
+ // Don't re-lock if both tuples are in the same bucket.
+ differentBuckets := bucket != replyBucket
+ if differentBuckets {
+ ct.buckets[replyBucket].mu.Lock()
+ }
+
+ // We have the buckets locked and can remove both tuples.
+ if tuple.direction == dirOriginal {
+ ct.buckets[replyBucket].tuples.Remove(&tuple.conn.reply)
+ } else {
+ ct.buckets[replyBucket].tuples.Remove(&tuple.conn.original)
+ }
+ ct.buckets[bucket].tuples.Remove(tuple)
+
+ // Don't re-unlock if both tuples are in the same bucket.
+ if differentBuckets {
+ ct.buckets[replyBucket].mu.Unlock()
+ }
- delete(ct.conns, conn.original.tupleID)
- delete(ct.conns, conn.reply.tupleID)
+ return true
}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 974d77c36..f846ea2e5 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -16,6 +16,7 @@ package stack
import (
"fmt"
+ "time"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -41,6 +42,9 @@ const (
// underflow.
const HookUnset = -1
+// reaperDelay is how long to wait before starting to reap connections.
+const reaperDelay = 5 * time.Second
+
// DefaultTables returns a default set of tables. Each chain is set to accept
// all packets.
func DefaultTables() *IPTables {
@@ -112,8 +116,9 @@ func DefaultTables() *IPTables {
Output: []string{TablenameMangle, TablenameNat, TablenameFilter},
},
connections: ConnTrack{
- conns: make(map[tupleID]tuple),
+ seed: generateRandUint32(),
},
+ reaperDone: make(chan struct{}, 1),
}
}
@@ -169,6 +174,12 @@ func (it *IPTables) GetTable(name string) (Table, bool) {
func (it *IPTables) ReplaceTable(name string, table Table) {
it.mu.Lock()
defer it.mu.Unlock()
+ // If iptables is being enabled, initialize the conntrack table and
+ // reaper.
+ if !it.modified {
+ it.connections.buckets = make([]bucket, numBuckets)
+ it.startReaper(reaperDelay)
+ }
it.modified = true
it.tables[name] = table
}
@@ -249,6 +260,35 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
return true
}
+// beforeSave is invoked by stateify.
+func (it *IPTables) beforeSave() {
+ // Ensure the reaper exits cleanly.
+ it.reaperDone <- struct{}{}
+ // Prevent others from modifying the connection table.
+ it.connections.mu.Lock()
+}
+
+// afterLoad is invoked by stateify.
+func (it *IPTables) afterLoad() {
+ it.startReaper(reaperDelay)
+}
+
+// startReaper starts a goroutine that wakes up periodically to reap timed out
+// connections.
+func (it *IPTables) startReaper(interval time.Duration) {
+ go func() { // S/R-SAFE: reaperDone is signalled when iptables is saved.
+ bucket := 0
+ for {
+ select {
+ case <-it.reaperDone:
+ return
+ case <-time.After(interval):
+ bucket, interval = it.connections.reapUnused(bucket, interval)
+ }
+ }
+ }()
+}
+
// CheckPackets runs pkts through the rules for hook and returns a map of packets that
// should not go forward.
//
diff --git a/pkg/tcpip/stack/iptables_state.go b/pkg/tcpip/stack/iptables_state.go
new file mode 100644
index 000000000..529e02a07
--- /dev/null
+++ b/pkg/tcpip/stack/iptables_state.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+ "time"
+)
+
+// +stateify savable
+type unixTime struct {
+ second int64
+ nano int64
+}
+
+// saveLastUsed is invoked by stateify.
+func (cn *conn) saveLastUsed() unixTime {
+ return unixTime{cn.lastUsed.Unix(), cn.lastUsed.UnixNano()}
+}
+
+// loadLastUsed is invoked by stateify.
+func (cn *conn) loadLastUsed(unix unixTime) {
+ cn.lastUsed = time.Unix(unix.second, unix.nano)
+}
+
+// beforeSave is invoked by stateify.
+func (ct *ConnTrack) beforeSave() {
+ ct.mu.Lock()
+}
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
index c528ec381..eb70e3104 100644
--- a/pkg/tcpip/stack/iptables_types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -78,6 +78,8 @@ const (
)
// IPTables holds all the tables for a netstack.
+//
+// +stateify savable
type IPTables struct {
// mu protects tables, priorities, and modified.
mu sync.RWMutex
@@ -97,10 +99,15 @@ type IPTables struct {
modified bool
connections ConnTrack
+
+ // reaperDone can be signalled to stop the reaper goroutine.
+ reaperDone chan struct{}
}
// A Table defines a set of chains and hooks into the network stack. It is
// really just a list of rules.
+//
+// +stateify savable
type Table struct {
// Rules holds the rules that make up the table.
Rules []Rule
@@ -130,6 +137,8 @@ func (table *Table) ValidHooks() uint32 {
// contains zero or more matchers, each of which is a specification of which
// packets this rule applies to. If there are no matchers in the rule, it
// applies to any packet.
+//
+// +stateify savable
type Rule struct {
// Filter holds basic IP filtering fields common to every rule.
Filter IPHeaderFilter
@@ -142,6 +151,8 @@ type Rule struct {
}
// IPHeaderFilter holds basic IP filtering data common to every rule.
+//
+// +stateify savable
type IPHeaderFilter struct {
// Protocol matches the transport protocol.
Protocol tcpip.TransportProtocolNumber
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index cdcfb8321..0aa815447 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -425,6 +425,7 @@ type Stack struct {
handleLocal bool
// tables are the iptables packet filtering and manipulation rules.
+ // TODO(gvisor.dev/issue/170): S/R this field.
tables *IPTables
// resumableEndpoints is a list of endpoints that need to be resumed if the
diff --git a/pkg/tcpip/stack/stack_state_autogen.go b/pkg/tcpip/stack/stack_state_autogen.go
index 6efa9a773..3bf5a8660 100644
--- a/pkg/tcpip/stack/stack_state_autogen.go
+++ b/pkg/tcpip/stack/stack_state_autogen.go
@@ -6,6 +6,334 @@ import (
"gvisor.dev/gvisor/pkg/state"
)
+func (x *tuple) StateTypeName() string {
+ return "pkg/tcpip/stack.tuple"
+}
+
+func (x *tuple) StateFields() []string {
+ return []string{
+ "tupleEntry",
+ "tupleID",
+ "conn",
+ "direction",
+ }
+}
+
+func (x *tuple) beforeSave() {}
+
+func (x *tuple) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.tupleEntry)
+ m.Save(1, &x.tupleID)
+ m.Save(2, &x.conn)
+ m.Save(3, &x.direction)
+}
+
+func (x *tuple) afterLoad() {}
+
+func (x *tuple) StateLoad(m state.Source) {
+ m.Load(0, &x.tupleEntry)
+ m.Load(1, &x.tupleID)
+ m.Load(2, &x.conn)
+ m.Load(3, &x.direction)
+}
+
+func (x *tupleID) StateTypeName() string {
+ return "pkg/tcpip/stack.tupleID"
+}
+
+func (x *tupleID) StateFields() []string {
+ return []string{
+ "srcAddr",
+ "srcPort",
+ "dstAddr",
+ "dstPort",
+ "transProto",
+ "netProto",
+ }
+}
+
+func (x *tupleID) beforeSave() {}
+
+func (x *tupleID) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.srcAddr)
+ m.Save(1, &x.srcPort)
+ m.Save(2, &x.dstAddr)
+ m.Save(3, &x.dstPort)
+ m.Save(4, &x.transProto)
+ m.Save(5, &x.netProto)
+}
+
+func (x *tupleID) afterLoad() {}
+
+func (x *tupleID) StateLoad(m state.Source) {
+ m.Load(0, &x.srcAddr)
+ m.Load(1, &x.srcPort)
+ m.Load(2, &x.dstAddr)
+ m.Load(3, &x.dstPort)
+ m.Load(4, &x.transProto)
+ m.Load(5, &x.netProto)
+}
+
+func (x *conn) StateTypeName() string {
+ return "pkg/tcpip/stack.conn"
+}
+
+func (x *conn) StateFields() []string {
+ return []string{
+ "original",
+ "reply",
+ "manip",
+ "tcbHook",
+ "tcb",
+ "lastUsed",
+ }
+}
+
+func (x *conn) beforeSave() {}
+
+func (x *conn) StateSave(m state.Sink) {
+ x.beforeSave()
+ var lastUsed unixTime = x.saveLastUsed()
+ m.SaveValue(5, lastUsed)
+ m.Save(0, &x.original)
+ m.Save(1, &x.reply)
+ m.Save(2, &x.manip)
+ m.Save(3, &x.tcbHook)
+ m.Save(4, &x.tcb)
+}
+
+func (x *conn) afterLoad() {}
+
+func (x *conn) StateLoad(m state.Source) {
+ m.Load(0, &x.original)
+ m.Load(1, &x.reply)
+ m.Load(2, &x.manip)
+ m.Load(3, &x.tcbHook)
+ m.Load(4, &x.tcb)
+ m.LoadValue(5, new(unixTime), func(y interface{}) { x.loadLastUsed(y.(unixTime)) })
+}
+
+func (x *ConnTrack) StateTypeName() string {
+ return "pkg/tcpip/stack.ConnTrack"
+}
+
+func (x *ConnTrack) StateFields() []string {
+ return []string{
+ "seed",
+ "buckets",
+ }
+}
+
+func (x *ConnTrack) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.seed)
+ m.Save(1, &x.buckets)
+}
+
+func (x *ConnTrack) afterLoad() {}
+
+func (x *ConnTrack) StateLoad(m state.Source) {
+ m.Load(0, &x.seed)
+ m.Load(1, &x.buckets)
+}
+
+func (x *bucket) StateTypeName() string {
+ return "pkg/tcpip/stack.bucket"
+}
+
+func (x *bucket) StateFields() []string {
+ return []string{
+ "tuples",
+ }
+}
+
+func (x *bucket) beforeSave() {}
+
+func (x *bucket) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.tuples)
+}
+
+func (x *bucket) afterLoad() {}
+
+func (x *bucket) StateLoad(m state.Source) {
+ m.Load(0, &x.tuples)
+}
+
+func (x *unixTime) StateTypeName() string {
+ return "pkg/tcpip/stack.unixTime"
+}
+
+func (x *unixTime) StateFields() []string {
+ return []string{
+ "second",
+ "nano",
+ }
+}
+
+func (x *unixTime) beforeSave() {}
+
+func (x *unixTime) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.second)
+ m.Save(1, &x.nano)
+}
+
+func (x *unixTime) afterLoad() {}
+
+func (x *unixTime) StateLoad(m state.Source) {
+ m.Load(0, &x.second)
+ m.Load(1, &x.nano)
+}
+
+func (x *IPTables) StateTypeName() string {
+ return "pkg/tcpip/stack.IPTables"
+}
+
+func (x *IPTables) StateFields() []string {
+ return []string{
+ "mu",
+ "tables",
+ "priorities",
+ "modified",
+ "connections",
+ "reaperDone",
+ }
+}
+
+func (x *IPTables) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.mu)
+ m.Save(1, &x.tables)
+ m.Save(2, &x.priorities)
+ m.Save(3, &x.modified)
+ m.Save(4, &x.connections)
+ m.Save(5, &x.reaperDone)
+}
+
+func (x *IPTables) StateLoad(m state.Source) {
+ m.Load(0, &x.mu)
+ m.Load(1, &x.tables)
+ m.Load(2, &x.priorities)
+ m.Load(3, &x.modified)
+ m.Load(4, &x.connections)
+ m.Load(5, &x.reaperDone)
+ m.AfterLoad(x.afterLoad)
+}
+
+func (x *Table) StateTypeName() string {
+ return "pkg/tcpip/stack.Table"
+}
+
+func (x *Table) StateFields() []string {
+ return []string{
+ "Rules",
+ "BuiltinChains",
+ "Underflows",
+ "UserChains",
+ }
+}
+
+func (x *Table) beforeSave() {}
+
+func (x *Table) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.Rules)
+ m.Save(1, &x.BuiltinChains)
+ m.Save(2, &x.Underflows)
+ m.Save(3, &x.UserChains)
+}
+
+func (x *Table) afterLoad() {}
+
+func (x *Table) StateLoad(m state.Source) {
+ m.Load(0, &x.Rules)
+ m.Load(1, &x.BuiltinChains)
+ m.Load(2, &x.Underflows)
+ m.Load(3, &x.UserChains)
+}
+
+func (x *Rule) StateTypeName() string {
+ return "pkg/tcpip/stack.Rule"
+}
+
+func (x *Rule) StateFields() []string {
+ return []string{
+ "Filter",
+ "Matchers",
+ "Target",
+ }
+}
+
+func (x *Rule) beforeSave() {}
+
+func (x *Rule) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.Filter)
+ m.Save(1, &x.Matchers)
+ m.Save(2, &x.Target)
+}
+
+func (x *Rule) afterLoad() {}
+
+func (x *Rule) StateLoad(m state.Source) {
+ m.Load(0, &x.Filter)
+ m.Load(1, &x.Matchers)
+ m.Load(2, &x.Target)
+}
+
+func (x *IPHeaderFilter) StateTypeName() string {
+ return "pkg/tcpip/stack.IPHeaderFilter"
+}
+
+func (x *IPHeaderFilter) StateFields() []string {
+ return []string{
+ "Protocol",
+ "Dst",
+ "DstMask",
+ "DstInvert",
+ "Src",
+ "SrcMask",
+ "SrcInvert",
+ "OutputInterface",
+ "OutputInterfaceMask",
+ "OutputInterfaceInvert",
+ }
+}
+
+func (x *IPHeaderFilter) beforeSave() {}
+
+func (x *IPHeaderFilter) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.Protocol)
+ m.Save(1, &x.Dst)
+ m.Save(2, &x.DstMask)
+ m.Save(3, &x.DstInvert)
+ m.Save(4, &x.Src)
+ m.Save(5, &x.SrcMask)
+ m.Save(6, &x.SrcInvert)
+ m.Save(7, &x.OutputInterface)
+ m.Save(8, &x.OutputInterfaceMask)
+ m.Save(9, &x.OutputInterfaceInvert)
+}
+
+func (x *IPHeaderFilter) afterLoad() {}
+
+func (x *IPHeaderFilter) StateLoad(m state.Source) {
+ m.Load(0, &x.Protocol)
+ m.Load(1, &x.Dst)
+ m.Load(2, &x.DstMask)
+ m.Load(3, &x.DstInvert)
+ m.Load(4, &x.Src)
+ m.Load(5, &x.SrcMask)
+ m.Load(6, &x.SrcInvert)
+ m.Load(7, &x.OutputInterface)
+ m.Load(8, &x.OutputInterfaceMask)
+ m.Load(9, &x.OutputInterfaceInvert)
+}
+
func (x *linkAddrEntryList) StateTypeName() string {
return "pkg/tcpip/stack.linkAddrEntryList"
}
@@ -261,7 +589,69 @@ func (x *multiPortEndpoint) StateLoad(m state.Source) {
m.Load(4, &x.flags)
}
+func (x *tupleList) StateTypeName() string {
+ return "pkg/tcpip/stack.tupleList"
+}
+
+func (x *tupleList) StateFields() []string {
+ return []string{
+ "head",
+ "tail",
+ }
+}
+
+func (x *tupleList) beforeSave() {}
+
+func (x *tupleList) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.head)
+ m.Save(1, &x.tail)
+}
+
+func (x *tupleList) afterLoad() {}
+
+func (x *tupleList) StateLoad(m state.Source) {
+ m.Load(0, &x.head)
+ m.Load(1, &x.tail)
+}
+
+func (x *tupleEntry) StateTypeName() string {
+ return "pkg/tcpip/stack.tupleEntry"
+}
+
+func (x *tupleEntry) StateFields() []string {
+ return []string{
+ "next",
+ "prev",
+ }
+}
+
+func (x *tupleEntry) beforeSave() {}
+
+func (x *tupleEntry) StateSave(m state.Sink) {
+ x.beforeSave()
+ m.Save(0, &x.next)
+ m.Save(1, &x.prev)
+}
+
+func (x *tupleEntry) afterLoad() {}
+
+func (x *tupleEntry) StateLoad(m state.Source) {
+ m.Load(0, &x.next)
+ m.Load(1, &x.prev)
+}
+
func init() {
+ state.Register((*tuple)(nil))
+ state.Register((*tupleID)(nil))
+ state.Register((*conn)(nil))
+ state.Register((*ConnTrack)(nil))
+ state.Register((*bucket)(nil))
+ state.Register((*unixTime)(nil))
+ state.Register((*IPTables)(nil))
+ state.Register((*Table)(nil))
+ state.Register((*Rule)(nil))
+ state.Register((*IPHeaderFilter)(nil))
state.Register((*linkAddrEntryList)(nil))
state.Register((*linkAddrEntryEntry)(nil))
state.Register((*PacketBufferList)(nil))
@@ -271,4 +661,6 @@ func init() {
state.Register((*GSO)(nil))
state.Register((*TransportEndpointInfo)(nil))
state.Register((*multiPortEndpoint)(nil))
+ state.Register((*tupleList)(nil))
+ state.Register((*tupleEntry)(nil))
}
diff --git a/pkg/tcpip/stack/tuple_list.go b/pkg/tcpip/stack/tuple_list.go
new file mode 100644
index 000000000..0d1b98874
--- /dev/null
+++ b/pkg/tcpip/stack/tuple_list.go
@@ -0,0 +1,193 @@
+package stack
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type tupleElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (tupleElementMapper) linkerFor(elem *tuple) *tuple { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+// for e := l.Front(); e != nil; e = e.Next() {
+// // do something with e.
+// }
+//
+// +stateify savable
+type tupleList struct {
+ head *tuple
+ tail *tuple
+}
+
+// Reset resets list l to the empty state.
+func (l *tupleList) Reset() {
+ l.head = nil
+ l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+func (l *tupleList) Empty() bool {
+ return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+func (l *tupleList) Front() *tuple {
+ return l.head
+}
+
+// Back returns the last element of list l or nil.
+func (l *tupleList) Back() *tuple {
+ return l.tail
+}
+
+// Len returns the number of elements in the list.
+//
+// NOTE: This is an O(n) operation.
+func (l *tupleList) Len() (count int) {
+ for e := l.Front(); e != nil; e = (tupleElementMapper{}.linkerFor(e)).Next() {
+ count++
+ }
+ return count
+}
+
+// PushFront inserts the element e at the front of list l.
+func (l *tupleList) PushFront(e *tuple) {
+ linker := tupleElementMapper{}.linkerFor(e)
+ linker.SetNext(l.head)
+ linker.SetPrev(nil)
+ if l.head != nil {
+ tupleElementMapper{}.linkerFor(l.head).SetPrev(e)
+ } else {
+ l.tail = e
+ }
+
+ l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+func (l *tupleList) PushBack(e *tuple) {
+ linker := tupleElementMapper{}.linkerFor(e)
+ linker.SetNext(nil)
+ linker.SetPrev(l.tail)
+ if l.tail != nil {
+ tupleElementMapper{}.linkerFor(l.tail).SetNext(e)
+ } else {
+ l.head = e
+ }
+
+ l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+func (l *tupleList) PushBackList(m *tupleList) {
+ if l.head == nil {
+ l.head = m.head
+ l.tail = m.tail
+ } else if m.head != nil {
+ tupleElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+ tupleElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+ l.tail = m.tail
+ }
+ m.head = nil
+ m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+func (l *tupleList) InsertAfter(b, e *tuple) {
+ bLinker := tupleElementMapper{}.linkerFor(b)
+ eLinker := tupleElementMapper{}.linkerFor(e)
+
+ a := bLinker.Next()
+
+ eLinker.SetNext(a)
+ eLinker.SetPrev(b)
+ bLinker.SetNext(e)
+
+ if a != nil {
+ tupleElementMapper{}.linkerFor(a).SetPrev(e)
+ } else {
+ l.tail = e
+ }
+}
+
+// InsertBefore inserts e before a.
+func (l *tupleList) InsertBefore(a, e *tuple) {
+ aLinker := tupleElementMapper{}.linkerFor(a)
+ eLinker := tupleElementMapper{}.linkerFor(e)
+
+ b := aLinker.Prev()
+ eLinker.SetNext(a)
+ eLinker.SetPrev(b)
+ aLinker.SetPrev(e)
+
+ if b != nil {
+ tupleElementMapper{}.linkerFor(b).SetNext(e)
+ } else {
+ l.head = e
+ }
+}
+
+// Remove removes e from l.
+func (l *tupleList) Remove(e *tuple) {
+ linker := tupleElementMapper{}.linkerFor(e)
+ prev := linker.Prev()
+ next := linker.Next()
+
+ if prev != nil {
+ tupleElementMapper{}.linkerFor(prev).SetNext(next)
+ } else if l.head == e {
+ l.head = next
+ }
+
+ if next != nil {
+ tupleElementMapper{}.linkerFor(next).SetPrev(prev)
+ } else if l.tail == e {
+ l.tail = prev
+ }
+
+ linker.SetNext(nil)
+ linker.SetPrev(nil)
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type tupleEntry struct {
+ next *tuple
+ prev *tuple
+}
+
+// Next returns the entry that follows e in the list.
+func (e *tupleEntry) Next() *tuple {
+ return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+func (e *tupleEntry) Prev() *tuple {
+ return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+func (e *tupleEntry) SetNext(elem *tuple) {
+ e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+func (e *tupleEntry) SetPrev(elem *tuple) {
+ e.prev = elem
+}
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
index 12bc1b5b5..558b06df0 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
@@ -106,6 +106,11 @@ func (t *TCB) UpdateStateOutbound(tcp header.TCP) Result {
return st
}
+// State returns the current state of the TCB.
+func (t *TCB) State() Result {
+ return t.state
+}
+
// IsAlive returns true as long as the connection is established(Alive)
// or connecting state.
func (t *TCB) IsAlive() bool {