diff options
author | gVisor bot <gvisor-bot@google.com> | 2020-07-13 19:03:18 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2020-07-13 19:03:18 +0000 |
commit | 0a49098e2b456f23af79341530d79f9128b58bb5 (patch) | |
tree | dde9d942d1dae331e62bdde25f067286c3b8a390 /pkg | |
parent | 4e931be12f70e320c7d750fba1c3a3c6b008ddb6 (diff) | |
parent | 43c209f48e0aa9024705583cc6f0fafa7d6380ca (diff) |
Merge release-20200622.1-97-g43c209f48 (automated)
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/tcpip/stack/conntrack.go | 277 | ||||
-rw-r--r-- | pkg/tcpip/stack/iptables.go | 42 | ||||
-rw-r--r-- | pkg/tcpip/stack/iptables_state.go | 40 | ||||
-rw-r--r-- | pkg/tcpip/stack/iptables_types.go | 11 | ||||
-rw-r--r-- | pkg/tcpip/stack/stack.go | 1 | ||||
-rw-r--r-- | pkg/tcpip/stack/stack_state_autogen.go | 392 | ||||
-rw-r--r-- | pkg/tcpip/stack/tuple_list.go | 193 | ||||
-rw-r--r-- | pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go | 5 |
8 files changed, 918 insertions, 43 deletions
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go index af9c325ca..d39baf620 100644 --- a/pkg/tcpip/stack/conntrack.go +++ b/pkg/tcpip/stack/conntrack.go @@ -15,9 +15,12 @@ package stack import ( + "encoding/binary" "sync" + "time" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack" ) @@ -30,6 +33,10 @@ import ( // // Currently, only TCP tracking is supported. +// Our hash table has 16K buckets. +// TODO(gvisor.dev/issue/170): These should be tunable. +const numBuckets = 1 << 14 + // Direction of the tuple. type direction int @@ -48,7 +55,12 @@ const ( // tuple holds a connection's identifying and manipulating data in one // direction. It is immutable. +// +// +stateify savable type tuple struct { + // tupleEntry is used to build an intrusive list of tuples. + tupleEntry + tupleID // conn is the connection tracking entry this tuple belongs to. @@ -61,6 +73,8 @@ type tuple struct { // tupleID uniquely identifies a connection in one direction. It currently // contains enough information to distinguish between any TCP or UDP // connection, and will need to be extended to support other protocols. +// +// +stateify savable type tupleID struct { srcAddr tcpip.Address srcPort uint16 @@ -83,6 +97,8 @@ func (ti tupleID) reply() tupleID { } // conn is a tracked connection. +// +// +stateify savable type conn struct { // original is the tuple in original direction. It is immutable. original tuple @@ -98,22 +114,67 @@ type conn struct { tcbHook Hook // mu protects tcb. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // tcb is TCB control block. It is used to keep track of states // of tcp connection and is protected by mu. tcb tcpconntrack.TCB + + // lastUsed is the last time the connection saw a relevant packet, and + // is updated by each packet on the connection. It is protected by mu. + lastUsed time.Time `state:".(unixTime)"` +} + +// timedOut returns whether the connection timed out based on its state. +func (cn *conn) timedOut(now time.Time) bool { + const establishedTimeout = 5 * 24 * time.Hour + const defaultTimeout = 120 * time.Second + cn.mu.Lock() + defer cn.mu.Unlock() + if cn.tcb.State() == tcpconntrack.ResultAlive { + // Use the same default as Linux, which doesn't delete + // established connections for 5(!) days. + return now.Sub(cn.lastUsed) > establishedTimeout + } + // Use the same default as Linux, which lets connections in most states + // other than established remain for <= 120 seconds. + return now.Sub(cn.lastUsed) > defaultTimeout } // ConnTrack tracks all connections created for NAT rules. Most users are // expected to only call handlePacket and createConnFor. +// +// ConnTrack keeps all connections in a slice of buckets, each of which holds a +// linked list of tuples. This gives us some desirable properties: +// - Each bucket has its own lock, lessening lock contention. +// - The slice is large enough that lists stay short (<10 elements on average). +// Thus traversal is fast. +// - During linked list traversal we reap expired connections. This amortizes +// the cost of reaping them and makes reapUnused faster. +// +// Locks are ordered by their location in the buckets slice. That is, a +// goroutine that locks buckets[i] can only lock buckets[j] s.t. i < j. +// +// +stateify savable type ConnTrack struct { - // mu protects conns. - mu sync.RWMutex + // seed is a one-time random value initialized at stack startup + // and is used in the calculation of hash keys for the list of buckets. + // It is immutable. + seed uint32 - // conns maintains a map of tuples needed for connection tracking for - // iptables NAT rules. It is protected by mu. - conns map[tupleID]tuple + // mu protects the buckets slice, but not buckets' contents. Only take + // the write lock if you are modifying the slice or saving for S/R. + mu sync.RWMutex `state:"nosave"` + + // buckets is protected by mu. + buckets []bucket +} + +// +stateify savable +type bucket struct { + // mu protects tuples. + mu sync.Mutex `state:"nosave"` + tuples tupleList } // packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid @@ -143,8 +204,9 @@ func packetToTupleID(pkt *PacketBuffer) (tupleID, *tcpip.Error) { // newConn creates new connection. func newConn(orig, reply tupleID, manip manipType, hook Hook) *conn { conn := conn{ - manip: manip, - tcbHook: hook, + manip: manip, + tcbHook: hook, + lastUsed: time.Now(), } conn.original = tuple{conn: &conn, tupleID: orig} conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply} @@ -162,14 +224,28 @@ func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) { return nil, dirOriginal } - ct.mu.Lock() - defer ct.mu.Unlock() - - tuple, ok := ct.conns[tid] - if !ok { - return nil, dirOriginal + bucket := ct.bucket(tid) + now := time.Now() + + ct.mu.RLock() + defer ct.mu.RUnlock() + ct.buckets[bucket].mu.Lock() + defer ct.buckets[bucket].mu.Unlock() + + // Iterate over the tuples in a bucket, cleaning up any unused + // connections we find. + for other := ct.buckets[bucket].tuples.Front(); other != nil; other = other.Next() { + // Clean up any timed-out connections we happen to find. + if ct.reapTupleLocked(other, bucket, now) { + // The tuple expired. + continue + } + if tid == other.tupleID { + return other.conn, other.direction + } } - return tuple.conn, tuple.direction + + return nil, dirOriginal } // createConnFor creates a new conn for pkt. @@ -197,13 +273,31 @@ func (ct *ConnTrack) createConnFor(pkt *PacketBuffer, hook Hook, rt RedirectTarg } conn := newConn(tid, replyTID, manip, hook) - // Add the changed tuple to the map. - // TODO(gvisor.dev/issue/170): Need to support collisions using linked - // list. - ct.mu.Lock() - defer ct.mu.Unlock() - ct.conns[tid] = conn.original - ct.conns[replyTID] = conn.reply + // Lock the buckets in the correct order. + tupleBucket := ct.bucket(tid) + replyBucket := ct.bucket(replyTID) + ct.mu.RLock() + defer ct.mu.RUnlock() + if tupleBucket < replyBucket { + ct.buckets[tupleBucket].mu.Lock() + ct.buckets[replyBucket].mu.Lock() + } else if tupleBucket > replyBucket { + ct.buckets[replyBucket].mu.Lock() + ct.buckets[tupleBucket].mu.Lock() + } else { + // Both tuples are in the same bucket. + ct.buckets[tupleBucket].mu.Lock() + } + + // Add the tuple to the map. + ct.buckets[tupleBucket].tuples.PushFront(&conn.original) + ct.buckets[replyBucket].tuples.PushFront(&conn.reply) + + // Unlocking can happen in any order. + ct.buckets[tupleBucket].mu.Unlock() + if tupleBucket != replyBucket { + ct.buckets[replyBucket].mu.Unlock() + } return conn } @@ -297,35 +391,134 @@ func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Rou // other tcp states. conn.mu.Lock() defer conn.mu.Unlock() - var st tcpconntrack.Result - tcpHeader := header.TCP(pkt.TransportHeader) - if conn.tcb.IsEmpty() { + + // Mark the connection as having been used recently so it isn't reaped. + conn.lastUsed = time.Now() + // Update connection state. + if tcpHeader := header.TCP(pkt.TransportHeader); conn.tcb.IsEmpty() { conn.tcb.Init(tcpHeader) conn.tcbHook = hook + } else if hook == conn.tcbHook { + conn.tcb.UpdateStateOutbound(tcpHeader) } else { - switch hook { - case conn.tcbHook: - st = conn.tcb.UpdateStateOutbound(tcpHeader) - default: - st = conn.tcb.UpdateStateInbound(tcpHeader) - } + conn.tcb.UpdateStateInbound(tcpHeader) } +} + +// bucket gets the conntrack bucket for a tupleID. +func (ct *ConnTrack) bucket(id tupleID) int { + h := jenkins.Sum32(ct.seed) + h.Write([]byte(id.srcAddr)) + h.Write([]byte(id.dstAddr)) + shortBuf := make([]byte, 2) + binary.LittleEndian.PutUint16(shortBuf, id.srcPort) + h.Write([]byte(shortBuf)) + binary.LittleEndian.PutUint16(shortBuf, id.dstPort) + h.Write([]byte(shortBuf)) + binary.LittleEndian.PutUint16(shortBuf, uint16(id.transProto)) + h.Write([]byte(shortBuf)) + binary.LittleEndian.PutUint16(shortBuf, uint16(id.netProto)) + h.Write([]byte(shortBuf)) + ct.mu.RLock() + defer ct.mu.RUnlock() + return int(h.Sum32()) % len(ct.buckets) +} - // Delete conn if tcp connection is closed. - if st == tcpconntrack.ResultClosedByPeer || st == tcpconntrack.ResultClosedBySelf || st == tcpconntrack.ResultReset { - ct.deleteConn(conn) +// reapUnused deletes timed out entries from the conntrack map. The rules for +// reaping are: +// - Most reaping occurs in connFor, which is called on each packet. connFor +// cleans up the bucket the packet's connection maps to. Thus calls to +// reapUnused should be fast. +// - Each call to reapUnused traverses a fraction of the conntrack table. +// Specifically, it traverses len(ct.buckets)/fractionPerReaping. +// - After reaping, reapUnused decides when it should next run based on the +// ratio of expired connections to examined connections. If the ratio is +// greater than maxExpiredPct, it schedules the next run quickly. Otherwise it +// slightly increases the interval between runs. +// - maxFullTraversal caps the time it takes to traverse the entire table. +// +// reapUnused returns the next bucket that should be checked and the time after +// which it should be called again. +func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, time.Duration) { + // TODO(gvisor.dev/issue/170): This can be more finely controlled, as + // it is in Linux via sysctl. + const fractionPerReaping = 128 + const maxExpiredPct = 50 + const maxFullTraversal = 60 * time.Second + const minInterval = 10 * time.Millisecond + const maxInterval = maxFullTraversal / fractionPerReaping + + now := time.Now() + checked := 0 + expired := 0 + var idx int + ct.mu.RLock() + defer ct.mu.RUnlock() + for i := 0; i < len(ct.buckets)/fractionPerReaping; i++ { + idx = (i + start) % len(ct.buckets) + ct.buckets[idx].mu.Lock() + for tuple := ct.buckets[idx].tuples.Front(); tuple != nil; tuple = tuple.Next() { + checked++ + if ct.reapTupleLocked(tuple, idx, now) { + expired++ + } + } + ct.buckets[idx].mu.Unlock() + } + // We already checked buckets[idx]. + idx++ + + // If half or more of the connections are expired, the table has gotten + // stale. Reschedule quickly. + expiredPct := 0 + if checked != 0 { + expiredPct = expired * 100 / checked + } + if expiredPct > maxExpiredPct { + return idx, minInterval + } + if interval := prevInterval + minInterval; interval <= maxInterval { + // Increment the interval between runs. + return idx, interval } + // We've hit the maximum interval. + return idx, maxInterval } -// deleteConn deletes the connection. -func (ct *ConnTrack) deleteConn(conn *conn) { - if conn == nil { - return +// reapTupleLocked tries to remove tuple and its reply from the table. It +// returns whether the tuple's connection has timed out. +// +// Preconditions: ct.mu is locked for reading and bucket is locked. +func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bool { + if !tuple.conn.timedOut(now) { + return false } - ct.mu.Lock() - defer ct.mu.Unlock() + // To maintain lock order, we can only reap these tuples if the reply + // appears later in the table. + replyBucket := ct.bucket(tuple.reply()) + if bucket > replyBucket { + return true + } + + // Don't re-lock if both tuples are in the same bucket. + differentBuckets := bucket != replyBucket + if differentBuckets { + ct.buckets[replyBucket].mu.Lock() + } + + // We have the buckets locked and can remove both tuples. + if tuple.direction == dirOriginal { + ct.buckets[replyBucket].tuples.Remove(&tuple.conn.reply) + } else { + ct.buckets[replyBucket].tuples.Remove(&tuple.conn.original) + } + ct.buckets[bucket].tuples.Remove(tuple) + + // Don't re-unlock if both tuples are in the same bucket. + if differentBuckets { + ct.buckets[replyBucket].mu.Unlock() + } - delete(ct.conns, conn.original.tupleID) - delete(ct.conns, conn.reply.tupleID) + return true } diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go index 974d77c36..f846ea2e5 100644 --- a/pkg/tcpip/stack/iptables.go +++ b/pkg/tcpip/stack/iptables.go @@ -16,6 +16,7 @@ package stack import ( "fmt" + "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -41,6 +42,9 @@ const ( // underflow. const HookUnset = -1 +// reaperDelay is how long to wait before starting to reap connections. +const reaperDelay = 5 * time.Second + // DefaultTables returns a default set of tables. Each chain is set to accept // all packets. func DefaultTables() *IPTables { @@ -112,8 +116,9 @@ func DefaultTables() *IPTables { Output: []string{TablenameMangle, TablenameNat, TablenameFilter}, }, connections: ConnTrack{ - conns: make(map[tupleID]tuple), + seed: generateRandUint32(), }, + reaperDone: make(chan struct{}, 1), } } @@ -169,6 +174,12 @@ func (it *IPTables) GetTable(name string) (Table, bool) { func (it *IPTables) ReplaceTable(name string, table Table) { it.mu.Lock() defer it.mu.Unlock() + // If iptables is being enabled, initialize the conntrack table and + // reaper. + if !it.modified { + it.connections.buckets = make([]bucket, numBuckets) + it.startReaper(reaperDelay) + } it.modified = true it.tables[name] = table } @@ -249,6 +260,35 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr return true } +// beforeSave is invoked by stateify. +func (it *IPTables) beforeSave() { + // Ensure the reaper exits cleanly. + it.reaperDone <- struct{}{} + // Prevent others from modifying the connection table. + it.connections.mu.Lock() +} + +// afterLoad is invoked by stateify. +func (it *IPTables) afterLoad() { + it.startReaper(reaperDelay) +} + +// startReaper starts a goroutine that wakes up periodically to reap timed out +// connections. +func (it *IPTables) startReaper(interval time.Duration) { + go func() { // S/R-SAFE: reaperDone is signalled when iptables is saved. + bucket := 0 + for { + select { + case <-it.reaperDone: + return + case <-time.After(interval): + bucket, interval = it.connections.reapUnused(bucket, interval) + } + } + }() +} + // CheckPackets runs pkts through the rules for hook and returns a map of packets that // should not go forward. // diff --git a/pkg/tcpip/stack/iptables_state.go b/pkg/tcpip/stack/iptables_state.go new file mode 100644 index 000000000..529e02a07 --- /dev/null +++ b/pkg/tcpip/stack/iptables_state.go @@ -0,0 +1,40 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "time" +) + +// +stateify savable +type unixTime struct { + second int64 + nano int64 +} + +// saveLastUsed is invoked by stateify. +func (cn *conn) saveLastUsed() unixTime { + return unixTime{cn.lastUsed.Unix(), cn.lastUsed.UnixNano()} +} + +// loadLastUsed is invoked by stateify. +func (cn *conn) loadLastUsed(unix unixTime) { + cn.lastUsed = time.Unix(unix.second, unix.nano) +} + +// beforeSave is invoked by stateify. +func (ct *ConnTrack) beforeSave() { + ct.mu.Lock() +} diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go index c528ec381..eb70e3104 100644 --- a/pkg/tcpip/stack/iptables_types.go +++ b/pkg/tcpip/stack/iptables_types.go @@ -78,6 +78,8 @@ const ( ) // IPTables holds all the tables for a netstack. +// +// +stateify savable type IPTables struct { // mu protects tables, priorities, and modified. mu sync.RWMutex @@ -97,10 +99,15 @@ type IPTables struct { modified bool connections ConnTrack + + // reaperDone can be signalled to stop the reaper goroutine. + reaperDone chan struct{} } // A Table defines a set of chains and hooks into the network stack. It is // really just a list of rules. +// +// +stateify savable type Table struct { // Rules holds the rules that make up the table. Rules []Rule @@ -130,6 +137,8 @@ func (table *Table) ValidHooks() uint32 { // contains zero or more matchers, each of which is a specification of which // packets this rule applies to. If there are no matchers in the rule, it // applies to any packet. +// +// +stateify savable type Rule struct { // Filter holds basic IP filtering fields common to every rule. Filter IPHeaderFilter @@ -142,6 +151,8 @@ type Rule struct { } // IPHeaderFilter holds basic IP filtering data common to every rule. +// +// +stateify savable type IPHeaderFilter struct { // Protocol matches the transport protocol. Protocol tcpip.TransportProtocolNumber diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index cdcfb8321..0aa815447 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -425,6 +425,7 @@ type Stack struct { handleLocal bool // tables are the iptables packet filtering and manipulation rules. + // TODO(gvisor.dev/issue/170): S/R this field. tables *IPTables // resumableEndpoints is a list of endpoints that need to be resumed if the diff --git a/pkg/tcpip/stack/stack_state_autogen.go b/pkg/tcpip/stack/stack_state_autogen.go index 6efa9a773..3bf5a8660 100644 --- a/pkg/tcpip/stack/stack_state_autogen.go +++ b/pkg/tcpip/stack/stack_state_autogen.go @@ -6,6 +6,334 @@ import ( "gvisor.dev/gvisor/pkg/state" ) +func (x *tuple) StateTypeName() string { + return "pkg/tcpip/stack.tuple" +} + +func (x *tuple) StateFields() []string { + return []string{ + "tupleEntry", + "tupleID", + "conn", + "direction", + } +} + +func (x *tuple) beforeSave() {} + +func (x *tuple) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.tupleEntry) + m.Save(1, &x.tupleID) + m.Save(2, &x.conn) + m.Save(3, &x.direction) +} + +func (x *tuple) afterLoad() {} + +func (x *tuple) StateLoad(m state.Source) { + m.Load(0, &x.tupleEntry) + m.Load(1, &x.tupleID) + m.Load(2, &x.conn) + m.Load(3, &x.direction) +} + +func (x *tupleID) StateTypeName() string { + return "pkg/tcpip/stack.tupleID" +} + +func (x *tupleID) StateFields() []string { + return []string{ + "srcAddr", + "srcPort", + "dstAddr", + "dstPort", + "transProto", + "netProto", + } +} + +func (x *tupleID) beforeSave() {} + +func (x *tupleID) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.srcAddr) + m.Save(1, &x.srcPort) + m.Save(2, &x.dstAddr) + m.Save(3, &x.dstPort) + m.Save(4, &x.transProto) + m.Save(5, &x.netProto) +} + +func (x *tupleID) afterLoad() {} + +func (x *tupleID) StateLoad(m state.Source) { + m.Load(0, &x.srcAddr) + m.Load(1, &x.srcPort) + m.Load(2, &x.dstAddr) + m.Load(3, &x.dstPort) + m.Load(4, &x.transProto) + m.Load(5, &x.netProto) +} + +func (x *conn) StateTypeName() string { + return "pkg/tcpip/stack.conn" +} + +func (x *conn) StateFields() []string { + return []string{ + "original", + "reply", + "manip", + "tcbHook", + "tcb", + "lastUsed", + } +} + +func (x *conn) beforeSave() {} + +func (x *conn) StateSave(m state.Sink) { + x.beforeSave() + var lastUsed unixTime = x.saveLastUsed() + m.SaveValue(5, lastUsed) + m.Save(0, &x.original) + m.Save(1, &x.reply) + m.Save(2, &x.manip) + m.Save(3, &x.tcbHook) + m.Save(4, &x.tcb) +} + +func (x *conn) afterLoad() {} + +func (x *conn) StateLoad(m state.Source) { + m.Load(0, &x.original) + m.Load(1, &x.reply) + m.Load(2, &x.manip) + m.Load(3, &x.tcbHook) + m.Load(4, &x.tcb) + m.LoadValue(5, new(unixTime), func(y interface{}) { x.loadLastUsed(y.(unixTime)) }) +} + +func (x *ConnTrack) StateTypeName() string { + return "pkg/tcpip/stack.ConnTrack" +} + +func (x *ConnTrack) StateFields() []string { + return []string{ + "seed", + "buckets", + } +} + +func (x *ConnTrack) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.seed) + m.Save(1, &x.buckets) +} + +func (x *ConnTrack) afterLoad() {} + +func (x *ConnTrack) StateLoad(m state.Source) { + m.Load(0, &x.seed) + m.Load(1, &x.buckets) +} + +func (x *bucket) StateTypeName() string { + return "pkg/tcpip/stack.bucket" +} + +func (x *bucket) StateFields() []string { + return []string{ + "tuples", + } +} + +func (x *bucket) beforeSave() {} + +func (x *bucket) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.tuples) +} + +func (x *bucket) afterLoad() {} + +func (x *bucket) StateLoad(m state.Source) { + m.Load(0, &x.tuples) +} + +func (x *unixTime) StateTypeName() string { + return "pkg/tcpip/stack.unixTime" +} + +func (x *unixTime) StateFields() []string { + return []string{ + "second", + "nano", + } +} + +func (x *unixTime) beforeSave() {} + +func (x *unixTime) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.second) + m.Save(1, &x.nano) +} + +func (x *unixTime) afterLoad() {} + +func (x *unixTime) StateLoad(m state.Source) { + m.Load(0, &x.second) + m.Load(1, &x.nano) +} + +func (x *IPTables) StateTypeName() string { + return "pkg/tcpip/stack.IPTables" +} + +func (x *IPTables) StateFields() []string { + return []string{ + "mu", + "tables", + "priorities", + "modified", + "connections", + "reaperDone", + } +} + +func (x *IPTables) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.mu) + m.Save(1, &x.tables) + m.Save(2, &x.priorities) + m.Save(3, &x.modified) + m.Save(4, &x.connections) + m.Save(5, &x.reaperDone) +} + +func (x *IPTables) StateLoad(m state.Source) { + m.Load(0, &x.mu) + m.Load(1, &x.tables) + m.Load(2, &x.priorities) + m.Load(3, &x.modified) + m.Load(4, &x.connections) + m.Load(5, &x.reaperDone) + m.AfterLoad(x.afterLoad) +} + +func (x *Table) StateTypeName() string { + return "pkg/tcpip/stack.Table" +} + +func (x *Table) StateFields() []string { + return []string{ + "Rules", + "BuiltinChains", + "Underflows", + "UserChains", + } +} + +func (x *Table) beforeSave() {} + +func (x *Table) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.Rules) + m.Save(1, &x.BuiltinChains) + m.Save(2, &x.Underflows) + m.Save(3, &x.UserChains) +} + +func (x *Table) afterLoad() {} + +func (x *Table) StateLoad(m state.Source) { + m.Load(0, &x.Rules) + m.Load(1, &x.BuiltinChains) + m.Load(2, &x.Underflows) + m.Load(3, &x.UserChains) +} + +func (x *Rule) StateTypeName() string { + return "pkg/tcpip/stack.Rule" +} + +func (x *Rule) StateFields() []string { + return []string{ + "Filter", + "Matchers", + "Target", + } +} + +func (x *Rule) beforeSave() {} + +func (x *Rule) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.Filter) + m.Save(1, &x.Matchers) + m.Save(2, &x.Target) +} + +func (x *Rule) afterLoad() {} + +func (x *Rule) StateLoad(m state.Source) { + m.Load(0, &x.Filter) + m.Load(1, &x.Matchers) + m.Load(2, &x.Target) +} + +func (x *IPHeaderFilter) StateTypeName() string { + return "pkg/tcpip/stack.IPHeaderFilter" +} + +func (x *IPHeaderFilter) StateFields() []string { + return []string{ + "Protocol", + "Dst", + "DstMask", + "DstInvert", + "Src", + "SrcMask", + "SrcInvert", + "OutputInterface", + "OutputInterfaceMask", + "OutputInterfaceInvert", + } +} + +func (x *IPHeaderFilter) beforeSave() {} + +func (x *IPHeaderFilter) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.Protocol) + m.Save(1, &x.Dst) + m.Save(2, &x.DstMask) + m.Save(3, &x.DstInvert) + m.Save(4, &x.Src) + m.Save(5, &x.SrcMask) + m.Save(6, &x.SrcInvert) + m.Save(7, &x.OutputInterface) + m.Save(8, &x.OutputInterfaceMask) + m.Save(9, &x.OutputInterfaceInvert) +} + +func (x *IPHeaderFilter) afterLoad() {} + +func (x *IPHeaderFilter) StateLoad(m state.Source) { + m.Load(0, &x.Protocol) + m.Load(1, &x.Dst) + m.Load(2, &x.DstMask) + m.Load(3, &x.DstInvert) + m.Load(4, &x.Src) + m.Load(5, &x.SrcMask) + m.Load(6, &x.SrcInvert) + m.Load(7, &x.OutputInterface) + m.Load(8, &x.OutputInterfaceMask) + m.Load(9, &x.OutputInterfaceInvert) +} + func (x *linkAddrEntryList) StateTypeName() string { return "pkg/tcpip/stack.linkAddrEntryList" } @@ -261,7 +589,69 @@ func (x *multiPortEndpoint) StateLoad(m state.Source) { m.Load(4, &x.flags) } +func (x *tupleList) StateTypeName() string { + return "pkg/tcpip/stack.tupleList" +} + +func (x *tupleList) StateFields() []string { + return []string{ + "head", + "tail", + } +} + +func (x *tupleList) beforeSave() {} + +func (x *tupleList) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.head) + m.Save(1, &x.tail) +} + +func (x *tupleList) afterLoad() {} + +func (x *tupleList) StateLoad(m state.Source) { + m.Load(0, &x.head) + m.Load(1, &x.tail) +} + +func (x *tupleEntry) StateTypeName() string { + return "pkg/tcpip/stack.tupleEntry" +} + +func (x *tupleEntry) StateFields() []string { + return []string{ + "next", + "prev", + } +} + +func (x *tupleEntry) beforeSave() {} + +func (x *tupleEntry) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.next) + m.Save(1, &x.prev) +} + +func (x *tupleEntry) afterLoad() {} + +func (x *tupleEntry) StateLoad(m state.Source) { + m.Load(0, &x.next) + m.Load(1, &x.prev) +} + func init() { + state.Register((*tuple)(nil)) + state.Register((*tupleID)(nil)) + state.Register((*conn)(nil)) + state.Register((*ConnTrack)(nil)) + state.Register((*bucket)(nil)) + state.Register((*unixTime)(nil)) + state.Register((*IPTables)(nil)) + state.Register((*Table)(nil)) + state.Register((*Rule)(nil)) + state.Register((*IPHeaderFilter)(nil)) state.Register((*linkAddrEntryList)(nil)) state.Register((*linkAddrEntryEntry)(nil)) state.Register((*PacketBufferList)(nil)) @@ -271,4 +661,6 @@ func init() { state.Register((*GSO)(nil)) state.Register((*TransportEndpointInfo)(nil)) state.Register((*multiPortEndpoint)(nil)) + state.Register((*tupleList)(nil)) + state.Register((*tupleEntry)(nil)) } diff --git a/pkg/tcpip/stack/tuple_list.go b/pkg/tcpip/stack/tuple_list.go new file mode 100644 index 000000000..0d1b98874 --- /dev/null +++ b/pkg/tcpip/stack/tuple_list.go @@ -0,0 +1,193 @@ +package stack + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type tupleElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (tupleElementMapper) linkerFor(elem *tuple) *tuple { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type tupleList struct { + head *tuple + tail *tuple +} + +// Reset resets list l to the empty state. +func (l *tupleList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *tupleList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *tupleList) Front() *tuple { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *tupleList) Back() *tuple { + return l.tail +} + +// Len returns the number of elements in the list. +// +// NOTE: This is an O(n) operation. +func (l *tupleList) Len() (count int) { + for e := l.Front(); e != nil; e = (tupleElementMapper{}.linkerFor(e)).Next() { + count++ + } + return count +} + +// PushFront inserts the element e at the front of list l. +func (l *tupleList) PushFront(e *tuple) { + linker := tupleElementMapper{}.linkerFor(e) + linker.SetNext(l.head) + linker.SetPrev(nil) + if l.head != nil { + tupleElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *tupleList) PushBack(e *tuple) { + linker := tupleElementMapper{}.linkerFor(e) + linker.SetNext(nil) + linker.SetPrev(l.tail) + if l.tail != nil { + tupleElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *tupleList) PushBackList(m *tupleList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + tupleElementMapper{}.linkerFor(l.tail).SetNext(m.head) + tupleElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *tupleList) InsertAfter(b, e *tuple) { + bLinker := tupleElementMapper{}.linkerFor(b) + eLinker := tupleElementMapper{}.linkerFor(e) + + a := bLinker.Next() + + eLinker.SetNext(a) + eLinker.SetPrev(b) + bLinker.SetNext(e) + + if a != nil { + tupleElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *tupleList) InsertBefore(a, e *tuple) { + aLinker := tupleElementMapper{}.linkerFor(a) + eLinker := tupleElementMapper{}.linkerFor(e) + + b := aLinker.Prev() + eLinker.SetNext(a) + eLinker.SetPrev(b) + aLinker.SetPrev(e) + + if b != nil { + tupleElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *tupleList) Remove(e *tuple) { + linker := tupleElementMapper{}.linkerFor(e) + prev := linker.Prev() + next := linker.Next() + + if prev != nil { + tupleElementMapper{}.linkerFor(prev).SetNext(next) + } else if l.head == e { + l.head = next + } + + if next != nil { + tupleElementMapper{}.linkerFor(next).SetPrev(prev) + } else if l.tail == e { + l.tail = prev + } + + linker.SetNext(nil) + linker.SetPrev(nil) +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type tupleEntry struct { + next *tuple + prev *tuple +} + +// Next returns the entry that follows e in the list. +func (e *tupleEntry) Next() *tuple { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *tupleEntry) Prev() *tuple { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *tupleEntry) SetNext(elem *tuple) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *tupleEntry) SetPrev(elem *tuple) { + e.prev = elem +} diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go index 12bc1b5b5..558b06df0 100644 --- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go +++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go @@ -106,6 +106,11 @@ func (t *TCB) UpdateStateOutbound(tcp header.TCP) Result { return st } +// State returns the current state of the TCB. +func (t *TCB) State() Result { + return t.state +} + // IsAlive returns true as long as the connection is established(Alive) // or connecting state. func (t *TCB) IsAlive() bool { |