diff options
Diffstat (limited to 'pkg/tcpip/stack/conntrack.go')
-rw-r--r-- | pkg/tcpip/stack/conntrack.go | 354 |
1 files changed, 172 insertions, 182 deletions
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go index 30545f634..16d295271 100644 --- a/pkg/tcpip/stack/conntrack.go +++ b/pkg/tcpip/stack/conntrack.go @@ -64,13 +64,21 @@ type tuple struct { // tupleEntry is used to build an intrusive list of tuples. tupleEntry - tupleID - // conn is the connection tracking entry this tuple belongs to. conn *conn // direction is the direction of the tuple. direction direction + + mu sync.RWMutex `state:"nosave"` + // +checklocks:mu + tupleID tupleID +} + +func (t *tuple) id() tupleID { + t.mu.RLock() + defer t.mu.RUnlock() + return t.tupleID } // tupleID uniquely identifies a connection in one direction. It currently @@ -103,17 +111,23 @@ func (ti tupleID) reply() tupleID { // // +stateify savable type conn struct { + ct *ConnTrack + // original is the tuple in original direction. It is immutable. original tuple - // reply is the tuple in reply direction. It is immutable. + // reply is the tuple in reply direction. reply tuple - // manip indicates if the packet should be manipulated. It is immutable. - // TODO(gvisor.dev/issue/5696): Support updating manipulation type. - manip manipType - mu sync.RWMutex `state:"nosave"` + // Indicates that the connection has been finalized and may handle replies. + // + // +checklocks:mu + finalized bool + // manip indicates if the packet should be manipulated. + // + // +checklocks:mu + manip manipType // tcb is TCB control block. It is used to keep track of states // of tcp connection. // @@ -128,17 +142,6 @@ type conn struct { lastUsed time.Time `state:".(unixTime)"` } -// newConn creates new connection. -func newConn(orig, reply tupleID, manip manipType) *conn { - conn := conn{ - manip: manip, - lastUsed: time.Now(), - } - conn.original = tuple{conn: &conn, tupleID: orig} - conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply} - return &conn -} - // timedOut returns whether the connection timed out based on its state. func (cn *conn) timedOut(now time.Time) bool { const establishedTimeout = 5 * 24 * time.Hour @@ -235,168 +238,180 @@ func getTransportHeader(pkt *PacketBuffer) (header.ChecksummableTransport, bool) return nil, false } -// packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid -// TCP header. -// -// Preconditions: pkt.NetworkHeader() is valid. -func packetToTupleID(pkt *PacketBuffer) (tupleID, tcpip.Error) { +func (ct *ConnTrack) init() { + ct.mu.Lock() + defer ct.mu.Unlock() + ct.buckets = make([]bucket, numBuckets) +} + +func (ct *ConnTrack) getConnOrMaybeInsertNoop(pkt *PacketBuffer) *tuple { netHeader := pkt.Network() transportHeader, ok := getTransportHeader(pkt) if !ok { - return tupleID{}, &tcpip.ErrUnknownProtocol{} + return nil } - return tupleID{ + tid := tupleID{ srcAddr: netHeader.SourceAddress(), srcPort: transportHeader.SourcePort(), dstAddr: netHeader.DestinationAddress(), dstPort: transportHeader.DestinationPort(), transProto: pkt.TransportProtocolNumber, netProto: pkt.NetworkProtocolNumber, - }, nil -} + } -func (ct *ConnTrack) init() { - ct.mu.Lock() - defer ct.mu.Unlock() - ct.buckets = make([]bucket, numBuckets) -} + bktID := ct.bucket(tid) + + ct.mu.RLock() + bkt := &ct.buckets[bktID] + ct.mu.RUnlock() + + now := time.Now() + if t := bkt.connForTID(tid, now); t != nil { + return t + } -// connFor gets the conn for pkt if it exists, or returns nil -// if it does not. It returns an error when pkt does not contain a valid TCP -// header. -// TODO(gvisor.dev/issue/6168): Support UDP. -func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) { - tid, err := packetToTupleID(pkt) - if err != nil { - return nil, dirOriginal + bkt.mu.Lock() + defer bkt.mu.Unlock() + + // Make sure a connection wasn't added between when we last checked the + // bucket and acquired the bucket's write lock. + if t := bkt.connForTIDRLocked(tid, now); t != nil { + return t + } + + // This is the first packet we're seeing for the connection. Create an entry + // for this new connection. + conn := &conn{ + ct: ct, + original: tuple{tupleID: tid, direction: dirOriginal}, + reply: tuple{tupleID: tid.reply(), direction: dirReply}, + manip: manipNone, + lastUsed: now, } - return ct.connForTID(tid) + conn.original.conn = conn + conn.reply.conn = conn + + // For now, we only map an entry for the packet's original tuple as NAT may be + // performed on this connection. Until the packet goes through all the hooks + // and its final address/port is known, we cannot know what the response + // packet's addresses/ports will look like. + // + // This is okay because the destination cannot send its response until it + // receives the packet; the packet will only be received once all the hooks + // have been performed. + // + // See (*conn).finalize. + bkt.tuples.PushFront(&conn.original) + return &conn.original } -func (ct *ConnTrack) connForTID(tid tupleID) (*conn, direction) { +func (ct *ConnTrack) connForTID(tid tupleID) *tuple { bktID := ct.bucket(tid) - now := time.Now() ct.mu.RLock() bkt := &ct.buckets[bktID] ct.mu.RUnlock() + return bkt.connForTID(tid, time.Now()) +} + +func (bkt *bucket) connForTID(tid tupleID, now time.Time) *tuple { bkt.mu.RLock() defer bkt.mu.RUnlock() + return bkt.connForTIDRLocked(tid, now) +} + +// +checklocks:bkt.mu +func (bkt *bucket) connForTIDRLocked(tid tupleID, now time.Time) *tuple { for other := bkt.tuples.Front(); other != nil; other = other.Next() { - if tid == other.tupleID && !other.conn.timedOut(now) { - return other.conn, other.direction + if tid == other.id() && !other.conn.timedOut(now) { + return other } } - - return nil, dirOriginal + return nil } -func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, port uint16, address tcpip.Address) *conn { - tid, err := packetToTupleID(pkt) - if err != nil { - return nil - } - if hook != Prerouting && hook != Output { - return nil - } +func (ct *ConnTrack) finalize(cn *conn) { + tid := cn.reply.id() + id := ct.bucket(tid) - replyTID := tid.reply() - replyTID.srcAddr = address - replyTID.srcPort = port + ct.mu.RLock() + bkt := &ct.buckets[id] + ct.mu.RUnlock() - conn, _ := ct.connForTID(tid) - if conn != nil { - // The connection is already tracked. - // TODO(gvisor.dev/issue/5696): Support updating an existing connection. - return nil + bkt.mu.Lock() + defer bkt.mu.Unlock() + + if t := bkt.connForTIDRLocked(tid, time.Now()); t != nil { + // Another connection for the reply already exists. We can't do much about + // this so we leave the connection cn represents in a state where it can + // send packets but its responses will be mapped to some other connection. + // This may be okay if the connection only expects to send packets without + // any responses. + return } - conn = newConn(tid, replyTID, manipDestination) - ct.insertConn(conn) - return conn + + bkt.tuples.PushFront(&cn.reply) } -func (ct *ConnTrack) insertSNATConn(pkt *PacketBuffer, hook Hook, port uint16, address tcpip.Address) *conn { - tid, err := packetToTupleID(pkt) - if err != nil { - return nil - } - if hook != Input && hook != Postrouting { - return nil +func (cn *conn) finalize() { + { + cn.mu.RLock() + finalized := cn.finalized + cn.mu.RUnlock() + if finalized { + return + } } - replyTID := tid.reply() - replyTID.dstAddr = address - replyTID.dstPort = port - - conn, _ := ct.connForTID(tid) - if conn != nil { - // The connection is already tracked. - // TODO(gvisor.dev/issue/5696): Support updating an existing connection. - return nil + cn.mu.Lock() + finalized := cn.finalized + cn.finalized = true + cn.mu.Unlock() + if finalized { + return } - conn = newConn(tid, replyTID, manipSource) - ct.insertConn(conn) - return conn + + cn.ct.finalize(cn) } -// insertConn inserts conn into the appropriate table bucket. -func (ct *ConnTrack) insertConn(conn *conn) { - tupleBktID := ct.bucket(conn.original.tupleID) - replyBktID := ct.bucket(conn.reply.tupleID) +// performNAT setups up the connection for the specified NAT. +// +// Generally, only the first packet of a connection reaches this method; other +// other packets will be manipulated without needing to modify the connection. +func (cn *conn) performNAT(pkt *PacketBuffer, hook Hook, r *Route, port uint16, address tcpip.Address, dnat bool) { + cn.performNATIfNoop(port, address, dnat) + cn.handlePacket(pkt, hook, r) +} - ct.mu.RLock() - defer ct.mu.RUnlock() +func (cn *conn) performNATIfNoop(port uint16, address tcpip.Address, dnat bool) { + cn.mu.Lock() + defer cn.mu.Unlock() - tupleBkt := &ct.buckets[tupleBktID] - if tupleBktID == replyBktID { - // Both tuples are in the same bucket. - tupleBkt.mu.Lock() - defer tupleBkt.mu.Unlock() - insertConn(tupleBkt, tupleBkt, conn) + if cn.finalized { return } - // Lock the buckets in the correct order. - replyBkt := &ct.buckets[replyBktID] - if tupleBktID < replyBktID { - tupleBkt.mu.Lock() - defer tupleBkt.mu.Unlock() - replyBkt.mu.Lock() - defer replyBkt.mu.Unlock() - } else { - replyBkt.mu.Lock() - defer replyBkt.mu.Unlock() - tupleBkt.mu.Lock() - defer tupleBkt.mu.Unlock() + if cn.manip != manipNone { + return } - insertConn(tupleBkt, replyBkt, conn) -} -// TODO(https://gvisor.dev/issue/6590): annotate r/w locking requirements. -// +checklocks:tupleBkt.mu -// +checklocks:replyBkt.mu -func insertConn(tupleBkt *bucket, replyBkt *bucket, conn *conn) { - // Now that we hold the locks, ensure the tuple hasn't been inserted by - // another thread. - // TODO(gvisor.dev/issue/5773): Should check conn.reply.tupleID, too? - alreadyInserted := false - for other := tupleBkt.tuples.Front(); other != nil; other = other.Next() { - if other.tupleID == conn.original.tupleID { - alreadyInserted = true - break - } - } + cn.reply.mu.Lock() + defer cn.reply.mu.Unlock() - if !alreadyInserted { - // Add the tuple to the map. - tupleBkt.tuples.PushFront(&conn.original) - replyBkt.tuples.PushFront(&conn.reply) + if dnat { + cn.reply.tupleID.srcAddr = address + cn.reply.tupleID.srcPort = port + cn.manip = manipDestination + } else { + cn.reply.tupleID.dstAddr = address + cn.reply.tupleID.dstPort = port + cn.manip = manipSource } } -func (cn *conn) handlePacket(pkt *PacketBuffer, hook Hook, dir direction, r *Route) { +func (cn *conn) handlePacket(pkt *PacketBuffer, hook Hook, r *Route) { if pkt.NatDone { return } @@ -417,26 +432,35 @@ func (cn *conn) handlePacket(pkt *PacketBuffer, hook Hook, dir direction, r *Rou updateSRCFields := false + dir := pkt.tuple.direction + + cn.mu.Lock() + defer cn.mu.Unlock() + switch hook { case Prerouting, Output: if cn.manip == manipDestination && dir == dirOriginal { - newPort = cn.reply.srcPort - newAddr = cn.reply.srcAddr + id := cn.reply.id() + newPort = id.srcPort + newAddr = id.srcAddr pkt.NatDone = true } else if cn.manip == manipSource && dir == dirReply { - newPort = cn.original.srcPort - newAddr = cn.original.srcAddr + id := cn.original.id() + newPort = id.srcPort + newAddr = id.srcAddr pkt.NatDone = true } case Input, Postrouting: if cn.manip == manipSource && dir == dirOriginal { - newPort = cn.reply.dstPort - newAddr = cn.reply.dstAddr + id := cn.reply.id() + newPort = id.dstPort + newAddr = id.dstAddr updateSRCFields = true pkt.NatDone = true } else if cn.manip == manipDestination && dir == dirReply { - newPort = cn.original.dstPort - newAddr = cn.original.dstAddr + id := cn.original.id() + newPort = id.dstPort + newAddr = id.dstAddr updateSRCFields = true pkt.NatDone = true } @@ -479,51 +503,12 @@ func (cn *conn) handlePacket(pkt *PacketBuffer, hook Hook, dir direction, r *Rou newAddr, ) - // Update the state of tcb. - cn.mu.Lock() - defer cn.mu.Unlock() - // Mark the connection as having been used recently so it isn't reaped. cn.lastUsed = time.Now() // Update connection state. cn.updateLocked(pkt, dir) } -// maybeInsertNoop tries to insert a no-op connection entry to keep connections -// from getting clobbered when replies arrive. It only inserts if there isn't -// already a connection for pkt. -// -// This should be called after traversing iptables rules only, to ensure that -// pkt.NatDone is set correctly. -func (ct *ConnTrack) maybeInsertNoop(pkt *PacketBuffer) { - // If there were a rule applying to this packet, it would be marked - // with NatDone. - if pkt.NatDone { - return - } - - switch pkt.TransportProtocolNumber { - case header.TCPProtocolNumber, header.UDPProtocolNumber: - default: - // TODO(https://gvisor.dev/issue/5915): Track ICMP and other trackable - // connections. - return - } - - // This is the first packet we're seeing for the TCP connection. Insert - // the noop entry (an identity mapping) so that the response doesn't - // get NATed, breaking the connection. - tid, err := packetToTupleID(pkt) - if err != nil { - return - } - conn := newConn(tid, tid.reply(), manipNone) - ct.insertConn(conn) - conn.mu.Lock() - defer conn.mu.Unlock() - conn.updateLocked(pkt, dirOriginal) -} - // bucket gets the conntrack bucket for a tupleID. func (ct *ConnTrack) bucket(id tupleID) int { h := jenkins.Sum32(ct.seed) @@ -617,7 +602,7 @@ func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bktID int, bkt *bucket, now t // To maintain lock order, we can only reap these tuples if the reply // appears later in the table. - replyBktID := ct.bucket(tuple.reply()) + replyBktID := ct.bucket(tuple.id().reply()) if bktID > replyBktID { return true } @@ -658,14 +643,19 @@ func (ct *ConnTrack) originalDst(epID TransportEndpointID, netProto tcpip.Networ transProto: transProto, netProto: netProto, } - conn, _ := ct.connForTID(tid) - if conn == nil { + t := ct.connForTID(tid) + if t == nil { // Not a tracked connection. return "", 0, &tcpip.ErrNotConnected{} - } else if conn.manip != manipDestination { + } + + t.conn.mu.RLock() + defer t.conn.mu.RUnlock() + if t.conn.manip != manipDestination { // Unmanipulated destination. return "", 0, &tcpip.ErrInvalidOptionValue{} } - return conn.original.dstAddr, conn.original.dstPort, nil + id := t.conn.original.id() + return id.dstAddr, id.dstPort, nil } |