summaryrefslogtreecommitdiff
path: root/proto/bgp
diff options
context:
space:
mode:
authorMaria Matejka <mq@ucw.cz>2022-06-20 19:10:49 +0200
committerMaria Matejka <mq@ucw.cz>2022-07-11 16:07:09 +0200
commit6b0368cc2c317d1acc0881a96b32ded291d82741 (patch)
tree8e8eeb29b6bb2eb7edc6f701a71e2b0569799eb9 /proto/bgp
parentd5e3272f3d9b1bad7ceb6d0d5897a7269e28a537 (diff)
Export tables merged with BGP prefix hash
Until now, if export table was enabled, Nest was storing exactly the route before rt_notify() was called on it. This was quite sloppy and spooky and it also wasn't reflecting the changes BGP does before sending. And as BGP is storing the routes to be sent anyway, we are simply keeping the already-sent routes in there to better rule out unneeded reexports. Some of the route attributes (IGP metric, preference) make no sense in BGP, therefore these will be probably replaced by something sensible. Also the nexthop shown in the short output is the BGP nexthop.
Diffstat (limited to 'proto/bgp')
-rw-r--r--proto/bgp/attrs.c266
-rw-r--r--proto/bgp/bgp.c7
-rw-r--r--proto/bgp/bgp.h19
-rw-r--r--proto/bgp/packets.c32
4 files changed, 280 insertions, 44 deletions
diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c
index 22a53dce..084c9b63 100644
--- a/proto/bgp/attrs.c
+++ b/proto/bgp/attrs.c
@@ -1550,7 +1550,6 @@ bgp_free_bucket_table(struct bgp_channel *c)
static struct bgp_bucket *
bgp_get_bucket(struct bgp_channel *c, ea_list *new)
{
-
/* Hash and lookup */
u32 hash = ea_hash(new);
struct bgp_bucket *b = HASH_FIND(c->bucket_hash, RBH, new, hash);
@@ -1571,8 +1570,7 @@ bgp_get_bucket(struct bgp_channel *c, ea_list *new)
/* Copy the ea_list */
ea_list_copy(b->eattrs, new, ea_size);
- /* Insert the bucket to send queue and bucket hash */
- add_tail(&c->bucket_queue, &b->send_node);
+ /* Insert the bucket to bucket hash */
HASH_INSERT2(c->bucket_hash, RBH, c->pool, b);
return b;
@@ -1590,14 +1588,30 @@ bgp_get_withdraw_bucket(struct bgp_channel *c)
return c->withdraw_bucket;
}
-void
-bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b)
+static void
+bgp_free_bucket_xx(struct bgp_channel *c, struct bgp_bucket *b)
{
- rem_node(&b->send_node);
HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b);
mb_free(b);
}
+int
+bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b)
+{
+ /* Won't free the withdraw bucket */
+ if (b == c->withdraw_bucket)
+ return 0;
+
+ if (EMPTY_LIST(b->prefixes))
+ rem_node(&b->send_node);
+
+ if (b->px_uc || !EMPTY_LIST(b->prefixes))
+ return 0;
+
+ bgp_free_bucket_xx(c, b);
+ return 1;
+}
+
void
bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b)
{
@@ -1617,8 +1631,8 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b)
struct bgp_prefix *px = HEAD(b->prefixes);
log(L_ERR "%s: - withdrawing %N", p->p.name, &px->net);
- rem_node(&px->buck_node);
- add_tail(&wb->prefixes, &px->buck_node);
+ rem_node(&px->buck_node_xx);
+ add_tail(&wb->prefixes, &px->buck_node_xx);
}
}
@@ -1629,7 +1643,7 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b)
#define PXH_KEY(px) px->net, px->path_id, px->hash
#define PXH_NEXT(px) px->next
-#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && i1 == i2 && net_equal(n1, n2)
+#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (c->add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2)
#define PXH_FN(n,i,h) h
#define PXH_REHASH bgp_pxh_rehash
@@ -1659,15 +1673,13 @@ bgp_free_prefix_table(struct bgp_channel *c)
static struct bgp_prefix *
bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id)
{
+ u32 path_id_hash = c->add_path_tx ? path_id : 0;
/* We must use a different hash function than the rtable */
- u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id));
- struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id, hash);
+ u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash));
+ struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash);
if (px)
- {
- rem_node(&px->buck_node);
return px;
- }
if (c->prefix_slab)
px = sl_alloc(c->prefix_slab);
@@ -1684,10 +1696,64 @@ bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id)
return px;
}
-void
+static void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px);
+
+static inline int
+bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *b)
+{
+#define BPX_TRACE(what) do { \
+ if (c->c.debug & D_ROUTES) log(L_TRACE "%s.%s < %s %N %uG %s", \
+ c->c.proto->name, c->c.name, what, \
+ px->net, px->path_id, (b == c->withdraw_bucket) ? "withdraw" : "update"); } while (0)
+ px->lastmod = current_time();
+
+ /* Already queued for the same bucket */
+ if (px->cur == b)
+ {
+ BPX_TRACE("already queued");
+ return 0;
+ }
+
+ /* Unqueue from the old bucket */
+ if (px->cur)
+ {
+ rem_node(&px->buck_node_xx);
+ bgp_done_bucket(c, px->cur);
+ }
+
+ /* The new bucket is the same as we sent before */
+ if ((px->last == b) || c->c.out_table && !px->last && (b == c->withdraw_bucket))
+ {
+ if (px->cur)
+ BPX_TRACE("reverted");
+ else
+ BPX_TRACE("already sent");
+
+ /* Well, we haven't sent anything yet */
+ if (!px->last)
+ bgp_free_prefix(c, px);
+
+ px->cur = NULL;
+ return 0;
+ }
+
+ /* Enqueue the bucket if it has been empty */
+ if ((b != c->withdraw_bucket) && EMPTY_LIST(b->prefixes))
+ add_tail(&c->bucket_queue, &b->send_node);
+
+ /* Enqueue to the new bucket and indicate the change */
+ add_tail(&b->prefixes, &px->buck_node_xx);
+ px->cur = b;
+
+ BPX_TRACE("queued");
+ return 1;
+
+#undef BPX_TRACE
+}
+
+static void
bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px)
{
- rem_node(&px->buck_node);
HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px);
if (c->prefix_slab)
@@ -1696,6 +1762,167 @@ bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px)
mb_free(px);
}
+void
+bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck)
+{
+ /* Cleanup: We're called from bucket senders. */
+ ASSERT_DIE(px->cur == buck);
+ rem_node(&px->buck_node_xx);
+
+ /* We may want to store the updates */
+ if (c->c.out_table)
+ {
+ /* Nothing to be sent right now */
+ px->cur = NULL;
+
+ /* Unref the previous sent version */
+ if (px->last)
+ px->last->px_uc--;
+
+ /* Ref the current sent version */
+ if (buck != c->withdraw_bucket)
+ {
+ px->last = buck;
+ px->last->px_uc++;
+ return;
+ }
+
+ /* Prefixes belonging to the withdraw bucket are freed always */
+ }
+
+ bgp_free_prefix(c, px);
+}
+
+
+/*
+ * Prefix hash table exporter
+ */
+
+static void
+bgp_out_table_feed(void *data)
+{
+ struct rt_export_hook *hook = data;
+ struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->table);
+
+ int max = 512;
+
+ const net_addr *neq = (hook->req->addr_mode == TE_ADDR_EQUAL) ? hook->req->addr : NULL;
+ const net_addr *cand = NULL;
+
+ do {
+ HASH_WALK_ITER(c->prefix_hash, PXH, n, hook->hash_iter)
+ {
+ switch (hook->req->addr_mode)
+ {
+ case TE_ADDR_IN:
+ if (!net_in_netX(n->net, hook->req->addr))
+ continue;
+ /* fall through */
+ case TE_ADDR_NONE:
+ /* Splitting only for multi-net exports */
+ if (--max <= 0)
+ HASH_WALK_ITER_PUT;
+ break;
+
+ case TE_ADDR_FOR:
+ if (!neq)
+ {
+ if (net_in_netX(hook->req->addr, n->net) && (!cand || (n->net->length > cand->length)))
+ cand = n->net;
+ continue;
+ }
+ /* fall through */
+ case TE_ADDR_EQUAL:
+ if (!net_equal(n->net, neq))
+ continue;
+ break;
+ }
+
+ struct bgp_bucket *buck = n->cur ?: n->last;
+ ea_list *ea = NULL;
+ if (buck == c->withdraw_bucket)
+ ea_set_dest(&ea, 0, RTD_UNREACHABLE);
+ else
+ {
+ ea = buck->eattrs;
+ eattr *eanh = bgp_find_attr(ea, BA_NEXT_HOP);
+ ASSERT_DIE(eanh);
+ const ip_addr *nh = (const void *) eanh->u.ptr->data;
+
+ struct nexthop_adata nhad = {
+ .ad = { .length = sizeof (struct nexthop_adata) - sizeof (struct adata), },
+ .nh = { .gw = nh[0], },
+ };
+
+ ea_set_attr(&ea, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, tmp_copy_adata(&nhad.ad)));
+ }
+
+ struct rte_storage es = {
+ .rte = {
+ .attrs = ea,
+ .net = n->net,
+ .src = rt_find_source_global(n->path_id),
+ .sender = NULL,
+ .lastmod = n->lastmod,
+ .flags = n->cur ? REF_PENDING : 0,
+ },
+ };
+
+ struct rt_pending_export rpe = {
+ .new = &es, .new_best = &es,
+ };
+
+ if (hook->req->export_bulk)
+ {
+ rte *feed = &es.rte;
+ hook->req->export_bulk(hook->req, n->net, &rpe, &feed, 1);
+ }
+ else if (hook->req->export_one)
+ hook->req->export_one(hook->req, n->net, &rpe);
+ else
+ bug("No export method in export request");
+ }
+ HASH_WALK_ITER_END;
+
+ neq = cand;
+ cand = NULL;
+ } while (neq);
+
+ if (hook->hash_iter)
+ ev_schedule_work(hook->event);
+ else
+ rt_set_export_state(hook, TES_READY);
+}
+
+static struct rt_export_hook *
+bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req UNUSED)
+{
+ struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, re);
+ pool *p = rp_new(c->c.proto->pool, "Export hook");
+ struct rt_export_hook *hook = mb_allocz(p, sizeof(struct rt_export_hook));
+ hook->pool = p;
+ hook->lp = lp_new_default(p);
+ hook->event = ev_new_init(p, bgp_out_table_feed, hook);
+ hook->feed_type = TFT_HASH;
+
+ return hook;
+}
+
+void
+bgp_setup_out_table(struct bgp_channel *c)
+{
+ ASSERT_DIE(c->c.out_table == NULL);
+
+ c->prefix_exporter = (struct rt_exporter) {
+ .addr_type = c->c.table->addr_type,
+ .start = bgp_out_table_export_start,
+ };
+
+ init_list(&c->prefix_exporter.hooks);
+
+ c->c.out_table = &c->prefix_exporter;
+}
+
/*
* BGP protocol glue
@@ -1894,7 +2121,6 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c
struct bgp_proto *p = (void *) P;
struct bgp_channel *c = (void *) C;
struct bgp_bucket *buck;
- struct bgp_prefix *px;
u32 path;
if (new)
@@ -1915,10 +2141,8 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c
path = old->src->global_id;
}
- px = bgp_get_prefix(c, n, c->add_path_tx ? path : 0);
- add_tail(&buck->prefixes, &px->buck_node);
-
- bgp_schedule_packet(p->conn, c, PKT_UPDATE);
+ if (bgp_update_prefix(c, bgp_get_prefix(c, n, path), buck))
+ bgp_schedule_packet(p->conn, c, PKT_UPDATE);
}
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index 8df99420..6ffe8824 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -1740,14 +1740,15 @@ bgp_channel_start(struct channel *C)
}
c->pool = p->p.pool; // XXXX
- bgp_init_bucket_table(c);
- bgp_init_prefix_table(c);
if (c->cf->import_table)
channel_setup_in_table(C);
if (c->cf->export_table)
- channel_setup_out_table(C);
+ bgp_setup_out_table(c);
+
+ bgp_init_bucket_table(c);
+ bgp_init_prefix_table(c);
c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0);
diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h
index cca7873a..003893e0 100644
--- a/proto/bgp/bgp.h
+++ b/proto/bgp/bgp.h
@@ -156,7 +156,7 @@ struct bgp_channel_config {
u8 aigp_originate; /* AIGP is originated automatically */
u32 cost; /* IGP cost for direct next hops */
u8 import_table; /* Use c.in_table as Adj-RIB-In */
- u8 export_table; /* Use c.out_table as Adj-RIB-Out */
+ u8 export_table; /* Keep Adj-RIB-Out and export it */
struct rtable_config *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */
struct rtable_config *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */
@@ -357,6 +357,8 @@ struct bgp_channel {
HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */
slab *prefix_slab; /* Slab holding prefix nodes */
+ struct rt_exporter prefix_exporter; /* Table-like exporter for prefix_hash */
+
ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */
ip_addr link_addr; /* Link-local version of next_hop_addr */
@@ -378,8 +380,11 @@ struct bgp_channel {
};
struct bgp_prefix {
- node buck_node; /* Node in per-bucket list */
+ node buck_node_xx; /* Node in per-bucket list */
struct bgp_prefix *next; /* Node in prefix hash table */
+ struct bgp_bucket *last; /* Last bucket sent with this prefix */
+ struct bgp_bucket *cur; /* Current bucket (cur == last) if no update is required */
+ btime lastmod; /* Last modification of this prefix */
u32 hash;
u32 path_id;
net_addr net[0];
@@ -388,8 +393,9 @@ struct bgp_prefix {
struct bgp_bucket {
node send_node; /* Node in send queue */
struct bgp_bucket *next; /* Node in bucket hash table */
- list prefixes; /* Prefixes in this bucket (struct bgp_prefix) */
+ list prefixes; /* Prefixes to send in this bucket (struct bgp_prefix) */
u32 hash; /* Hash over extended attributes */
+ u32 px_uc; /* How many prefixes are linking this bucket */
ea_list eattrs[0]; /* Per-bucket extended attributes */
};
@@ -556,15 +562,16 @@ int bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte
ea_list * bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len);
void bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to);
+void bgp_setup_out_table(struct bgp_channel *c);
+
void bgp_init_bucket_table(struct bgp_channel *c);
void bgp_free_bucket_table(struct bgp_channel *c);
-void bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b);
-void bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b);
void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b);
+int bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b);
void bgp_init_prefix_table(struct bgp_channel *c);
void bgp_free_prefix_table(struct bgp_channel *c);
-void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *bp);
+void bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck);
int bgp_rte_better(struct rte *, struct rte *);
int bgp_rte_mergable(rte *pri, rte *sec);
diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c
index c2e98340..4d4ae3eb 100644
--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@@ -1488,7 +1488,7 @@ bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu
memcpy(pos, &a, b);
ADVANCE(pos, size, b);
- bgp_free_prefix(s->channel, px);
+ bgp_done_prefix(s->channel, px, buck);
}
return pos - buf;
@@ -1573,7 +1573,7 @@ bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu
memcpy(pos, &a, b);
ADVANCE(pos, size, b);
- bgp_free_prefix(s->channel, px);
+ bgp_done_prefix(s->channel, px, buck);
}
return pos - buf;
@@ -1661,7 +1661,7 @@ bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *b
memcpy(pos, &a, b);
ADVANCE(pos, size, b);
- bgp_free_prefix(s->channel, px);
+ bgp_done_prefix(s->channel, px, buck);
}
return pos - buf;
@@ -1758,7 +1758,7 @@ bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *b
memcpy(pos, &a, b);
ADVANCE(pos, size, b);
- bgp_free_prefix(s->channel, px);
+ bgp_done_prefix(s->channel, px, buck);
}
return pos - buf;
@@ -1845,7 +1845,7 @@ bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *
memcpy(pos, net->data, flen);
ADVANCE(pos, size, flen);
- bgp_free_prefix(s->channel, px);
+ bgp_done_prefix(s->channel, px, buck);
}
return pos - buf;
@@ -1933,7 +1933,7 @@ bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *
memcpy(pos, net->data, flen);
ADVANCE(pos, size, flen);
- bgp_free_prefix(s->channel, px);
+ bgp_done_prefix(s->channel, px, buck);
}
return pos - buf;
@@ -2167,6 +2167,8 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu
* var IPv4 Network Layer Reachability Information
*/
+ ASSERT_DIE(s->channel->withdraw_bucket != buck);
+
int lr, la;
la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
@@ -2188,6 +2190,8 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu
static byte *
bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
{
+ ASSERT_DIE(s->channel->withdraw_bucket != buck);
+
/*
* 2 B IPv4 Withdrawn Routes Length (zero)
* --- IPv4 Withdrawn Routes NLRI (unused)
@@ -2341,9 +2345,8 @@ again: ;
buck = HEAD(c->bucket_queue);
/* Cleanup empty buckets */
- if (EMPTY_LIST(buck->prefixes))
+ if (bgp_done_bucket(c, buck))
{
- bgp_free_bucket(c, buck);
lp_restore(tmp_linpool, &tmpp);
goto again;
}
@@ -2352,10 +2355,7 @@ again: ;
bgp_create_ip_reach(&s, buck, buf, end):
bgp_create_mp_reach(&s, buck, buf, end);
- if (EMPTY_LIST(buck->prefixes))
- bgp_free_bucket(c, buck);
- else
- bgp_defer_bucket(c, buck);
+ bgp_done_bucket(c, buck);
if (!res)
{
@@ -2724,7 +2724,7 @@ bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
{
case BGP_RR_REQUEST:
BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
- rt_refeed_channel(&c->c);
+ channel_request_feeding(&c->c);
break;
case BGP_RR_BEGIN:
@@ -2903,7 +2903,11 @@ bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
{
ASSERT(conn->sk);
- DBG("BGP: Scheduling packet type %d\n", type);
+ struct bgp_proto *p = conn->bgp;
+ if (c)
+ BGP_TRACE(D_PACKETS, "Scheduling packet type %d for channel %s", type, c->c.name);
+ else
+ BGP_TRACE(D_PACKETS, "Scheduling packet type %d", type);
if (c)
{