diff options
author | Maria Matejka <mq@ucw.cz> | 2022-06-20 19:10:49 +0200 |
---|---|---|
committer | Maria Matejka <mq@ucw.cz> | 2022-07-11 16:07:09 +0200 |
commit | 6b0368cc2c317d1acc0881a96b32ded291d82741 (patch) | |
tree | 8e8eeb29b6bb2eb7edc6f701a71e2b0569799eb9 | |
parent | d5e3272f3d9b1bad7ceb6d0d5897a7269e28a537 (diff) |
Export tables merged with BGP prefix hash
Until now, if export table was enabled, Nest was storing exactly the
route before rt_notify() was called on it. This was quite sloppy and
spooky and it also wasn't reflecting the changes BGP does before
sending. And as BGP is storing the routes to be sent anyway, we are
simply keeping the already-sent routes in there to better rule out
unneeded reexports.
Some of the route attributes (IGP metric, preference) make no sense in
BGP, therefore these will be probably replaced by something sensible.
Also the nexthop shown in the short output is the BGP nexthop.
-rw-r--r-- | lib/route.h | 2 | ||||
-rw-r--r-- | nest/config.Y | 2 | ||||
-rw-r--r-- | nest/proto.c | 16 | ||||
-rw-r--r-- | nest/protocol.h | 3 | ||||
-rw-r--r-- | nest/rt-attr.c | 27 | ||||
-rw-r--r-- | nest/rt-table.c | 168 | ||||
-rw-r--r-- | nest/rt.h | 20 | ||||
-rw-r--r-- | proto/bgp/attrs.c | 266 | ||||
-rw-r--r-- | proto/bgp/bgp.c | 7 | ||||
-rw-r--r-- | proto/bgp/bgp.h | 19 | ||||
-rw-r--r-- | proto/bgp/packets.c | 32 |
11 files changed, 342 insertions, 220 deletions
diff --git a/lib/route.h b/lib/route.h index 30be7c4e..68596316 100644 --- a/lib/route.h +++ b/lib/route.h @@ -35,6 +35,7 @@ typedef struct rte { #define REF_STALE 4 /* Route is stale in a refresh cycle */ #define REF_DISCARD 8 /* Route is scheduled for discard */ #define REF_MODIFY 16 /* Route is scheduled for modify */ +#define REF_PENDING 32 /* Route has not propagated completely yet */ /* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); } @@ -53,6 +54,7 @@ struct rte_src { struct rte_src *rt_find_source(struct proto *p, u32 id); struct rte_src *rt_get_source(struct proto *p, u32 id); +struct rte_src *rt_find_source_global(u32 id); static inline void rt_lock_source(struct rte_src *src) { src->uc++; } static inline void rt_unlock_source(struct rte_src *src) { src->uc--; } void rt_prune_sources(void); diff --git a/nest/config.Y b/nest/config.Y index 7c163f74..edddfc2b 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -694,7 +694,7 @@ r_args: } | r_args EXPORT TABLE channel_arg { if (!$4->out_table) cf_error("No export table in channel %s.%s", $4->proto->name, $4->name); - rt_show_add_exporter($$, &$4->out_table->exporter, "export"); + rt_show_add_exporter($$, $4->out_table, "export"); $$->tables_defined_by = RSD_TDB_DIRECT; } | r_args FILTER filter { diff --git a/nest/proto.c b/nest/proto.c index e7be8001..061205c1 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -559,10 +559,6 @@ channel_export_stopped(struct rt_export_request *req) return; } - /* Free the routes from out_table */ - if (c->out_table) - rt_prune_sync(c->out_table, 1); - mb_free(c->out_req.name); c->out_req.name = NULL; @@ -647,18 +643,6 @@ channel_setup_in_table(struct channel *c) c->in_keep |= RIK_PREFILTER; } -/* Called by protocol to activate out_table */ -void -channel_setup_out_table(struct channel *c) -{ - struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config)); - cf->name = "export"; - cf->addr_type = c->net_type; - cf->internal = 1; - - c->out_table = rt_setup(c->proto->pool, cf); -} - static void channel_do_start(struct channel *c) diff --git a/nest/protocol.h b/nest/protocol.h index a1701a7e..3ccd364a 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -537,7 +537,7 @@ struct channel { u8 refeed_pending; /* Refeeding and another refeed is scheduled */ u8 rpki_reload; /* RPKI changes trigger channel reload */ - struct rtable *out_table; /* Internal table for exported routes */ + struct rt_exporter *out_table; /* Internal table for exported routes */ list roa_subscriptions; /* List of active ROA table subscriptions based on filters roa_check() */ }; @@ -610,7 +610,6 @@ int proto_configure_channel(struct proto *p, struct channel **c, struct channel_ void channel_set_state(struct channel *c, uint state); void channel_setup_in_table(struct channel *c); -void channel_setup_out_table(struct channel *c); void channel_schedule_reload(struct channel *c); static inline void channel_init(struct channel *c) { channel_set_state(c, CS_START); } diff --git a/nest/rt-attr.c b/nest/rt-attr.c index a66d4c6e..b31bc5cc 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -197,11 +197,14 @@ static struct idm src_ids; #define RSH_INIT_ORDER 6 static HASH(struct rte_src) src_hash; +static struct rte_src **rte_src_global; +static uint rte_src_global_max = SRC_ID_INIT_SIZE; static void rte_src_init(void) { rte_src_slab = sl_new(rta_pool, sizeof(struct rte_src)); + rte_src_global = mb_allocz(rta_pool, sizeof(struct rte_src *) * rte_src_global_max); idm_init(&src_ids, rta_pool, SRC_ID_INIT_SIZE); @@ -232,10 +235,27 @@ rt_get_source(struct proto *p, u32 id) src->uc = 0; HASH_INSERT2(src_hash, RSH, rta_pool, src); + if (src->global_id >= rte_src_global_max) + { + rte_src_global = mb_realloc(rte_src_global, sizeof(struct rte_src *) * (rte_src_global_max *= 2)); + memset(&rte_src_global[rte_src_global_max / 2], 0, + sizeof(struct rte_src *) * (rte_src_global_max / 2)); + } + + rte_src_global[src->global_id] = src; return src; } +struct rte_src * +rt_find_source_global(u32 id) +{ + if (id >= rte_src_global_max) + return NULL; + else + return rte_src_global[id]; +} + void rt_prune_sources(void) { @@ -1081,8 +1101,11 @@ ea_show_nexthop_list(struct cli *c, struct nexthop_adata *nhad) bsprintf(weight, " weight %d", nh->weight + 1); if (ipa_nonzero(nh->gw)) - cli_printf(c, -1007, "\tvia %I on %s%s%s%s", - nh->gw, nh->iface->name, mpls, onlink, weight); + if (nh->iface) + cli_printf(c, -1007, "\tvia %I on %s%s%s%s", + nh->gw, nh->iface->name, mpls, onlink, weight); + else + cli_printf(c, -1007, "\tvia %I", nh->gw); else cli_printf(c, -1007, "\tdev %s%s%s", nh->iface->name, mpls, onlink, weight); diff --git a/nest/rt-table.c b/nest/rt-table.c index 5adf6346..29690414 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -820,17 +820,6 @@ do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) if (!new && old) CHANNEL_LIMIT_POP(c, OUT); - /* Apply export table */ - struct rte_storage *old_exported = NULL; - if (c->out_table) - { - if (!rte_update_out(c, net, new, old, &old_exported)) - { - channel_rte_trace_out(D_ROUTES, c, new, "idempotent"); - return; - } - } - if (new) stats->updates_accepted++; else @@ -852,10 +841,7 @@ do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) channel_rte_trace_out(D_ROUTES, c, old, "removed"); } - p->rt_notify(p, c, net, new, old_exported ? &old_exported->rte : old); - - if (c->out_table && old_exported) - rte_free(old_exported); + p->rt_notify(p, c, net, new, old); } static void @@ -1747,7 +1733,7 @@ rt_export_stopped(void *data) hook->stopped(hook->req); /* Reporting the hook as finished. */ - tab->done(hook); + CALL(tab->done, hook); /* Freeing the hook together with its coroutine. */ rfree(hook->pool); @@ -1763,7 +1749,7 @@ rt_set_import_state(struct rt_import_hook *hook, u8 state) hook->req->log_state_change(hook->req, state); } -static inline void +void rt_set_export_state(struct rt_export_hook *hook, u8 state) { hook->last_state_change = current_time(); @@ -1874,16 +1860,22 @@ rt_table_export_stop(struct rt_export_hook *hook) if (hook->export_state != TES_FEEDING) return; - if (hook->walk_lock) + switch (hook->req->addr_mode) { - rt_unlock_trie(tab, hook->walk_lock); - hook->walk_lock = NULL; - - mb_free(hook->walk_state); - hook->walk_state = NULL; + case TE_ADDR_IN: + if (hook->walk_lock) + { + rt_unlock_trie(tab, hook->walk_lock); + hook->walk_lock = NULL; + mb_free(hook->walk_state); + hook->walk_state = NULL; + break; + } + /* fall through */ + case TE_ADDR_NONE: + fit_get(&tab->fib, &hook->feed_fit); + break; } - else - fit_get(&tab->fib, &hook->feed_fit); } void @@ -1896,7 +1888,7 @@ rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_r ev_postpone(hook->event); /* Stop feeding from the exporter */ - hook->table->stop(hook); + CALL(hook->table->stop, hook); /* Reset the event as the stopped event */ hook->event->hook = rt_export_stopped; @@ -3397,130 +3389,6 @@ void channel_reload_export_bulk(struct rt_export_request *req, const net_addr *n } } -void -rt_prune_sync(rtable *t, int all) -{ - struct fib_iterator fit; - - FIB_ITERATE_INIT(&fit, &t->fib); - -again: - FIB_ITERATE_START(&t->fib, &fit, net, n) - { - struct rte_storage *e, **ee = &n->routes; - - while (e = *ee) - { - if (all || (e->rte.flags & (REF_STALE | REF_DISCARD))) - { - *ee = e->next; - rte_free(e); - t->rt_count--; - } - else - ee = &e->next; - } - - if (all || !n->routes) - { - FIB_ITERATE_PUT(&fit); - fib_delete(&t->fib, n); - goto again; - } - } - FIB_ITERATE_END; -} - - -/* - * Export table - */ - -int -rte_update_out(struct channel *c, const net_addr *n, rte *new, const rte *old0, struct rte_storage **old_exported) -{ - struct rtable *tab = c->out_table; - struct rte_src *src; - net *net; - - if (new) - { - net = net_get(tab, n); - src = new->src; - } - else - { - net = net_find(tab, n); - src = old0->src; - - if (!net) - goto drop; - } - - /* Find the old rte */ - struct rte_storage **pos = (c->ra_mode == RA_ANY) ? rte_find(net, src) : &net->routes; - struct rte_storage *old = NULL; - - if (old = *pos) - { - if (new && rte_same(&(*pos)->rte, new)) - goto drop; - - /* Remove the old rte */ - *pos = old->next; - *old_exported = old; - tab->rt_count--; - } - - if (!new) - { - if (!old) - goto drop; - - if (!net->routes) - fib_delete(&tab->fib, net); - - return 1; - } - - /* Insert the new rte */ - struct rte_storage *e = rte_store(new, net, tab); - e->rte.lastmod = current_time(); - e->next = *pos; - *pos = e; - tab->rt_count++; - return 1; - -drop: - return 0; -} - -void -rt_refeed_channel(struct channel *c) -{ - if (!c->out_table) - { - channel_request_feeding(c); - return; - } - - ASSERT_DIE(c->ra_mode != RA_ANY); - - c->proto->feed_begin(c, 0); - - FIB_WALK(&c->out_table->fib, net, n) - { - if (!n->routes) - continue; - - rte e = n->routes->rte; - c->proto->rt_notify(c->proto, c, n->n.addr, &e, NULL); - } - FIB_WALK_END; - - c->proto->feed_end(c); -} - /* * Hostcache @@ -247,14 +247,20 @@ struct rt_export_hook { u32 withdraws_received; /* Number of route withdraws received */ } stats; - struct fib_iterator feed_fit; /* Routing table iterator used during feeding */ - struct f_trie_walk_state *walk_state; /* Iterator over networks in trie */ - struct f_trie *walk_lock; /* Locked trie for walking */ + union { + struct fib_iterator feed_fit; /* Routing table iterator used during feeding */ + struct { + struct f_trie_walk_state *walk_state; /* Iterator over networks in trie */ + struct f_trie *walk_lock; /* Locked trie for walking */ + }; + u32 hash_iter; /* Iterator over hash */ + }; btime last_state_change; /* Time of last state transition */ u8 refeed_pending; /* Refeeding and another refeed is scheduled */ u8 export_state; /* Route export state (TES_*, see below) */ + u8 feed_type; /* Which feeding method is used (TFT_*, see below) */ struct event *event; /* Event running all the export operations */ @@ -282,6 +288,10 @@ struct rt_export_hook { #define TE_ADDR_IN 3 /* Interval query - show route in <addr> */ +#define TFT_FIB 1 +#define TFT_TRIE 2 +#define TFT_HASH 3 + void rt_request_import(rtable *tab, struct rt_import_request *req); void rt_request_export(struct rt_exporter *tab, struct rt_export_request *req); @@ -296,6 +306,8 @@ const char *rt_export_state_name(u8 state); static inline u8 rt_import_get_state(struct rt_import_hook *ih) { return ih ? ih->import_state : TIS_DOWN; } static inline u8 rt_export_get_state(struct rt_export_hook *eh) { return eh ? eh->export_state : TES_DOWN; } +void rt_set_export_state(struct rt_export_hook *hook, u8 state); + void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src); /* Types of route announcement, also used as flags */ @@ -386,8 +398,6 @@ int rt_reload_channel(struct channel *c); void rt_reload_channel_abort(struct channel *c); void rt_refeed_channel(struct channel *c); void rt_prune_sync(rtable *t, int all); -int rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); -int rte_update_out(struct channel *c, const net_addr *n, rte *new, const rte *old, struct rte_storage **old_exported); struct rtable_config *rt_new_table(struct symbol *s, uint addr_type); static inline int rt_is_ip(rtable *tab) diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 22a53dce..084c9b63 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -1550,7 +1550,6 @@ bgp_free_bucket_table(struct bgp_channel *c) static struct bgp_bucket * bgp_get_bucket(struct bgp_channel *c, ea_list *new) { - /* Hash and lookup */ u32 hash = ea_hash(new); struct bgp_bucket *b = HASH_FIND(c->bucket_hash, RBH, new, hash); @@ -1571,8 +1570,7 @@ bgp_get_bucket(struct bgp_channel *c, ea_list *new) /* Copy the ea_list */ ea_list_copy(b->eattrs, new, ea_size); - /* Insert the bucket to send queue and bucket hash */ - add_tail(&c->bucket_queue, &b->send_node); + /* Insert the bucket to bucket hash */ HASH_INSERT2(c->bucket_hash, RBH, c->pool, b); return b; @@ -1590,14 +1588,30 @@ bgp_get_withdraw_bucket(struct bgp_channel *c) return c->withdraw_bucket; } -void -bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b) +static void +bgp_free_bucket_xx(struct bgp_channel *c, struct bgp_bucket *b) { - rem_node(&b->send_node); HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b); mb_free(b); } +int +bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b) +{ + /* Won't free the withdraw bucket */ + if (b == c->withdraw_bucket) + return 0; + + if (EMPTY_LIST(b->prefixes)) + rem_node(&b->send_node); + + if (b->px_uc || !EMPTY_LIST(b->prefixes)) + return 0; + + bgp_free_bucket_xx(c, b); + return 1; +} + void bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b) { @@ -1617,8 +1631,8 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) struct bgp_prefix *px = HEAD(b->prefixes); log(L_ERR "%s: - withdrawing %N", p->p.name, &px->net); - rem_node(&px->buck_node); - add_tail(&wb->prefixes, &px->buck_node); + rem_node(&px->buck_node_xx); + add_tail(&wb->prefixes, &px->buck_node_xx); } } @@ -1629,7 +1643,7 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) #define PXH_KEY(px) px->net, px->path_id, px->hash #define PXH_NEXT(px) px->next -#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && i1 == i2 && net_equal(n1, n2) +#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (c->add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2) #define PXH_FN(n,i,h) h #define PXH_REHASH bgp_pxh_rehash @@ -1659,15 +1673,13 @@ bgp_free_prefix_table(struct bgp_channel *c) static struct bgp_prefix * bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) { + u32 path_id_hash = c->add_path_tx ? path_id : 0; /* We must use a different hash function than the rtable */ - u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id)); - struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id, hash); + u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash)); + struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash); if (px) - { - rem_node(&px->buck_node); return px; - } if (c->prefix_slab) px = sl_alloc(c->prefix_slab); @@ -1684,10 +1696,64 @@ bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) return px; } -void +static void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px); + +static inline int +bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *b) +{ +#define BPX_TRACE(what) do { \ + if (c->c.debug & D_ROUTES) log(L_TRACE "%s.%s < %s %N %uG %s", \ + c->c.proto->name, c->c.name, what, \ + px->net, px->path_id, (b == c->withdraw_bucket) ? "withdraw" : "update"); } while (0) + px->lastmod = current_time(); + + /* Already queued for the same bucket */ + if (px->cur == b) + { + BPX_TRACE("already queued"); + return 0; + } + + /* Unqueue from the old bucket */ + if (px->cur) + { + rem_node(&px->buck_node_xx); + bgp_done_bucket(c, px->cur); + } + + /* The new bucket is the same as we sent before */ + if ((px->last == b) || c->c.out_table && !px->last && (b == c->withdraw_bucket)) + { + if (px->cur) + BPX_TRACE("reverted"); + else + BPX_TRACE("already sent"); + + /* Well, we haven't sent anything yet */ + if (!px->last) + bgp_free_prefix(c, px); + + px->cur = NULL; + return 0; + } + + /* Enqueue the bucket if it has been empty */ + if ((b != c->withdraw_bucket) && EMPTY_LIST(b->prefixes)) + add_tail(&c->bucket_queue, &b->send_node); + + /* Enqueue to the new bucket and indicate the change */ + add_tail(&b->prefixes, &px->buck_node_xx); + px->cur = b; + + BPX_TRACE("queued"); + return 1; + +#undef BPX_TRACE +} + +static void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) { - rem_node(&px->buck_node); HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px); if (c->prefix_slab) @@ -1696,6 +1762,167 @@ bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) mb_free(px); } +void +bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck) +{ + /* Cleanup: We're called from bucket senders. */ + ASSERT_DIE(px->cur == buck); + rem_node(&px->buck_node_xx); + + /* We may want to store the updates */ + if (c->c.out_table) + { + /* Nothing to be sent right now */ + px->cur = NULL; + + /* Unref the previous sent version */ + if (px->last) + px->last->px_uc--; + + /* Ref the current sent version */ + if (buck != c->withdraw_bucket) + { + px->last = buck; + px->last->px_uc++; + return; + } + + /* Prefixes belonging to the withdraw bucket are freed always */ + } + + bgp_free_prefix(c, px); +} + + +/* + * Prefix hash table exporter + */ + +static void +bgp_out_table_feed(void *data) +{ + struct rt_export_hook *hook = data; + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->table); + + int max = 512; + + const net_addr *neq = (hook->req->addr_mode == TE_ADDR_EQUAL) ? hook->req->addr : NULL; + const net_addr *cand = NULL; + + do { + HASH_WALK_ITER(c->prefix_hash, PXH, n, hook->hash_iter) + { + switch (hook->req->addr_mode) + { + case TE_ADDR_IN: + if (!net_in_netX(n->net, hook->req->addr)) + continue; + /* fall through */ + case TE_ADDR_NONE: + /* Splitting only for multi-net exports */ + if (--max <= 0) + HASH_WALK_ITER_PUT; + break; + + case TE_ADDR_FOR: + if (!neq) + { + if (net_in_netX(hook->req->addr, n->net) && (!cand || (n->net->length > cand->length))) + cand = n->net; + continue; + } + /* fall through */ + case TE_ADDR_EQUAL: + if (!net_equal(n->net, neq)) + continue; + break; + } + + struct bgp_bucket *buck = n->cur ?: n->last; + ea_list *ea = NULL; + if (buck == c->withdraw_bucket) + ea_set_dest(&ea, 0, RTD_UNREACHABLE); + else + { + ea = buck->eattrs; + eattr *eanh = bgp_find_attr(ea, BA_NEXT_HOP); + ASSERT_DIE(eanh); + const ip_addr *nh = (const void *) eanh->u.ptr->data; + + struct nexthop_adata nhad = { + .ad = { .length = sizeof (struct nexthop_adata) - sizeof (struct adata), }, + .nh = { .gw = nh[0], }, + }; + + ea_set_attr(&ea, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, tmp_copy_adata(&nhad.ad))); + } + + struct rte_storage es = { + .rte = { + .attrs = ea, + .net = n->net, + .src = rt_find_source_global(n->path_id), + .sender = NULL, + .lastmod = n->lastmod, + .flags = n->cur ? REF_PENDING : 0, + }, + }; + + struct rt_pending_export rpe = { + .new = &es, .new_best = &es, + }; + + if (hook->req->export_bulk) + { + rte *feed = &es.rte; + hook->req->export_bulk(hook->req, n->net, &rpe, &feed, 1); + } + else if (hook->req->export_one) + hook->req->export_one(hook->req, n->net, &rpe); + else + bug("No export method in export request"); + } + HASH_WALK_ITER_END; + + neq = cand; + cand = NULL; + } while (neq); + + if (hook->hash_iter) + ev_schedule_work(hook->event); + else + rt_set_export_state(hook, TES_READY); +} + +static struct rt_export_hook * +bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req UNUSED) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, re); + pool *p = rp_new(c->c.proto->pool, "Export hook"); + struct rt_export_hook *hook = mb_allocz(p, sizeof(struct rt_export_hook)); + hook->pool = p; + hook->lp = lp_new_default(p); + hook->event = ev_new_init(p, bgp_out_table_feed, hook); + hook->feed_type = TFT_HASH; + + return hook; +} + +void +bgp_setup_out_table(struct bgp_channel *c) +{ + ASSERT_DIE(c->c.out_table == NULL); + + c->prefix_exporter = (struct rt_exporter) { + .addr_type = c->c.table->addr_type, + .start = bgp_out_table_export_start, + }; + + init_list(&c->prefix_exporter.hooks); + + c->c.out_table = &c->prefix_exporter; +} + /* * BGP protocol glue @@ -1894,7 +2121,6 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c struct bgp_proto *p = (void *) P; struct bgp_channel *c = (void *) C; struct bgp_bucket *buck; - struct bgp_prefix *px; u32 path; if (new) @@ -1915,10 +2141,8 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c path = old->src->global_id; } - px = bgp_get_prefix(c, n, c->add_path_tx ? path : 0); - add_tail(&buck->prefixes, &px->buck_node); - - bgp_schedule_packet(p->conn, c, PKT_UPDATE); + if (bgp_update_prefix(c, bgp_get_prefix(c, n, path), buck)) + bgp_schedule_packet(p->conn, c, PKT_UPDATE); } diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 8df99420..6ffe8824 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -1740,14 +1740,15 @@ bgp_channel_start(struct channel *C) } c->pool = p->p.pool; // XXXX - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); if (c->cf->import_table) channel_setup_in_table(C); if (c->cf->export_table) - channel_setup_out_table(C); + bgp_setup_out_table(c); + + bgp_init_bucket_table(c); + bgp_init_prefix_table(c); c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0); diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index cca7873a..003893e0 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -156,7 +156,7 @@ struct bgp_channel_config { u8 aigp_originate; /* AIGP is originated automatically */ u32 cost; /* IGP cost for direct next hops */ u8 import_table; /* Use c.in_table as Adj-RIB-In */ - u8 export_table; /* Use c.out_table as Adj-RIB-Out */ + u8 export_table; /* Keep Adj-RIB-Out and export it */ struct rtable_config *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ struct rtable_config *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ @@ -357,6 +357,8 @@ struct bgp_channel { HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ slab *prefix_slab; /* Slab holding prefix nodes */ + struct rt_exporter prefix_exporter; /* Table-like exporter for prefix_hash */ + ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ ip_addr link_addr; /* Link-local version of next_hop_addr */ @@ -378,8 +380,11 @@ struct bgp_channel { }; struct bgp_prefix { - node buck_node; /* Node in per-bucket list */ + node buck_node_xx; /* Node in per-bucket list */ struct bgp_prefix *next; /* Node in prefix hash table */ + struct bgp_bucket *last; /* Last bucket sent with this prefix */ + struct bgp_bucket *cur; /* Current bucket (cur == last) if no update is required */ + btime lastmod; /* Last modification of this prefix */ u32 hash; u32 path_id; net_addr net[0]; @@ -388,8 +393,9 @@ struct bgp_prefix { struct bgp_bucket { node send_node; /* Node in send queue */ struct bgp_bucket *next; /* Node in bucket hash table */ - list prefixes; /* Prefixes in this bucket (struct bgp_prefix) */ + list prefixes; /* Prefixes to send in this bucket (struct bgp_prefix) */ u32 hash; /* Hash over extended attributes */ + u32 px_uc; /* How many prefixes are linking this bucket */ ea_list eattrs[0]; /* Per-bucket extended attributes */ }; @@ -556,15 +562,16 @@ int bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte ea_list * bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len); void bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to); +void bgp_setup_out_table(struct bgp_channel *c); + void bgp_init_bucket_table(struct bgp_channel *c); void bgp_free_bucket_table(struct bgp_channel *c); -void bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b); -void bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b); void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b); +int bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b); void bgp_init_prefix_table(struct bgp_channel *c); void bgp_free_prefix_table(struct bgp_channel *c); -void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *bp); +void bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck); int bgp_rte_better(struct rte *, struct rte *); int bgp_rte_mergable(rte *pri, rte *sec); diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index c2e98340..4d4ae3eb 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -1488,7 +1488,7 @@ bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; @@ -1573,7 +1573,7 @@ bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; @@ -1661,7 +1661,7 @@ bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *b memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; @@ -1758,7 +1758,7 @@ bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *b memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; @@ -1845,7 +1845,7 @@ bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte * memcpy(pos, net->data, flen); ADVANCE(pos, size, flen); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; @@ -1933,7 +1933,7 @@ bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte * memcpy(pos, net->data, flen); ADVANCE(pos, size, flen); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; @@ -2167,6 +2167,8 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu * var IPv4 Network Layer Reachability Information */ + ASSERT_DIE(s->channel->withdraw_bucket != buck); + int lr, la; la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH); @@ -2188,6 +2190,8 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu static byte * bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { + ASSERT_DIE(s->channel->withdraw_bucket != buck); + /* * 2 B IPv4 Withdrawn Routes Length (zero) * --- IPv4 Withdrawn Routes NLRI (unused) @@ -2341,9 +2345,8 @@ again: ; buck = HEAD(c->bucket_queue); /* Cleanup empty buckets */ - if (EMPTY_LIST(buck->prefixes)) + if (bgp_done_bucket(c, buck)) { - bgp_free_bucket(c, buck); lp_restore(tmp_linpool, &tmpp); goto again; } @@ -2352,10 +2355,7 @@ again: ; bgp_create_ip_reach(&s, buck, buf, end): bgp_create_mp_reach(&s, buck, buf, end); - if (EMPTY_LIST(buck->prefixes)) - bgp_free_bucket(c, buck); - else - bgp_defer_bucket(c, buck); + bgp_done_bucket(c, buck); if (!res) { @@ -2724,7 +2724,7 @@ bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) { case BGP_RR_REQUEST: BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH"); - rt_refeed_channel(&c->c); + channel_request_feeding(&c->c); break; case BGP_RR_BEGIN: @@ -2903,7 +2903,11 @@ bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) { ASSERT(conn->sk); - DBG("BGP: Scheduling packet type %d\n", type); + struct bgp_proto *p = conn->bgp; + if (c) + BGP_TRACE(D_PACKETS, "Scheduling packet type %d for channel %s", type, c->c.name); + else + BGP_TRACE(D_PACKETS, "Scheduling packet type %d", type); if (c) { |