From 080cbd1219ba86dd44712d0d24ceae884b34ec4b Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Tue, 12 Jul 2022 10:36:10 +0200 Subject: Route refresh in tables uses a stale counter. Until now, we were marking routes as REF_STALE and REF_DISCARD to cleanup old routes after route refresh. This needed a synchronous route table walk at both beginning and the end of route refresh routine, marking the routes by the flags. We avoid these walks by using a stale counter. Every route contains: u8 stale_cycle; Every import hook contains: u8 stale_set; u8 stale_valid; u8 stale_pruned; u8 stale_pruning; In base_state, stale_set == stale_valid == stale_pruned == stale_pruning and all routes' stale_cycle also have the same value. The route refresh looks like follows: + ----------- + --------- + ----------- + ------------- + ------------ + | | stale_set | stale_valid | stale_pruning | stale_pruned | | Base | x | x | x | x | | Begin | x+1 | x | x | x | ... now routes are being inserted with stale_cycle == (x+1) | End | x+1 | x+1 | x | x | ... now table pruning routine is scheduled | Prune begin | x+1 | x+1 | x+1 | x | ... now routes with stale_cycle not between stale_set and stale_valid are deleted | Prune end | x+1 | x+1 | x+1 | x+1 | + ----------- + --------- + ----------- + ------------- + ------------ + The pruning routine is asynchronous and may have high latency in high-load environments. Therefore, multiple route refresh requests may happen before the pruning routine starts, leading to this situation: | Prune begin | x+k | x+k | x -> x+k | x | ... or even | Prune begin | x+k+1 | x+k | x -> x+k | x | ... if the prune event starts while another route refresh is running. In such a case, the pruning routine still deletes routes not fitting between stale_set and and stale_valid, effectively pruning the remnants of all unpruned route refreshes from before: | Prune end | x+k | x+k | x+k | x+k | In extremely rare cases, there may happen too many route refreshes before any route prune routine finishes. If the difference between stale_valid and stale_pruned becomes more than 128 when requesting for another route refresh, the routine walks the table synchronously and resets all the stale values to a base state, while logging a warning. --- lib/route.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib/route.h') diff --git a/lib/route.h b/lib/route.h index 68596316..9f78ed00 100644 --- a/lib/route.h +++ b/lib/route.h @@ -29,11 +29,10 @@ typedef struct rte { u8 generation; /* If this route import is based on other previously exported route, this value should be 1 + MAX(generation of the parent routes). Otherwise the route is independent and this value is zero. */ + u8 stale_cycle; /* Auxiliary value for route refresh */ } rte; #define REF_FILTERED 2 /* Route is rejected by import filter */ -#define REF_STALE 4 /* Route is stale in a refresh cycle */ -#define REF_DISCARD 8 /* Route is scheduled for discard */ #define REF_MODIFY 16 /* Route is scheduled for modify */ #define REF_PENDING 32 /* Route has not propagated completely yet */ -- cgit v1.2.3 From bc2ce4aaa8d1e4d56776ee35352c5e2caa09a0e5 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Tue, 12 Jul 2022 12:40:18 +0200 Subject: Removing the rte_modify API For BGP LLGR purposes, there was an API allowing a protocol to directly modify their stale routes in table before flushing them. This API was called by the table prune routine which violates the future locking requirements. Instead of this, BGP now requests a special route export and reimports these routes into the table, allowing for asynchronous execution without locking the table on export. --- lib/route.h | 1 - nest/proto.c | 1 - nest/protocol.h | 1 - nest/rt-table.c | 51 --------------------------------------------- nest/rt.h | 1 - proto/bgp/attrs.c | 62 +++++++++++++++++++++++++++++++++++++++++-------------- proto/bgp/bgp.c | 54 +++++++++++++++++++++++++++++++++++++++++++++--- proto/bgp/bgp.h | 3 ++- 8 files changed, 100 insertions(+), 74 deletions(-) (limited to 'lib/route.h') diff --git a/lib/route.h b/lib/route.h index 9f78ed00..88a4373d 100644 --- a/lib/route.h +++ b/lib/route.h @@ -33,7 +33,6 @@ typedef struct rte { } rte; #define REF_FILTERED 2 /* Route is rejected by import filter */ -#define REF_MODIFY 16 /* Route is scheduled for modify */ #define REF_PENDING 32 /* Route has not propagated completely yet */ /* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ diff --git a/nest/proto.c b/nest/proto.c index 061205c1..72e479d7 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -438,7 +438,6 @@ channel_start_import(struct channel *c) .dump_req = channel_dump_import_req, .log_state_change = channel_import_log_state_change, .preimport = channel_preimport, - .rte_modify = c->proto->rte_modify, }; ASSERT(c->channel_state == CS_UP); diff --git a/nest/protocol.h b/nest/protocol.h index 3ccd364a..026d42ab 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -189,7 +189,6 @@ struct proto { int (*rte_recalculate)(struct rtable *, struct network *, struct rte *, struct rte *, struct rte *); int (*rte_better)(struct rte *, struct rte *); int (*rte_mergable)(struct rte *, struct rte *); - struct rte *(*rte_modify)(struct rte *, struct linpool *); void (*rte_insert)(struct network *, struct rte *); void (*rte_remove)(struct network *, struct rte *); u32 (*rte_igp_metric)(const struct rte *); diff --git a/nest/rt-table.c b/nest/rt-table.c index 2ba28e33..50ddc141 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -1322,8 +1322,6 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr if (new && rte_same(old, &new_stored->rte)) { /* No changes, ignore the new route and refresh the old one */ - - old->flags &= ~REF_MODIFY; old->stale_cycle = new->stale_cycle; if (!rte_is_filtered(new)) @@ -1673,24 +1671,6 @@ rte_discard(net *net, rte *old) /* Non-filtered route deletion, used during garb rte_update_unlock(); } -/* Modify existing route by protocol hook, used for long-lived graceful restart */ -static inline void -rte_modify(net *net, rte *old) -{ - rte_update_lock(); - - rte *new = old->sender->req->rte_modify(old, rte_update_pool); - if (new != old) - { - if (new) - new->flags = old->flags & ~REF_MODIFY; - - rte_recalculate(old->sender, net, new, old->src); - } - - rte_update_unlock(); -} - /* Check rtable for best route to given net whether it would be exported do p */ int rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter) @@ -1977,29 +1957,6 @@ rt_refresh_end(struct rt_import_request *req) log(L_TRACE "%s: route refresh end [%u]", req->name, hook->stale_valid); } -void -rt_modify_stale(rtable *t, struct rt_import_request *req) -{ - int prune = 0; - struct rt_import_hook *s = req->hook; - - FIB_WALK(&t->fib, net, n) - { - for (struct rte_storage *e = n->routes; e; e = e->next) - if ((e->rte.sender == s) && - !(e->rte.flags & REF_FILTERED) && - (e->rte.stale_cycle + 1 == s->stale_set)) - { - e->rte.flags |= REF_MODIFY; - prune = 1; - } - } - FIB_WALK_END; - - if (prune) - rt_schedule_prune(t); -} - /** * rte_dump - dump a route * @e: &rte to be dumped @@ -2499,14 +2456,6 @@ again: rte_discard(n, &e->rte); limit--; - goto rescan; - } - - if (e->rte.flags & REF_MODIFY) - { - rte_modify(n, &e->rte); - limit--; - goto rescan; } } diff --git a/nest/rt.h b/nest/rt.h index b9ae7d10..4a7a087f 100644 --- a/nest/rt.h +++ b/nest/rt.h @@ -181,7 +181,6 @@ struct rt_import_request { /* Preimport is called when the @new route is just-to-be inserted, replacing @old. * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */ int (*preimport)(struct rt_import_request *req, struct rte *new, struct rte *old); - struct rte *(*rte_modify)(struct rte *, struct linpool *); }; struct rt_import_hook { diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 084c9b63..28eb6fee 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -2546,27 +2546,59 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) return !old_suppressed; } -rte * -bgp_rte_modify_stale(struct rte *r, struct linpool *pool) +void +bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) { - eattr *ea = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); - const struct adata *ad = ea ? ea->u.ptr : NULL; - uint flags = ea ? ea->flags : BAF_PARTIAL; + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct rt_import_hook *irh = c->c.in_req.hook; - if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) - return NULL; + /* Find our routes among others */ + for (uint i=0; isender != irh) + continue; - _Thread_local static rte e0; - e0 = *r; + /* A new route, do not mark as stale */ + if (r->stale_cycle == irh->stale_set) + continue; - bgp_set_attr_ptr(&e0.attrs, BA_COMMUNITY, flags, - int_set_add(pool, ad, BGP_COMM_LLGR_STALE)); - e0.pflags |= BGP_REF_STALE; + eattr *ea = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); + const struct adata *ad = ea ? ea->u.ptr : NULL; + uint flags = ea ? ea->flags : BAF_PARTIAL; + + /* LLGR not allowed, withdraw the route */ + if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) + { + rte_import(&c->c.in_req, n, NULL, r->src); + continue; + } - return &e0; + /* Route already marked as LLGR, do nothing */ + if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) + continue; + + /* Store the tmp_linpool state to aggresively save memory */ + struct lp_state tmpp; + lp_save(tmp_linpool, &tmpp); + + /* Mark the route as LLGR */ + rte e0 = *r; + bgp_set_attr_ptr(&e0.attrs, BA_COMMUNITY, flags, int_set_add(tmp_linpool, ad, BGP_COMM_LLGR_STALE)); + e0.pflags &= ~BGP_REF_NOT_STALE; + e0.pflags |= BGP_REF_STALE; + + /* We need to update the route but keep it stale. */ + ASSERT_DIE(irh->stale_set == irh->stale_valid + 1); + irh->stale_set--; + rte_import(&c->c.in_req, n, &e0, r->src); + irh->stale_set++; + + /* Restore the memory state */ + lp_restore(tmp_linpool, &tmpp); + } } diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index d9008b9a..fb8fa529 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -139,6 +139,9 @@ static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd); static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); static void bgp_listen_sock_err(sock *sk UNUSED, int err); +static void bgp_graceful_restart_feed(struct bgp_channel *c); + + /** * bgp_open - open a BGP instance * @p: BGP instance @@ -770,7 +773,7 @@ bgp_handle_graceful_restart(struct bgp_proto *p) case BGP_GRS_LLGR: rt_refresh_begin(&c->c.in_req); - rt_modify_stale(c->c.table, &c->c.in_req); + bgp_graceful_restart_feed(c); break; } } @@ -796,6 +799,52 @@ bgp_handle_graceful_restart(struct bgp_proto *p) tm_start(p->gr_timer, p->conn->remote_caps->gr_time S); } +static void +bgp_graceful_restart_feed_done(struct rt_export_request *req) +{ + req->hook = NULL; +} + +static void +bgp_graceful_restart_feed_dump_req(struct rt_export_request *req) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + debug(" BGP-GR %s.%s export request %p\n", c->c.proto->name, c->c.name, req); +} + +static void +bgp_graceful_restart_feed_log_state_change(struct rt_export_request *req, u8 state) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct bgp_proto *p = (void *) c->c.proto; + BGP_TRACE(D_EVENTS, "Long-lived graceful restart export state changed to %s", rt_export_state_name(state)); + + if (state == TES_READY) + rt_stop_export(req, bgp_graceful_restart_feed_done); +} + +static void +bgp_graceful_restart_drop_export(struct rt_export_request *req UNUSED, const net_addr *n UNUSED, struct rt_pending_export *rpe UNUSED) +{ /* Nothing to do */ } + +static void +bgp_graceful_restart_feed(struct bgp_channel *c) +{ + c->stale_feed = (struct rt_export_request) { + .name = "BGP-GR", + .trace_routes = c->c.debug | c->c.proto->debug, + .dump_req = bgp_graceful_restart_feed_dump_req, + .log_state_change = bgp_graceful_restart_feed_log_state_change, + .export_bulk = bgp_rte_modify_stale, + .export_one = bgp_graceful_restart_drop_export, + }; + + rt_request_export(&c->c.table->exporter, &c->stale_feed); +} + + + + /** * bgp_graceful_restart_done - finish active BGP graceful restart * @c: BGP channel @@ -861,7 +910,7 @@ bgp_graceful_restart_timeout(timer *t) /* Channel is in GR, and supports LLGR -> start LLGR */ c->gr_active = BGP_GRS_LLGR; tm_start(c->stale_timer, c->stale_time S); - rt_modify_stale(c->c.table, &c->c.in_req); + bgp_graceful_restart_feed(c); } } else @@ -1672,7 +1721,6 @@ bgp_init(struct proto_config *CF) P->rte_better = bgp_rte_better; P->rte_mergable = bgp_rte_mergable; P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; - P->rte_modify = bgp_rte_modify_stale; P->rte_igp_metric = bgp_rte_igp_metric; p->cf = cf; diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 003893e0..2e7615ea 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -371,6 +371,7 @@ struct bgp_channel { timer *stale_timer; /* Long-lived stale timer for LLGR */ u32 stale_time; /* Stored LLGR stale time from last session */ + struct rt_export_request stale_feed; /* Feeder request for stale route modification */ u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ @@ -576,7 +577,7 @@ void bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bu int bgp_rte_better(struct rte *, struct rte *); int bgp_rte_mergable(rte *pri, rte *sec); int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); -struct rte *bgp_rte_modify_stale(struct rte *r, struct linpool *pool); +void bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count); u32 bgp_rte_igp_metric(const rte *); void bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, const rte *old); int bgp_preexport(struct channel *, struct rte *); -- cgit v1.2.3 From 9901ca6fb3683091c7eb424cbba8c7bc94e41cbb Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Mon, 18 Jul 2022 10:56:20 +0200 Subject: Fixed an annoying warning in ea_get_storage() --- lib/route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/route.h') diff --git a/lib/route.h b/lib/route.h index 88a4373d..2d691215 100644 --- a/lib/route.h +++ b/lib/route.h @@ -423,7 +423,7 @@ static inline int ea_is_cached(const ea_list *r) { return r->flags & EALF_CACHED static inline struct ea_storage *ea_get_storage(ea_list *r) { ASSERT_DIE(ea_is_cached(r)); - return SKIP_BACK(struct ea_storage, l, r); + return SKIP_BACK(struct ea_storage, l[0], r); } static inline ea_list *ea_clone(ea_list *r) { ea_get_storage(r)->uc++; return r; } -- cgit v1.2.3