diff options
Diffstat (limited to 'nest/rt-table.c')
-rw-r--r-- | nest/rt-table.c | 3625 |
1 files changed, 2514 insertions, 1111 deletions
diff --git a/nest/rt-table.c b/nest/rt-table.c index 5f1e1679..b18727b1 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -21,17 +21,79 @@ * on the list being the best one (i.e., the one we currently use * for routing), the order of the other ones is undetermined. * - * The &rte contains information specific to the route (preference, protocol - * metrics, time of last modification etc.) and a pointer to a &rta structure - * (see the route attribute module for a precise explanation) holding the - * remaining route attributes which are expected to be shared by multiple - * routes in order to conserve memory. + * The &rte contains information about the route. There are net and src, which + * together forms a key identifying the route in a routing table. There is a + * pointer to a &rta structure (see the route attribute module for a precise + * explanation) holding the route attributes, which are primary data about the + * route. There are several technical fields used by routing table code (route + * id, REF_* flags), There is also the pflags field, holding protocol-specific + * flags. They are not used by routing table code, but by protocol-specific + * hooks. In contrast to route attributes, they are not primary data and their + * validity is also limited to the routing table. + * + * There are several mechanisms that allow automatic update of routes in one + * routing table (dst) as a result of changes in another routing table (src). + * They handle issues of recursive next hop resolving, flowspec validation and + * RPKI validation. + * + * The first such mechanism is handling of recursive next hops. A route in the + * dst table has an indirect next hop address, which is resolved through a route + * in the src table (which may also be the same table) to get an immediate next + * hop. This is implemented using structure &hostcache attached to the src + * table, which contains &hostentry structures for each tracked next hop + * address. These structures are linked from recursive routes in dst tables, + * possibly multiple routes sharing one hostentry (as many routes may have the + * same indirect next hop). There is also a trie in the hostcache, which matches + * all prefixes that may influence resolving of tracked next hops. + * + * When a best route changes in the src table, the hostcache is notified using + * an auxiliary export request, which checks using the trie whether the + * change is relevant and if it is, then it schedules asynchronous hostcache + * recomputation. The recomputation is done by rt_update_hostcache() (called + * as an event of src table), it walks through all hostentries and resolves + * them (by rt_update_hostentry()). It also updates the trie. If a change in + * hostentry resolution was found, then it schedules asynchronous nexthop + * recomputation of associated dst table. That is done by rt_next_hop_update() + * (called from rt_event() of dst table), it iterates over all routes in the dst + * table and re-examines their hostentries for changes. Note that in contrast to + * hostcache update, next hop update can be interrupted by main loop. These two + * full-table walks (over hostcache and dst table) are necessary due to absence + * of direct lookups (route -> affected nexthop, nexthop -> its route). + * + * The second mechanism is for flowspec validation, where validity of flowspec + * routes depends of resolving their network prefixes in IP routing tables. This + * is similar to the recursive next hop mechanism, but simpler as there are no + * intermediate hostcache and hostentries (because flows are less likely to + * share common net prefix than routes sharing a common next hop). Every dst + * table has its own export request in every src table. Each dst table has its + * own trie of prefixes that may influence validation of flowspec routes in it + * (flowspec_trie). + * + * When a best route changes in the src table, the notification mechanism is + * invoked by the export request which checks its dst table's trie to see + * whether the change is relevant, and if so, an asynchronous re-validation of + * flowspec routes in the dst table is scheduled. That is also done by function + * rt_next_hop_update(), like nexthop recomputation above. It iterates over all + * flowspec routes and re-validates them. It also recalculates the trie. + * + * Note that in contrast to the hostcache update, here the trie is recalculated + * during the rt_next_hop_update(), which may be interleaved with IP route + * updates. The trie is flushed at the beginning of recalculation, which means + * that such updates may use partial trie to see if they are relevant. But it + * works anyway! Either affected flowspec was already re-validated and added to + * the trie, then IP route change would match the trie and trigger a next round + * of re-validation, or it was not yet re-validated and added to the trie, but + * will be re-validated later in this round anyway. + * + * The third mechanism is used for RPKI re-validation of IP routes and it is the + * simplest. It is also an auxiliary export request belonging to the + * appropriate channel, triggering its reload/refeed timer after a settle time. */ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "lib/resource.h" @@ -44,12 +106,21 @@ #include "lib/hash.h" #include "lib/string.h" #include "lib/alloca.h" +#include "lib/flowspec.h" +#include "lib/idm.h" + +#ifdef CONFIG_BGP +#include "proto/bgp/bgp.h" +#endif #include <stdatomic.h> pool *rt_table_pool; list routing_tables; +list deleted_routing_tables; + +struct rt_cork rt_cork; /* Data structures for export journal */ #define RT_PENDING_EXPORT_ITEMS (page_size - sizeof(struct rt_export_block)) / sizeof(struct rt_pending_export) @@ -61,17 +132,28 @@ struct rt_export_block { struct rt_pending_export export[]; }; -static void rt_free_hostcache(rtable_private *tab); -static void rt_notify_hostcache(rtable_private *tab, net *net); +static void rt_free_hostcache(struct rtable_private *tab); static void rt_update_hostcache(void *tab); -static void rt_next_hop_update(void *tab); -static inline void rt_prune_table(void *tab); -static void rt_fast_prune_check(rtable_private *tab); -static inline void rt_schedule_notify(rtable_private *tab); -static void rt_feed_channel(void *); - -static inline void rt_export_used(rtable_private *tab); -static void rt_export_cleanup(void *tab); +static void rt_next_hop_update(struct rtable_private *tab); +static void rt_nhu_uncork(void *_tab); +static inline void rt_next_hop_resolve_rte(rte *r); +static inline void rt_flowspec_resolve_rte(rte *r, struct channel *c); +static inline void rt_prune_table(struct rtable_private *tab); +static void rt_kick_prune_timer(struct rtable_private *tab); +static void rt_feed_by_fib(void *); +static void rt_feed_by_trie(void *); +static void rt_feed_equal(void *); +static void rt_feed_for(void *); +static void rt_check_cork_low(struct rtable_private *tab); +static void rt_check_cork_high(struct rtable_private *tab); +static void rt_cork_release_hook(void *); +static void rt_shutdown(void *); +static void rt_delete(void *); + +static void rt_export_used(struct rt_table_exporter *, const char *, const char *); +static void rt_export_cleanup(struct rtable_private *tab); + +static int rte_same(rte *x, rte *y); const char *rt_import_state_name_array[TIS_MAX] = { [TIS_DOWN] = "DOWN", @@ -106,54 +188,193 @@ const char *rt_export_state_name(u8 state) return rt_export_state_name_array[state]; } -struct event_cork rt_cork; +static struct hostentry *rt_get_hostentry(struct rtable_private *tab, ip_addr a, ip_addr ll, rtable *dep); -static inline void -rte_update_lock(struct channel *c) +static inline rtable *rt_priv_to_pub(struct rtable_private *tab) { return RT_PUB(tab); } +static inline rtable *rt_pub_to_pub(rtable *tab) { return tab; } +#define RT_ANY_TO_PUB(tab) _Generic((tab),rtable*:rt_pub_to_pub,struct rtable_private*:rt_priv_to_pub)((tab)) + +#define rt_trace(tab, level, fmt, args...) do {\ + rtable *t = RT_ANY_TO_PUB((tab)); \ + if (t->config->debug & (level)) \ + log(L_TRACE "%s: " fmt, t->name, ##args); \ +} while (0) + +static void +net_init_with_trie(struct fib *f, void *N) { - c->rte_update_nest_cnt++; + struct rtable_private *tab = SKIP_BACK(struct rtable_private, fib, f); + net *n = N; + + if (tab->trie) + trie_add_prefix(tab->trie, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen); + + if (tab->trie_new) + trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen); } -static inline void -rte_update_unlock(struct channel *c) +static inline net * +net_route_ip4_trie(struct rtable_private *t, const net_addr_ip4 *n0) +{ + TRIE_WALK_TO_ROOT_IP4(t->trie, n0, n) + { + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_vpn4_trie(struct rtable_private *t, const net_addr_vpn4 *n0) { - if (!--c->rte_update_nest_cnt) - lp_flush(c->rte_update_pool); + TRIE_WALK_TO_ROOT_IP4(t->trie, (const net_addr_ip4 *) n0, px) + { + net_addr_vpn4 n = NET_ADDR_VPN4(px.prefix, px.pxlen, n0->rd); + + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_ip6_trie(struct rtable_private *t, const net_addr_ip6 *n0) +{ + TRIE_WALK_TO_ROOT_IP6(t->trie, n0, n) + { + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_vpn6_trie(struct rtable_private *t, const net_addr_vpn6 *n0) +{ + TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px) + { + net_addr_vpn6 n = NET_ADDR_VPN6(px.prefix, px.pxlen, n0->rd); + + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; } -/* Like fib_route(), but skips empty net entries */ static inline void * -net_route_ip4(rtable_private *t, net_addr_ip4 *n) +net_route_ip6_sadr_trie(struct rtable_private *t, const net_addr_ip6_sadr *n0) { + TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px) + { + net_addr_ip6_sadr n = NET_ADDR_IP6_SADR(px.prefix, px.pxlen, n0->src_prefix, n0->src_pxlen); + net *best = NULL; + int best_pxlen = 0; + + /* We need to do dst first matching. Since sadr addresses are hashed on dst + prefix only, find the hash table chain and go through it to find the + match with the longest matching src prefix. */ + for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next) + { + net_addr_ip6_sadr *a = (void *) fn->addr; + + if (net_equal_dst_ip6_sadr(&n, a) && + net_in_net_src_ip6_sadr(&n, a) && + (a->src_pxlen >= best_pxlen)) + { + best = fib_node_to_user(&t->fib, fn); + best_pxlen = a->src_pxlen; + } + } + + if (best) + return best; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_ip4_fib(struct rtable_private *t, const net_addr_ip4 *n0) +{ + net_addr_ip4 n; + net_copy_ip4(&n, n0); + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) + { + n.pxlen--; + ip4_clrbit(&n.prefix, n.pxlen); + } - while (r = net_find_valid(t, (net_addr *) n), (!r) && (n->pxlen > 0)) + return r; +} + +static inline net * +net_route_vpn4_fib(struct rtable_private *t, const net_addr_vpn4 *n0) +{ + net_addr_vpn4 n; + net_copy_vpn4(&n, n0); + + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) { - n->pxlen--; - ip4_clrbit(&n->prefix, n->pxlen); + n.pxlen--; + ip4_clrbit(&n.prefix, n.pxlen); } return r; } -static inline void * -net_route_ip6(rtable_private *t, net_addr_ip6 *n) +static inline net * +net_route_ip6_fib(struct rtable_private *t, const net_addr_ip6 *n0) { + net_addr_ip6 n; + net_copy_ip6(&n, n0); + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) + { + n.pxlen--; + ip6_clrbit(&n.prefix, n.pxlen); + } - while (r = net_find_valid(t, (net_addr *) n), (!r) && (n->pxlen > 0)) + return r; +} + +static inline net * +net_route_vpn6_fib(struct rtable_private *t, const net_addr_vpn6 *n0) +{ + net_addr_vpn6 n; + net_copy_vpn6(&n, n0); + + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) { - n->pxlen--; - ip6_clrbit(&n->prefix, n->pxlen); + n.pxlen--; + ip6_clrbit(&n.prefix, n.pxlen); } return r; } static inline void * -net_route_ip6_sadr(rtable_private *t, net_addr_ip6_sadr *n) +net_route_ip6_sadr_fib(struct rtable_private *t, const net_addr_ip6_sadr *n0) { - struct fib_node *fn; + net_addr_ip6_sadr n; + net_copy_ip6_sadr(&n, n0); while (1) { @@ -162,13 +383,13 @@ net_route_ip6_sadr(rtable_private *t, net_addr_ip6_sadr *n) /* We need to do dst first matching. Since sadr addresses are hashed on dst prefix only, find the hash table chain and go through it to find the - match with the smallest matching src prefix. */ - for (fn = fib_get_chain(&t->fib, (net_addr *) n); fn; fn = fn->next) + match with the longest matching src prefix. */ + for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next) { net_addr_ip6_sadr *a = (void *) fn->addr; - if (net_equal_dst_ip6_sadr(n, a) && - net_in_net_src_ip6_sadr(n, a) && + if (net_equal_dst_ip6_sadr(&n, a) && + net_in_net_src_ip6_sadr(&n, a) && (a->src_pxlen >= best_pxlen)) { best = fib_node_to_user(&t->fib, fn); @@ -179,38 +400,52 @@ net_route_ip6_sadr(rtable_private *t, net_addr_ip6_sadr *n) if (best) return best; - if (!n->dst_pxlen) + if (!n.dst_pxlen) break; - n->dst_pxlen--; - ip6_clrbit(&n->dst_prefix, n->dst_pxlen); + n.dst_pxlen--; + ip6_clrbit(&n.dst_prefix, n.dst_pxlen); } return NULL; } -void * -net_route(rtable_private *tab, const net_addr *n) +net * +net_route(struct rtable_private *tab, const net_addr *n) { ASSERT(tab->addr_type == n->type); - net_addr *n0 = alloca(n->length); - net_copy(n0, n); - switch (n->type) { case NET_IP4: + if (tab->trie) + return net_route_ip4_trie(tab, (net_addr_ip4 *) n); + else + return net_route_ip4_fib (tab, (net_addr_ip4 *) n); + case NET_VPN4: - case NET_ROA4: - return net_route_ip4(tab, (net_addr_ip4 *) n0); + if (tab->trie) + return net_route_vpn4_trie(tab, (net_addr_vpn4 *) n); + else + return net_route_vpn4_fib (tab, (net_addr_vpn4 *) n); case NET_IP6: + if (tab->trie) + return net_route_ip6_trie(tab, (net_addr_ip6 *) n); + else + return net_route_ip6_fib (tab, (net_addr_ip6 *) n); + case NET_VPN6: - case NET_ROA6: - return net_route_ip6(tab, (net_addr_ip6 *) n0); + if (tab->trie) + return net_route_vpn6_trie(tab, (net_addr_vpn6 *) n); + else + return net_route_vpn6_fib (tab, (net_addr_vpn6 *) n); case NET_IP6_SADR: - return net_route_ip6_sadr(tab, (net_addr_ip6_sadr *) n0); + if (tab->trie) + return net_route_ip6_sadr_trie(tab, (net_addr_ip6_sadr *) n); + else + return net_route_ip6_sadr_fib (tab, (net_addr_ip6_sadr *) n); default: return NULL; @@ -219,15 +454,40 @@ net_route(rtable_private *tab, const net_addr *n) static int -net_roa_check_ip4(rtable *t, const net_addr_ip4 *px, u32 asn) +net_roa_check_ip4_trie(struct rtable_private *tab, const net_addr_ip4 *px, u32 asn) +{ + int anything = 0; + + TRIE_WALK_TO_ROOT_IP4(tab->trie, px, px0) + { + net_addr_roa4 roa0 = NET_ADDR_ROA4(px0.prefix, px0.pxlen, 0, 0); + + struct fib_node *fn; + for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next) + { + net_addr_roa4 *roa = (void *) fn->addr; + net *r = fib_node_to_user(&tab->fib, fn); + + if (net_equal_prefix_roa4(roa, &roa0) && r->routes && rte_is_valid(&r->routes->rte)) + { + anything = 1; + if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) + return ROA_VALID; + } + } + } + TRIE_WALK_TO_ROOT_END; + + return anything ? ROA_INVALID : ROA_UNKNOWN; +} + +static int +net_roa_check_ip4_fib(struct rtable_private *tab, const net_addr_ip4 *px, u32 asn) { struct net_addr_roa4 n = NET_ADDR_ROA4(px->prefix, px->pxlen, 0, 0); struct fib_node *fn; int anything = 0; - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - while (1) { for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next) @@ -239,10 +499,7 @@ net_roa_check_ip4(rtable *t, const net_addr_ip4 *px, u32 asn) { anything = 1; if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) - { - RT_UNLOCK(tab); return ROA_VALID; - } } } @@ -253,20 +510,44 @@ net_roa_check_ip4(rtable *t, const net_addr_ip4 *px, u32 asn) ip4_clrbit(&n.prefix, n.pxlen); } - RT_UNLOCK(tab); return anything ? ROA_INVALID : ROA_UNKNOWN; } static int -net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) +net_roa_check_ip6_trie(struct rtable_private *tab, const net_addr_ip6 *px, u32 asn) +{ + int anything = 0; + + TRIE_WALK_TO_ROOT_IP6(tab->trie, px, px0) + { + net_addr_roa6 roa0 = NET_ADDR_ROA6(px0.prefix, px0.pxlen, 0, 0); + + struct fib_node *fn; + for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next) + { + net_addr_roa6 *roa = (void *) fn->addr; + net *r = fib_node_to_user(&tab->fib, fn); + + if (net_equal_prefix_roa6(roa, &roa0) && r->routes && rte_is_valid(&r->routes->rte)) + { + anything = 1; + if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) + return ROA_VALID; + } + } + } + TRIE_WALK_TO_ROOT_END; + + return anything ? ROA_INVALID : ROA_UNKNOWN; +} + +static int +net_roa_check_ip6_fib(struct rtable_private *tab, const net_addr_ip6 *px, u32 asn) { struct net_addr_roa6 n = NET_ADDR_ROA6(px->prefix, px->pxlen, 0, 0); struct fib_node *fn; int anything = 0; - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - while (1) { for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next) @@ -278,10 +559,7 @@ net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) { anything = 1; if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) - { - RT_UNLOCK(tab); return ROA_VALID; - } } } @@ -292,7 +570,6 @@ net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) ip6_clrbit(&n.prefix, n.pxlen); } - RT_UNLOCK(tab); return anything ? ROA_INVALID : ROA_UNKNOWN; } @@ -312,14 +589,30 @@ net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) * must have type NET_IP4 or NET_IP6, respectively. */ int -net_roa_check(rtable *tab, const net_addr *n, u32 asn) +net_roa_check(rtable *tp, const net_addr *n, u32 asn) { - if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4)) - return net_roa_check_ip4(tab, (const net_addr_ip4 *) n, asn); - else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6)) - return net_roa_check_ip6(tab, (const net_addr_ip6 *) n, asn); - else - return ROA_UNKNOWN; /* Should not happen */ + int out = ROA_UNKNOWN; + + RT_LOCKED(tp, tab) + { + if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4)) + { + if (tab->trie) + out = net_roa_check_ip4_trie(tab, (const net_addr_ip4 *) n, asn); + else + out = net_roa_check_ip4_fib (tab, (const net_addr_ip4 *) n, asn); + } + else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6)) + { + if (tab->trie) + out = net_roa_check_ip6_trie(tab, (const net_addr_ip6 *) n, asn); + else + out = net_roa_check_ip6_fib (tab, (const net_addr_ip6 *) n, asn); + } + else + out = ROA_UNKNOWN; /* Should not happen */ + } + return out; } /** @@ -342,8 +635,8 @@ rte_find(net *net, struct rte_src *src) } -static struct rte_storage * -rte_store(const rte *r, net *net, rtable_private *tab) +struct rte_storage * +rte_store(const rte *r, net *net, struct rtable_private *tab) { struct rte_storage *e = sl_alloc(tab->rte_slab); @@ -352,6 +645,11 @@ rte_store(const rte *r, net *net, rtable_private *tab) rt_lock_source(e->rte.src); + if (ea_is_cached(e->rte.attrs)) + e->rte.attrs = rta_clone(e->rte.attrs); + else + e->rte.attrs = rta_lookup(e->rte.attrs, 1); + return e; } @@ -364,26 +662,29 @@ rte_store(const rte *r, net *net, rtable_private *tab) */ void -rte_free(struct rte_storage *e, rtable_private *tab) +rte_free(struct rte_storage *e) { rt_unlock_source(e->rte.src); rta_free(e->rte.attrs); - sl_free(tab->rte_slab, e); + sl_free(e); } static int /* Actually better or at least as good as */ -rte_better(rte *new, rte *old) +rte_better(const rte *new, const rte *old) { - int (*better)(rte *, rte *); + int (*better)(const rte *, const rte *); if (!rte_is_valid(old)) return 1; if (!rte_is_valid(new)) return 0; - if (new->attrs->pref > old->attrs->pref) + u32 np = rt_get_preference(new); + u32 op = rt_get_preference(old); + + if (np > op) return 1; - if (new->attrs->pref < old->attrs->pref) + if (np < op) return 0; if (new->src->owner->class != old->src->owner->class) { @@ -400,14 +701,14 @@ rte_better(rte *new, rte *old) } static int -rte_mergable(rte *pri, rte *sec) +rte_mergable(const rte *pri, const rte *sec) { - int (*mergable)(rte *, rte *); + int (*mergable)(const rte *, const rte *); if (!rte_is_valid(pri) || !rte_is_valid(sec)) return 0; - if (pri->attrs->pref != sec->attrs->pref) + if (rt_get_preference(pri) != rt_get_preference(sec)) return 0; if (pri->src->owner->class != sec->src->owner->class) @@ -422,11 +723,10 @@ rte_mergable(rte *pri, rte *sec) static void rte_trace(const char *name, const rte *e, int dir, const char *msg) { - log(L_TRACE "%s %c %s %N src %uL %uG %uS id %u %s%s", + log(L_TRACE "%s %c %s %N src %uL %uG %uS id %u %s", name, dir, msg, e->net, e->src->private_id, e->src->global_id, e->stale_cycle, e->id, - rta_dest_name(e->attrs->dest), - rte_is_filtered(e) ? " (filtered)" : ""); + rta_dest_name(rte_dest(e))); } static inline void @@ -465,26 +765,26 @@ rte_feed_count(net *n) { uint count = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) - count++; + count++; + return count; } static void -rte_feed_obtain(net *n, struct rte **feed, uint count) +rte_feed_obtain(net *n, const rte **feed, uint count) { uint i = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) { ASSERT_DIE(i < count); feed[i++] = &e->rte; } + ASSERT_DIE(i == count); } static rte * -export_filter_(struct channel *c, rte *rt, linpool *pool, int silent) +export_filter(struct channel *c, rte *rt, int silent) { struct proto *p = c->proto; const struct filter *filter = c->out_filter; @@ -514,7 +814,7 @@ export_filter_(struct channel *c, rte *rt, linpool *pool, int silent) } v = filter && ((filter == FILTER_REJECT) || - (f_run(filter, rt, pool, + (f_run(filter, rt, (silent ? FF_SILENT : 0)) > F_ACCEPT)); if (v) { @@ -540,17 +840,10 @@ reject_noset: return NULL; } -static inline rte * -export_filter(struct channel *c, rte *rt, int silent) -{ - return export_filter_(c, rt, c->rte_update_pool, silent); -} - -void do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old); - static void do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) { + struct proto *p = c->proto; struct channel_export_stats *stats = &c->export_stats; if (c->refeeding && new) @@ -567,30 +860,16 @@ do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) if (!new && old) CHANNEL_LIMIT_POP(c, OUT); - /* Store route export state */ - if (old) - bmap_clear(&c->export_map, old->id); - if (new) - bmap_set(&c->export_map, new->id); - - /* Apply export table */ - if (c->out_table) - rte_import(&c->out_table->push, net, new, old ? old->src : new->src); + stats->updates_accepted++; else - do_rt_notify_direct(c, net, new, old); -} + stats->withdraws_accepted++; -void -do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old) -{ - struct proto *p = c->proto; - struct channel_export_stats *stats = &c->export_stats; + if (old) + bmap_clear(&c->export_map, old->id); if (new) - stats->updates_accepted++; - else - stats->withdraws_accepted++; + bmap_set(&c->export_map, new->id); if (p->debug & D_ROUTES) { @@ -608,6 +887,16 @@ do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte static void rt_notify_basic(struct channel *c, const net_addr *net, rte *new, rte *old) { + if (new && old && rte_same(new, old)) + { + if ((new->id != old->id) && bmap_test(&c->export_map, old->id)) + { + bmap_set(&c->export_map, new->id); + bmap_clear(&c->export_map, old->id); + } + return; + } + if (new) new = export_filter(c, new, 0); @@ -631,13 +920,12 @@ channel_rpe_mark_seen(struct rt_export_request *req, struct rt_pending_export *r } void -rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe, - struct rte **feed, uint count) +rt_notify_accepted(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); - rte nb0, *new_best = NULL; const rte *old_best = NULL; @@ -677,7 +965,7 @@ rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_p done: /* Check obsolete routes for previously exported */ - while (rpe) + RPE_WALK(first, rpe, NULL) { channel_rpe_mark_seen(req, rpe); if (rpe->old) @@ -688,7 +976,8 @@ done: old_best = &rpe->old->rte; } } - rpe = rpe_next(rpe, NULL); + if (rpe == last) + break; } /* Nothing to export */ @@ -696,25 +985,16 @@ done: do_rt_notify(c, n, new_best, old_best); else DBG("rt_notify_accepted: nothing to export\n"); - - rte_update_unlock(c); -} - - -static struct nexthop * -nexthop_merge_rta(struct nexthop *nhs, rta *a, linpool *pool, int max) -{ - return nexthop_merge(nhs, &(a->nh), 1, 0, max, pool); } rte * -rt_export_merged(struct channel *c, struct rte **feed, uint count, linpool *pool, int silent) +rt_export_merged(struct channel *c, const rte **feed, uint count, linpool *pool, int silent) { _Thread_local static rte rloc; // struct proto *p = c->proto; - struct nexthop *nhs = NULL; - rte *best0 = feed[0]; + struct nexthop_adata *nhs = NULL; + const rte *best0 = feed[0]; rte *best = NULL; if (!rte_is_valid(best0)) @@ -725,7 +1005,7 @@ rt_export_merged(struct channel *c, struct rte **feed, uint count, linpool *pool return NULL; rloc = *best0; - best = export_filter_(c, &rloc, pool, silent); + best = export_filter(c, &rloc, silent); if (!best) /* Best route doesn't pass the filter */ @@ -741,35 +1021,41 @@ rt_export_merged(struct channel *c, struct rte **feed, uint count, linpool *pool continue; rte tmp0 = *feed[i]; - rte *tmp = export_filter_(c, &tmp0, pool, 1); + rte *tmp = export_filter(c, &tmp0, 1); if (!tmp || !rte_is_reachable(tmp)) continue; - nhs = nexthop_merge_rta(nhs, tmp->attrs, pool, c->merge_limit); + eattr *nhea = ea_find(tmp->attrs, &ea_gen_nexthop); + ASSERT_DIE(nhea); + + if (nhs) + nhs = nexthop_merge(nhs, (struct nexthop_adata *) nhea->u.ptr, c->merge_limit, pool); + else + nhs = (struct nexthop_adata *) nhea->u.ptr; } if (nhs) { - nhs = nexthop_merge_rta(nhs, best->attrs, pool, c->merge_limit); + eattr *nhea = ea_find(best->attrs, &ea_gen_nexthop); + ASSERT_DIE(nhea); - if (nhs->next) - { - best->attrs = rta_cow(best->attrs, pool); - nexthop_link(best->attrs, nhs); - } + nhs = nexthop_merge(nhs, (struct nexthop_adata *) nhea->u.ptr, c->merge_limit, pool); + + ea_set_attr(&best->attrs, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhs->ad)); } return best; } void -rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe, - struct rte **feed, uint count) +rt_notify_merged(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); // struct proto *p = c->proto; #if 0 /* TODO: Find whether this check is possible when processing multiple changes at once. */ @@ -781,7 +1067,7 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen return; #endif - rte *old_best = NULL; + const rte *old_best = NULL; /* Find old best route */ for (uint i = 0; i < count; i++) if (bmap_test(&c->export_map, feed[i]->id)) @@ -791,7 +1077,7 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen } /* Check obsolete routes for previously exported */ - while (rpe) + RPE_WALK(first, rpe, NULL) { channel_rpe_mark_seen(req, rpe); if (rpe->old) @@ -802,80 +1088,83 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen old_best = &rpe->old->rte; } } - rpe = rpe_next(rpe, NULL); + if (rpe == last) + break; } /* Prepare new merged route */ - rte *new_merged = count ? rt_export_merged(c, feed, count, c->rte_update_pool, 0) : NULL; + rte *new_merged = count ? rt_export_merged(c, feed, count, tmp_linpool, 0) : NULL; if (new_merged || old_best) do_rt_notify(c, n, new_merged, old_best); - - rte_update_unlock(c); } void -rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); - rte *old = RTES_OR_NULL(rpe->old_best); - struct rte_storage *new_best = rpe->new_best; + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; - while (rpe) + RPE_WALK(first, rpe, NULL) { channel_rpe_mark_seen(req, rpe); new_best = rpe->new_best; - rpe = rpe_next(rpe, NULL); } - if (&new_best->rte != old) - { - rte n0, *new = RTES_CLONE(new_best, &n0); - rt_notify_basic(c, net, new, old); - } - - rte_update_unlock(c); + rte n0 = RTE_COPY_VALID(new_best); + if (n0.src || o) + rt_notify_basic(c, net, n0.src ? &n0 : NULL, o); } void -rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); - struct rte_src *src = rpe->new ? rpe->new->rte.src : rpe->old->rte.src; - rte *old = RTES_OR_NULL(rpe->old); - struct rte_storage *new_any = rpe->new; - while (rpe) + rte *n = RTE_VALID_OR_NULL(first->new); + rte *o = RTE_VALID_OR_NULL(first->old); + + if (!n && !o) { - channel_rpe_mark_seen(req, rpe); - new_any = rpe->new; - rpe = rpe_next(rpe, src); + channel_rpe_mark_seen(req, first); + return; } - if (&new_any->rte != old) + struct rte_src *src = n ? n->src : o->src; + struct rte_storage *new_latest = first->new; + + RPE_WALK(first, rpe, src) { - rte n0, *new = RTES_CLONE(new_any, &n0); - rt_notify_basic(c, net, new, old); + channel_rpe_mark_seen(req, rpe); + new_latest = rpe->new; } - rte_update_unlock(c); + rte n0 = RTE_COPY_VALID(new_latest); + if (n0.src || o) + rt_notify_basic(c, net, n0.src ? &n0 : NULL, o); } void -rt_feed_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) +rt_feed_any(struct rt_export_request *req, const net_addr *net, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); for (uint i=0; i<count; i++) + if (rte_is_valid(feed[i])) + { + rte n0 = *feed[i]; + rt_notify_basic(c, net, &n0, NULL); + } + + RPE_WALK(first, rpe, NULL) { - rte n0 = *feed[i]; - rt_notify_basic(c, net, &n0, NULL); + channel_rpe_mark_seen(req, rpe); + if (rpe == last) + break; } - - rte_update_unlock(c); } void @@ -905,14 +1194,38 @@ rpe_next(struct rt_pending_export *rpe, struct rte_src *src) } static struct rt_pending_export * rt_next_export_fast(struct rt_pending_export *last); -static void -rte_export(struct rt_export_hook *hook, struct rt_pending_export *rpe) +static int +rte_export(struct rt_table_export_hook *th, struct rt_pending_export *rpe) { + rtable *tab = RT_PUB(SKIP_BACK(struct rtable_private, exporter, th->table)); + struct rt_export_hook *hook = &th->h; if (bmap_test(&hook->seq_map, rpe->seq)) - goto seen; + goto ignore; /* Seen already */ const net_addr *n = rpe->new_best ? rpe->new_best->rte.net : rpe->old_best->rte.net; + switch (hook->req->addr_mode) + { + case TE_ADDR_NONE: + break; + + case TE_ADDR_IN: + if (!net_in_netX(n, hook->req->addr)) + goto ignore; + break; + + case TE_ADDR_EQUAL: + if (!net_equal(n, hook->req->addr)) + goto ignore; + break; + + case TE_ADDR_FOR: + bug("Continuos export of best prefix match not implemented yet."); + + default: + bug("Strange table export address mode: %d", hook->req->addr_mode); + } + if (rpe->new) hook->stats.updates_received++; else @@ -922,36 +1235,34 @@ rte_export(struct rt_export_hook *hook, struct rt_pending_export *rpe) hook->req->export_one(hook->req, n, rpe); else if (hook->req->export_bulk) { - RT_LOCK(hook->table); net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); + RT_LOCK(tab); + struct rt_pending_export *last = net->last; uint count = rte_feed_count(net); - rte **feed = NULL; + const rte **feed = NULL; if (count) { feed = alloca(count * sizeof(rte *)); rte_feed_obtain(net, feed, count); } - RT_UNLOCK(hook->table); - hook->req->export_bulk(hook->req, n, rpe, feed, count); + RT_UNLOCK(tab); + hook->req->export_bulk(hook->req, n, rpe, last, feed, count); } else bug("Export request must always provide an export method"); -seen: +ignore: /* Get the next export if exists */ - hook->rpe_next = rt_next_export_fast(rpe); + th->rpe_next = rt_next_export_fast(rpe); /* The last block may be available to free */ - if (PAGE_HEAD(hook->rpe_next) != PAGE_HEAD(rpe)) - { - RT_LOCK(hook->table); - rt_export_used(RT_PRIV(hook->table)); - RT_UNLOCK(hook->table); - } + int used = (PAGE_HEAD(th->rpe_next) != PAGE_HEAD(rpe)); /* Releasing this export for cleanup routine */ DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe->seq); - atomic_store_explicit(&hook->last_export, rpe, memory_order_release); + atomic_store_explicit(&th->last_export, rpe, memory_order_release); + + return used; } /** @@ -986,47 +1297,28 @@ seen: * done outside of scope of rte_announce(). */ static void -rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_storage *old, +rte_announce(struct rtable_private *tab, net *net, struct rte_storage *new, struct rte_storage *old, struct rte_storage *new_best, struct rte_storage *old_best) { - if (!new_best || !rte_is_valid(&new_best->rte)) - new_best = NULL; - - if (!old_best || !rte_is_valid(&old_best->rte)) - old_best = NULL; - - if (!new || !rte_is_valid(&new->rte)) - new = NULL; - - if (old && !rte_is_valid(&old->rte)) - { - /* Filtered old route isn't announced, should be freed immediately. */ - rte_free(old, tab); - old = NULL; - } + int new_best_valid = rte_is_valid(RTE_OR_NULL(new_best)); + int old_best_valid = rte_is_valid(RTE_OR_NULL(old_best)); if ((new == old) && (new_best == old_best)) return; - if (new_best != old_best) - { - if (new_best) - new_best->rte.sender->stats.pref++; - if (old_best) - old_best->rte.sender->stats.pref--; + if (new_best_valid) + new_best->rte.sender->stats.pref++; + if (old_best_valid) + old_best->rte.sender->stats.pref--; - if (tab->hostcache) - rt_notify_hostcache(tab, net); - } - - if (EMPTY_LIST(tab->exports) && EMPTY_LIST(tab->pending_exports)) + if (EMPTY_LIST(tab->exporter.e.hooks) && EMPTY_LIST(tab->exporter.pending)) { /* No export hook and no pending exports to cleanup. We may free the route immediately. */ if (!old) return; hmap_clear(&tab->id_map, old->rte.id); - rte_free(old, tab); + rte_free(old); return; } @@ -1034,9 +1326,9 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ struct rt_export_block *rpeb = NULL, *rpebsnl = NULL; u32 end = 0; - if (!EMPTY_LIST(tab->pending_exports)) + if (!EMPTY_LIST(tab->exporter.pending)) { - rpeb = TAIL(tab->pending_exports); + rpeb = TAIL(tab->exporter.pending); end = atomic_load_explicit(&rpeb->end, memory_order_relaxed); if (end >= RT_PENDING_EXPORT_ITEMS) { @@ -1052,7 +1344,7 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ { rpeb = alloc_page(); *rpeb = (struct rt_export_block) {}; - add_tail(&tab->pending_exports, &rpeb->n); + add_tail(&tab->exporter.pending, &rpeb->n); } /* Fill the pending export */ @@ -1062,10 +1354,14 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ .new_best = new_best, .old = old, .old_best = old_best, - .seq = tab->next_export_seq++, + .seq = tab->exporter.next_seq++, }; - DBG("rte_announce: table=%s net=%N new=%p from %p old=%p from %p new_best=%p old_best=%p seq=%lu\n", tab->name, net->n.addr, new, new ? new->sender : NULL, old, old ? old->sender : NULL, new_best, old_best, rpe->seq); + DBGL("rte_announce: table=%s net=%N new=%p id %u from %s old=%p id %u from %s new_best=%p id %u old_best=%p id %u seq=%lu", + tab->name, net->n.addr, + new, new ? new->rte.id : 0, new ? new->rte.sender->req->name : NULL, + old, old ? old->rte.id : 0, old ? old->rte.sender->req->name : NULL, + new_best, old_best, rpe->seq); ASSERT_DIE(atomic_fetch_add_explicit(&rpeb->end, 1, memory_order_release) == end); @@ -1084,7 +1380,7 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ &net->last->next, &rpenull, rpe, memory_order_relaxed, memory_order_relaxed)); - + } net->last = rpe; @@ -1092,19 +1388,10 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ if (!net->first) net->first = rpe; - if (tab->first_export == NULL) - tab->first_export = rpe; + if (tab->exporter.first == NULL) + tab->exporter.first = rpe; - if (!EMPTY_LIST(tab->exports) && - (tab->first_export->seq + tab->config->cork_limit <= tab->next_export_seq) && - !tab->cork_active) - { - if (config->table_debug) - log(L_TRACE "%s: cork activated", tab->name); - - ev_cork(&rt_cork); - tab->cork_active = 1; - } + rt_check_cork_high(tab); } static struct rt_pending_export * @@ -1133,8 +1420,10 @@ rt_next_export_fast(struct rt_pending_export *last) } static struct rt_pending_export * -rt_next_export(struct rt_export_hook *hook, rtable_private *tab) +rt_next_export(struct rt_table_export_hook *hook, struct rt_table_exporter *tab) { + ASSERT_DIE(RT_IS_LOCKED(SKIP_BACK(struct rtable_private, exporter, tab))); + /* As the table is locked, it is safe to reload the last export pointer */ struct rt_pending_export *last = atomic_load_explicit(&hook->last_export, memory_order_acquire); @@ -1144,62 +1433,75 @@ rt_next_export(struct rt_export_hook *hook, rtable_private *tab) /* No, therefore we must process the table's first pending export */ else - return tab->first_export; + return tab->first; } static inline void rt_send_export_event(struct rt_export_hook *hook) { - ev_send(hook->req->list, hook->event); + ev_send(hook->req->list, &hook->event); } static void -rt_announce_exports(void *data) +rt_announce_exports(struct settle *s) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); - - rt_schedule_notify(tab); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, export_settle, s)), tab) + if (!EMPTY_LIST(tab->exporter.pending)) + { + struct rt_export_hook *c; node *n; + WALK_LIST2(c, n, tab->exporter.e.hooks, n) + { + if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_READY) + continue; - struct rt_export_hook *c; node *n; - WALK_LIST2(c, n, tab->exports, n) - { - if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_READY) - continue; + rt_send_export_event(c); + } + } +} - rt_send_export_event(c); - } +static void +rt_kick_export_settle(struct rtable_private *tab) +{ + tab->export_settle.cf = tab->rr_counter ? tab->config->export_rr_settle : tab->config->export_settle; + settle_kick(&tab->export_settle, tab->loop); } static void -rt_import_announce_exports(void *data) +rt_import_announce_exports(void *_hook) { - struct rt_import_hook *hook = data; - RT_LOCKED(hook->table, tab) + struct rt_import_hook *hook = _hook; + if (hook->import_state == TIS_CLEARED) { - if (hook->import_state == TIS_CLEARED) + void (*stopped)(struct rt_import_request *) = hook->stopped; + struct rt_import_request *req = hook->req; + + RT_LOCKED(hook->table, tab) { - rfree(hook->export_announce_event); + req->hook = NULL; - ev_send(hook->stopped->list, hook->stopped); + rt_trace(tab, D_EVENTS, "Hook %s stopped", req->name); rem_node(&hook->n); mb_free(hook); rt_unlock_table(tab); } - else - ev_send_loop(tab->loop, tab->announce_event); + + stopped(req); + return; } + + rt_trace(hook->table, D_EVENTS, "Announcing exports after imports from %s", hook->req->name); + birdloop_flag(hook->table->loop, RTF_EXPORT); } static struct rt_pending_export * -rt_last_export(rtable_private *tab) +rt_last_export(struct rt_table_exporter *tab) { struct rt_pending_export *rpe = NULL; - if (!EMPTY_LIST(tab->pending_exports)) + if (!EMPTY_LIST(tab->pending)) { /* We'll continue processing exports from this export on */ - struct rt_export_block *reb = TAIL(tab->pending_exports); + struct rt_export_block *reb = TAIL(tab->pending); ASSERT_DIE(reb->end); rpe = &reb->export[reb->end - 1]; } @@ -1212,35 +1514,47 @@ rt_last_export(rtable_private *tab) static void rt_export_hook(void *_data) { - struct rt_export_hook *c = _data; + struct rt_table_export_hook *c = _data; + rtable *tab = SKIP_BACK(rtable, priv.exporter, c->table); - ASSERT_DIE(atomic_load_explicit(&c->export_state, memory_order_relaxed) == TES_READY); + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_READY); if (!c->rpe_next) { - RT_LOCK(c->table); - c->rpe_next = rt_next_export(c, RT_PRIV(c->table)); + RT_LOCK(tab); + c->rpe_next = rt_next_export(c, c->table); if (!c->rpe_next) { - rt_export_used(RT_PRIV(c->table)); - RT_UNLOCK(c->table); + rt_export_used(c->table, c->h.req->name, "done exporting"); + RT_UNLOCK(tab); return; } - RT_UNLOCK(c->table); + RT_UNLOCK(tab); } + int used = 0; + int no_next = 0; + /* Process the export */ for (uint i=0; i<RT_EXPORT_BULK; i++) { - rte_export(c, c->rpe_next); + used += rte_export(c, c->rpe_next); if (!c->rpe_next) + { + no_next = 1; break; + } } - rt_send_export_event(c); + if (used) + RT_LOCKED(tab, t) + if (no_next || t->cork_active) + rt_export_used(c->table, c->h.req->name, no_next ? "finished export bulk" : "cork active"); + + rt_send_export_event(&c->h); } @@ -1267,16 +1581,29 @@ rte_validate(struct channel *ch, rte *e) return 0; } - if (net_type_match(n, NB_DEST) == !e->attrs->dest) + if (net_type_match(n, NB_DEST)) { - log(L_WARN "Ignoring route %N with invalid dest %d received via %s", - n, e->attrs->dest, ch->proto->name); - return 0; - } + eattr *nhea = ea_find(e->attrs, &ea_gen_nexthop); + int dest = nhea_dest(nhea); - if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh))) + if (dest == RTD_NONE) + { + log(L_WARN "Ignoring route %N with no destination received via %s", + n, ch->proto->name); + return 0; + } + + if ((dest == RTD_UNICAST) && + !nexthop_is_sorted((struct nexthop_adata *) nhea->u.ptr)) + { + log(L_WARN "Ignoring unsorted multipath route %N received via %s", + n, ch->proto->name); + return 0; + } + } + else if (ea_find(e->attrs, &ea_gen_nexthop)) { - log(L_WARN "Ignoring unsorted multipath route %N received via %s", + log(L_WARN "Ignoring route %N having a nexthop attribute received via %s", n, ch->proto->name); return 0; } @@ -1287,9 +1614,7 @@ rte_validate(struct channel *ch, rte *e) static int rte_same(rte *x, rte *y) { - ASSERT_DIE(x->attrs->cached && y->attrs->cached); - - /* rte.flags are not checked, as they are mostly internal to rtable */ + /* rte.flags / rte.pflags are not checked, as they are internal to rtable */ return x->attrs == y->attrs && x->src == y->src && @@ -1298,8 +1623,8 @@ rte_same(rte *x, rte *y) static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); } -static void -rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte *new, struct rte_src *src) +static int +rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, net *net, rte *new, struct rte_src *src) { struct rt_import_request *req = c->req; struct rt_import_stats *stats = &c->stats; @@ -1307,22 +1632,17 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * rte *old_best = old_best_stored ? &old_best_stored->rte : NULL; rte *old = NULL; - /* Set the stale cycle unless already set */ - if (new && !(new->flags & REF_USE_STALE)) - new->stale_cycle = c->stale_set; + /* If the new route is identical to the old one, we find the attributes in + * cache and clone these with no performance drop. OTOH, if we were to lookup + * the attributes, such a route definitely hasn't been anywhere yet, + * therefore it's definitely worth the time. */ + struct rte_storage *new_stored = NULL; + if (new) + new = &(new_stored = rte_store(new, net, table))->rte; /* Find and remove original route from the same protocol */ struct rte_storage **before_old = rte_find(net, src); - if (!*before_old && !new) - { - stats->withdraws_ignored++; - return; - } - - if (new) - new->attrs = rta_is_cached(new->attrs) ? rta_clone(new->attrs) : rta_lookup(new->attrs); - if (*before_old) { old = &(old_stored = (*before_old))->rte; @@ -1342,7 +1662,7 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * c->table->name, net->n.addr, old->src->owner->name, old->src->private_id, old->src->global_id); } - if (new && rte_same(old, new)) + if (new && rte_same(old, &new_stored->rte)) { /* No changes, ignore the new route and refresh the old one */ old->stale_cycle = new->stale_cycle; @@ -1353,16 +1673,28 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * rt_rte_trace_in(D_ROUTES, req, new, "ignored"); } - rta_free(new->attrs); - return; + /* We need to free the already stored route here before returning */ + rte_free(new_stored); + return 0; } *before_old = (*before_old)->next; table->rt_count--; } - if (req->preimport) - new = req->preimport(req, new, old); + if (!old && !new) + { + stats->withdraws_ignored++; + return 0; + } + + /* If rejected by import limit, we need to pretend there is no route */ + if (req->preimport && (req->preimport(req, new, old) == 0)) + { + rte_free(new_stored); + new_stored = NULL; + new = NULL; + } int new_ok = rte_is_ok(new); int old_ok = rte_is_ok(old); @@ -1377,8 +1709,6 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * if (old_ok || new_ok) table->last_rt_change = current_time(); - struct rte_storage *new_stored = new ? rte_store(new, net, table) : NULL; - if (table->config->sorted) { /* If routes are sorted, just insert new route to appropriate position */ @@ -1474,121 +1804,81 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * hmap_set(&table->id_map, new_stored->rte.id); } - _Bool nb = (new_stored == net->routes); - _Bool ob = (old_best == old); - /* Log the route change */ - if (new_ok && old_ok) + if (new_ok) + rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, new_stored == net->routes ? "added [best]" : "added"); + else if (old_ok) { - const char *best_indicator[2][2] = { { "updated", "updated [-best]" }, { "updated [+best]", "updated [best]" } }; - rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, best_indicator[nb][ob]); + if (old != old_best) + rt_rte_trace_in(D_ROUTES, req, old, "removed"); + else if (net->routes && rte_is_ok(&net->routes->rte)) + rt_rte_trace_in(D_ROUTES, req, old, "removed [replaced]"); + else + rt_rte_trace_in(D_ROUTES, req, old, "removed [sole]"); } - else if (new_ok) - rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, - (!net->routes->next || !rte_is_ok(&net->routes->next->rte)) ? "added [sole]" : - nb ? "added [best]" : "added"); - else if (old_ok) - rt_rte_trace_in(D_ROUTES, req, old, - (!net->routes || !rte_is_ok(&net->routes->rte)) ? "removed [sole]" : - ob ? "removed [best]" : "removed"); + else + if (req->trace_routes & D_ROUTES) + log(L_TRACE "%s > ignored %N %s->%s", req->name, net->n.addr, old ? "filtered" : "none", new ? "filtered" : "none"); /* Propagate the route change */ rte_announce(table, net, new_stored, old_stored, net->routes, old_best_stored); - if (!net->routes && - (table->gc_counter++ >= table->config->gc_max_ops) && - (table->gc_time + table->config->gc_min_time <= current_time())) - rt_schedule_prune(table); - -#if 0 - /* Enable and reimplement these callbacks if anybody wants to use them */ - if (old_ok && p->rte_remove) - p->rte_remove(net, old); - if (new_ok && p->rte_insert) - p->rte_insert(net, &new_stored->rte); -#endif - + return 1; } -rte * +int channel_preimport(struct rt_import_request *req, rte *new, rte *old) { struct channel *c = SKIP_BACK(struct channel, in_req, req); - if (!c->in_table) - { - if (new && !old) - if (CHANNEL_LIMIT_PUSH(c, RX)) - { - rta_free(new->attrs); - return NULL; - } + if (new && !old) + if (CHANNEL_LIMIT_PUSH(c, RX)) + return 0; - if (!new && old) - CHANNEL_LIMIT_POP(c, RX); - } + if (!new && old) + CHANNEL_LIMIT_POP(c, RX); int new_in = new && !rte_is_filtered(new); int old_in = old && !rte_is_filtered(old); if (new_in && !old_in) if (CHANNEL_LIMIT_PUSH(c, IN)) - if (c->in_keep_filtered) + if (c->in_keep & RIK_REJECTED) { new->flags |= REF_FILTERED; - return new; + return 1; } else - { - rta_free(new->attrs); - return NULL; - } + return 0; if (!new_in && old_in) CHANNEL_LIMIT_POP(c, IN); - return new; -} - -rte * -channel_in_preimport(struct rt_import_request *req, rte *new, rte *old) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); - - if (new && !old) - if (CHANNEL_LIMIT_PUSH(cat->c, RX)) - return NULL; - - if (!new && old) - CHANNEL_LIMIT_POP(cat->c, RX); - - return new; + return 1; } -void rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); - void rte_update(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) { if (!c->in_req.hook) + { + log(L_WARN "%s.%s: Called rte_update without import hook", c->proto->name, c->name); return; + } ASSERT(c->channel_state == CS_UP); - if (c->in_table) - rte_import(&c->in_table->push, n, new, src); - else - rte_update_direct(c, n, new, src); -} + /* The import reloader requires prefilter routes to be the first layer */ + if (new && (c->in_keep & RIK_PREFILTER)) + if (ea_is_cached(new->attrs) && !new->attrs->next) + new->attrs = ea_clone(new->attrs); + else + new->attrs = ea_lookup(new->attrs, 0); -void -rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) -{ const struct filter *filter = c->in_filter; struct channel_import_stats *stats = &c->import_stats; - rte_update_lock(c); if (new) { new->net = n; @@ -1596,30 +1886,49 @@ rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src int fr; stats->updates_received++; - if (!rte_validate(c, new)) - { - channel_rte_trace_in(D_FILTERS, c, new, "invalid"); - stats->updates_invalid++; - new = NULL; - } - else if ((filter == FILTER_REJECT) || - ((fr = f_run(filter, new, c->rte_update_pool, 0)) > F_ACCEPT)) + if ((filter == FILTER_REJECT) || + ((fr = f_run(filter, new, 0)) > F_ACCEPT)) { stats->updates_filtered++; channel_rte_trace_in(D_FILTERS, c, new, "filtered out"); - if (c->in_keep_filtered) + if (c->in_keep & RIK_REJECTED) new->flags |= REF_FILTERED; else new = NULL; } + + if (new) + if (net_is_flow(n)) + rt_flowspec_resolve_rte(new, c); + else + rt_next_hop_resolve_rte(new); + + if (new && !rte_validate(c, new)) + { + channel_rte_trace_in(D_FILTERS, c, new, "invalid"); + stats->updates_invalid++; + new = NULL; + } + } else stats->withdraws_received++; rte_import(&c->in_req, n, new, src); - rte_update_unlock(c); + /* Now the route attributes are kept by the in-table cached version + * and we may drop the local handle */ + if (new && (c->in_keep & RIK_PREFILTER)) + { + /* There may be some updates on top of the original attribute block */ + ea_list *a = new->attrs; + while (a->next) + a = a->next; + + ea_free(a); + } + } void @@ -1627,91 +1936,99 @@ rte_import(struct rt_import_request *req, const net_addr *n, rte *new, struct rt { struct rt_import_hook *hook = req->hook; if (!hook) + { + log(L_WARN "%s: Called rte_import without import hook", req->name); return; + } - RT_LOCK(hook->table); - rtable_private *tab = RT_PRIV(hook->table); - - net *nn; - if (new) + RT_LOCKED(hook->table, tab) + { + net *nn; + if (new) { /* Use the actual struct network, not the dummy one */ nn = net_get(tab, n); new->net = nn->n.addr; new->sender = hook; + + /* Set the stale cycle */ + new->stale_cycle = hook->stale_set; } - else if (!(nn = net_find(tab, n))) + else if (!(nn = net_find(tab, n))) { req->hook->stats.withdraws_ignored++; - RT_UNLOCK(tab); - return; + if (req->trace_routes & D_ROUTES) + log(L_TRACE "%s > ignored %N withdraw", req->name, n); + RT_RETURN(tab); } - /* And recalculate the best route */ - rte_recalculate(tab, hook, nn, new, src); - - /* Schedule export announcement */ - ev_send(req->list, hook->export_announce_event); - - /* Done! */ - RT_UNLOCK(tab); + /* Recalculate the best route */ + if (rte_recalculate(tab, hook, nn, new, src)) + ev_send(req->list, &hook->announce_event); + } } /* Check rtable for best route to given net whether it would be exported do p */ int -rt_examine(rtable_private *t, net_addr *a, struct channel *c, const struct filter *filter) +rt_examine(rtable *tp, net_addr *a, struct channel *c, const struct filter *filter) { - net *n = net_find(t, a); + rte rt = {}; - if (!n || !n->routes) - return 0; - - rte rt = n->routes->rte; + RT_LOCKED(tp, t) + { + net *n = net_find(t, a); + if (n) + rt = RTE_COPY_VALID(n->routes); + } - if (!rte_is_valid(&rt)) + if (!rt.src) return 0; - rte_update_lock(c); - - /* Rest is stripped down export_filter() */ int v = c->proto->preexport ? c->proto->preexport(c, &rt) : 0; if (v == RIC_PROCESS) - v = (f_run(filter, &rt, c->rte_update_pool, FF_SILENT) <= F_ACCEPT); - - rte_update_unlock(c); + v = (f_run(filter, &rt, FF_SILENT) <= F_ACCEPT); return v > 0; } static void -rt_export_stopped(void *data) +rt_table_export_done(void *hh) { - struct rt_export_hook *hook = data; + struct rt_table_export_hook *hook = hh; + struct rt_export_request *req = hook->h.req; + void (*stopped)(struct rt_export_request *) = hook->h.stopped; + rtable *t = SKIP_BACK(rtable, priv.exporter, hook->table); - RT_LOCKED(hook->table, tab) + RT_LOCKED(t, tab) { + DBG("Export hook %p in table %s finished uc=%u\n", hook, tab->name, tab->use_count); + /* Drop pending exports */ - rt_export_used(tab); + rt_export_used(&tab->exporter, hook->h.req->name, "stopped"); - /* Unlist */ - rem_node(&hook->n); + /* Do the common code; this frees the hook */ + rt_export_stopped(&hook->h); } /* Report the channel as stopped. */ - hook->stopped(hook->req); + CALL(stopped, req); - RT_LOCKED(hook->table, tab) - { - /* Free the hook together with its coroutine. */ - rp_free(hook->pool, tab->rp); - rt_unlock_table(tab); + /* Unlock the table; this may free it */ + rt_unlock_table(t); +} - rt_fast_prune_check(tab); +void +rt_export_stopped(struct rt_export_hook *hook) +{ + /* Unlink from the request */ + hook->req->hook = NULL; - DBG("Export hook %p in table %s finished uc=%u\n", hook, tab->name, tab->use_count); - } -} + /* Unlist */ + rem_node(&hook->n); + /* Free the hook itself together with its pool */ + rfree(hook->pool); +} static inline void rt_set_import_state(struct rt_import_hook *hook, u8 state) @@ -1719,128 +2036,292 @@ rt_set_import_state(struct rt_import_hook *hook, u8 state) hook->last_state_change = current_time(); hook->import_state = state; - if (hook->req->log_state_change) - hook->req->log_state_change(hook->req, state); + CALL(hook->req->log_state_change, hook->req, state); } -static inline void +void rt_set_export_state(struct rt_export_hook *hook, u8 state) { hook->last_state_change = current_time(); - atomic_store_explicit(&hook->export_state, state, memory_order_release); + u8 old = atomic_exchange_explicit(&hook->export_state, state, memory_order_release); - if (hook->req->log_state_change) - hook->req->log_state_change(hook->req, state); + if (old != state) + CALL(hook->req->log_state_change, hook->req, state); } void rt_request_import(rtable *t, struct rt_import_request *req) { - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - rt_lock_table(tab); - - ASSERT_DIE(!tab->delete); - - struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook)); - - DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count); - - hook->req = req; - hook->table = t; + RT_LOCKED(t, tab) + { + rt_lock_table(tab); - hook->export_announce_event = ev_new_init(tab->rp, rt_import_announce_exports, hook); + struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook)); - if (!hook->stale_set) - hook->stale_set = hook->stale_valid = hook->stale_pruning = hook->stale_pruned = 1; + hook->announce_event = (event) { .hook = rt_import_announce_exports, .data = hook }; - rt_set_import_state(hook, TIS_UP); + DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count); - hook->n = (node) {}; - add_tail(&tab->imports, &hook->n); - tab->imports_up++; + hook->req = req; + hook->table = t; - RT_UNLOCK(t); + rt_set_import_state(hook, TIS_UP); + add_tail(&tab->imports, &hook->n); + } } void -rt_stop_import(struct rt_import_request *req, event *stopped) +rt_stop_import(struct rt_import_request *req, void (*stopped)(struct rt_import_request *)) { ASSERT_DIE(req->hook); struct rt_import_hook *hook = req->hook; - rtable_private *tab = RT_LOCK(hook->table); + RT_LOCKED(hook->table, tab) + { + rt_schedule_prune(tab); + rt_set_import_state(hook, TIS_STOP); + hook->stopped = stopped; - rt_schedule_prune(tab); + /* Cancel table rr_counter */ + if (hook->stale_set != hook->stale_pruned) + tab->rr_counter -= (hook->stale_set - hook->stale_pruned); - tab->imports_up--; - rt_fast_prune_check(tab); + tab->rr_counter++; - rt_set_import_state(hook, TIS_STOP); + hook->stale_set = hook->stale_pruned = hook->stale_pruning = hook->stale_valid = 0; + } +} - hook->stopped = stopped; +static void rt_table_export_start_feed(struct rtable_private *tab, struct rt_table_export_hook *hook); +static void +rt_table_export_uncork(void *_hook) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); - if (hook->stale_set < hook->stale_valid) - if (!--tab->rr_count) - rt_schedule_notify(tab); + struct rt_table_export_hook *hook = _hook; + struct birdloop *loop = hook->h.req->list->loop; - RT_UNLOCK(tab); + if (loop != &main_birdloop) + birdloop_enter(loop); + + u8 state; + switch (state = atomic_load_explicit(&hook->h.export_state, memory_order_relaxed)) + { + case TES_HUNGRY: + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, hook->table)), tab) + if ((state = atomic_load_explicit(&hook->h.export_state, memory_order_relaxed)) == TES_HUNGRY) + rt_table_export_start_feed(tab, hook); + if (state != TES_STOP) + break; + /* fall through */ + case TES_STOP: + rt_stop_export_common(&hook->h); + break; + default: + bug("Uncorking a table export in a strange state: %u", state); + } + + if (loop != &main_birdloop) + birdloop_leave(loop); } -void -rt_request_export(rtable *t, struct rt_export_request *req) +static void +rt_table_export_start_locked(struct rtable_private *tab, struct rt_export_request *req) { - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); + struct rt_exporter *re = &tab->exporter.e; rt_lock_table(tab); - pool *p = rp_new(tab->rp, tab->loop, "Export hook"); - struct rt_export_hook *hook = req->hook = mb_allocz(p, sizeof(struct rt_export_hook)); - hook->pool = p; - - hook->req = req; - hook->table = t; + req->hook = rt_alloc_export(re, sizeof(struct rt_table_export_hook)); + req->hook->req = req; + + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, req->hook); + hook->h.event = (event) { + .hook = rt_table_export_uncork, + .data = hook, + }; + + if (rt_cork_check(&hook->h.event)) + rt_set_export_state(&hook->h, TES_HUNGRY); + else + rt_table_export_start_feed(tab, hook); +} + +static void +rt_table_export_start_feed(struct rtable_private *tab, struct rt_table_export_hook *hook) +{ + struct rt_exporter *re = &tab->exporter.e; + struct rt_export_request *req = hook->h.req; /* stats zeroed by mb_allocz */ + switch (req->addr_mode) + { + case TE_ADDR_IN: + if (tab->trie && net_val_match(tab->addr_type, NB_IP)) + { + hook->walk_state = mb_allocz(hook->h.pool, sizeof (struct f_trie_walk_state)); + hook->walk_lock = rt_lock_trie(tab); + trie_walk_init(hook->walk_state, tab->trie, req->addr); + hook->h.event.hook = rt_feed_by_trie; + hook->walk_last.type = 0; + break; + } + /* fall through */ + case TE_ADDR_NONE: + FIB_ITERATE_INIT(&hook->feed_fit, &tab->fib); + hook->h.event.hook = rt_feed_by_fib; + break; - bmap_init(&hook->seq_map, p, 1024); + case TE_ADDR_EQUAL: + hook->h.event.hook = rt_feed_equal; + break; - rt_set_export_state(hook, TES_HUNGRY); + case TE_ADDR_FOR: + hook->h.event.hook = rt_feed_for; + break; - hook->n = (node) {}; - add_tail(&tab->exports, &hook->n); + default: + bug("Requested an unknown export address mode"); + } DBG("New export hook %p req %p in table %s uc=%u\n", hook, req, tab->name, tab->use_count); - hook->event = ev_new_init(p, rt_feed_channel, hook); - RT_UNLOCK(t); + struct rt_pending_export *rpe = rt_last_export(hook->table); + DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe ? rpe->seq : 0); + atomic_store_explicit(&hook->last_export, rpe, memory_order_relaxed); + rt_init_export(re, req->hook); +} + +static void +rt_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +{ + RT_LOCKED(SKIP_BACK(rtable, priv.exporter.e, re), tab) + rt_table_export_start_locked(tab, req); +} + +void rt_request_export(rtable *t, struct rt_export_request *req) +{ + RT_LOCKED(t, tab) + rt_table_export_start_locked(tab, req); /* Is locked inside */ +} + +void +rt_request_export_other(struct rt_exporter *re, struct rt_export_request *req) +{ + return re->class->start(re, req); +} + +struct rt_export_hook * +rt_alloc_export(struct rt_exporter *re, uint size) +{ + pool *p = rp_new(re->rp, "Export hook"); + struct rt_export_hook *hook = mb_allocz(p, size); + + hook->pool = p; + hook->table = re; + + hook->n = (node) {}; + add_tail(&re->hooks, &hook->n); + + return hook; +} + +void +rt_init_export(struct rt_exporter *re UNUSED, struct rt_export_hook *hook) +{ + hook->event.data = hook; + + bmap_init(&hook->seq_map, hook->pool, 16); + + /* Regular export */ + rt_set_export_state(hook, TES_FEEDING); rt_send_export_event(hook); } +static int +rt_table_export_stop_locked(struct rt_export_hook *hh) +{ + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, hh); + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter, hook->table); + + switch (atomic_load_explicit(&hh->export_state, memory_order_relaxed)) + { + case TES_HUNGRY: + rt_trace(tab, D_EVENTS, "Stopping export hook %s must wait for uncorking", hook->h.req->name); + return 0; + case TES_FEEDING: + switch (hh->req->addr_mode) + { + case TE_ADDR_IN: + if (hook->walk_lock) + { + rt_unlock_trie(tab, hook->walk_lock); + hook->walk_lock = NULL; + mb_free(hook->walk_state); + hook->walk_state = NULL; + break; + } + /* fall through */ + case TE_ADDR_NONE: + fit_get(&tab->fib, &hook->feed_fit); + break; + } + break; + + case TES_STOP: + bug("Tried to repeatedly stop the same export hook %s", hook->h.req->name); + } + + rt_trace(tab, D_EVENTS, "Stopping export hook %s right now", hook->h.req->name); + return 1; +} + +static void +rt_table_export_stop(struct rt_export_hook *hh) +{ + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, hh); + int ok = 0; + rtable *t = SKIP_BACK(rtable, priv.exporter, hook->table); + if (RT_IS_LOCKED(t)) + ok = rt_table_export_stop_locked(hh); + else + RT_LOCKED(t, tab) + ok = rt_table_export_stop_locked(hh); + + if (ok) + rt_stop_export_common(hh); + else + rt_set_export_state(&hook->h, TES_STOP); +} + void rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_request *)) { + ASSERT_DIE(birdloop_inside(req->list->loop)); ASSERT_DIE(req->hook); struct rt_export_hook *hook = req->hook; - RT_LOCK(hook->table); - rtable_private *tab = RT_PRIV(hook->table); + /* Set the stopped callback */ + hook->stopped = stopped; - /* Stop feeding */ - ev_postpone(hook->event); + /* Run the stop code */ + if (hook->table->class->stop) + hook->table->class->stop(hook); + else + rt_stop_export_common(hook); +} - if (atomic_load_explicit(&hook->export_state, memory_order_relaxed) == TES_FEEDING) - fit_get(&tab->fib, &hook->feed_fit); +void +rt_stop_export_common(struct rt_export_hook *hook) +{ + /* Update export state */ + rt_set_export_state(hook, TES_STOP); - hook->event->hook = rt_export_stopped; - hook->stopped = stopped; + /* Reset the event as the stopped event */ + hook->event.hook = hook->table->class->done; + /* Run the stopped event */ rt_send_export_event(hook); - - RT_UNLOCK(hook->table); - - rt_set_export_state(hook, TES_STOP); } /** @@ -1853,48 +2334,48 @@ rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_r * routes to the routing table (by rte_update()). After that, all protocol * routes (more precisely routes with @c as @sender) not sent during the * refresh cycle but still in the table from the past are pruned. This is - * implemented by setting rte->stale_cycle to req->stale_set in rte_update() - * and then dropping all routes with old stale_cycle values in table prune loop. */ + * implemented by marking all related routes as stale by REF_STALE flag in + * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD + * flag in rt_refresh_end() and then removing such routes in the prune loop. + */ void rt_refresh_begin(struct rt_import_request *req) { struct rt_import_hook *hook = req->hook; ASSERT_DIE(hook); - RT_LOCK(hook->table); - rtable_private *tab = RT_PRIV(hook->table); - - ASSERT_DIE(hook->stale_set == hook->stale_valid); + RT_LOCKED(hook->table, tab) + { /* If the pruning routine is too slow */ - if ((hook->stale_pruned < hook->stale_valid) && (hook->stale_pruned + 128 < hook->stale_valid) - || (hook->stale_pruned > hook->stale_valid) && (hook->stale_pruned > hook->stale_valid + 128)) + if (((hook->stale_set - hook->stale_pruned) & 0xff) >= 240) { - log(L_WARN "Route refresh flood in table %s", tab->name); + log(L_WARN "Route refresh flood in table %s (stale_set=%u, stale_pruned=%u)", hook->table->name, hook->stale_set, hook->stale_pruned); + + /* Forcibly set all old routes' stale cycle to zero. */ FIB_WALK(&tab->fib, net, n) { - for (struct rte_storage *e = n->routes; e; e = e->next) - if (e->rte.sender == req->hook) - e->rte.stale_cycle = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + if (e->rte.sender == req->hook) + e->rte.stale_cycle = 0; } FIB_WALK_END; - hook->stale_set = 1; - hook->stale_valid = 0; - hook->stale_pruned = 0; - } - else if (!++hook->stale_set) - { - /* Let's reserve the stale_cycle zero value for always-invalid routes */ - hook->stale_set = 1; - hook->stale_valid = 0; + + /* Smash the route refresh counter and zero everything. */ + tab->rr_counter -= hook->stale_set - hook->stale_pruned; + hook->stale_set = hook->stale_valid = hook->stale_pruning = hook->stale_pruned = 0; } - tab->rr_count++; + /* Now we can safely increase the stale_set modifier */ + hook->stale_set++; + + /* The table must know that we're route-refreshing */ + tab->rr_counter++; if (req->trace_routes & D_STATES) log(L_TRACE "%s: route refresh begin [%u]", req->name, hook->stale_set); - RT_UNLOCK(tab); + } } /** @@ -1911,19 +2392,21 @@ rt_refresh_end(struct rt_import_request *req) struct rt_import_hook *hook = req->hook; ASSERT_DIE(hook); - rtable_private *tab = RT_LOCK(hook->table); - hook->stale_valid++; - ASSERT_DIE(hook->stale_set == hook->stale_valid); - - rt_schedule_prune(tab); - - if (req->trace_routes & D_STATES) - log(L_TRACE "%s: route refresh end [%u]", req->name, hook->stale_valid); + RT_LOCKED(hook->table, tab) + { + /* Now valid routes are only those one with the latest stale_set value */ + uint cnt = hook->stale_set - hook->stale_valid; + hook->stale_valid = hook->stale_set; - if (!--tab->rr_count) - rt_schedule_notify(tab); + /* Here we can't kick the timer as we aren't in the table service loop */ + rt_schedule_prune(tab); - RT_UNLOCK(tab); + if (req->trace_routes & D_STATES) + if (cnt > 1) + log(L_TRACE "%s: route refresh end (x%u) [%u]", req->name, cnt, hook->stale_valid); + else + log(L_TRACE "%s: route refresh end [%u]", req->name, hook->stale_valid); + } } /** @@ -1937,7 +2420,7 @@ rte_dump(struct rte_storage *e) { debug("%-1N ", e->rte.net); debug("PF=%02x ", e->rte.pflags); - rta_dump(e->rte.attrs); + ea_dump(e->rte.attrs); debug("\n"); } @@ -1948,11 +2431,12 @@ rte_dump(struct rte_storage *e) * This function dumps contents of a given routing table to debug output. */ void -rt_dump(rtable *tab) +rt_dump(rtable *tp) { - RT_LOCK(tab); - rtable_private *t = RT_PRIV(tab); - debug("Dump of routing table <%s>%s\n", t->name, t->delete ? " (deleted)" : ""); + RT_LOCKED(tp, t) + { + + debug("Dump of routing table <%s>%s\n", t->name, t->deleted ? " (deleted)" : ""); #ifdef DEBUGGING fib_check(&t->fib); #endif @@ -1963,7 +2447,8 @@ rt_dump(rtable *tab) } FIB_WALK_END; debug("\n"); - RT_UNLOCK(tab); + + } } /** @@ -1979,16 +2464,20 @@ rt_dump_all(void) WALK_LIST2(t, n, routing_tables, n) rt_dump(t); + + WALK_LIST2(t, n, deleted_routing_tables, n) + rt_dump(t); } void -rt_dump_hooks(rtable *t) +rt_dump_hooks(rtable *tp) { - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - debug("Dump of hooks in routing table <%s>%s\n", tab->name, tab->delete ? " (deleted)" : ""); - debug(" nhu_state=%u hcu_scheduled=%u use_count=%d rt_count=%u\n", - atomic_load(&tab->nhu_state), ev_active(tab->hcu_event), tab->use_count, tab->rt_count); + RT_LOCKED(tp, tab) + { + + debug("Dump of hooks in routing table <%s>%s\n", tab->name, tab->deleted ? " (deleted)" : ""); + debug(" nhu_state=%u use_count=%d rt_count=%u\n", + tab->nhu_state, tab->use_count, tab->rt_count); debug(" last_rt_change=%t gc_time=%t gc_counter=%d prune_state=%u\n", tab->last_rt_change, tab->gc_time, tab->gc_counter, tab->prune_state); @@ -2002,16 +2491,18 @@ rt_dump_hooks(rtable *t) ih->last_state_change, ih->import_state, ih->stopped); } - struct rt_export_hook *eh; - WALK_LIST(eh, tab->exports) + struct rt_table_export_hook *eh; + WALK_LIST(eh, tab->exporter.e.hooks) { - eh->req->dump_req(eh->req); + eh->h.req->dump_req(eh->h.req); debug(" Export hook %p requested by %p:" " refeed_pending=%u last_state_change=%t export_state=%u\n", - eh, eh->req, eh->refeed_pending, eh->last_state_change, atomic_load_explicit(&eh->export_state, memory_order_relaxed)); + eh, eh->h.req, eh->refeed_pending, eh->h.last_state_change, + atomic_load_explicit(&eh->h.export_state, memory_order_relaxed)); } debug("\n"); - RT_UNLOCK(t); + + } } void @@ -2024,156 +2515,279 @@ rt_dump_hooks_all(void) WALK_LIST2(t, n, routing_tables, n) rt_dump_hooks(t); + + WALK_LIST2(t, n, deleted_routing_tables, n) + rt_dump_hooks(t); } static inline void -rt_schedule_nhu(rtable *tab) +rt_schedule_nhu(struct rtable_private *tab) { - atomic_fetch_or_explicit(&tab->nhu_state, NHU_SCHEDULED, memory_order_acq_rel); - ev_send_loop(tab->loop, tab->nhu_event); - - /* state change: - * NHU_CLEAN -> NHU_SCHEDULED - * NHU_RUNNING -> NHU_DIRTY - */ + if (tab->nhu_corked) + { + if (!(tab->nhu_corked & NHU_SCHEDULED)) + tab->nhu_corked |= NHU_SCHEDULED; + } + else if (!(tab->nhu_state & NHU_SCHEDULED)) + { + rt_trace(tab, D_EVENTS, "Scheduling NHU"); + + /* state change: + * NHU_CLEAN -> NHU_SCHEDULED + * NHU_RUNNING -> NHU_DIRTY + */ + if ((tab->nhu_state |= NHU_SCHEDULED) == NHU_SCHEDULED) + birdloop_flag(tab->loop, RTF_NHU); + } } void -rt_schedule_prune(rtable_private *tab) +rt_schedule_prune(struct rtable_private *tab) { if (tab->prune_state == 0) - ev_send_loop(tab->loop, tab->prune_event); + birdloop_flag(tab->loop, RTF_CLEANUP); /* state change 0->1, 2->3 */ tab->prune_state |= 1; } -static int -rt_fast_prune_ready(rtable_private *tab) +static void +rt_export_used(struct rt_table_exporter *e, const char *who, const char *why) { - return EMPTY_LIST(tab->pending_exports) && EMPTY_LIST(tab->exports) && !tab->imports_up; + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter, e); + ASSERT_DIE(RT_IS_LOCKED(tab)); + + rt_trace(tab, D_EVENTS, "Export cleanup requested by %s %s", who, why); + + if (tab->export_used) + return; + + tab->export_used = 1; + birdloop_flag(tab->loop, RTF_CLEANUP); } static void -rt_fast_prune_check(rtable_private *tab) +rt_flag_handler(struct birdloop_flag_handler *fh, u32 flags) { - if (tab->delete && rt_fast_prune_ready(tab)) + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, fh, fh)), tab) { - tab->prune_state |= 1; - ev_send_loop(tab->loop, tab->prune_event); - } -} + ASSERT_DIE(birdloop_inside(tab->loop)); + rt_lock_table(tab); -void -rt_export_used(rtable_private *tab) -{ - if (config->table_debug) - log(L_TRACE "%s: Export cleanup requested", tab->name); + if (flags & RTF_NHU) + rt_next_hop_update(tab); - ev_send_loop(tab->loop, tab->ec_event); -} + if (flags & RTF_EXPORT) + rt_kick_export_settle(tab); -static inline btime -rt_settled_time(rtable_private *tab) -{ - ASSUME(tab->base_settle_time != 0); + if (flags & RTF_CLEANUP) + { + if (tab->export_used) + rt_export_cleanup(tab); + + if (tab->prune_state) + rt_prune_table(tab); + } - btime min_settle_time = tab->rr_count ? tab->config->min_rr_settle_time : tab->config->min_settle_time; - btime max_settle_time = tab->rr_count ? tab->config->max_rr_settle_time : tab->config->max_settle_time; + if (flags & RTF_DELETE) + { + if (tab->hostcache) + rt_stop_export(&tab->hostcache->req, NULL); - DBG("settled time computed from %t %t %t %t as %t / %t, now is %t\n", - tab->name, tab->last_rt_change, min_settle_time, - tab->base_settle_time, max_settle_time, - tab->last_rt_change + min_settle_time, - tab->base_settle_time + max_settle_time, current_time()); + rt_unlock_table(tab); + } - return MIN(tab->last_rt_change + min_settle_time, - tab->base_settle_time + max_settle_time); + rt_unlock_table(tab); + } } static void -rt_settle_timer(timer *t) +rt_prune_timer(timer *t) { - rtable_private *tab = t->data; - ASSERT_DIE(birdloop_inside(tab->loop)); + RT_LOCKED((rtable *) t->data, tab) + if (tab->gc_counter >= tab->config->gc_threshold) + rt_schedule_prune(tab); +} - if (!tab->base_settle_time) +static void +rt_kick_prune_timer(struct rtable_private *tab) +{ + /* Return if prune is already scheduled */ + if (tm_active(tab->prune_timer) || (tab->prune_state & 1)) return; - btime settled_time = rt_settled_time(tab); - if (current_time() < settled_time) + /* Randomize GC period to +/- 50% */ + btime gc_period = tab->config->gc_period; + gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period); + tm_start_in(tab->prune_timer, gc_period, tab->loop); +} + + +static void +rt_flowspec_export_one(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) +{ + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rtable *dst_pub = ln->dst; + ASSUME(rt_is_flow(dst_pub)); + struct rtable_private *dst = RT_LOCK(dst_pub); + + /* No need to inspect it further if recalculation is already scheduled */ + if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY) + || !trie_match_net(dst->flowspec_trie, net)) { - tm_set_in(tab->settle_timer, settled_time, tab->loop); + RT_UNLOCK(dst_pub); + rpe_mark_seen_all(req->hook, first, NULL, NULL); return; } - /* Settled */ - tab->base_settle_time = 0; + /* This net may affect some flowspecs, check the actual change */ + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; + + RPE_WALK(first, rpe, NULL) + { + rpe_mark_seen(req->hook, rpe); + new_best = rpe->new_best; + } + + /* Yes, something has actually changed. Schedule the update. */ + if (o != RTE_VALID_OR_NULL(new_best)) + rt_schedule_nhu(dst); - struct rt_subscription *s; - WALK_LIST(s, tab->subscribers) - ev_send(s->event->list, s->event); + RT_UNLOCK(dst_pub); } static void -rt_kick_settle_timer(rtable_private *tab) +rt_flowspec_dump_req(struct rt_export_request *req) { - tab->base_settle_time = current_time(); - - if (!tab->settle_timer) - tab->settle_timer = tm_new_init(tab->rp, rt_settle_timer, tab, 0, 0); - - if (!tm_active(tab->settle_timer)) - tm_set_in(tab->settle_timer, rt_settled_time(tab), tab->loop); + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + debug(" Flowspec link for table %s (%p)\n", ln->dst->name, req); } -static inline void -rt_schedule_notify(rtable_private *tab) +static void +rt_flowspec_log_state_change(struct rt_export_request *req, u8 state) { - if (EMPTY_LIST(tab->subscribers)) - return; + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rt_trace(ln->dst, D_STATES, "Flowspec link from %s export state changed to %s", + ln->src->name, rt_export_state_name(state)); +} - if (tab->base_settle_time) - return; +static struct rt_flowspec_link * +rt_flowspec_find_link(struct rtable_private *src, rtable *dst) +{ + struct rt_table_export_hook *hook; node *n; + WALK_LIST2(hook, n, src->exporter.e.hooks, h.n) + switch (atomic_load_explicit(&hook->h.export_state, memory_order_acquire)) + { + case TES_HUNGRY: + case TES_FEEDING: + case TES_READY: + if (hook->h.req->export_one == rt_flowspec_export_one) + { + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, hook->h.req); + if (ln->dst == dst) + return ln; + } + } - rt_kick_settle_timer(tab); + return NULL; } void -rt_subscribe(rtable *t, struct rt_subscription *s) +rt_flowspec_link(rtable *src_pub, rtable *dst_pub) { - s->tab = t; - RT_LOCKED(t, tab) + ASSERT(rt_is_ip(src_pub)); + ASSERT(rt_is_flow(dst_pub)); + + int lock_dst = 0; + + birdloop_enter(dst_pub->loop); + + RT_LOCKED(src_pub, src) { - rt_lock_table(tab); - DBG("rt_subscribe(%s)\n", tab->name); - add_tail(&tab->subscribers, &s->n); + struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst_pub); + + if (!ln) + { + pool *p = src->rp; + ln = mb_allocz(p, sizeof(struct rt_flowspec_link)); + ln->src = src_pub; + ln->dst = dst_pub; + ln->req = (struct rt_export_request) { + .name = mb_sprintf(p, "%s.flowspec.notifier", dst_pub->name), + .list = birdloop_event_list(dst_pub->loop), + .trace_routes = src->config->debug, + .dump_req = rt_flowspec_dump_req, + .log_state_change = rt_flowspec_log_state_change, + .export_one = rt_flowspec_export_one, + }; + + rt_table_export_start_locked(src, &ln->req); + + lock_dst = 1; + } + + ln->uc++; } + + if (lock_dst) + rt_lock_table(dst_pub); + + birdloop_leave(dst_pub->loop); +} + +static void +rt_flowspec_link_stopped(struct rt_export_request *req) +{ + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rtable *dst = ln->dst; + + mb_free(ln); + rt_unlock_table(dst); } void -rt_unsubscribe(struct rt_subscription *s) +rt_flowspec_unlink(rtable *src, rtable *dst) { - RT_LOCKED(s->tab, tab) + birdloop_enter(dst->loop); + + struct rt_flowspec_link *ln; + RT_LOCKED(src, t) { - rem_node(&s->n); - if (EMPTY_LIST(tab->subscribers) && tm_active(tab->settle_timer)) - tm_stop(tab->settle_timer); - rt_unlock_table(tab); + ln = rt_flowspec_find_link(t, dst); + + ASSERT(ln && (ln->uc > 0)); + + if (!--ln->uc) + rt_stop_export(&ln->req, rt_flowspec_link_stopped); } + + birdloop_leave(dst->loop); +} + +static void +rt_flowspec_reset_trie(struct rtable_private *tab) +{ + linpool *lp = tab->flowspec_trie->lp; + int ipv4 = tab->flowspec_trie->ipv4; + + lp_flush(lp); + tab->flowspec_trie = f_new_trie(lp, 0); + tab->flowspec_trie->ipv4 = ipv4; } static void rt_free(resource *_r) { - rtable_private *r = (rtable_private *) _r; + struct rtable_private *r = SKIP_BACK(struct rtable_private, r, _r); + + DOMAIN_FREE(rtable, r->lock); DBG("Deleting routing table %s\n", r->name); ASSERT_DIE(r->use_count == 0); - ASSERT_DIE(r->rt_count == 0); - ASSERT_DIE(!r->cork_active); - ASSERT_DIE(EMPTY_LIST(r->imports)); - ASSERT_DIE(EMPTY_LIST(r->exports)); + + r->config->table = NULL; + rem_node(&r->n); if (r->hostcache) rt_free_hostcache(r); @@ -2182,81 +2796,123 @@ rt_free(resource *_r) fib_free(&r->fib); hmap_free(&r->id_map); rfree(r->rt_event); - rfree(r->settle_timer); mb_free(r); */ } static void -rt_res_dump(resource *_r) +rt_res_dump(resource *_r, unsigned indent) { - rtable_private *r = RT_PRIV((rtable *) _r); + struct rtable_private *r = SKIP_BACK(struct rtable_private, r, _r); + debug("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n", r->name, net_label[r->addr_type], r->rt_count, r->use_count); + + char x[32]; + bsprintf(x, "%%%dspending export %%p\n", indent + 2); + + node *n; + WALK_LIST(n, r->exporter.pending) + debug(x, "", n); } static struct resclass rt_class = { .name = "Routing table", - .size = sizeof(rtable_private), + .size = sizeof(rtable), .free = rt_free, .dump = rt_res_dump, .lookup = NULL, .memsize = NULL, }; +static const struct rt_exporter_class rt_table_exporter_class = { + .start = rt_table_export_start, + .stop = rt_table_export_stop, + .done = rt_table_export_done, +}; + +void +rt_exporter_init(struct rt_exporter *e) +{ + init_list(&e->hooks); +} + +static struct idm rtable_idm; +uint rtable_max_id = 0; + rtable * rt_setup(pool *pp, struct rtable_config *cf) { - int ns = strlen("Routing table ") + strlen(cf->name) + 1; - void *nb = mb_alloc(pp, ns); - ASSERT_DIE(ns - 1 == bsnprintf(nb, ns, "Routing table %s", cf->name)); + ASSERT_DIE(birdloop_inside(&main_birdloop)); - struct birdloop *l = birdloop_new(pp, DOMAIN_ORDER(rtable), nb); - pool *p = birdloop_pool(l); + pool *p = rp_newf(pp, "Routing table %s", cf->name); - birdloop_enter(l); - - rtable_private *t = ralloc(p, &rt_class); + struct rtable_private *t = ralloc(p, &rt_class); t->rp = p; - t->loop = l; t->rte_slab = sl_new(p, sizeof(struct rte_storage)); t->name = cf->name; t->config = cf; t->addr_type = cf->addr_type; + t->id = idm_alloc(&rtable_idm); + if (t->id >= rtable_max_id) + rtable_max_id = t->id + 1; + + t->lock = DOMAIN_NEW(rtable, t->name); fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL); + if (cf->trie_used) + { + t->trie = f_new_trie(lp_new_default(p), 0); + t->trie->ipv4 = net_val_match(t->addr_type, NB_IP4 | NB_VPN4 | NB_ROA4); + + t->fib.init = net_init_with_trie; + } + init_list(&t->imports); - init_list(&t->exports); hmap_init(&t->id_map, p, 1024); hmap_set(&t->id_map, 0); - init_list(&t->pending_exports); - init_list(&t->subscribers); + t->fh = (struct birdloop_flag_handler) { .hook = rt_flag_handler, }; + t->nhu_uncork_event = ev_new_init(p, rt_nhu_uncork, t); + t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0); + t->last_rt_change = t->gc_time = current_time(); + + t->export_settle = SETTLE_INIT(&cf->export_settle, rt_announce_exports, NULL); - t->announce_event = ev_new_init(p, rt_announce_exports, t); - t->ec_event = ev_new_init(p, rt_export_cleanup, t); - t->prune_event = ev_new_init(p, rt_prune_table, t); - t->hcu_event = ev_new_init(p, rt_update_hostcache, t); - t->nhu_event = ev_new_init(p, rt_next_hop_update, t); + t->exporter = (struct rt_table_exporter) { + .e = { + .class = &rt_table_exporter_class, + .addr_type = t->addr_type, + .rp = t->rp, + }, + .next_seq = 1, + }; - t->nhu_event->cork = &rt_cork; - t->prune_event->cork = &rt_cork; + rt_exporter_init(&t->exporter.e); - t->last_rt_change = t->gc_time = current_time(); - t->next_export_seq = 1; + init_list(&t->exporter.pending); + + t->cork_threshold = cf->cork_threshold; t->rl_pipe = (struct tbf) TBF_DEFAULT_LOG_LIMITS; - t->nhu_lp = lp_new_default(p); + if (rt_is_flow(RT_PUB(t))) + { + t->flowspec_trie = f_new_trie(lp_new_default(p), 0); + t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4); + } - mb_move(nb, p); - birdloop_leave(l); + /* Start the service thread */ + t->loop = birdloop_new(p, DOMAIN_ORDER(service), mb_sprintf(p, "Routing table %s", t->name), 0); + birdloop_enter(t->loop); + birdloop_flag_set_handler(t->loop, &t->fh); + birdloop_leave(t->loop); - return (rtable *) t; + return RT_PUB(t); } /** @@ -2269,11 +2925,15 @@ void rt_init(void) { rta_init(); - rt_table_pool = rp_new(&root_pool, &main_birdloop, "Routing tables"); + rt_table_pool = rp_new(&root_pool, "Routing tables"); init_list(&routing_tables); - ev_init_cork(&rt_cork, "Route Table Cork"); + init_list(&deleted_routing_tables); + ev_init_list(&rt_cork.queue, &main_birdloop, "Route cork release"); + rt_cork.run = (event) { .hook = rt_cork_release_hook }; + idm_init(&rtable_idm, rt_table_pool, 256); } + /** * rt_prune_table - prune a routing table * @@ -2289,18 +2949,15 @@ rt_init(void) * iteration. */ static void -rt_prune_table(void *data) +rt_prune_table(struct rtable_private *tab) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); - struct fib_iterator *fit = &tab->prune_fit; - int limit = tab->delete ? 16384 : 512; + int limit = 2000; struct rt_import_hook *ih; node *n, *x; - DBG("Pruning route table %s\n", tab->name); + rt_trace(tab, D_STATES, "Pruning"); #ifdef DEBUGGING fib_check(&tab->fib); #endif @@ -2308,11 +2965,6 @@ rt_prune_table(void *data) if (tab->prune_state == 0) return; - if (tab->delete && !rt_fast_prune_ready(tab)) - return; - - rt_lock_table(tab); - if (tab->prune_state == 1) { /* Mark channels to flush */ @@ -2329,50 +2981,36 @@ rt_prune_table(void *data) FIB_ITERATE_INIT(fit, &tab->fib); tab->prune_state = 2; + + tab->gc_counter = 0; + tab->gc_time = current_time(); + + if (tab->prune_trie) + { + /* Init prefix trie pruning */ + tab->trie_new = f_new_trie(lp_new_default(tab->rp), 0); + tab->trie_new->ipv4 = tab->trie->ipv4; + } } again: FIB_ITERATE_START(&tab->fib, fit, net, n) { - if (tab->delete) + rescan: + if (limit <= 0) { - ASSERT_DIE(!n->first); - - for (struct rte_storage *e = n->routes, *next; e; e = next) - { - next = e->next; - - struct rt_import_request *req = e->rte.sender->req; - if (req->preimport) - req->preimport(req, NULL, &e->rte); - - tab->rt_count--; - hmap_clear(&tab->id_map, e->rte.id); - rte_free(e, tab); - limit--; - } - - n->routes = NULL; + FIB_ITERATE_PUT(fit); + birdloop_flag(tab->loop, RTF_CLEANUP); + return; } - else - rescan: + for (struct rte_storage *e=n->routes; e; e=e->next) { struct rt_import_hook *s = e->rte.sender; - if ((s->import_state == TIS_FLUSHING) || (e->rte.stale_cycle < s->stale_valid) || (e->rte.stale_cycle > s->stale_set)) { - if (limit <= 0) - { - FIB_ITERATE_PUT(fit); - ev_send_loop(tab->loop, tab->prune_event); - ev_send_loop(tab->loop, tab->announce_event); - rt_unlock_table(tab); - return; - } - rte_recalculate(tab, e->rte.sender, n, NULL, e->rte.src); limit--; @@ -2386,66 +3024,110 @@ again: fib_delete(&tab->fib, n); goto again; } + + if (tab->trie_new) + { + trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen); + limit--; + } } FIB_ITERATE_END; + rt_trace(tab, D_EVENTS, "Prune done, scheduling export timer"); + rt_kick_export_settle(tab); + #ifdef DEBUGGING fib_check(&tab->fib); #endif - tab->gc_counter = 0; - tab->gc_time = current_time(); - /* state change 2->0, 3->1 */ if (tab->prune_state &= 1) - ev_send_loop(tab->loop, tab->prune_event); + birdloop_flag(tab->loop, RTF_CLEANUP); - uint flushed_channels = 0; + if (tab->trie_new) + { + /* Finish prefix trie pruning */ + + if (!tab->trie_lock_count) + { + rfree(tab->trie->lp); + } + else + { + ASSERT(!tab->trie_old); + tab->trie_old = tab->trie; + tab->trie_old_lock_count = tab->trie_lock_count; + tab->trie_lock_count = 0; + } + + tab->trie = tab->trie_new; + tab->trie_new = NULL; + tab->prune_trie = 0; + } + else + { + /* Schedule prefix trie pruning */ + if (tab->trie && !tab->trie_old && (tab->trie->prefix_count > (2 * tab->fib.entries))) + { + /* state change 0->1, 2->3 */ + tab->prune_state |= 1; + tab->prune_trie = 1; + } + } /* Close flushed channels */ WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) if (ih->import_state == TIS_FLUSHING) { - ih->flush_seq = tab->next_export_seq; + DBG("flushing %s %s rr %u", ih->req->name, tab->name, tab->rr_counter); + ih->flush_seq = tab->exporter.next_seq; rt_set_import_state(ih, TIS_WAITING); - flushed_channels++; + tab->rr_counter--; + tab->wait_counter++; } else if (ih->stale_pruning != ih->stale_pruned) { + DBG("pruning %s %s rr %u set %u valid %u pruning %u pruned %u", ih->req->name, tab->name, tab->rr_counter, ih->stale_set, ih->stale_valid, ih->stale_pruning, ih->stale_pruned); + tab->rr_counter -= (ih->stale_pruning - ih->stale_pruned); ih->stale_pruned = ih->stale_pruning; - if (ih->req->trace_routes & D_STATES) log(L_TRACE "%s: table prune after refresh end [%u]", ih->req->name, ih->stale_pruned); } /* In some cases, we may want to directly proceed to export cleanup */ - if (EMPTY_LIST(tab->exports) && flushed_channels) + if (tab->wait_counter && (EMPTY_LIST(tab->exporter.e.hooks) || !tab->exporter.first)) rt_export_cleanup(tab); - - ev_send_loop(tab->loop, tab->announce_event); - rt_unlock_table(tab); } static void -rt_export_cleanup(void *data) +rt_export_cleanup(struct rtable_private *tab) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); + tab->export_used = 0; u64 min_seq = ~((u64) 0); struct rt_pending_export *last_export_to_free = NULL; - struct rt_pending_export *first_export = tab->first_export; + struct rt_pending_export *first = tab->exporter.first; + int want_prune = 0; - struct rt_export_hook *eh; + struct rt_table_export_hook *eh; node *n; - WALK_LIST2(eh, n, tab->exports, n) + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) { - switch (atomic_load_explicit(&eh->export_state, memory_order_acquire)) + switch (atomic_load_explicit(&eh->h.export_state, memory_order_acquire)) { - case TES_DOWN: - case TES_HUNGRY: + /* Export cleanup while feeding isn't implemented */ + case TES_FEEDING: + goto done; + + /* States not interfering with export cleanup */ + case TES_DOWN: /* This should not happen at all */ + log(L_WARN "%s: Export cleanup found hook %s in explicit state TES_DOWN", tab->name, eh->h.req->name); + /* fall through */ + case TES_HUNGRY: /* Feeding waiting for uncork */ + case TES_STOP: /* No more export will happen on this hook */ continue; + /* Regular export */ case TES_READY: { struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); @@ -2462,23 +3144,20 @@ rt_export_cleanup(void *data) } default: - /* It's only safe to cleanup when the export state is idle or regular. No feeding or stopping allowed. */ - goto done; + bug("%s: Strange export state of hook %s: %d", tab->name, eh->h.req->name, atomic_load_explicit(&eh->h.export_state, memory_order_relaxed)); } } - tab->first_export = last_export_to_free ? rt_next_export_fast(last_export_to_free) : NULL; + tab->exporter.first = last_export_to_free ? rt_next_export_fast(last_export_to_free) : NULL; - if (config->table_debug) - log(L_TRACE "%s: Export cleanup, old first_export seq %lu, new %lu, min_seq %ld", - tab->name, - first_export ? first_export->seq : 0, - tab->first_export ? tab->first_export->seq : 0, + rt_trace(tab, D_STATES, "Export cleanup, old exporter.first seq %lu, new %lu, min_seq %ld", + first ? first->seq : 0, + tab->exporter.first ? tab->exporter.first->seq : 0, min_seq); - WALK_LIST2(eh, n, tab->exports, n) + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) { - if (atomic_load_explicit(&eh->export_state, memory_order_acquire) != TES_READY) + if (atomic_load_explicit(&eh->h.export_state, memory_order_acquire) != TES_READY) continue; struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); @@ -2494,45 +3173,47 @@ rt_export_cleanup(void *data) } } - while (first_export && (first_export->seq <= min_seq)) + while (first && (first->seq <= min_seq)) { - ASSERT_DIE(first_export->new || first_export->old); + ASSERT_DIE(first->new || first->old); - const net_addr *n = first_export->new ? - first_export->new->rte.net : - first_export->old->rte.net; + const net_addr *n = first->new ? + first->new->rte.net : + first->old->rte.net; net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); - ASSERT_DIE(net->first == first_export); - - if (first_export == net->last) + ASSERT_DIE(net->first == first); + + if (first == net->last) /* The only export here */ net->last = net->first = NULL; else /* First is now the next one */ - net->first = atomic_load_explicit(&first_export->next, memory_order_relaxed); + net->first = atomic_load_explicit(&first->next, memory_order_relaxed); + + want_prune += !net->routes && !net->first; /* For now, the old route may be finally freed */ - if (first_export->old) + if (first->old) { - rt_rte_trace_in(D_ROUTES, first_export->old->rte.sender->req, &first_export->old->rte, "freed"); - hmap_clear(&tab->id_map, first_export->old->rte.id); - rte_free(first_export->old, tab); + rt_rte_trace_in(D_ROUTES, first->old->rte.sender->req, &first->old->rte, "freed"); + hmap_clear(&tab->id_map, first->old->rte.id); + rte_free(first->old); } #ifdef LOCAL_DEBUG - memset(first_export, 0xbd, sizeof(struct rt_pending_export)); + memset(first, 0xbd, sizeof(struct rt_pending_export)); #endif - struct rt_export_block *reb = HEAD(tab->pending_exports); - ASSERT_DIE(reb == PAGE_HEAD(first_export)); + struct rt_export_block *reb = HEAD(tab->exporter.pending); + ASSERT_DIE(reb == PAGE_HEAD(first)); - u32 pos = (first_export - &reb->export[0]); + u32 pos = (first - &reb->export[0]); u32 end = atomic_load_explicit(&reb->end, memory_order_relaxed); ASSERT_DIE(pos < end); struct rt_pending_export *next = NULL; - + if (++pos < end) next = &reb->export[pos]; else @@ -2545,66 +3226,160 @@ rt_export_cleanup(void *data) free_page(reb); - if (EMPTY_LIST(tab->pending_exports)) + if (EMPTY_LIST(tab->exporter.pending)) { - if (config->table_debug) - log(L_TRACE "%s: Resetting export seq", tab->name); + rt_trace(tab, D_EVENTS, "Resetting export seq"); node *n; - WALK_LIST2(eh, n, tab->exports, n) + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) { - if (atomic_load_explicit(&eh->export_state, memory_order_acquire) != TES_READY) + if (atomic_load_explicit(&eh->h.export_state, memory_order_acquire) != TES_READY) continue; ASSERT_DIE(atomic_load_explicit(&eh->last_export, memory_order_acquire) == NULL); - bmap_reset(&eh->seq_map, 1024); + bmap_reset(&eh->h.seq_map, 16); } - tab->next_export_seq = 1; + tab->exporter.next_seq = 1; } else { - reb = HEAD(tab->pending_exports); + reb = HEAD(tab->exporter.pending); next = &reb->export[0]; } } - first_export = next; + first = next; } + rt_check_cork_low(tab); + done:; struct rt_import_hook *ih; node *x; - WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) - if (ih->import_state == TIS_WAITING) - if (!first_export || (first_export->seq >= ih->flush_seq)) - { - ih->import_state = TIS_CLEARED; - ev_send(ih->req->list, ih->export_announce_event); - } + if (tab->wait_counter) + WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) + if (ih->import_state == TIS_WAITING) + if (!first || (first->seq >= ih->flush_seq)) + { + ih->import_state = TIS_CLEARED; + tab->wait_counter--; + ev_send(ih->req->list, &ih->announce_event); + } + + if ((tab->gc_counter += want_prune) >= tab->config->gc_threshold) + rt_kick_prune_timer(tab); + + if (tab->export_used) + birdloop_flag(tab->loop, RTF_CLEANUP); - if (EMPTY_LIST(tab->pending_exports) && ev_active(tab->announce_event)) - ev_postpone(tab->announce_event); + if (EMPTY_LIST(tab->exporter.pending)) + settle_cancel(&tab->export_settle); +} + +static void +rt_cork_release_hook(void *data UNUSED) +{ + do synchronize_rcu(); + while ( + !atomic_load_explicit(&rt_cork.active, memory_order_acquire) && + ev_run_list(&rt_cork.queue) + ); +} - /* If reduced to at most one export block pending */ - if (tab->cork_active && - ((!tab->first_export) || (tab->first_export->seq + 128 > tab->next_export_seq))) +/** + * rt_lock_trie - lock a prefix trie of a routing table + * @tab: routing table with prefix trie to be locked + * + * The prune loop may rebuild the prefix trie and invalidate f_trie_walk_state + * structures. Therefore, asynchronous walks should lock the prefix trie using + * this function. That allows the prune loop to rebuild the trie, but postpones + * its freeing until all walks are done (unlocked by rt_unlock_trie()). + * + * Return a current trie that will be locked, the value should be passed back to + * rt_unlock_trie() for unlocking. + * + */ +struct f_trie * +rt_lock_trie(struct rtable_private *tab) +{ + ASSERT(tab->trie); + + tab->trie_lock_count++; + return tab->trie; +} + +/** + * rt_unlock_trie - unlock a prefix trie of a routing table + * @tab: routing table with prefix trie to be locked + * @trie: value returned by matching rt_lock_trie() + * + * Done for trie locked by rt_lock_trie() after walk over the trie is done. + * It may free the trie and schedule next trie pruning. + */ +void +rt_unlock_trie(struct rtable_private *tab, struct f_trie *trie) +{ + ASSERT(trie); + + if (trie == tab->trie) { - tab->cork_active = 0; - ev_uncork(&rt_cork); - if (config->table_debug) - log(L_TRACE "%s: cork released", tab->name); + /* Unlock the current prefix trie */ + ASSERT(tab->trie_lock_count); + tab->trie_lock_count--; } + else if (trie == tab->trie_old) + { + /* Unlock the old prefix trie */ + ASSERT(tab->trie_old_lock_count); + tab->trie_old_lock_count--; + + /* Free old prefix trie that is no longer needed */ + if (!tab->trie_old_lock_count) + { + rfree(tab->trie_old->lp); + tab->trie_old = NULL; - rt_fast_prune_check(tab); + /* Kick prefix trie pruning that was postponed */ + if (tab->trie && (tab->trie->prefix_count > (2 * tab->fib.entries))) + { + tab->prune_trie = 1; + rt_kick_prune_timer(tab); + } + } + } + else + log(L_BUG "Invalid arg to rt_unlock_trie()"); } + void rt_preconfig(struct config *c) { init_list(&c->tables); - rt_new_table(cf_get_symbol("master4"), NET_IP4); - rt_new_table(cf_get_symbol("master6"), NET_IP6); + c->def_tables[NET_IP4] = cf_define_symbol(cf_get_symbol("master4"), SYM_TABLE, table, NULL); + c->def_tables[NET_IP6] = cf_define_symbol(cf_get_symbol("master6"), SYM_TABLE, table, NULL); +} + +void +rt_postconfig(struct config *c) +{ + uint num_tables = list_length(&c->tables); + btime def_gc_period = 400 MS * num_tables; + def_gc_period = MAX(def_gc_period, 10 S); + def_gc_period = MIN(def_gc_period, 600 S); + + struct rtable_config *rc; + WALK_LIST(rc, c->tables) + if (rc->gc_period == (uint) -1) + rc->gc_period = (uint) def_gc_period; + + for (uint net_type = 0; net_type < NET_MAX; net_type++) + if (c->def_tables[net_type] && !c->def_tables[net_type]->table) + { + c->def_tables[net_type]->class = SYM_VOID; + c->def_tables[net_type] = NULL; + } } @@ -2613,180 +3388,453 @@ rt_preconfig(struct config *c) * triggered by rt_schedule_nhu(). */ -static inline int -rta_next_hop_outdated(rta *a) +void +ea_set_hostentry(ea_list **to, rtable *dep, rtable *src, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]) { - struct hostentry *he = a->hostentry; - - if (!he) - return 0; - - if (!he->src) - return a->dest != RTD_UNREACHABLE; - - return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) || - (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh)); + struct { + struct adata ad; + struct hostentry *he; + u32 labels[0]; + } *head = (void *) tmp_alloc_adata(sizeof *head + sizeof(u32) * lnum - sizeof(struct adata)); + + RT_LOCKED(src, tab) + head->he = rt_get_hostentry(tab, gw, ll, dep); + memcpy(head->labels, labels, lnum * sizeof(u32)); + + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &ea_gen_hostentry, 0, &head->ad)); } -void -rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls, linpool *lp) + +static void +rta_apply_hostentry(ea_list **to, struct hostentry_adata *head) { - a->hostentry = he; - a->dest = he->dest; - a->igp_metric = he->igp_metric; + struct hostentry *he = head->he; + u32 *labels = head->labels; + u32 lnum = (u32 *) (head->ad.data + head->ad.length) - labels; + + ea_set_attr_u32(to, &ea_gen_igp_metric, 0, he->igp_metric); - if (a->dest != RTD_UNICAST) + if (!he->src) { - /* No nexthop */ -no_nexthop: - a->nh = (struct nexthop) {}; - if (mls) - { /* Store the label stack for later changes */ - a->nh.labels_orig = a->nh.labels = mls->len; - memcpy(a->nh.label, mls->stack, mls->len * sizeof(u32)); - } + ea_set_dest(to, 0, RTD_UNREACHABLE); return; } - if (((!mls) || (!mls->len)) && he->nexthop_linkable) + eattr *he_nh_ea = ea_find(he->src, &ea_gen_nexthop); + ASSERT_DIE(he_nh_ea); + + struct nexthop_adata *nhad = (struct nexthop_adata *) he_nh_ea->u.ptr; + int idest = nhea_dest(he_nh_ea); + + if ((idest != RTD_UNICAST) || + !lnum && he->nexthop_linkable) { /* Just link the nexthop chain, no label append happens. */ - memcpy(&(a->nh), &(he->src->nh), nexthop_size(&(he->src->nh))); + ea_copy_attr(to, he->src, &ea_gen_nexthop); return; } - struct nexthop *nhp = NULL, *nhr = NULL; - int skip_nexthop = 0; + uint total_size = OFFSETOF(struct nexthop_adata, nh); - for (struct nexthop *nh = &(he->src->nh); nh; nh = nh->next) + NEXTHOP_WALK(nh, nhad) { - if (skip_nexthop) - skip_nexthop--; - else + if (nh->labels + lnum > MPLS_MAX_LABEL_STACK) { - nhr = nhp; - nhp = (nhp ? (nhp->next = lp_alloc(lp, NEXTHOP_MAX_SIZE)) : &(a->nh)); + log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)", + nh->labels, lnum, nh->labels + lnum, MPLS_MAX_LABEL_STACK); + continue; } - memset(nhp, 0, NEXTHOP_MAX_SIZE); - nhp->iface = nh->iface; - nhp->weight = nh->weight; + total_size += NEXTHOP_SIZE_CNT(nh->labels + lnum); + } - if (mls) - { - nhp->labels = nh->labels + mls->len; - nhp->labels_orig = mls->len; - if (nhp->labels <= MPLS_MAX_LABEL_STACK) - { - memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); /* First the hostentry labels */ - memcpy(&(nhp->label[nh->labels]), mls->stack, mls->len * sizeof(u32)); /* Then the bottom labels */ - } - else - { - log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)", - nh->labels, mls->len, nhp->labels, MPLS_MAX_LABEL_STACK); - skip_nexthop++; - continue; - } - } - else if (nh->labels) + if (total_size == OFFSETOF(struct nexthop_adata, nh)) + { + log(L_WARN "No valid nexthop remaining, setting route unreachable"); + + struct nexthop_adata nha = { + .ad.length = NEXTHOP_DEST_SIZE, + .dest = RTD_UNREACHABLE, + }; + + ea_set_attr_data(to, &ea_gen_nexthop, 0, &nha.ad.data, nha.ad.length); + return; + } + + struct nexthop_adata *new = (struct nexthop_adata *) tmp_alloc_adata(total_size); + struct nexthop *dest = &new->nh; + + NEXTHOP_WALK(nh, nhad) + { + if (nh->labels + lnum > MPLS_MAX_LABEL_STACK) + continue; + + memcpy(dest, nh, NEXTHOP_SIZE(nh)); + if (lnum) { - nhp->labels = nh->labels; - nhp->labels_orig = 0; - memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); + memcpy(&(dest->label[dest->labels]), labels, lnum * sizeof labels[0]); + dest->labels += lnum; } if (ipa_nonzero(nh->gw)) - { - nhp->gw = nh->gw; /* Router nexthop */ - nhp->flags |= (nh->flags & RNF_ONLINK); - } + /* Router nexthop */ + dest->flags = (dest->flags & RNF_ONLINK); else if (!(nh->iface->flags & IF_MULTIACCESS) || (nh->iface->flags & IF_LOOPBACK)) - nhp->gw = IPA_NONE; /* PtP link - no need for nexthop */ + dest->gw = IPA_NONE; /* PtP link - no need for nexthop */ else if (ipa_nonzero(he->link)) - nhp->gw = he->link; /* Device nexthop with link-local address known */ + dest->gw = he->link; /* Device nexthop with link-local address known */ else - nhp->gw = he->addr; /* Device nexthop with link-local address unknown */ + dest->gw = he->addr; /* Device nexthop with link-local address unknown */ + + dest = NEXTHOP_NEXT(dest); } - if (skip_nexthop) - if (nhr) - nhr->next = NULL; - else + /* Fix final length */ + new->ad.length = (void *) dest - (void *) new->ad.data; + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &ea_gen_nexthop, 0, &new->ad)); +} + +static inline struct hostentry_adata * +rta_next_hop_outdated(ea_list *a) +{ + /* First retrieve the hostentry */ + eattr *heea = ea_find(a, &ea_gen_hostentry); + if (!heea) + return NULL; + + struct hostentry_adata *head = (struct hostentry_adata *) heea->u.ptr; + + /* If no nexthop is present, we have to create one */ + eattr *a_nh_ea = ea_find(a, &ea_gen_nexthop); + if (!a_nh_ea) + return head; + + struct nexthop_adata *nhad = (struct nexthop_adata *) a_nh_ea->u.ptr; + + /* Shortcut for unresolvable hostentry */ + if (!head->he->src) + return NEXTHOP_IS_REACHABLE(nhad) ? head : NULL; + + /* Comparing our nexthop with the hostentry nexthop */ + eattr *he_nh_ea = ea_find(head->he->src, &ea_gen_nexthop); + + return ( + (ea_get_int(a, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN) != head->he->igp_metric) || + (!head->he->nexthop_linkable) || + (!he_nh_ea != !a_nh_ea) || + (he_nh_ea && a_nh_ea && !adata_same(he_nh_ea->u.ptr, a_nh_ea->u.ptr))) + ? head : NULL; +} + +static inline int +rt_next_hop_update_rte(rte *old, rte *new) +{ + struct hostentry_adata *head = rta_next_hop_outdated(old->attrs); + if (!head) + return 0; + + *new = *old; + rta_apply_hostentry(&new->attrs, head); + return 1; +} + +static inline void +rt_next_hop_resolve_rte(rte *r) +{ + eattr *heea = ea_find(r->attrs, &ea_gen_hostentry); + if (!heea) + return; + + struct hostentry_adata *head = (struct hostentry_adata *) heea->u.ptr; + + rta_apply_hostentry(&r->attrs, head); +} + +#ifdef CONFIG_BGP + +static inline int +net_flow_has_dst_prefix(const net_addr *n) +{ + ASSUME(net_is_flow(n)); + + if (n->pxlen) + return 1; + + if (n->type == NET_FLOW4) + { + const net_addr_flow4 *n4 = (void *) n; + return (n4->length > sizeof(net_addr_flow4)) && (n4->data[0] == FLOW_TYPE_DST_PREFIX); + } + else + { + const net_addr_flow6 *n6 = (void *) n; + return (n6->length > sizeof(net_addr_flow6)) && (n6->data[0] == FLOW_TYPE_DST_PREFIX); + } +} + +static inline int +rta_as_path_is_empty(ea_list *a) +{ + eattr *e = ea_find(a, "bgp_path"); + return !e || (as_path_getlen(e->u.ptr) == 0); +} + +static inline u32 +rta_get_first_asn(ea_list *a) +{ + eattr *e = ea_find(a, "bgp_path"); + u32 asn; + + return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0; +} + +static inline enum flowspec_valid +rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, ea_list *a, int interior) +{ + ASSERT(rt_is_ip(tab_ip)); + ASSERT(rt_is_flow(tab_flow)); + + /* RFC 8955 6. a) Flowspec has defined dst prefix */ + if (!net_flow_has_dst_prefix(n)) + return FLOWSPEC_INVALID; + + /* RFC 9117 4.1. Accept AS_PATH is empty (fr */ + if (interior && rta_as_path_is_empty(a)) + return FLOWSPEC_VALID; + + + /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */ + + /* Find flowspec dst prefix */ + net_addr dst; + if (n->type == NET_FLOW4) + net_fill_ip4(&dst, net4_prefix(n), net4_pxlen(n)); + else + net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n)); + + rte rb = {}; + net_addr_union nau; + RT_LOCKED(tab_ip, tip) + { + ASSERT(tip->trie); + /* Find best-match BGP unicast route for flowspec dst prefix */ + net *nb = net_route(tip, &dst); + if (nb) + { + rb = RTE_COPY_VALID(nb->routes); + rta_clone(rb.attrs); + net_copy(&nau.n, nb->n.addr); + rb.net = &nau.n; + } + } + + /* Register prefix to trie for tracking further changes */ + int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH; + RT_LOCKED(tab_flow, tfl) + trie_add_prefix(tfl->flowspec_trie, &dst, (rb.net ? rb.net->pxlen : 0), max_pxlen); + + /* No best-match BGP route -> no flowspec */ + if (!rb.attrs || (rt_get_source_attr(&rb) != RTS_BGP)) + return FLOWSPEC_INVALID; + + /* Find ORIGINATOR_ID values */ + u32 orig_a = ea_get_int(a, "bgp_originator_id", 0); + u32 orig_b = ea_get_int(rb.attrs, "bgp_originator_id", 0); + + /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */ + if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal( + ea_get_ip(a, &ea_gen_from, IPA_NONE), + ea_get_ip(rb.attrs, &ea_gen_from, IPA_NONE) + ))) + return FLOWSPEC_INVALID; + + + /* Find ASN of the best-match route, for use in next checks */ + u32 asn_b = rta_get_first_asn(rb.attrs); + if (!asn_b) + return FLOWSPEC_INVALID; + + /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */ + if (!interior && (rta_get_first_asn(a) != asn_b)) + return FLOWSPEC_INVALID; + + /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */ + RT_LOCKED(tab_ip, tip) + { + TRIE_WALK(tip->trie, subnet, &dst) { - a->dest = RTD_UNREACHABLE; - log(L_WARN "No valid nexthop remaining, setting route unreachable"); - goto no_nexthop; + net *nc = net_find_valid(tip, &subnet); + if (!nc) + continue; + + const rte *rc = &nc->routes->rte; + if (rt_get_source_attr(rc) != RTS_BGP) + RT_RETURN(tip, FLOWSPEC_INVALID); + + if (rta_get_first_asn(rc->attrs) != asn_b) + RT_RETURN(tip, FLOWSPEC_INVALID); } + TRIE_WALK_END; + } + + return FLOWSPEC_VALID; } -static inline struct rte_storage * -rt_next_hop_update_rte(rtable_private *tab, net *n, rte *old) +#endif /* CONFIG_BGP */ + +static int +rt_flowspec_update_rte(rtable *tab, rte *r, rte *new) { - rta *a = alloca(RTA_MAX_SIZE); - memcpy(a, old->attrs, rta_size(old->attrs)); +#ifdef CONFIG_BGP + if (r->generation || (rt_get_source_attr(r) != RTS_BGP)) + return 0; - mpls_label_stack mls = { .len = a->nh.labels_orig }; - memcpy(mls.stack, &a->nh.label[a->nh.labels - mls.len], mls.len * sizeof(u32)); + struct bgp_channel *bc = (struct bgp_channel *) SKIP_BACK(struct channel, in_req, r->sender->req); + if (!bc->base_table) + return 0; + + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, p, bc->c.proto); - rta_apply_hostentry(a, old->attrs->hostentry, &mls, tab->nhu_lp); - a->cached = 0; + enum flowspec_valid old = rt_get_flowspec_valid(r), + valid = rt_flowspec_check(bc->base_table, tab, r->net, r->attrs, p->is_interior); - rte e0 = *old; - e0.attrs = rta_lookup(a); + if (old == valid) + return 0; - return rte_store(&e0, n, tab); + *new = *r; + ea_set_attr_u32(&new->attrs, &ea_gen_flowspec_valid, 0, valid); + return 1; +#else + return 0; +#endif +} + +static inline void +rt_flowspec_resolve_rte(rte *r, struct channel *c) +{ +#ifdef CONFIG_BGP + enum flowspec_valid valid, old = rt_get_flowspec_valid(r); + struct bgp_channel *bc = (struct bgp_channel *) c; + + if ( (rt_get_source_attr(r) == RTS_BGP) + && (c->channel == &channel_bgp) + && (bc->base_table)) + { + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, p, bc->c.proto); + valid = rt_flowspec_check( + bc->base_table, + c->in_req.hook->table, + r->net, r->attrs, p->is_interior); + } + else + valid = FLOWSPEC_UNKNOWN; + + if (valid == old) + return; + + if (valid == FLOWSPEC_UNKNOWN) + ea_unset_attr(&r->attrs, 0, &ea_gen_flowspec_valid); + else + ea_set_attr_u32(&r->attrs, &ea_gen_flowspec_valid, 0, valid); +#endif } static inline int -rt_next_hop_update_net(rtable_private *tab, net *n) +rt_next_hop_update_net(struct rtable_private *tab, net *n) { - struct rte_storage *new; - int count = 0; + uint count = 0; + int is_flow = net_is_flow(n->n.addr); struct rte_storage *old_best = n->routes; if (!old_best) return 0; for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) - if (rta_next_hop_outdated(e->rte.attrs)) - count++; + count++; if (!count) return 0; struct rte_multiupdate { - struct rte_storage *old, *new; - } *updates = alloca(sizeof(struct rte_multiupdate) * count); + struct rte_storage *old, *new_stored; + rte new; + } *updates = tmp_allocz(sizeof(struct rte_multiupdate) * (count+1)); + + struct rt_pending_export *last_pending = n->last; - int pos = 0; + uint pos = 0; for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) - if (rta_next_hop_outdated(e->rte.attrs)) - { - struct rte_storage *new = rt_next_hop_update_rte(tab, n, &e->rte); + updates[pos++].old = e; + + /* This is an exceptional place where table can be unlocked while keeping its data: + * the reason why this is safe is that NHU must be always run from the same + * thread as cleanup routines, therefore the only real problem may arise when + * some importer does a change on this particular net (destination) while NHU + * is being computed. Statistically, this should almost never happen. In such + * case, we just drop all the computed changes and do it once again. + * */ + RT_UNLOCK(tab); - /* Call a pre-comparison hook */ - /* Not really an efficient way to compute this */ - if (e->rte.src->owner->rte_recalculate) - e->rte.src->owner->rte_recalculate(tab, n, &new->rte, &e->rte, &old_best->rte); + uint mod = 0; + if (is_flow) + for (uint i = 0; i < pos; i++) + mod += rt_flowspec_update_rte(RT_PUB(tab), &updates[i].old->rte, &updates[i].new); - updates[pos++] = (struct rte_multiupdate) { - .old = e, - .new = new, - }; + else + for (uint i = 0; i < pos; i++) + mod += rt_next_hop_update_rte(&updates[i].old->rte, &updates[i].new); - /* Replace the route in the list */ - new->next = e->next; - *k = e = new; + RT_LOCK(RT_PUB(tab)); + + if (!mod) + return 0; + /* Something has changed inbetween, retry NHU. */ + if (last_pending != n->last) + return rt_next_hop_update_net(tab, n); + + /* Now we reconstruct the original linked list */ + struct rte_storage **nptr = &n->routes; + for (uint i = 0; i < pos; i++) + { + updates[i].old->next = NULL; + + struct rte_storage *put; + if (updates[i].new.attrs) + put = updates[i].new_stored = rte_store(&updates[i].new, n, tab); + else + put = updates[i].old; + + *nptr = put; + nptr = &put->next; + } + *nptr = NULL; + + /* Call the pre-comparison hooks */ + for (uint i = 0; i < pos; i++) + if (updates[i].new_stored) + { /* Get a new ID for the route */ - new->rte.lastmod = current_time(); - new->rte.id = hmap_first_zero(&tab->id_map); - hmap_set(&tab->id_map, new->rte.id); + updates[i].new_stored->rte.lastmod = current_time(); + updates[i].new_stored->rte.id = hmap_first_zero(&tab->id_map); + hmap_set(&tab->id_map, updates[i].new_stored->rte.id); - lp_flush(tab->nhu_lp); + /* Call a pre-comparison hook */ + /* Not really an efficient way to compute this */ + if (updates[i].old->rte.src->owner->rte_recalculate) + updates[i].old->rte.src->owner->rte_recalculate(tab, n, &updates[i].new_stored->rte, &updates[i].old->rte, &old_best->rte); } - ASSERT_DIE(pos == count); +#if DEBUGGING + { + uint t = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + t++; + ASSERT_DIE(t == pos); + ASSERT_DIE(pos == count); + } +#endif /* Find the new best route */ struct rte_storage **new_best = NULL; @@ -2797,7 +3845,7 @@ rt_next_hop_update_net(rtable_private *tab, net *n) } /* Relink the new best route to the first position */ - new = *new_best; + struct rte_storage *new = *new_best; if (new != n->routes) { *new_best = new->next; @@ -2805,95 +3853,170 @@ rt_next_hop_update_net(rtable_private *tab, net *n) n->routes = new; } + uint total = 0; /* Announce the changes */ - for (int i=0; i<count; i++) + for (uint i=0; i<count; i++) { - _Bool nb = (new == updates[i].new), ob = (old_best == updates[i].old); + if (!updates[i].new_stored) + continue; + + _Bool nb = (new->rte.src == updates[i].new.src), ob = (i == 0); const char *best_indicator[2][2] = { { "autoupdated", "autoupdated [-best]" }, { "autoupdated [+best]", "autoupdated [best]" } }; - rt_rte_trace_in(D_ROUTES, updates[i].new->rte.sender->req, &updates[i].new->rte, best_indicator[nb][ob]); - rte_announce(tab, n, updates[i].new, updates[i].old, new, old_best); + rt_rte_trace_in(D_ROUTES, updates[i].new.sender->req, &updates[i].new, best_indicator[nb][ob]); + rte_announce(tab, n, updates[i].new_stored, updates[i].old, new, old_best); + + total++; } - return count; + return total; +} + +static void +rt_nhu_uncork(void *_tab) +{ + RT_LOCKED((rtable *) _tab, tab) + { + ASSERT_DIE(tab->nhu_corked); + ASSERT_DIE(tab->nhu_state == 0); + + /* Reset the state */ + tab->nhu_state = tab->nhu_corked; + tab->nhu_corked = 0; + rt_trace(tab, D_STATES, "Next hop updater uncorked"); + + birdloop_flag(tab->loop, RTF_NHU); + } } static void -rt_next_hop_update(void *data) +rt_next_hop_update(struct rtable_private *tab) { - rtable_private *tab = data; ASSERT_DIE(birdloop_inside(tab->loop)); - struct fib_iterator *fit = &tab->nhu_fit; - int max_feed = 32; + if (tab->nhu_corked) + return; - if (atomic_load_explicit(&tab->nhu_state, memory_order_acquire) == NHU_CLEAN) + if (!tab->nhu_state) return; - rt_lock_table(tab); + /* Check corkedness */ + if (rt_cork_check(tab->nhu_uncork_event)) + { + rt_trace(tab, D_STATES, "Next hop updater corked"); + if ((tab->nhu_state & NHU_RUNNING) + && !EMPTY_LIST(tab->exporter.pending)) + rt_kick_export_settle(tab); - if (atomic_load_explicit(&tab->nhu_state, memory_order_acquire) == NHU_SCHEDULED) - { - FIB_ITERATE_INIT(fit, &tab->fib); - ASSERT_DIE(atomic_exchange_explicit(&tab->nhu_state, NHU_RUNNING, memory_order_acq_rel) == NHU_SCHEDULED); - } + tab->nhu_corked = tab->nhu_state; + tab->nhu_state = 0; + return; + } + + struct fib_iterator *fit = &tab->nhu_fit; + int max_feed = 32; + /* Initialize a new run */ + if (tab->nhu_state == NHU_SCHEDULED) + { + FIB_ITERATE_INIT(fit, &tab->fib); + tab->nhu_state = NHU_RUNNING; + + if (tab->flowspec_trie) + rt_flowspec_reset_trie(tab); + } + + /* Walk the fib one net after another */ FIB_ITERATE_START(&tab->fib, fit, net, n) { if (max_feed <= 0) { FIB_ITERATE_PUT(fit); - ev_send_loop(tab->loop, tab->nhu_event); - rt_unlock_table(tab); + birdloop_flag(tab->loop, RTF_NHU); return; } + lp_state lps; + lp_save(tmp_linpool, &lps); max_feed -= rt_next_hop_update_net(tab, n); + lp_restore(tmp_linpool, &lps); } FIB_ITERATE_END; + /* Finished NHU, cleanup */ + rt_trace(tab, D_EVENTS, "NHU done, scheduling export timer"); + rt_kick_export_settle(tab); + /* State change: * NHU_DIRTY -> NHU_SCHEDULED * NHU_RUNNING -> NHU_CLEAN */ - if (atomic_fetch_and_explicit(&tab->nhu_state, NHU_SCHEDULED, memory_order_acq_rel) != NHU_RUNNING) - ev_send_loop(tab->loop, tab->nhu_event); + if ((tab->nhu_state &= NHU_SCHEDULED) == NHU_SCHEDULED) + birdloop_flag(tab->loop, RTF_NHU); +} - ev_send_loop(tab->loop, tab->announce_event); +void +rt_new_default_table(struct symbol *s) +{ + for (uint addr_type = 0; addr_type < NET_MAX; addr_type++) + if (s == new_config->def_tables[addr_type]) + { + ASSERT_DIE(!s->table); + s->table = rt_new_table(s, addr_type); + return; + } - rt_unlock_table(tab); + bug("Requested an unknown new default table: %s", s->name); } +struct rtable_config * +rt_get_default_table(struct config *cf, uint addr_type) +{ + struct symbol *ts = cf->def_tables[addr_type]; + if (!ts) + return NULL; + + if (!ts->table) + rt_new_default_table(ts); + + return ts->table; +} struct rtable_config * rt_new_table(struct symbol *s, uint addr_type) { - /* Hack that allows to 'redefine' the master table */ - if ((s->class == SYM_TABLE) && - (s->table == new_config->def_tables[addr_type]) && - ((addr_type == NET_IP4) || (addr_type == NET_IP6))) - return s->table; + if (s->table) + cf_error("Duplicate configuration of table %s", s->name); struct rtable_config *c = cfg_allocz(sizeof(struct rtable_config)); - cf_define_symbol(s, SYM_TABLE, table, c); + if (s == new_config->def_tables[addr_type]) + s->table = c; + else + cf_define_symbol(s, SYM_TABLE, table, c); + c->name = s->name; c->addr_type = addr_type; - c->gc_max_ops = 1000; - c->gc_min_time = 5; - c->min_settle_time = 1 S; - c->max_settle_time = 20 S; - c->min_rr_settle_time = 30 S; - c->max_rr_settle_time = 90 S; - c->cork_limit = 4 * page_size / sizeof(struct rt_pending_export); - c->owner = new_config; + c->gc_threshold = 1000; + c->gc_period = (uint) -1; /* set in rt_postconfig() */ + c->cork_threshold.low = 1024; + c->cork_threshold.high = 8192; + c->export_settle = (struct settle_config) { + .min = 1 MS, + .max = 100 MS, + }; + c->export_rr_settle = (struct settle_config) { + .min = 100 MS, + .max = 3 S, + }; + c->debug = new_config->table_debug; add_tail(&new_config->tables, &c->n); /* First table of each type is kept as default */ if (! new_config->def_tables[addr_type]) - new_config->def_tables[addr_type] = c; + new_config->def_tables[addr_type] = s; return c; } @@ -2907,8 +4030,9 @@ rt_new_table(struct symbol *s, uint addr_type) * configuration. */ void -rt_lock_table(rtable_private *r) +rt_lock_table_priv(struct rtable_private *r, const char *file, uint line) { + rt_trace(r, D_STATES, "Locked at %s:%d", file, line); r->use_count++; } @@ -2921,35 +4045,110 @@ rt_lock_table(rtable_private *r) * for deletion by configuration changes. */ void -rt_unlock_table(rtable_private *r) +rt_unlock_table_priv(struct rtable_private *r, const char *file, uint line) { - if (!--r->use_count && r->delete && - !r->prune_state && !atomic_load_explicit(&r->nhu_state, memory_order_acquire)) - birdloop_stop_self(r->loop, r->delete, r); + rt_trace(r, D_STATES, "Unlocked at %s:%d", file, line); + if (!--r->use_count && r->deleted) + /* Stop the service thread to finish this up */ + ev_send(&global_event_list, ev_new_init(r->rp, rt_shutdown, r)); } -static struct rtable_config * -rt_find_table_config(struct config *cf, char *name) +static void +rt_shutdown(void *tab_) { - struct symbol *sym = cf_find_symbol(cf, name); - return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL; + struct rtable_private *r = tab_; + birdloop_stop(r->loop, rt_delete, r); +} + +static void +rt_delete(void *tab_) +{ + birdloop_enter(&main_birdloop); + + /* We assume that nobody holds the table reference now as use_count is zero. + * Anyway the last holder may still hold the lock. Therefore we lock and + * unlock it the last time to be sure that nobody is there. */ + struct rtable_private *tab = RT_LOCK((rtable *) tab_); + struct config *conf = tab->deleted; + + RT_UNLOCK(RT_PUB(tab)); + + birdloop_free(tab->loop); + rfree(tab->rp); + config_del_obstacle(conf); + + birdloop_leave(&main_birdloop); } + static void -rt_done(void *data) +rt_check_cork_low(struct rtable_private *tab) { - RT_LOCKED((rtable *) data, t) + if (!tab->cork_active) + return; + + if (tab->deleted || !tab->exporter.first || (tab->exporter.first->seq + tab->cork_threshold.low > tab->exporter.next_seq)) { - struct rtable_config *tc = t->config; - struct config *c = tc->owner; + tab->cork_active = 0; + rt_cork_release(); - tc->table = NULL; - rem_node(&t->n); + rt_trace(tab, D_STATES, "Uncorked"); + } +} - config_del_obstacle(c); +static void +rt_check_cork_high(struct rtable_private *tab) +{ + if (!tab->deleted && !tab->cork_active && tab->exporter.first && (tab->exporter.first->seq + tab->cork_threshold.high <= tab->exporter.next_seq)) + { + tab->cork_active = 1; + rt_cork_acquire(); + rt_export_used(&tab->exporter, tab->name, "corked"); + + rt_trace(tab, D_STATES, "Corked"); } } + +static int +rt_reconfigure(struct rtable_private *tab, struct rtable_config *new, struct rtable_config *old) +{ + if ((new->addr_type != old->addr_type) || + (new->sorted != old->sorted) || + (new->trie_used != old->trie_used)) + return 0; + + DBG("\t%s: same\n", new->name); + new->table = RT_PUB(tab); + tab->name = new->name; + tab->config = new; + + if (tab->hostcache) + tab->hostcache->req.trace_routes = new->debug; + + struct rt_table_export_hook *hook; node *n; + WALK_LIST2(hook, n, tab->exporter.e.hooks, h.n) + if (hook->h.req->export_one == rt_flowspec_export_one) + hook->h.req->trace_routes = new->debug; + + tab->cork_threshold = new->cork_threshold; + + if (new->cork_threshold.high != old->cork_threshold.high) + rt_check_cork_high(tab); + + if (new->cork_threshold.low != old->cork_threshold.low) + rt_check_cork_low(tab); + + return 1; +} + +static struct rtable_config * +rt_find_table_config(struct config *cf, char *name) +{ + struct symbol *sym = cf_find_symbol(cf, name); + return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL; +} + /** * rt_commit - commit new routing table configuration * @new: new configuration @@ -2972,30 +4171,34 @@ rt_commit(struct config *new, struct config *old) { WALK_LIST(o, old->tables) { - RT_LOCK(o->table); - rtable_private *ot = RT_PRIV(o->table); - if (!ot->delete) - { - r = rt_find_table_config(new, o->name); - if (r && (r->addr_type == o->addr_type) && !new->shutdown) - { - DBG("\t%s: same\n", o->name); - r->table = (rtable *) ot; - ot->name = r->name; - ot->config = r; - if (o->sorted != r->sorted) - log(L_WARN "Reconfiguration of rtable sorted flag not implemented"); - } - else - { - DBG("\t%s: deleted\n", o->name); - rt_lock_table(ot); - ot->delete = rt_done; - config_add_obstacle(old); - rt_unlock_table(ot); - } - } - RT_UNLOCK(o->table); + struct rtable_private *tab = RT_LOCK(o->table); + + if (tab->deleted) + { + RT_UNLOCK(tab); + continue; + } + + r = rt_find_table_config(new, o->name); + if (r && !new->shutdown && rt_reconfigure(tab, r, o)) + { + RT_UNLOCK(tab); + continue; + } + + DBG("\t%s: deleted\n", o->name); + tab->deleted = old; + config_add_obstacle(old); + rt_lock_table(tab); + + rt_check_cork_low(tab); + + if (tab->hostcache && ev_get_list(&tab->hostcache->update) == &rt_cork.queue) + ev_postpone(&tab->hostcache->update); + + /* Force one more loop run */ + birdloop_flag(tab->loop, RTF_DELETE); + RT_UNLOCK(tab); } } @@ -3009,8 +4212,98 @@ rt_commit(struct config *new, struct config *old) DBG("\tdone\n"); } +static void +rt_feed_done(struct rt_export_hook *c) +{ + c->event.hook = rt_export_hook; + + rt_set_export_state(c, TES_READY); + + rt_send_export_event(c); +} + +#define MAX_FEED_BLOCK 1024 +typedef struct { + uint cnt, pos; + union { + struct rt_pending_export *rpe; + struct { + const rte **feed; + struct rt_feed_block_aux { + struct rt_pending_export *first, *last; + uint start; + } *aux; + }; + }; +} rt_feed_block; + +static int +rt_prepare_feed(struct rt_table_export_hook *c, net *n, rt_feed_block *b) +{ + if (n->routes) + { + if (c->h.req->export_bulk) + { + uint cnt = rte_feed_count(n); + if (b->cnt && (b->cnt + cnt > MAX_FEED_BLOCK)) + return 0; + + if (!b->cnt) + { + b->feed = tmp_alloc(sizeof(rte *) * MAX(MAX_FEED_BLOCK, cnt)); + + uint aux_block_size = (cnt >= MAX_FEED_BLOCK) ? 2 : (MAX_FEED_BLOCK + 2 - cnt); + b->aux = tmp_alloc(sizeof(struct rt_feed_block_aux) * aux_block_size); + } + + rte_feed_obtain(n, &b->feed[b->cnt], cnt); + + b->aux[b->pos++] = (struct rt_feed_block_aux) { + .start = b->cnt, + .first = n->first, + .last = n->last, + }; + + b->cnt += cnt; + } + else if (b->pos == MAX_FEED_BLOCK) + return 0; + else + { + if (!b->pos) + b->rpe = tmp_alloc(sizeof(struct rt_pending_export) * MAX_FEED_BLOCK); + + b->rpe[b->pos++] = (struct rt_pending_export) { .new = n->routes, .new_best = n->routes }; + } + } + + return 1; +} + +static void +rt_process_feed(struct rt_table_export_hook *c, rt_feed_block *b) +{ + if (!b->pos) + return; + + if (c->h.req->export_bulk) + { + b->aux[b->pos].start = b->cnt; + for (uint p = 0; p < b->pos; p++) + { + struct rt_feed_block_aux *aux = &b->aux[p]; + const rte **feed = &b->feed[aux->start]; + + c->h.req->export_bulk(c->h.req, feed[0]->net, aux->first, aux->last, feed, (aux+1)->start - aux->start); + } + } + else + for (uint p = 0; p < b->pos; p++) + c->h.req->export_one(c->h.req, b->rpe[p].new->rte.net, &b->rpe[p]); +} + /** - * rt_feed_channel - advertise all routes to a channel + * rt_feed_by_fib - advertise all routes to a channel by walking a fib * @c: channel to be fed * * This function performs one pass of advertisement of routes to a channel that @@ -3019,119 +4312,154 @@ rt_commit(struct config *new, struct config *old) * order not to monopolize CPU time.) */ static void -rt_feed_channel(void *data) +rt_feed_by_fib(void *data) { - struct rt_export_hook *c = data; - + struct rt_table_export_hook *c = data; struct fib_iterator *fit = &c->feed_fit; - int max_feed = 256; - - rtable_private *tab; - if (c->export_state == TES_HUNGRY) - { - rt_set_export_state(c, TES_FEEDING); - - tab = RT_LOCK(c->table); + rt_feed_block block = {}; - struct rt_pending_export *rpe = rt_last_export(tab); - DBG("store hook=%p last_export=%p seq=%lu\n", c, rpe, rpe ? rpe->seq : 0); - atomic_store_explicit(&c->last_export, rpe, memory_order_relaxed); + ASSERT(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); - FIB_ITERATE_INIT(&c->feed_fit, &tab->fib); - } - else - tab = RT_LOCK(c->table); - - ASSERT_DIE(c->export_state == TES_FEEDING); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { -redo: FIB_ITERATE_START(&tab->fib, fit, net, n) { - if (max_feed <= 0) + if ((c->h.req->addr_mode == TE_ADDR_NONE) || net_in_netX(n->n.addr, c->h.req->addr)) + { + if (!rt_prepare_feed(c, n, &block)) { FIB_ITERATE_PUT(fit); - rt_send_export_event(c); - - RT_UNLOCK(c->table); + RT_UNLOCK(tab); + rt_process_feed(c, &block); + rt_send_export_event(&c->h); return; } - - if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_FEEDING) - { - RT_UNLOCK(c->table); - return; } + } + FIB_ITERATE_END; + } - if (!n->routes || !rte_is_valid(&n->routes->rte)) - ; /* if no route, do nothing */ - else if (c->req->export_bulk) - { - uint count = rte_feed_count(n); - if (count) - { - rte **feed = alloca(count * sizeof(rte *)); - rte_feed_obtain(n, feed, count); + rt_process_feed(c, &block); + rt_feed_done(&c->h); +} + +static void +rt_feed_by_trie(void *data) +{ + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; - struct rt_pending_export *rpe_last, *rpe_first = n->first; - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - rpe_last = rpe; + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { - FIB_ITERATE_PUT_NEXT(fit, &tab->fib); - RT_UNLOCK(c->table); + ASSERT_DIE(c->walk_state); + struct f_trie_walk_state *ws = c->walk_state; - c->req->export_bulk(c->req, n->n.addr, NULL, feed, count); + ASSERT(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); - RT_LOCK(c->table); + do { + if (!c->walk_last.type) + continue; - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - { - rpe_mark_seen(c, rpe); - if (rpe == rpe_last) - break; - ASSERT_DIE(rpe->seq < rpe_last->seq); - } + net *n = net_find(tab, &c->walk_last); + if (!n) + continue; - max_feed -= count; + if (!rt_prepare_feed(c, n, &block)) + { + RT_UNLOCK(tab); + rt_process_feed(c, &block); + rt_send_export_event(&c->h); + return; + } + } + while (trie_walk_next(ws, &c->walk_last)); - goto redo; - } - } - else if (c->req->export_one) - { - struct rt_pending_export rpe = { .new = n->routes, .new_best = n->routes }; + rt_unlock_trie(tab, c->walk_lock); + c->walk_lock = NULL; - struct rt_pending_export *rpe_last, *rpe_first = n->first; - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - rpe_last = rpe; + mb_free(c->walk_state); + c->walk_state = NULL; - FIB_ITERATE_PUT_NEXT(fit, &tab->fib); - RT_UNLOCK(c->table); + c->walk_last.type = 0; - c->req->export_one(c->req, n->n.addr, &rpe); + } - RT_LOCK(c->table); - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - { - rpe_mark_seen(c, rpe); - if (rpe == rpe_last) - break; - ASSERT_DIE(rpe->seq < rpe_last->seq); - } + rt_process_feed(c, &block); + rt_feed_done(&c->h); +} - max_feed--; - goto redo; - } - else - bug("Export request must always provide an export method"); - } - FIB_ITERATE_END; +static void +rt_feed_equal(void *data) +{ + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + net *n; - c->event->hook = rt_export_hook; - rt_send_export_event(c); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); + ASSERT_DIE(c->h.req->addr_mode == TE_ADDR_EQUAL); - RT_UNLOCK(c->table); + if (n = net_find(tab, c->h.req->addr)) + ASSERT_DIE(rt_prepare_feed(c, n, &block)); + } - rt_set_export_state(c, TES_READY); + if (n) + rt_process_feed(c, &block); + + rt_feed_done(&c->h); +} + +static void +rt_feed_for(void *data) +{ + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + net *n; + + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); + ASSERT_DIE(c->h.req->addr_mode == TE_ADDR_FOR); + + if (n = net_route(tab, c->h.req->addr)) + ASSERT_DIE(rt_prepare_feed(c, n, &block)); + } + + if (n) + rt_process_feed(c, &block); + + rt_feed_done(&c->h); +} + + +/* + * Import table + */ + +void channel_reload_export_bulk(struct rt_export_request *req, const net_addr *net, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) +{ + struct channel *c = SKIP_BACK(struct channel, reload_req, req); + + for (uint i=0; i<count; i++) + if (feed[i]->sender == c->in_req.hook) + { + /* Strip the table-specific information */ + rte new = rte_init_from(feed[i]); + + /* Strip the later attribute layers */ + while (new.attrs->next) + new.attrs = new.attrs->next; + + /* And reload the route */ + rte_update(c, net, &new, new.src); + } + + rpe_mark_seen_all(req->hook, first, last, NULL); } @@ -3230,7 +4558,7 @@ hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he) rem_node(&he->ln); hc_remove(hc, he); - sl_free(hc->slab, he); + sl_free(he); hc->hash_items--; if (hc->hash_items < hc->hash_min) @@ -3238,7 +4566,56 @@ hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he) } static void -rt_init_hostcache(rtable_private *tab) +hc_notify_dump_req(struct rt_export_request *req) +{ + debug(" Table %s (%p)\n", req->name, req); +} + +static void +hc_notify_log_state_change(struct rt_export_request *req, u8 state) +{ + struct hostcache *hc = SKIP_BACK(struct hostcache, req, req); + rt_trace((rtable *) hc->update.data, D_STATES, "HCU Export state changed to %s", rt_export_state_name(state)); +} + +static void +hc_notify_export_one(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) +{ + struct hostcache *hc = SKIP_BACK(struct hostcache, req, req); + + /* No interest in this update, mark seen only */ + int interested = 1; + RT_LOCKED((rtable *) hc->update.data, tab) + if (ev_active(&hc->update) || !trie_match_net(hc->trie, net)) + { + rpe_mark_seen_all(req->hook, first, NULL, NULL); + interested = 0; + } + + if (!interested) + return; + + /* This net may affect some hostentries, check the actual change */ + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; + + RPE_WALK(first, rpe, NULL) + { + rpe_mark_seen(req->hook, rpe); + new_best = rpe->new_best; + } + + /* Yes, something has actually changed. Do the hostcache update. */ + if (o != RTE_VALID_OR_NULL(new_best)) + RT_LOCKED((rtable *) hc->update.data, tab) + if ((atomic_load_explicit(&req->hook->export_state, memory_order_acquire) == TES_READY) + && !ev_active(&hc->update)) + ev_send_loop(tab->loop, &hc->update); +} + + +static void +rt_init_hostcache(struct rtable_private *tab) { struct hostcache *hc = mb_allocz(tab->rp, sizeof(struct hostcache)); init_list(&hc->hostentries); @@ -3247,14 +4624,30 @@ rt_init_hostcache(rtable_private *tab) hc_alloc_table(hc, tab->rp, HC_DEF_ORDER); hc->slab = sl_new(tab->rp, sizeof(struct hostentry)); - hc->lp = lp_new(tab->rp, LP_GOOD_SIZE(1024)); + hc->lp = lp_new(tab->rp); hc->trie = f_new_trie(hc->lp, 0); + hc->update = (event) { + .hook = rt_update_hostcache, + .data = tab, + }; + + hc->req = (struct rt_export_request) { + .name = mb_sprintf(tab->rp, "%s.hcu.notifier", tab->name), + .list = birdloop_event_list(tab->loop), + .trace_routes = tab->config->debug, + .dump_req = hc_notify_dump_req, + .log_state_change = hc_notify_log_state_change, + .export_one = hc_notify_export_one, + }; + + rt_table_export_start_locked(tab, &hc->req); + tab->hostcache = hc; } static void -rt_free_hostcache(rtable_private *tab) +rt_free_hostcache(struct rtable_private *tab) { struct hostcache *hc = tab->hostcache; @@ -3276,16 +4669,6 @@ rt_free_hostcache(rtable_private *tab) */ } -static void -rt_notify_hostcache(rtable_private *tab, net *net) -{ - if (ev_active(tab->hcu_event)) - return; - - if (trie_match_net(tab->hostcache->trie, net->n.addr)) - ev_send_loop(tab->loop, tab->hcu_event); -} - static int if_local_addr(ip_addr a, struct iface *i) { @@ -3299,14 +4682,14 @@ if_local_addr(ip_addr a, struct iface *i) } u32 -rt_get_igp_metric(rte *rt) +rt_get_igp_metric(const rte *rt) { - eattr *ea = ea_find(rt->attrs->eattrs, EA_GEN_IGP_METRIC); + eattr *ea = ea_find(rt->attrs, "igp_metric"); if (ea) return ea->u.data; - if (rt->attrs->source == RTS_DEVICE) + if (rt_get_source_attr(rt) == RTS_DEVICE) return 0; if (rt->src->owner->class->rte_igp_metric) @@ -3316,15 +4699,14 @@ rt_get_igp_metric(rte *rt) } static int -rt_update_hostentry(rtable_private *tab, struct hostentry *he) +rt_update_hostentry(struct rtable_private *tab, struct hostentry *he) { - rta *old_src = he->src; + ea_list *old_src = he->src; int direct = 0; int pxlen = 0; /* Reset the hostentry */ he->src = NULL; - he->dest = RTD_UNREACHABLE; he->nexthop_linkable = 0; he->igp_metric = 0; @@ -3334,11 +4716,13 @@ rt_update_hostentry(rtable_private *tab, struct hostentry *he) if (n) { struct rte_storage *e = n->routes; - rta *a = e->rte.attrs; - word pref = a->pref; + ea_list *a = e->rte.attrs; + u32 pref = rt_get_preference(&e->rte); for (struct rte_storage *ee = n->routes; ee; ee = ee->next) - if ((ee->rte.attrs->pref >= pref) && ee->rte.attrs->hostentry) + if (rte_is_valid(&ee->rte) && + (rt_get_preference(&ee->rte) >= pref) && + ea_find(ee->rte.attrs, &ea_gen_hostentry)) { /* Recursive route should not depend on another recursive route */ log(L_WARN "Next hop address %I resolvable through recursive route for %N", @@ -3348,9 +4732,12 @@ rt_update_hostentry(rtable_private *tab, struct hostentry *he) pxlen = n->n.addr->pxlen; - if (a->dest == RTD_UNICAST) - { - for (struct nexthop *nh = &(a->nh); nh; nh = nh->next) + eattr *nhea = ea_find(a, &ea_gen_nexthop); + ASSERT_DIE(nhea); + struct nexthop_adata *nhad = (void *) nhea->u.ptr; + + if (NEXTHOP_IS_REACHABLE(nhad)) + NEXTHOP_WALK(nh, nhad) if (ipa_zero(nh->gw)) { if (if_local_addr(he->addr, nh->iface)) @@ -3363,10 +4750,8 @@ rt_update_hostentry(rtable_private *tab, struct hostentry *he) direct++; } - } he->src = rta_clone(a); - he->dest = a->dest; he->nexthop_linkable = !direct; he->igp_metric = rt_get_igp_metric(&e->rte); } @@ -3382,10 +4767,26 @@ done: static void rt_update_hostcache(void *data) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); + rtable **nhu_pending; + + RT_LOCKED((rtable *) data, tab) + { struct hostcache *hc = tab->hostcache; + + /* Shutdown shortcut */ + if (!hc->req.hook) + RT_RETURN(tab); + + if (rt_cork_check(&hc->update)) + { + rt_trace(tab, D_STATES, "Hostcache update corked"); + RT_RETURN(tab); + } + + /* Destination schedule map */ + nhu_pending = tmp_allocz(sizeof(rtable *) * rtable_max_id); + struct hostentry *he; node *n, *x; @@ -3403,31 +4804,33 @@ rt_update_hostcache(void *data) } if (rt_update_hostentry(tab, he)) - rt_schedule_nhu(he->tab); + nhu_pending[he->tab->id] = he->tab; } + } + + for (uint i=0; i<rtable_max_id; i++) + if (nhu_pending[i]) + RT_LOCKED(nhu_pending[i], dst) + rt_schedule_nhu(dst); } -struct hostentry * -rt_get_hostentry(rtable *t, ip_addr a, ip_addr ll, rtable *dep) +static struct hostentry * +rt_get_hostentry(struct rtable_private *tab, ip_addr a, ip_addr ll, rtable *dep) { + ip_addr link = ipa_zero(ll) ? a : ll; struct hostentry *he; - rtable_private *tab = RT_LOCK(t); - if (!tab->hostcache) rt_init_hostcache(tab); u32 k = hc_hash(a, dep); struct hostcache *hc = tab->hostcache; for (he = hc->hash_table[k >> hc->hash_shift]; he != NULL; he = he->next) - if (ipa_equal(he->addr, a) && (he->tab == dep)) - goto done; + if (ipa_equal(he->addr, a) && ipa_equal(he->link, link) && (he->tab == dep)) + return he; - he = hc_new_hostentry(hc, tab->rp, a, ipa_zero(ll) ? a : ll, dep, k); + he = hc_new_hostentry(hc, tab->rp, a, link, dep, k); rt_update_hostentry(tab, he); - -done: - RT_UNLOCK(t); return he; } |