diff options
-rw-r--r-- | doc/bird.sgml | 15 | ||||
-rw-r--r-- | nest/proto-hooks.c | 20 | ||||
-rw-r--r-- | nest/protocol.h | 2 | ||||
-rw-r--r-- | nest/route.h | 5 | ||||
-rw-r--r-- | nest/rt-table.c | 45 | ||||
-rw-r--r-- | proto/bgp/attrs.c | 172 | ||||
-rw-r--r-- | proto/bgp/bgp.c | 4 | ||||
-rw-r--r-- | proto/bgp/bgp.h | 2 | ||||
-rw-r--r-- | proto/bgp/config.Y | 3 |
9 files changed, 243 insertions, 25 deletions
diff --git a/doc/bird.sgml b/doc/bird.sgml index 7f53f028..b8dabf47 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1316,6 +1316,21 @@ for each neighbor using the following configuration parameters: received from the same AS (which is the standard behavior). Default: off. + <tag>deterministic med <m/switch/</tag> BGP route selection + algorithm is often viewed as a comparison between individual + routes (e.g. if a new route appears and is better than the + current best one, it is chosen as the new best one). But the + proper route selection, as specified by RFC 4271, cannot be + fully implemented in that way. The problem is mainly in + handling the MED attribute. BIRD, by default, uses an + simplification based on individual route comparison, which in + some cases may lead to temporally dependent behavior (i.e. the + selection is dependent on the order in which routes appeared). + This option enables a different (and slower) algorithm + implementing proper RFC 4271 route selection, which is + deterministic. Alternative way how to get deterministic + behavior is to use <cf/med metric/ option. Default: off. + <tag>igp metric <m/switch/</tag> Enable comparison of internal distances to boundary routers during best route selection. Default: on. diff --git a/nest/proto-hooks.c b/nest/proto-hooks.c index f0261922..2582c48b 100644 --- a/nest/proto-hooks.c +++ b/nest/proto-hooks.c @@ -268,6 +268,26 @@ int import_control(struct proto *p, rte **e, ea_list **attrs, struct linpool *po { DUMMY; } /** + * rte_recalculate - prepare routes for comparison + * @table: a routing table + * @net: a network entry + * @new: new route for the network + * @old: old route for the network + * @old_best: old best route for the network (may be NULL) + * + * This hook is called when a route change (from @old to @new for a + * @net entry) is propagated to a @table. It may be used to prepare + * routes for comparison by rte_better() in the best route + * selection. @new may or may not be in @net->routes list, + * @old is not there. + * + * Result: 1 if the ordering implied by rte_better() changes enough + * that full best route calculation have to be done, 0 otherwise. + */ +int rte_recalculate(struct rtable *table, struct network *net, struct rte *new, struct rte *old, struct rte *old_best) +{ DUMMY; } + +/** * rte_better - compare metrics of two routes * @new: the new route * @old: the original route diff --git a/nest/protocol.h b/nest/protocol.h index a7518c2a..3766e15e 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -178,12 +178,14 @@ struct proto { /* * Routing entry hooks (called only for rte's belonging to this protocol): * + * rte_recalculate Called at the beginning of the best route selection * rte_better Compare two rte's and decide which one is better (1=first, 0=second). * rte_same Compare two rte's and decide whether they are identical (1=yes, 0=no). * rte_insert Called whenever a rte is inserted to a routing table. * rte_remove Called whenever a rte is removed from the routing table. */ + int (*rte_recalculate)(struct rtable *, struct network *, struct rte *, struct rte *, struct rte *); int (*rte_better)(struct rte *, struct rte *); int (*rte_same)(struct rte *, struct rte *); void (*rte_insert)(struct network *, struct rte *); diff --git a/nest/route.h b/nest/route.h index a4c01548..e6712c64 100644 --- a/nest/route.h +++ b/nest/route.h @@ -201,6 +201,11 @@ typedef struct rte { u32 router_id; /* Router that originated this route */ } ospf; #endif +#ifdef CONFIG_BGP + struct { + u8 suppressed; /* Used for deterministic MED comparison */ + } bgp; +#endif struct { /* Routes generated by krt sync (both temporary and inherited ones) */ s8 src; /* Alleged route source (see krt.h) */ u8 proto; /* Kernel source protocol ID */ diff --git a/nest/rt-table.c b/nest/rt-table.c index e20d2f6d..377687de 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -498,6 +498,9 @@ rte_recalculate(rtable *table, net *net, struct proto *p, struct proto *src, rte rte_announce(table, RA_ANY, net, new, old, tmpa); + if (src->rte_recalculate && src->rte_recalculate(table, net, new, old, old_best)) + goto do_recalculate; + if (new && rte_better(new, old_best)) { /* The first case - the new route is cleary optimal, we link it @@ -516,6 +519,7 @@ rte_recalculate(rtable *table, net *net, struct proto *p, struct proto *src, rte that route at the first position and announce it. New optimal route might be NULL if there is no more routes */ + do_recalculate: /* Add the new route to the list */ if (new) { @@ -1015,27 +1019,36 @@ rt_next_hop_update_net(rtable *tab, net *n) if (!old_best) return 0; - new_best = NULL; - for (k = &n->routes; e = *k; k = &e->next) - { - if (rta_next_hop_outdated(e->attrs)) - { - new = rt_next_hop_update_rte(tab, e); - *k = new; + if (rta_next_hop_outdated(e->attrs)) + { + new = rt_next_hop_update_rte(tab, e); + *k = new; - rte_announce_i(tab, RA_ANY, n, new, e); - rte_trace_in(D_ROUTES, new->sender, new, "updated"); + rte_announce_i(tab, RA_ANY, n, new, e); + rte_trace_in(D_ROUTES, new->sender, new, "updated"); - if (e != old_best) - rte_free_quick(e); - else /* Freeing of the old best rte is postponed */ - free_old_best = 1; + /* Call a pre-comparison hook */ + /* Not really an efficient way to compute this */ + if (e->attrs->proto->rte_recalculate) + e->attrs->proto->rte_recalculate(tab, n, new, e, NULL); - e = new; - count++; - } + if (e != old_best) + rte_free_quick(e); + else /* Freeing of the old best rte is postponed */ + free_old_best = 1; + e = new; + count++; + } + + if (!count) + return 0; + + /* Find the new best route */ + new_best = NULL; + for (k = &n->routes; e = *k; k = &e->next) + { if (!new_best || rte_better(e, *new_best)) new_best = k; } diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 2832f428..93c1f6d6 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -1125,6 +1125,14 @@ bgp_rte_better(rte *new, rte *old) eattr *x, *y; u32 n, o; + /* Skip suppressed routes (see bgp_rte_recalculate()) */ + n = new->u.bgp.suppressed; + o = old->u.bgp.suppressed; + if (n > o) + return 0; + if (n < o) + return 1; + /* RFC 4271 9.1.2.1. Route resolvability test */ n = rte_resolvable(new); o = rte_resolvable(old); @@ -1167,14 +1175,15 @@ bgp_rte_better(rte *new, rte *old) return 0; /* RFC 4271 9.1.2.2. c) Compare MED's */ - /* This is noncompliant. Proper RFC 4271 path selection cannot be - * interpreted as finding the best path in some ordering. - * Therefore, it cannot be implemented in BIRD without some ugly - * hacks. This is just an approximation, which in specific - * situations may lead to persistent routing loops, because it is - * nondeterministic - it depends on the order in which routes - * appeared. But it is also the same behavior as used by default in - * Cisco routers, so it is probably not a big issue. + /* Proper RFC 4271 path selection cannot be interpreted as finding + * the best path in some ordering. It is implemented partially in + * bgp_rte_recalculate() when deterministic_med option is + * active. Without that option, the behavior is just an + * approximation, which in specific situations may lead to + * persistent routing loops, because it is nondeterministic - it + * depends on the order in which routes appeared. But it is also the + * same behavior as used by default in Cisco routers, so it is + * probably not a big issue. */ if (new_bgp->cf->med_metric || old_bgp->cf->med_metric || (bgp_get_neighbor(new) == bgp_get_neighbor(old))) @@ -1236,6 +1245,148 @@ bgp_rte_better(rte *new, rte *old) return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0); } + +static inline int +same_group(rte *r, u32 lpref, u32 lasn) +{ + return (r->pref == lpref) && (bgp_get_neighbor(r) == lasn); +} + +static inline int +use_deterministic_med(rte *r) +{ + return ((struct bgp_proto *) r->attrs->proto)->cf->deterministic_med; +} + +int +bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) +{ + rte *r, *s; + rte *key = new ? new : old; + u32 lpref = key->pref; + u32 lasn = bgp_get_neighbor(key); + int old_is_group_best = 0; + + /* + * Proper RFC 4271 path selection is a bit complicated, it cannot be + * implemented just by rte_better(), because it is not a linear + * ordering. But it can be splitted to two levels, where the lower + * level chooses the best routes in each group of routes from the + * same neighboring AS and higher level chooses the best route (with + * a slightly different ordering) between the best-in-group routes. + * + * When deterministic_med is disabled, we just ignore this issue and + * choose the best route by bgp_rte_better() alone. If enabled, the + * lower level of the route selection is done here (for the group + * to which the changed route belongs), all routes in group are + * marked as suppressed, just chosen best-in-group is not. + * + * Global best route selection then implements higher level by + * choosing between non-suppressed routes (as they are always + * preferred over suppressed routes). Routes from BGP protocols + * that do not set deterministic_med are just never suppressed. As + * they do not participate in the lower level selection, it is OK + * that this fn is not called for them. + * + * The idea is simple, the implementation is more problematic, + * mostly because of optimizations in rte_recalculate() that + * avoids full recalculation in most cases. + * + * We can assume that at least one of new, old is non-NULL and both + * are from the same protocol with enabled deterministic_med. We + * group routes by both neighbor AS (lasn) and preference (lpref), + * because bgp_rte_better() does not handle preference itself. + */ + + /* If new and old are from different groups, we just process that + as two independent events */ + if (new && old && !same_group(old, lpref, lasn)) + { + int i1, i2; + i1 = bgp_rte_recalculate(table, net, NULL, old, old_best); + i2 = bgp_rte_recalculate(table, net, new, NULL, old_best); + return i1 || i2; + } + + /* + * We could find the best-in-group and then make some shortcuts like + * in rte_recalculate, but as we would have to walk through all + * net->routes just to find it, it is probably not worth. So we + * just have two simpler fast cases that use just the old route. + * We also set suppressed flag to avoid using it in bgp_rte_better(). + */ + + if (new) + new->u.bgp.suppressed = 1; + + if (old) + { + old_is_group_best = !old->u.bgp.suppressed; + old->u.bgp.suppressed = 1; + int new_is_better = new && bgp_rte_better(new, old); + + /* The first case - replace not best with worse (or remove not best) */ + if (!old_is_group_best && !new_is_better) + return 0; + + /* The second case - replace the best with better */ + if (old_is_group_best && new_is_better) + { + /* new is best-in-group, the see discussion below - this is + a special variant of NBG && OBG. From OBG we can deduce + that same_group(old_best) iff (old == old_best) */ + new->u.bgp.suppressed = 0; + return (old == old_best); + } + } + + /* The default case - find a new best-in-group route */ + r = new; /* new may not be in the list */ + for (s=net->routes; s; s=s->next) + if (use_deterministic_med(s) && same_group(s, lpref, lasn)) + { + s->u.bgp.suppressed = 1; + if (!r || bgp_rte_better(s, r)) + r = s; + } + + /* Simple case - the last route in group disappears */ + if (!r) + return 0; + + /* Found best-in-group */ + r->u.bgp.suppressed = 0; + + /* + * There are generally two reasons why we have to force + * recalculation (return 1): First, the new route may be wrongfully + * chosen to be the best in the first case check in + * rte_recalculate(), this may happen only if old_best is from the + * same group. Second, another (different than new route) + * best-in-group is chosen and that may be the proper best (although + * rte_recalculate() without ignore that possibility). + * + * There are three possible cases according to whether the old route + * was the best in group (OBG, stored in old_is_group_best) and + * whether the new route is the best in group (NBG, tested by r == new). + * These cases work even if old or new is NULL. + * + * NBG -> new is a possible candidate for the best route, so we just + * check for the first reason using same_group(). + * + * !NBG && OBG -> Second reason applies, return 1 + * + * !NBG && !OBG -> Best in group does not change, old != old_best, + * rte_better(new, old_best) is false and therefore + * the first reason does not apply, return 0 + */ + + if (r == new) + return old_best && same_group(old_best, lpref, lasn); + else + return old_is_group_best; +} + static struct adata * bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool) { @@ -1614,6 +1765,11 @@ bgp_get_route_info(rte *e, byte *buf, ea_list *attrs) eattr *o = ea_find(attrs, EA_CODE(EAP_BGP, BA_ORIGIN)); u32 origas; + /* + if (e->u.bgp.suppressed) + buf += bsprintf(buf, " -"); + */ + buf += bsprintf(buf, " (%d", e->pref); if (e->attrs->hostentry) { diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 675342de..28396a52 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -908,6 +908,10 @@ bgp_init(struct proto_config *C) P->import_control = bgp_import_control; P->neigh_notify = bgp_neigh_notify; P->reload_routes = bgp_reload_routes; + + if (c->deterministic_med) + P->rte_recalculate = bgp_rte_recalculate; + p->cf = c; p->local_as = c->local_as; p->remote_as = c->remote_as; diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 437ba335..0c50583b 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -29,6 +29,7 @@ struct bgp_config { int med_metric; /* Compare MULTI_EXIT_DISC even between routes from differen ASes */ int igp_metric; /* Use IGP metrics when selecting best route */ int prefer_older; /* Prefer older routes according to RFC 5004 */ + int deterministic_med; /* Use more complicated algo to have strict RFC 4271 MED comparison */ u32 default_local_pref; /* Default value for LOCAL_PREF attribute */ u32 default_med; /* Default value for MULTI_EXIT_DISC attribute */ int capabilities; /* Enable capability handshake [RFC3392] */ @@ -185,6 +186,7 @@ byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned att struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, unsigned int len, struct linpool *pool, int mandatory); int bgp_get_attr(struct eattr *e, byte *buf, int buflen); int bgp_rte_better(struct rte *, struct rte *); +int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs); int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *); void bgp_attr_init(struct bgp_proto *); diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 03c233d6..3ef9b290 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -25,7 +25,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE, PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP, - TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY) + TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC) CF_GRAMMAR @@ -82,6 +82,7 @@ bgp_proto: | bgp_proto MED METRIC bool ';' { BGP_CFG->med_metric = $4; } | bgp_proto IGP METRIC bool ';' { BGP_CFG->igp_metric = $4; } | bgp_proto PREFER OLDER bool ';' { BGP_CFG->prefer_older = $4; } + | bgp_proto DETERMINISTIC MED bool ';' { BGP_CFG->deterministic_med = $4; } | bgp_proto DEFAULT BGP_MED expr ';' { BGP_CFG->default_med = $4; } | bgp_proto DEFAULT BGP_LOCAL_PREF expr ';' { BGP_CFG->default_local_pref = $4; } | bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->source_addr = $4; } |