diff options
-rw-r--r-- | doc/bird.sgml | 28 | ||||
-rw-r--r-- | nest/route.h | 15 | ||||
-rw-r--r-- | nest/rt-table.c | 358 | ||||
-rw-r--r-- | proto/bgp/attrs.c | 4 | ||||
-rw-r--r-- | proto/bgp/bgp.c | 54 | ||||
-rw-r--r-- | proto/bgp/bgp.h | 6 | ||||
-rw-r--r-- | proto/bgp/config.Y | 17 | ||||
-rw-r--r-- | proto/bgp/packets.c | 28 | ||||
-rw-r--r-- | proto/pipe/pipe.c | 3 |
9 files changed, 487 insertions, 26 deletions
diff --git a/doc/bird.sgml b/doc/bird.sgml index 39dadaf2..d1d2bdae 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -2274,6 +2274,7 @@ avoid routing loops. <item> <rfc id="8092"> - BGP Large Communities Attribute <item> <rfc id="8203"> - BGP Administrative Shutdown Communication <item> <rfc id="8212"> - Default EBGP Route Propagation Behavior without Policies +<item> <rfc id="9117"> - Revised Validation Procedure for BGP Flow Specifications </itemize> <sect1>Route selection rules @@ -2659,7 +2660,7 @@ using the following configuration parameters: <tag><label id="bgp-error-wait-time">error wait time <m/number/,<m/number/</tag> Minimum and maximum delay in seconds between a protocol failure (either - local or reported by the peer) and automatic restart. Doesn't apply + local or reported by the peer) and automatic restart. Doesn not apply when <cf/disable after error/ is configured. If consecutive errors happen, the delay is increased exponentially until it reaches the maximum. Default: 60, 300. @@ -2837,6 +2838,31 @@ be used in explicit configuration. explicitly (to conserve memory). This option requires that the connected routing table is <ref id="dsc-table-sorted" name="sorted">. Default: off. + <tag><label id="bgp-validate">validate <m/switch/</tag> + Apply flowspec validation procedure as described in <rfc id="8955"> + section 6 and <rfc id="9117">. The Validation procedure enforces that + only routers in the forwarding path for a network can originate flowspec + rules for that network. The validation procedure should be used for EBGP + to prevent injection of malicious flowspec rules from outside, but it + should also be used for IBGP to ensure that selected flowspec rules are + consistent with selected IP routes. The validation procedure uses an IP + routing table (<ref id="bgp-base-table" name="base table">, see below) + against which flowspec rules are validated. This option is limited to + flowspec channels. Default: off (for compatibility reasons). + + Note that currently the flowspec validation does not work reliably + together with <ref id="bgp-import-table" name="import table"> option + enabled on flowspec channels. + + <tag><label id="bgp-base-table">base table <m/name/</tag> + Specifies an IP table used for the flowspec validation procedure. The + table must have enabled <cf/trie/ option, otherwise the validation + procedure would not work. The type of the table must be <cf/ipv4/ for + <cf/flow4/ channels and <cf/ipv6/ for <cf/flow6/ channels. This option + is limited to flowspec channels. Default: the main table of the + <cf/ipv4/ / <cf/ipv6/ channel of the same BGP instance, or the + <cf/master4/ / <cf/master6/ table if there is no such channel. + <tag><label id="bgp-extended-next-hop">extended next hop <m/switch/</tag> BGP expects that announced next hops have the same address family as associated network prefixes. This option provides an extension to use diff --git a/nest/route.h b/nest/route.h index a1732bc7..102ea0ea 100644 --- a/nest/route.h +++ b/nest/route.h @@ -192,6 +192,8 @@ typedef struct rtable { list subscribers; /* Subscribers for notifications */ struct timer *settle_timer; /* Settle time for notifications */ + list flowspec_links; /* List of flowspec links, src for NET_IPx and dst for NET_FLOWx */ + struct f_trie *flowspec_trie; /* Trie for evaluation of flowspec notifications */ } rtable; struct rt_subscription { @@ -201,6 +203,13 @@ struct rt_subscription { void *data; }; +struct rt_flowspec_link { + node n; + rtable *src; + rtable *dst; + u32 uc; +}; + #define NHU_CLEAN 0 #define NHU_SCHEDULED 1 #define NHU_RUNNING 2 @@ -267,6 +276,7 @@ typedef struct rte { struct { u8 suppressed; /* Used for deterministic MED comparison */ s8 stale; /* Route is LLGR_STALE, -1 if unknown */ + struct rtable *base_table; /* Base table for Flowspec validation */ } bgp; #endif #ifdef CONFIG_BABEL @@ -322,6 +332,8 @@ void rt_lock_table(rtable *); void rt_unlock_table(rtable *); void rt_subscribe(rtable *tab, struct rt_subscription *s); void rt_unsubscribe(struct rt_subscription *s); +void rt_flowspec_link(rtable *src, rtable *dst); +void rt_flowspec_unlink(rtable *src, rtable *dst); rtable *rt_setup(pool *, struct rtable_config *); static inline void rt_shutdown(rtable *r) { rfree(r->rp); } @@ -743,6 +755,9 @@ rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr gw, ip_addr static inline void rt_lock_hostentry(struct hostentry *he) { if (he) he->uc++; } static inline void rt_unlock_hostentry(struct hostentry *he) { if (he) he->uc--; } +int rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior); + + /* * Default protocol preferences */ diff --git a/nest/rt-table.c b/nest/rt-table.c index 94ae987b..ee679898 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -26,6 +26,66 @@ * (see the route attribute module for a precise explanation) holding the * remaining route attributes which are expected to be shared by multiple * routes in order to conserve memory. + * + * There are several mechanisms that allow automatic update of routes in one + * routing table (dst) as a result of changes in another routing table (src). + * They handle issues of recursive next hop resolving, flowspec validation and + * RPKI validation. + * + * The first such mechanism is handling of recursive next hops. A route in the + * dst table has an indirect next hop address, which is resolved through a route + * in the src table (which may also be the same table) to get an immediate next + * hop. This is implemented using structure &hostcache attached to the src + * table, which contains &hostentry structures for each tracked next hop + * address. These structures are linked from recursive routes in dst tables, + * possibly multiple routes sharing one hostentry (as many routes may have the + * same indirect next hop). There is also a trie in the hostcache, which matches + * all prefixes that may influence resolving of tracked next hops. + * + * When a best route changes in the src table, the hostcache is notified using + * rt_notify_hostcache(), which immediately checks using the trie whether the + * change is relevant and if it is, then it schedules asynchronous hostcache + * recomputation. The recomputation is done by rt_update_hostcache() (called + * from rt_event() of src table), it walks through all hostentries and resolves + * them (by rt_update_hostentry()). It also updates the trie. If a change in + * hostentry resolution was found, then it schedules asynchronous nexthop + * recomputation of associated dst table. That is done by rt_next_hop_update() + * (called from rt_event() of dst table), it iterates over all routes in the dst + * table and re-examines their hostentries for changes. Note that in contrast to + * hostcache update, next hop update can be interrupted by main loop. These two + * full-table walks (over hostcache and dst table) are necessary due to absence + * of direct lookups (route -> affected nexthop, nexthop -> its route). + * + * The second mechanism is for flowspec validation, where validity of flowspec + * routes depends of resolving their network prefixes in IP routing tables. This + * is similar to the recursive next hop mechanism, but simpler as there are no + * intermediate hostcache and hostentries (because flows are less likely to + * share common net prefix than routes sharing a common next hop). In src table, + * there is a list of dst tables (list flowspec_links), this list is updated by + * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during + * channel start/stop). Each dst table has its own trie of prefixes that may + * influence validation of flowspec routes in it (flowspec_trie). + * + * When a best route changes in the src table, rt_flowspec_notify() immediately + * checks all dst tables from the list using their tries to see whether the + * change is relevant for them. If it is, then an asynchronous re-validation of + * flowspec routes in the dst table is scheduled. That is also done by function + * rt_next_hop_update(), like nexthop recomputation above. It iterates over all + * flowspec routes and re-validates them. It also recalculates the trie. + * + * Note that in contrast to the hostcache update, here the trie is recalculated + * during the rt_next_hop_update(), which may be interleaved with IP route + * updates. The trie is flushed at the beginning of recalculation, which means + * that such updates may use partial trie to see if they are relevant. But it + * works anyway! Either affected flowspec was already re-validated and added to + * the trie, then IP route change would match the trie and trigger a next round + * of re-validation, or it was not yet re-validated and added to the trie, but + * will be re-validated later in this round anyway. + * + * The third mechanism is used for RPKI re-validation of IP routes and it is the + * simplest. It is just a list of subscribers in src table, who are notified + * when any change happened, but only after a settle time. Also, in RPKI case + * the dst is not a table, but a channel, who refeeds routes through a filter. */ #undef LOCAL_DEBUG @@ -44,6 +104,7 @@ #include "lib/hash.h" #include "lib/string.h" #include "lib/alloca.h" +#include "lib/flowspec.h" #ifdef CONFIG_BGP #include "proto/bgp/bgp.h" @@ -62,6 +123,7 @@ static void rt_update_hostcache(rtable *tab); static void rt_next_hop_update(rtable *tab); static inline void rt_prune_table(rtable *tab); static inline void rt_schedule_notify(rtable *tab); +static void rt_flowspec_notify(rtable *tab, net *net); static void @@ -1193,6 +1255,9 @@ rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old, if (tab->hostcache) rt_notify_hostcache(tab, net); + + if (!EMPTY_LIST(tab->flowspec_links)) + rt_flowspec_notify(tab, net); } rt_schedule_notify(tab); @@ -1254,6 +1319,10 @@ rte_validate(rte *e) if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest) { + /* Exception for flowspec that failed validation */ + if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE)) + return 1; + log(L_WARN "Ignoring route %N with invalid dest %d received via %s", n->n.addr, e->attrs->dest, e->sender->proto->name); return 0; @@ -2097,6 +2166,90 @@ rt_unsubscribe(struct rt_subscription *s) rt_unlock_table(s->tab); } +static struct rt_flowspec_link * +rt_flowspec_find_link(rtable *src, rtable *dst) +{ + struct rt_flowspec_link *ln; + WALK_LIST(ln, src->flowspec_links) + if ((ln->src == src) && (ln->dst == dst)) + return ln; + + return NULL; +} + +void +rt_flowspec_link(rtable *src, rtable *dst) +{ + ASSERT(rt_is_ip(src)); + ASSERT(rt_is_flow(dst)); + + struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst); + + if (!ln) + { + rt_lock_table(src); + rt_lock_table(dst); + + ln = mb_allocz(src->rp, sizeof(struct rt_flowspec_link)); + ln->src = src; + ln->dst = dst; + add_tail(&src->flowspec_links, &ln->n); + } + + ln->uc++; +} + +void +rt_flowspec_unlink(rtable *src, rtable *dst) +{ + struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst); + + ASSERT(ln && (ln->uc > 0)); + + ln->uc--; + + if (!ln->uc) + { + rem_node(&ln->n); + mb_free(ln); + + rt_unlock_table(src); + rt_unlock_table(dst); + } +} + +static void +rt_flowspec_notify(rtable *src, net *net) +{ + /* Only IP tables are src links */ + ASSERT(rt_is_ip(src)); + + struct rt_flowspec_link *ln; + WALK_LIST(ln, src->flowspec_links) + { + rtable *dst = ln->dst; + ASSERT(rt_is_flow(dst)); + + /* No need to inspect it further if recalculation is already active */ + if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY)) + continue; + + if (trie_match_net(dst->flowspec_trie, net->n.addr)) + rt_schedule_nhu(dst); + } +} + +static void +rt_flowspec_reset_trie(rtable *tab) +{ + linpool *lp = tab->flowspec_trie->lp; + int ipv4 = tab->flowspec_trie->ipv4; + + lp_flush(lp); + tab->flowspec_trie = f_new_trie(lp, 0); + tab->flowspec_trie->ipv4 = ipv4; +} + static void rt_free(resource *_r) { @@ -2167,16 +2320,23 @@ rt_setup(pool *pp, struct rtable_config *cf) t->fib.init = net_init_with_trie; } + init_list(&t->channels); + init_list(&t->flowspec_links); + init_list(&t->subscribers); + if (!(t->internal = cf->internal)) { - init_list(&t->channels); hmap_init(&t->id_map, p, 1024); hmap_set(&t->id_map, 0); - init_list(&t->subscribers); - t->rt_event = ev_new_init(p, rt_event, t); t->last_rt_change = t->gc_time = current_time(); + + if (rt_is_flow(t)) + { + t->flowspec_trie = f_new_trie(lp_new_default(p), 0); + t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4); + } } return t; @@ -2331,21 +2491,6 @@ rt_preconfig(struct config *c) * triggered by rt_schedule_nhu(). */ -static inline int -rta_next_hop_outdated(rta *a) -{ - struct hostentry *he = a->hostentry; - - if (!he) - return 0; - - if (!he->src) - return a->dest != RTD_UNREACHABLE; - - return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) || - (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh)); -} - void rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls) { @@ -2437,9 +2582,27 @@ no_nexthop: } } +static inline int +rta_next_hop_outdated(rta *a) +{ + struct hostentry *he = a->hostentry; + + if (!he) + return 0; + + if (!he->src) + return a->dest != RTD_UNREACHABLE; + + return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) || + (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh)); +} + static inline rte * rt_next_hop_update_rte(rtable *tab UNUSED, rte *old) { + if (!rta_next_hop_outdated(old->attrs)) + return NULL; + rta *a = alloca(RTA_MAX_SIZE); memcpy(a, old->attrs, rta_size(old->attrs)); @@ -2456,6 +2619,152 @@ rt_next_hop_update_rte(rtable *tab UNUSED, rte *old) return e; } + +#ifdef CONFIG_BGP + +static inline int +net_flow_has_dst_prefix(const net_addr *n) +{ + ASSUME(net_is_flow(n)); + + if (n->pxlen) + return 1; + + if (n->type == NET_FLOW4) + { + const net_addr_flow4 *n4 = (void *) n; + return (n4->length > sizeof(net_addr_flow4)) && (n4->data[0] == FLOW_TYPE_DST_PREFIX); + } + else + { + const net_addr_flow6 *n6 = (void *) n; + return (n6->length > sizeof(net_addr_flow6)) && (n6->data[0] == FLOW_TYPE_DST_PREFIX); + } +} + +static inline int +rta_as_path_is_empty(rta *a) +{ + eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + return !e || (as_path_getlen(e->u.ptr) == 0); +} + +static inline u32 +rta_get_first_asn(rta *a) +{ + eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + u32 asn; + + return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0; +} + +int +rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior) +{ + ASSERT(rt_is_ip(tab_ip)); + ASSERT(rt_is_flow(tab_flow)); + ASSERT(tab_ip->trie); + + /* RFC 8955 6. a) Flowspec has defined dst prefix */ + if (!net_flow_has_dst_prefix(n)) + return 0; + + /* RFC 9117 4.1. Accept AS_PATH is empty (fr */ + if (interior && rta_as_path_is_empty(a)) + return 1; + + + /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */ + + /* Find flowspec dst prefix */ + net_addr dst; + if (n->type == NET_FLOW4) + net_fill_ip4(&dst, net4_prefix(n), net4_pxlen(n)); + else + net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n)); + + /* Find best-match BGP unicast route for flowspec dst prefix */ + net *nb = net_route(tab_ip, &dst); + rte *rb = nb ? nb->routes : NULL; + + /* Register prefix to trie for tracking further changes */ + int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH; + trie_add_prefix(tab_flow->flowspec_trie, &dst, (nb ? nb->n.addr->pxlen : 0), max_pxlen); + + /* No best-match BGP route -> no flowspec */ + if (!rb || (rb->attrs->source != RTS_BGP)) + return 0; + + /* Find ORIGINATOR_ID values */ + u32 orig_a = ea_get_int(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0); + u32 orig_b = ea_get_int(rb->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0); + + /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */ + if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal(a->from, rb->attrs->from))) + return 0; + + + /* Find ASN of the best-match route, for use in next checks */ + u32 asn_b = rta_get_first_asn(rb->attrs); + if (!asn_b) + return 0; + + /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */ + if (!interior && (rta_get_first_asn(a) != asn_b)) + return 0; + + /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */ + TRIE_WALK(tab_ip->trie, subnet, &dst) + { + net *nc = net_find_valid(tab_ip, &subnet); + if (!nc) + continue; + + rte *rc = nc->routes; + if (rc->attrs->source != RTS_BGP) + return 0; + + if (rta_get_first_asn(rc->attrs) != asn_b) + return 0; + } + TRIE_WALK_END; + + return 1; +} + +#endif /* CONFIG_BGP */ + +static rte * +rt_flowspec_update_rte(rtable *tab, rte *r) +{ +#ifdef CONFIG_BGP + if ((r->attrs->source != RTS_BGP) || !r->u.bgp.base_table) + return NULL; + + const net_addr *n = r->net->n.addr; + struct bgp_proto *p = (void *) r->attrs->src->proto; + int valid = rt_flowspec_check(r->u.bgp.base_table, tab, n, r->attrs, p->is_interior); + int dest = valid ? RTD_NONE : RTD_UNREACHABLE; + + if (dest == r->attrs->dest) + return NULL; + + rta *a = alloca(RTA_MAX_SIZE); + memcpy(a, r->attrs, rta_size(r->attrs)); + a->dest = dest; + a->aflags = 0; + + rte *new = sl_alloc(rte_slab); + memcpy(new, r, sizeof(rte)); + new->attrs = rta_lookup(a); + + return new; +#else + return NULL; +#endif +} + + static inline int rt_next_hop_update_net(rtable *tab, net *n) { @@ -2468,9 +2777,14 @@ rt_next_hop_update_net(rtable *tab, net *n) return 0; for (k = &n->routes; e = *k; k = &e->next) - if (rta_next_hop_outdated(e->attrs)) + { + if (!net_is_flow(n->n.addr)) + new = rt_next_hop_update_rte(tab, e); + else + new = rt_flowspec_update_rte(tab, e); + + if (new) { - new = rt_next_hop_update_rte(tab, e); *k = new; rte_trace_in(D_ROUTES, new->sender, new, "updated"); @@ -2489,6 +2803,7 @@ rt_next_hop_update_net(rtable *tab, net *n) e = new; count++; } + } if (!count) return 0; @@ -2536,6 +2851,9 @@ rt_next_hop_update(rtable *tab) { FIB_ITERATE_INIT(fit, &tab->fib); tab->nhu_state = NHU_RUNNING; + + if (tab->flowspec_trie) + rt_flowspec_reset_trie(tab); } FIB_ITERATE_START(&tab->fib, fit, net, n) diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 24ba00ba..b28cbd55 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -1676,6 +1676,10 @@ bgp_preexport(struct proto *P, rte **new, struct linpool *pool UNUSED) if (src == NULL) return 0; + /* Reject flowspec that failed validation */ + if ((e->attrs->dest == RTD_UNREACHABLE) && net_is_flow(e->net->n.addr)) + return -1; + /* IBGP route reflection, RFC 4456 */ if (p->is_internal && src->is_internal && (p->local_as == src->local_as)) { diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index e4d754b1..58084136 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -101,6 +101,7 @@ * RFC 8203 - BGP Administrative Shutdown Communication * RFC 8212 - Default EBGP Route Propagation Behavior without Policies * RFC 8654 - Extended Message Support for BGP + * RFC 9117 - Revised Validation Procedure for BGP Flow Specifications * draft-ietf-idr-ext-opt-param-07 * draft-uttaro-idr-bgp-persistence-04 * draft-walton-bgp-hostname-capability-02 @@ -1735,6 +1736,9 @@ bgp_channel_init(struct channel *C, struct channel_config *CF) if (cf->igp_table_ip6) c->igp_table_ip6 = cf->igp_table_ip6->table; + + if (cf->base_table) + c->base_table = cf->base_table->table; } static int @@ -1750,6 +1754,12 @@ bgp_channel_start(struct channel *C) if (c->igp_table_ip6) rt_lock_table(c->igp_table_ip6); + if (c->base_table) + { + rt_lock_table(c->base_table); + rt_flowspec_link(c->base_table, c->c.table); + } + c->pool = p->p.pool; // XXXX bgp_init_bucket_table(c); bgp_init_prefix_table(c); @@ -1834,6 +1844,12 @@ bgp_channel_cleanup(struct channel *C) if (c->igp_table_ip6) rt_unlock_table(c->igp_table_ip6); + if (c->base_table) + { + rt_flowspec_unlink(c->base_table, c->c.table); + rt_unlock_table(c->base_table); + } + c->index = 0; /* Cleanup rest of bgp_channel starting at pool field */ @@ -1881,6 +1897,25 @@ bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 cf_error("Undefined IGP table"); } +static struct rtable_config * +bgp_default_base_table(struct bgp_config *cf, struct bgp_channel_config *cc) +{ + /* Expected table type */ + u32 type = (cc->afi == BGP_AF_FLOW4) ? NET_IP4 : NET_IP6; + + /* First, try appropriate IP channel */ + u32 afi2 = BGP_AF(BGP_AFI(cc->afi), BGP_SAFI_UNICAST); + struct bgp_channel_config *cc2 = bgp_find_channel_config(cf, afi2); + if (cc2 && (cc2->c.table->addr_type == type)) + return cc2->c.table; + + /* Last, try default table of given type */ + struct rtable_config *tab = cf->c.global->def_tables[type]; + if (tab) + return tab; + + cf_error("Undefined base table"); +} void bgp_postconfig(struct proto_config *CF) @@ -2025,6 +2060,14 @@ bgp_postconfig(struct proto_config *CF) cf_error("Mismatched IGP table type"); } + /* Default value of base table */ + if ((BGP_SAFI(cc->afi) == BGP_SAFI_FLOW) && cc->validate && !cc->base_table) + cc->base_table = bgp_default_base_table(cf, cc); + + if (cc->base_table && !cc->base_table->trie_used) + cf_error("Flowspec validation requires base table (%s) with trie", + cc->base_table->name); + if (cf->multihop && (cc->gw_mode == GW_DIRECT)) cf_error("Multihop BGP cannot use direct gateway mode"); @@ -2093,7 +2136,7 @@ bgp_reconfigure(struct proto *P, struct proto_config *CF) return same; } -#define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL ) +#define TABLE(cf, NAME) ((cf)->NAME ? (cf)->NAME->table : NULL ) static int bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *import_changed, int *export_changed) @@ -2104,6 +2147,7 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor struct bgp_channel_config *old = c->cf; if ((new->secondary != old->secondary) || + (new->validate != old->validate) || (new->gr_able != old->gr_able) || (new->llgr_able != old->llgr_able) || (new->llgr_time != old->llgr_time) || @@ -2111,8 +2155,9 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor (new->add_path != old->add_path) || (new->import_table != old->import_table) || (new->export_table != old->export_table) || - (IGP_TABLE(new, ip4) != IGP_TABLE(old, ip4)) || - (IGP_TABLE(new, ip6) != IGP_TABLE(old, ip6))) + (TABLE(new, igp_table_ip4) != TABLE(old, igp_table_ip4)) || + (TABLE(new, igp_table_ip6) != TABLE(old, igp_table_ip6)) || + (TABLE(new, base_table) != TABLE(old, base_table))) return 0; if (new->mandatory && !old->mandatory && (C->channel_state != CS_UP)) @@ -2526,6 +2571,9 @@ bgp_show_proto_info(struct proto *P) if (c->igp_table_ip6) cli_msg(-1006, " IGP IPv6 table: %s", c->igp_table_ip6->name); + + if (c->base_table) + cli_msg(-1006, " Base table: %s", c->base_table->name); } } } diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index cca4b448..e894d632 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -146,6 +146,7 @@ struct bgp_channel_config { u8 mandatory; /* Channel is mandatory in capability negotiation */ u8 gw_mode; /* How we compute route gateway from next_hop attr, see GW_* */ u8 secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ + u8 validate; /* Validate Flowspec per RFC 8955 (6) */ u8 gr_able; /* Allow full graceful restart for the channel */ u8 llgr_able; /* Allow full long-lived GR for the channel */ uint llgr_time; /* Long-lived graceful restart stale time */ @@ -159,6 +160,7 @@ struct bgp_channel_config { struct rtable_config *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ struct rtable_config *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ + struct rtable_config *base_table; /* Base table for Flowspec validation */ }; #define BGP_PT_INTERNAL 1 @@ -340,6 +342,7 @@ struct bgp_channel { rtable *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ rtable *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ + rtable *base_table; /* Base table for Flowspec validation */ /* Rest are zeroed when down */ pool *pool; @@ -449,6 +452,7 @@ struct bgp_parse_state { jmp_buf err_jmpbuf; struct hostentry *hostentry; + struct rtable *base_table; adata *mpls_labels; /* Cached state for bgp_rte_update() */ @@ -515,7 +519,7 @@ struct rte_source *bgp_get_source(struct bgp_proto *p, u32 path_id); static inline int rte_resolvable(rte *rt) { - return rt->attrs->dest == RTD_UNICAST; + return rt->attrs->dest != RTD_UNREACHABLE; } diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 2dfbdca9..27c352c5 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -31,7 +31,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG, LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS, DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE, - FIRST) + FIRST, VALIDATE, BASE) %type <i> bgp_nh %type <i32> bgp_afi @@ -255,6 +255,11 @@ bgp_channel_item: | GATEWAY DIRECT { BGP_CC->gw_mode = GW_DIRECT; } | GATEWAY RECURSIVE { BGP_CC->gw_mode = GW_RECURSIVE; } | SECONDARY bool { BGP_CC->secondary = $2; } + | VALIDATE bool { + BGP_CC->validate = $2; + if (BGP_SAFI(BGP_CC->afi) != BGP_SAFI_FLOW) + cf_error("Validate option limited to flowspec channels"); + } | GRACEFUL RESTART bool { BGP_CC->gr_able = $3; } | LONG LIVED GRACEFUL RESTART bool { BGP_CC->llgr_able = $5; } | LONG LIVED STALE TIME expr { BGP_CC->llgr_time = $5; } @@ -278,6 +283,16 @@ bgp_channel_item: else cf_error("Mismatched IGP table type"); } + | BASE TABLE rtable { + if (BGP_SAFI(BGP_CC->afi) != BGP_SAFI_FLOW) + cf_error("Base table option limited to flowspec channels"); + + if (((BGP_CC->afi == BGP_AF_FLOW4) && ($3->addr_type == NET_IP4)) || + ((BGP_CC->afi == BGP_AF_FLOW6) && ($3->addr_type == NET_IP6))) + BGP_CC->base_table = $3; + else + cf_error("Mismatched base table type"); + } ; bgp_channel_opts: diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index 99b5d5b4..e536f873 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -1009,6 +1009,23 @@ bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum) } } +static void +bgp_apply_flow_validation(struct bgp_parse_state *s, const net_addr *n, rta *a) +{ + struct bgp_channel *c = s->channel; + int valid = rt_flowspec_check(c->base_table, c->c.table, n, a, s->proto->is_interior); + a->dest = valid ? RTD_NONE : RTD_UNREACHABLE; + + /* Set rte.bgp.base_table later from this state variable */ + s->base_table = c->base_table; + + /* Invalidate cached rta if dest changes */ + if (s->cached_rta && (s->cached_rta->dest != a->dest)) + { + rta_free(s->cached_rta); + s->cached_rta = NULL; + } +} static int bgp_match_src(struct bgp_export_state *s, int mode) @@ -1370,6 +1387,7 @@ bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0) e->pflags = 0; e->u.bgp.suppressed = 0; e->u.bgp.stale = -1; + e->u.bgp.base_table = s->base_table; rte_update3(&s->channel->c, n, e, s->last_src); } @@ -1884,6 +1902,10 @@ bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) net_fill_flow4(n, px, pxlen, pos, flen); ADVANCE(pos, len, flen); + /* Apply validation procedure per RFC 8955 (6) */ + if (a && s->channel->cf->validate) + bgp_apply_flow_validation(s, n, a); + bgp_rte_update(s, n, path_id, a); } } @@ -1972,6 +1994,10 @@ bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) net_fill_flow6(n, px, pxlen, pos, flen); ADVANCE(pos, len, flen); + /* Apply validation procedure per RFC 8955 (6) */ + if (a && s->channel->cf->validate) + bgp_apply_flow_validation(s, n, a); + bgp_rte_update(s, n, path_id, a); } } @@ -2425,6 +2451,8 @@ bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_lis s->last_id = 0; s->last_src = s->proto->p.main_source; + s->base_table = NULL; + /* * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for diff --git a/proto/pipe/pipe.c b/proto/pipe/pipe.c index 3532f114..f991d09a 100644 --- a/proto/pipe/pipe.c +++ b/proto/pipe/pipe.c @@ -81,7 +81,10 @@ pipe_rt_notify(struct proto *P, struct channel *src_ch, net *n, rte *new, rte *o #ifdef CONFIG_BGP /* Hack to cleanup cached value */ if (e->attrs->src->proto->proto == &proto_bgp) + { e->u.bgp.stale = -1; + e->u.bgp.base_table = NULL; + } #endif src = a->src; |