BGP: Implement flowspec validation procedure

Implement flowspec validation procedure as described in RFC 8955 sec. 6 and RFC 9117. The Validation procedure enforces that only routers in the forwarding path for a network can originate flowspec rules for that network. The patch adds new mechanism for tracking inter-table dependencies, which is necessary as the flowspec validation depends on IP routes, and flowspec rules must be revalidated when best IP routes change. The validation procedure is disabled by default and requires that relevant IP table uses trie, as it uses interval queries for subnets.
author: Ondrej Zajicek (work) <santiago@crfreenet.org> 2021-12-20 20:25:35 +0100
committer: Ondrej Zajicek (work) <santiago@crfreenet.org> 2022-02-06 23:27:13 +0100
commit: 1f2eb2aca8e348fefc1822ec2adcad0cc97768d8 (patch)
tree: 11494fc2f2dbc8b7aeb2a4a172fec6d2263af4ab
parent: 1ae42e522374ae60c23fe4c419c62b2209fbeea8 (diff)
9 files changed, 487 insertions, 26 deletions
diff --git a/doc/bird.sgml b/doc/bird.sgml
index 39dadaf2..d1d2bdae 100644
--- a/doc/bird.sgml
+++ b/doc/bird.sgml
@@ -2274,6 +2274,7 @@ avoid routing loops.
 <item> <rfc id="8092"> - BGP Large Communities Attribute
 <item> <rfc id="8203"> - BGP Administrative Shutdown Communication
 <item> <rfc id="8212"> - Default EBGP Route Propagation Behavior without Policies
+<item> <rfc id="9117"> - Revised Validation Procedure for BGP Flow Specifications
 </itemize>
 
 <sect1>Route selection rules
@@ -2659,7 +2660,7 @@ using the following configuration parameters:
 
 	<tag><label id="bgp-error-wait-time">error wait time <m/number/,<m/number/</tag>
 	Minimum and maximum delay in seconds between a protocol failure (either
-	local or reported by the peer) and automatic restart. Doesn't apply
+	local or reported by the peer) and automatic restart. Doesn not apply
 	when <cf/disable after error/ is configured. If consecutive errors
 	happen, the delay is increased exponentially until it reaches the
 	maximum. Default: 60, 300.
@@ -2837,6 +2838,31 @@ be used in explicit configuration.
 	explicitly (to conserve memory). This option requires that the connected
 	routing table is <ref id="dsc-table-sorted" name="sorted">. Default: off.
 
+	<tag><label id="bgp-validate">validate <m/switch/</tag>
+	Apply flowspec validation procedure as described in <rfc id="8955">
+	section 6 and <rfc id="9117">. The Validation procedure enforces that
+	only routers in the forwarding path for a network can originate flowspec
+	rules for that network. The validation procedure should be used for EBGP
+	to prevent injection of malicious flowspec rules from outside, but it
+	should also be used for IBGP to ensure that selected flowspec rules are
+	consistent with selected IP routes. The validation procedure uses an IP
+	routing table (<ref id="bgp-base-table" name="base table">, see below)
+	against which flowspec rules are validated. This option is limited to
+	flowspec channels. Default: off (for compatibility reasons).
+
+	Note that currently the flowspec validation does not work reliably
+	together with <ref id="bgp-import-table" name="import table"> option
+	enabled on flowspec channels.
+
+	<tag><label id="bgp-base-table">base table <m/name/</tag>
+	Specifies an IP table used for the flowspec validation procedure. The
+	table must have enabled <cf/trie/ option, otherwise the validation
+	procedure would not work. The type of the table must be <cf/ipv4/ for
+	<cf/flow4/ channels and <cf/ipv6/ for <cf/flow6/ channels. This option
+	is limited to flowspec channels. Default: the main table of the
+	<cf/ipv4/ / <cf/ipv6/ channel of the same BGP instance, or the
+	<cf/master4/ / <cf/master6/ table if there is no such channel.
+
 	<tag><label id="bgp-extended-next-hop">extended next hop <m/switch/</tag>
 	BGP expects that announced next hops have the same address family as
 	associated network prefixes. This option provides an extension to use
diff --git a/nest/route.h b/nest/route.h
index a1732bc7..102ea0ea 100644
--- a/nest/route.h
+++ b/nest/route.h
@@ -192,6 +192,8 @@ typedef struct rtable {
 
   list subscribers;			/* Subscribers for notifications */
   struct timer *settle_timer;		/* Settle time for notifications */
+  list flowspec_links;			/* List of flowspec links, src for NET_IPx and dst for NET_FLOWx */
+  struct f_trie *flowspec_trie;		/* Trie for evaluation of flowspec notifications */
 } rtable;
 
 struct rt_subscription {
@@ -201,6 +203,13 @@ struct rt_subscription {
   void *data;
 };
 
+struct rt_flowspec_link {
+  node n;
+  rtable *src;
+  rtable *dst;
+  u32 uc;
+};
+
 #define NHU_CLEAN	0
 #define NHU_SCHEDULED	1
 #define NHU_RUNNING	2
@@ -267,6 +276,7 @@ typedef struct rte {
     struct {
       u8 suppressed;			/* Used for deterministic MED comparison */
       s8 stale;				/* Route is LLGR_STALE, -1 if unknown */
+      struct rtable *base_table;	/* Base table for Flowspec validation */
     } bgp;
 #endif
 #ifdef CONFIG_BABEL
@@ -322,6 +332,8 @@ void rt_lock_table(rtable *);
 void rt_unlock_table(rtable *);
 void rt_subscribe(rtable *tab, struct rt_subscription *s);
 void rt_unsubscribe(struct rt_subscription *s);
+void rt_flowspec_link(rtable *src, rtable *dst);
+void rt_flowspec_unlink(rtable *src, rtable *dst);
 rtable *rt_setup(pool *, struct rtable_config *);
 static inline void rt_shutdown(rtable *r) { rfree(r->rp); }
 
@@ -743,6 +755,9 @@ rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr gw, ip_addr
 static inline void rt_lock_hostentry(struct hostentry *he) { if (he) he->uc++; }
 static inline void rt_unlock_hostentry(struct hostentry *he) { if (he) he->uc--; }
 
+int rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior);
+
+
 /*
  *	Default protocol preferences
  */
diff --git a/nest/rt-table.c b/nest/rt-table.c
index 94ae987b..ee679898 100644
--- a/nest/rt-table.c
+++ b/nest/rt-table.c
@@ -26,6 +26,66 @@
  * (see the route attribute module for a precise explanation) holding the
  * remaining route attributes which are expected to be shared by multiple
  * routes in order to conserve memory.
+ *
+ * There are several mechanisms that allow automatic update of routes in one
+ * routing table (dst) as a result of changes in another routing table (src).
+ * They handle issues of recursive next hop resolving, flowspec validation and
+ * RPKI validation.
+ *
+ * The first such mechanism is handling of recursive next hops. A route in the
+ * dst table has an indirect next hop address, which is resolved through a route
+ * in the src table (which may also be the same table) to get an immediate next
+ * hop. This is implemented using structure &hostcache attached to the src
+ * table, which contains &hostentry structures for each tracked next hop
+ * address. These structures are linked from recursive routes in dst tables,
+ * possibly multiple routes sharing one hostentry (as many routes may have the
+ * same indirect next hop). There is also a trie in the hostcache, which matches
+ * all prefixes that may influence resolving of tracked next hops.
+ *
+ * When a best route changes in the src table, the hostcache is notified using
+ * rt_notify_hostcache(), which immediately checks using the trie whether the
+ * change is relevant and if it is, then it schedules asynchronous hostcache
+ * recomputation. The recomputation is done by rt_update_hostcache() (called
+ * from rt_event() of src table), it walks through all hostentries and resolves
+ * them (by rt_update_hostentry()). It also updates the trie. If a change in
+ * hostentry resolution was found, then it schedules asynchronous nexthop
+ * recomputation of associated dst table. That is done by rt_next_hop_update()
+ * (called from rt_event() of dst table), it iterates over all routes in the dst
+ * table and re-examines their hostentries for changes. Note that in contrast to
+ * hostcache update, next hop update can be interrupted by main loop. These two
+ * full-table walks (over hostcache and dst table) are necessary due to absence
+ * of direct lookups (route -> affected nexthop, nexthop -> its route).
+ *
+ * The second mechanism is for flowspec validation, where validity of flowspec
+ * routes depends of resolving their network prefixes in IP routing tables. This
+ * is similar to the recursive next hop mechanism, but simpler as there are no
+ * intermediate hostcache and hostentries (because flows are less likely to
+ * share common net prefix than routes sharing a common next hop). In src table,
+ * there is a list of dst tables (list flowspec_links), this list is updated by
+ * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during
+ * channel start/stop). Each dst table has its own trie of prefixes that may
+ * influence validation of flowspec routes in it (flowspec_trie).
+ *
+ * When a best route changes in the src table, rt_flowspec_notify() immediately
+ * checks all dst tables from the list using their tries to see whether the
+ * change is relevant for them. If it is, then an asynchronous re-validation of
+ * flowspec routes in the dst table is scheduled. That is also done by function
+ * rt_next_hop_update(), like nexthop recomputation above. It iterates over all
+ * flowspec routes and re-validates them. It also recalculates the trie.
+ *
+ * Note that in contrast to the hostcache update, here the trie is recalculated
+ * during the rt_next_hop_update(), which may be interleaved with IP route
+ * updates. The trie is flushed at the beginning of recalculation, which means
+ * that such updates may use partial trie to see if they are relevant. But it
+ * works anyway! Either affected flowspec was already re-validated and added to
+ * the trie, then IP route change would match the trie and trigger a next round
+ * of re-validation, or it was not yet re-validated and added to the trie, but
+ * will be re-validated later in this round anyway.
+ *
+ * The third mechanism is used for RPKI re-validation of IP routes and it is the
+ * simplest. It is just a list of subscribers in src table, who are notified
+ * when any change happened, but only after a settle time. Also, in RPKI case
+ * the dst is not a table, but a channel, who refeeds routes through a filter.
  */
 
 #undef LOCAL_DEBUG
@@ -44,6 +104,7 @@
 #include "lib/hash.h"
 #include "lib/string.h"
 #include "lib/alloca.h"
+#include "lib/flowspec.h"
 
 #ifdef CONFIG_BGP
 #include "proto/bgp/bgp.h"
@@ -62,6 +123,7 @@ static void rt_update_hostcache(rtable *tab);
 static void rt_next_hop_update(rtable *tab);
 static inline void rt_prune_table(rtable *tab);
 static inline void rt_schedule_notify(rtable *tab);
+static void rt_flowspec_notify(rtable *tab, net *net);
 
 
 static void
@@ -1193,6 +1255,9 @@ rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old,
 
     if (tab->hostcache)
       rt_notify_hostcache(tab, net);
+
+    if (!EMPTY_LIST(tab->flowspec_links))
+      rt_flowspec_notify(tab, net);
   }
 
   rt_schedule_notify(tab);
@@ -1254,6 +1319,10 @@ rte_validate(rte *e)
 
   if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest)
   {
+    /* Exception for flowspec that failed validation */
+    if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE))
+      return 1;
+
     log(L_WARN "Ignoring route %N with invalid dest %d received via %s",
 	n->n.addr, e->attrs->dest, e->sender->proto->name);
     return 0;
@@ -2097,6 +2166,90 @@ rt_unsubscribe(struct rt_subscription *s)
   rt_unlock_table(s->tab);
 }
 
+static struct rt_flowspec_link *
+rt_flowspec_find_link(rtable *src, rtable *dst)
+{
+  struct rt_flowspec_link *ln;
+  WALK_LIST(ln, src->flowspec_links)
+    if ((ln->src == src) && (ln->dst == dst))
+      return ln;
+
+  return NULL;
+}
+
+void
+rt_flowspec_link(rtable *src, rtable *dst)
+{
+  ASSERT(rt_is_ip(src));
+  ASSERT(rt_is_flow(dst));
+
+  struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
+
+  if (!ln)
+  {
+    rt_lock_table(src);
+    rt_lock_table(dst);
+
+    ln = mb_allocz(src->rp, sizeof(struct rt_flowspec_link));
+    ln->src = src;
+    ln->dst = dst;
+    add_tail(&src->flowspec_links, &ln->n);
+  }
+
+  ln->uc++;
+}
+
+void
+rt_flowspec_unlink(rtable *src, rtable *dst)
+{
+  struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
+
+  ASSERT(ln && (ln->uc > 0));
+
+  ln->uc--;
+
+  if (!ln->uc)
+  {
+    rem_node(&ln->n);
+    mb_free(ln);
+
+    rt_unlock_table(src);
+    rt_unlock_table(dst);
+  }
+}
+
+static void
+rt_flowspec_notify(rtable *src, net *net)
+{
+  /* Only IP tables are src links */
+  ASSERT(rt_is_ip(src));
+
+  struct rt_flowspec_link *ln;
+  WALK_LIST(ln, src->flowspec_links)
+  {
+    rtable *dst = ln->dst;
+    ASSERT(rt_is_flow(dst));
+
+    /* No need to inspect it further if recalculation is already active */
+    if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY))
+      continue;
+
+    if (trie_match_net(dst->flowspec_trie, net->n.addr))
+      rt_schedule_nhu(dst);
+  }
+}
+
+static void
+rt_flowspec_reset_trie(rtable *tab)
+{
+  linpool *lp = tab->flowspec_trie->lp;
+  int ipv4 = tab->flowspec_trie->ipv4;
+
+  lp_flush(lp);
+  tab->flowspec_trie = f_new_trie(lp, 0);
+  tab->flowspec_trie->ipv4 = ipv4;
+}
+
 static void
 rt_free(resource *_r)
 {
@@ -2167,16 +2320,23 @@ rt_setup(pool *pp, struct rtable_config *cf)
     t->fib.init = net_init_with_trie;
   }
 
+  init_list(&t->channels);
+  init_list(&t->flowspec_links);
+  init_list(&t->subscribers);
+
   if (!(t->internal = cf->internal))
   {
-    init_list(&t->channels);
     hmap_init(&t->id_map, p, 1024);
     hmap_set(&t->id_map, 0);
 
-    init_list(&t->subscribers);
-
     t->rt_event = ev_new_init(p, rt_event, t);
     t->last_rt_change = t->gc_time = current_time();
+
+    if (rt_is_flow(t))
+    {
+      t->flowspec_trie = f_new_trie(lp_new_default(p), 0);
+      t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4);
+    }
   }
 
   return t;
@@ -2331,21 +2491,6 @@ rt_preconfig(struct config *c)
  * triggered by rt_schedule_nhu().
  */
 
-static inline int
-rta_next_hop_outdated(rta *a)
-{
-  struct hostentry *he = a->hostentry;
-
-  if (!he)
-    return 0;
-
-  if (!he->src)
-    return a->dest != RTD_UNREACHABLE;
-
-  return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) ||
-    (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh));
-}
-
 void
 rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls)
 {
@@ -2437,9 +2582,27 @@ no_nexthop:
     }
 }
 
+static inline int
+rta_next_hop_outdated(rta *a)
+{
+  struct hostentry *he = a->hostentry;
+
+  if (!he)
+    return 0;
+
+  if (!he->src)
+    return a->dest != RTD_UNREACHABLE;
+
+  return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) ||
+    (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh));
+}
+
 static inline rte *
 rt_next_hop_update_rte(rtable *tab UNUSED, rte *old)
 {
+  if (!rta_next_hop_outdated(old->attrs))
+    return NULL;
+
   rta *a = alloca(RTA_MAX_SIZE);
   memcpy(a, old->attrs, rta_size(old->attrs));
 
@@ -2456,6 +2619,152 @@ rt_next_hop_update_rte(rtable *tab UNUSED, rte *old)
   return e;
 }
 
+
+#ifdef CONFIG_BGP
+
+static inline int
+net_flow_has_dst_prefix(const net_addr *n)
+{
+  ASSUME(net_is_flow(n));
+
+  if (n->pxlen)
+    return 1;
+
+  if (n->type == NET_FLOW4)
+  {
+    const net_addr_flow4 *n4 = (void *) n;
+    return (n4->length > sizeof(net_addr_flow4)) && (n4->data[0] == FLOW_TYPE_DST_PREFIX);
+  }
+  else
+  {
+    const net_addr_flow6 *n6 = (void *) n;
+    return (n6->length > sizeof(net_addr_flow6)) && (n6->data[0] == FLOW_TYPE_DST_PREFIX);
+  }
+}
+
+static inline int
+rta_as_path_is_empty(rta *a)
+{
+  eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
+  return !e || (as_path_getlen(e->u.ptr) == 0);
+}
+
+static inline u32
+rta_get_first_asn(rta *a)
+{
+  eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
+  u32 asn;
+
+  return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0;
+}
+
+int
+rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior)
+{
+  ASSERT(rt_is_ip(tab_ip));
+  ASSERT(rt_is_flow(tab_flow));
+  ASSERT(tab_ip->trie);
+
+  /* RFC 8955 6. a) Flowspec has defined dst prefix */
+  if (!net_flow_has_dst_prefix(n))
+    return 0;
+
+  /* RFC 9117 4.1. Accept  AS_PATH is empty (fr */
+  if (interior && rta_as_path_is_empty(a))
+    return 1;
+
+
+  /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */
+
+  /* Find flowspec dst prefix */
+  net_addr dst;
+  if (n->type == NET_FLOW4)
+    net_fill_ip4(&dst, net4_prefix(n), net4_pxlen(n));
+  else
+    net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n));
+
+  /* Find best-match BGP unicast route for flowspec dst prefix */
+  net *nb = net_route(tab_ip, &dst);
+  rte *rb = nb ? nb->routes : NULL;
+
+  /* Register prefix to trie for tracking further changes */
+  int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH;
+  trie_add_prefix(tab_flow->flowspec_trie, &dst, (nb ? nb->n.addr->pxlen : 0), max_pxlen);
+
+  /* No best-match BGP route -> no flowspec */
+  if (!rb || (rb->attrs->source != RTS_BGP))
+    return 0;
+
+  /* Find ORIGINATOR_ID values */
+  u32 orig_a = ea_get_int(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
+  u32 orig_b = ea_get_int(rb->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
+
+  /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */
+  if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal(a->from, rb->attrs->from)))
+    return 0;
+
+
+  /* Find ASN of the best-match route, for use in next checks */
+  u32 asn_b = rta_get_first_asn(rb->attrs);
+  if (!asn_b)
+    return 0;
+
+  /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */
+  if (!interior && (rta_get_first_asn(a) != asn_b))
+    return 0;
+
+  /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */
+  TRIE_WALK(tab_ip->trie, subnet, &dst)
+  {
+    net *nc = net_find_valid(tab_ip, &subnet);
+    if (!nc)
+      continue;
+
+    rte *rc = nc->routes;
+    if (rc->attrs->source != RTS_BGP)
+      return 0;
+
+    if (rta_get_first_asn(rc->attrs) != asn_b)
+      return 0;
+  }
+  TRIE_WALK_END;
+
+  return 1;
+}
+
+#endif /* CONFIG_BGP */
+
+static rte *
+rt_flowspec_update_rte(rtable *tab, rte *r)
+{
+#ifdef CONFIG_BGP
+  if ((r->attrs->source != RTS_BGP) || !r->u.bgp.base_table)
+    return NULL;
+
+  const net_addr *n = r->net->n.addr;
+  struct bgp_proto *p = (void *) r->attrs->src->proto;
+  int valid = rt_flowspec_check(r->u.bgp.base_table, tab, n, r->attrs, p->is_interior);
+  int dest = valid ? RTD_NONE : RTD_UNREACHABLE;
+
+  if (dest == r->attrs->dest)
+    return NULL;
+
+  rta *a = alloca(RTA_MAX_SIZE);
+  memcpy(a, r->attrs, rta_size(r->attrs));
+  a->dest = dest;
+  a->aflags = 0;
+
+  rte *new = sl_alloc(rte_slab);
+  memcpy(new, r, sizeof(rte));
+  new->attrs = rta_lookup(a);
+
+  return new;
+#else
+  return NULL;
+#endif
+}
+
+
 static inline int
 rt_next_hop_update_net(rtable *tab, net *n)
 {
@@ -2468,9 +2777,14 @@ rt_next_hop_update_net(rtable *tab, net *n)
     return 0;
 
   for (k = &n->routes; e = *k; k = &e->next)
-    if (rta_next_hop_outdated(e->attrs))
+  {
+    if (!net_is_flow(n->n.addr))
+      new = rt_next_hop_update_rte(tab, e);
+    else
+      new = rt_flowspec_update_rte(tab, e);
+
+    if (new)
       {
-	new = rt_next_hop_update_rte(tab, e);
 	*k = new;
 
 	rte_trace_in(D_ROUTES, new->sender, new, "updated");
@@ -2489,6 +2803,7 @@ rt_next_hop_update_net(rtable *tab, net *n)
 	e = new;
 	count++;
       }
+  }
 
   if (!count)
     return 0;
@@ -2536,6 +2851,9 @@ rt_next_hop_update(rtable *tab)
     {
       FIB_ITERATE_INIT(fit, &tab->fib);
       tab->nhu_state = NHU_RUNNING;
+
+      if (tab->flowspec_trie)
+	rt_flowspec_reset_trie(tab);
     }
 
   FIB_ITERATE_START(&tab->fib, fit, net, n)
diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c
index 24ba00ba..b28cbd55 100644
--- a/proto/bgp/attrs.c
+++ b/proto/bgp/attrs.c
@@ -1676,6 +1676,10 @@ bgp_preexport(struct proto *P, rte **new, struct linpool *pool UNUSED)
   if (src == NULL)
     return 0;
 
+  /* Reject flowspec that failed validation */
+  if ((e->attrs->dest == RTD_UNREACHABLE) && net_is_flow(e->net->n.addr))
+      return -1;
+
   /* IBGP route reflection, RFC 4456 */
   if (p->is_internal && src->is_internal && (p->local_as == src->local_as))
   {
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index e4d754b1..58084136 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -101,6 +101,7 @@
  * RFC 8203 - BGP Administrative Shutdown Communication
  * RFC 8212 - Default EBGP Route Propagation Behavior without Policies
  * RFC 8654 - Extended Message Support for BGP
+ * RFC 9117 - Revised Validation Procedure for BGP Flow Specifications
  * draft-ietf-idr-ext-opt-param-07
  * draft-uttaro-idr-bgp-persistence-04
  * draft-walton-bgp-hostname-capability-02
@@ -1735,6 +1736,9 @@ bgp_channel_init(struct channel *C, struct channel_config *CF)
 
   if (cf->igp_table_ip6)
     c->igp_table_ip6 = cf->igp_table_ip6->table;
+
+  if (cf->base_table)
+    c->base_table = cf->base_table->table;
 }
 
 static int
@@ -1750,6 +1754,12 @@ bgp_channel_start(struct channel *C)
   if (c->igp_table_ip6)
     rt_lock_table(c->igp_table_ip6);
 
+  if (c->base_table)
+  {
+    rt_lock_table(c->base_table);
+    rt_flowspec_link(c->base_table, c->c.table);
+  }
+
   c->pool = p->p.pool; // XXXX
   bgp_init_bucket_table(c);
   bgp_init_prefix_table(c);
@@ -1834,6 +1844,12 @@ bgp_channel_cleanup(struct channel *C)
   if (c->igp_table_ip6)
     rt_unlock_table(c->igp_table_ip6);
 
+  if (c->base_table)
+  {
+    rt_flowspec_unlink(c->base_table, c->c.table);
+    rt_unlock_table(c->base_table);
+  }
+
   c->index = 0;
 
   /* Cleanup rest of bgp_channel starting at pool field */
@@ -1881,6 +1897,25 @@ bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32
   cf_error("Undefined IGP table");
 }
 
+static struct rtable_config *
+bgp_default_base_table(struct bgp_config *cf, struct bgp_channel_config *cc)
+{
+  /* Expected table type */
+  u32 type = (cc->afi == BGP_AF_FLOW4) ? NET_IP4 : NET_IP6;
+
+  /* First, try appropriate IP channel */
+  u32 afi2 = BGP_AF(BGP_AFI(cc->afi), BGP_SAFI_UNICAST);
+  struct bgp_channel_config *cc2 = bgp_find_channel_config(cf, afi2);
+  if (cc2 && (cc2->c.table->addr_type == type))
+    return cc2->c.table;
+
+  /* Last, try default table of given type */
+  struct rtable_config *tab = cf->c.global->def_tables[type];
+  if (tab)
+    return tab;
+
+  cf_error("Undefined base table");
+}
 
 void
 bgp_postconfig(struct proto_config *CF)
@@ -2025,6 +2060,14 @@ bgp_postconfig(struct proto_config *CF)
 	cf_error("Mismatched IGP table type");
     }
 
+    /* Default value of base table */
+    if ((BGP_SAFI(cc->afi) == BGP_SAFI_FLOW) && cc->validate && !cc->base_table)
+      cc->base_table = bgp_default_base_table(cf, cc);
+
+    if (cc->base_table && !cc->base_table->trie_used)
+      cf_error("Flowspec validation requires base table (%s) with trie",
+	       cc->base_table->name);
+
     if (cf->multihop && (cc->gw_mode == GW_DIRECT))
       cf_error("Multihop BGP cannot use direct gateway mode");
 
@@ -2093,7 +2136,7 @@ bgp_reconfigure(struct proto *P, struct proto_config *CF)
   return same;
 }
 
-#define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL )
+#define TABLE(cf, NAME) ((cf)->NAME ? (cf)->NAME->table : NULL )
 
 static int
 bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *import_changed, int *export_changed)
@@ -2104,6 +2147,7 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor
   struct bgp_channel_config *old = c->cf;
 
   if ((new->secondary != old->secondary) ||
+      (new->validate != old->validate) ||
       (new->gr_able != old->gr_able) ||
       (new->llgr_able != old->llgr_able) ||
       (new->llgr_time != old->llgr_time) ||
@@ -2111,8 +2155,9 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor
       (new->add_path != old->add_path) ||
       (new->import_table != old->import_table) ||
       (new->export_table != old->export_table) ||
-      (IGP_TABLE(new, ip4) != IGP_TABLE(old, ip4)) ||
-      (IGP_TABLE(new, ip6) != IGP_TABLE(old, ip6)))
+      (TABLE(new, igp_table_ip4) != TABLE(old, igp_table_ip4)) ||
+      (TABLE(new, igp_table_ip6) != TABLE(old, igp_table_ip6)) ||
+      (TABLE(new, base_table) != TABLE(old, base_table)))
     return 0;
 
   if (new->mandatory && !old->mandatory && (C->channel_state != CS_UP))
@@ -2526,6 +2571,9 @@ bgp_show_proto_info(struct proto *P)
 
       if (c->igp_table_ip6)
 	cli_msg(-1006, "    IGP IPv6 table: %s", c->igp_table_ip6->name);
+
+      if (c->base_table)
+	cli_msg(-1006, "    Base table:     %s", c->base_table->name);
     }
   }
 }
diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h
index cca4b448..e894d632 100644
--- a/proto/bgp/bgp.h
+++ b/proto/bgp/bgp.h
@@ -146,6 +146,7 @@ struct bgp_channel_config {
   u8 mandatory;				/* Channel is mandatory in capability negotiation */
   u8 gw_mode;				/* How we compute route gateway from next_hop attr, see GW_* */
   u8 secondary;				/* Accept also non-best routes (i.e. RA_ACCEPTED) */
+  u8 validate;				/* Validate Flowspec per RFC 8955 (6) */
   u8 gr_able;				/* Allow full graceful restart for the channel */
   u8 llgr_able;				/* Allow full long-lived GR for the channel */
   uint llgr_time;			/* Long-lived graceful restart stale time */
@@ -159,6 +160,7 @@ struct bgp_channel_config {
 
   struct rtable_config *igp_table_ip4;	/* Table for recursive IPv4 next hop lookups */
   struct rtable_config *igp_table_ip6;	/* Table for recursive IPv6 next hop lookups */
+  struct rtable_config *base_table;	/* Base table for Flowspec validation */
 };
 
 #define BGP_PT_INTERNAL		1
@@ -340,6 +342,7 @@ struct bgp_channel {
 
   rtable *igp_table_ip4;		/* Table for recursive IPv4 next hop lookups */
   rtable *igp_table_ip6;		/* Table for recursive IPv6 next hop lookups */
+  rtable *base_table;			/* Base table for Flowspec validation */
 
   /* Rest are zeroed when down */
   pool *pool;
@@ -449,6 +452,7 @@ struct bgp_parse_state {
   jmp_buf err_jmpbuf;
 
   struct hostentry *hostentry;
+  struct rtable *base_table;
   adata *mpls_labels;
 
   /* Cached state for bgp_rte_update() */
@@ -515,7 +519,7 @@ struct rte_source *bgp_get_source(struct bgp_proto *p, u32 path_id);
 static inline int
 rte_resolvable(rte *rt)
 {
-  return rt->attrs->dest == RTD_UNICAST;
+  return rt->attrs->dest != RTD_UNREACHABLE;
 }
 
 
diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y
index 2dfbdca9..27c352c5 100644
--- a/proto/bgp/config.Y
+++ b/proto/bgp/config.Y
@@ -31,7 +31,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE,
 	STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG,
 	LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS,
 	DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE,
-	FIRST)
+	FIRST, VALIDATE, BASE)
 
 %type <i> bgp_nh
 %type <i32> bgp_afi
@@ -255,6 +255,11 @@ bgp_channel_item:
  | GATEWAY DIRECT { BGP_CC->gw_mode = GW_DIRECT; }
  | GATEWAY RECURSIVE { BGP_CC->gw_mode = GW_RECURSIVE; }
  | SECONDARY bool { BGP_CC->secondary = $2; }
+ | VALIDATE bool {
+    BGP_CC->validate = $2;
+    if (BGP_SAFI(BGP_CC->afi) != BGP_SAFI_FLOW)
+      cf_error("Validate option limited to flowspec channels");
+   }
  | GRACEFUL RESTART bool { BGP_CC->gr_able = $3; }
  | LONG LIVED GRACEFUL RESTART bool { BGP_CC->llgr_able = $5; }
  | LONG LIVED STALE TIME expr { BGP_CC->llgr_time = $5; }
@@ -278,6 +283,16 @@ bgp_channel_item:
     else
       cf_error("Mismatched IGP table type");
    }
+ | BASE TABLE rtable {
+    if (BGP_SAFI(BGP_CC->afi) != BGP_SAFI_FLOW)
+      cf_error("Base table option limited to flowspec channels");
+
+    if (((BGP_CC->afi == BGP_AF_FLOW4) && ($3->addr_type == NET_IP4)) ||
+        ((BGP_CC->afi == BGP_AF_FLOW6) && ($3->addr_type == NET_IP6)))
+      BGP_CC->base_table = $3;
+    else
+      cf_error("Mismatched base table type");
+   }
  ;
 
 bgp_channel_opts:
diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c
index 99b5d5b4..e536f873 100644
--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@@ -1009,6 +1009,23 @@ bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
   }
 }
 
+static void
+bgp_apply_flow_validation(struct bgp_parse_state *s, const net_addr *n, rta *a)
+{
+  struct bgp_channel *c = s->channel;
+  int valid = rt_flowspec_check(c->base_table, c->c.table, n, a, s->proto->is_interior);
+  a->dest = valid ? RTD_NONE : RTD_UNREACHABLE;
+
+  /* Set rte.bgp.base_table later from this state variable */
+  s->base_table = c->base_table;
+
+  /* Invalidate cached rta if dest changes */
+  if (s->cached_rta && (s->cached_rta->dest != a->dest))
+  {
+    rta_free(s->cached_rta);
+    s->cached_rta = NULL;
+  }
+}
 
 static int
 bgp_match_src(struct bgp_export_state *s, int mode)
@@ -1370,6 +1387,7 @@ bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
   e->pflags = 0;
   e->u.bgp.suppressed = 0;
   e->u.bgp.stale = -1;
+  e->u.bgp.base_table = s->base_table;
   rte_update3(&s->channel->c, n, e, s->last_src);
 }
 
@@ -1884,6 +1902,10 @@ bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
     net_fill_flow4(n, px, pxlen, pos, flen);
     ADVANCE(pos, len, flen);
 
+    /* Apply validation procedure per RFC 8955 (6) */
+    if (a && s->channel->cf->validate)
+      bgp_apply_flow_validation(s, n, a);
+
     bgp_rte_update(s, n, path_id, a);
   }
 }
@@ -1972,6 +1994,10 @@ bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
     net_fill_flow6(n, px, pxlen, pos, flen);
     ADVANCE(pos, len, flen);
 
+    /* Apply validation procedure per RFC 8955 (6) */
+    if (a && s->channel->cf->validate)
+      bgp_apply_flow_validation(s, n, a);
+
     bgp_rte_update(s, n, path_id, a);
   }
 }
@@ -2425,6 +2451,8 @@ bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_lis
   s->last_id = 0;
   s->last_src = s->proto->p.main_source;
 
+  s->base_table = NULL;
+
   /*
    * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
    * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
diff --git a/proto/pipe/pipe.c b/proto/pipe/pipe.c
index 3532f114..f991d09a 100644
--- a/proto/pipe/pipe.c
+++ b/proto/pipe/pipe.c
@@ -81,7 +81,10 @@ pipe_rt_notify(struct proto *P, struct channel *src_ch, net *n, rte *new, rte *o
 #ifdef CONFIG_BGP
       /* Hack to cleanup cached value */
       if (e->attrs->src->proto->proto == &proto_bgp)
+      {
 	e->u.bgp.stale = -1;
+	e->u.bgp.base_table = NULL;
+      }
 #endif
 
       src = a->src;
author	Ondrej Zajicek (work) <santiago@crfreenet.org>	2021-12-20 20:25:35 +0100
committer	Ondrej Zajicek (work) <santiago@crfreenet.org>	2022-02-06 23:27:13 +0100
commit	1f2eb2aca8e348fefc1822ec2adcad0cc97768d8 (patch)
tree	11494fc2f2dbc8b7aeb2a4a172fec6d2263af4ab
parent	1ae42e522374ae60c23fe4c419c62b2209fbeea8 (diff)