summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/bird.sgml29
-rw-r--r--lib/route.h24
-rw-r--r--nest/proto.c17
-rw-r--r--nest/rt-attr.c41
-rw-r--r--nest/rt-table.c72
-rw-r--r--nest/rt.h4
-rw-r--r--proto/bgp/attrs.c205
-rw-r--r--proto/bgp/bgp.c37
-rw-r--r--proto/bgp/bgp.h46
-rw-r--r--proto/bgp/config.Y16
-rw-r--r--proto/bgp/packets.c56
-rw-r--r--sysdep/cf/README1
-rw-r--r--sysdep/cf/linux.h1
-rw-r--r--sysdep/linux/netlink.c49
-rw-r--r--sysdep/unix/krt.c72
-rw-r--r--sysdep/unix/krt.h9
16 files changed, 458 insertions, 221 deletions
diff --git a/doc/bird.sgml b/doc/bird.sgml
index a4d7cb0c..c29353fc 100644
--- a/doc/bird.sgml
+++ b/doc/bird.sgml
@@ -2379,6 +2379,7 @@ avoid routing loops.
<item> <rfc id="8203"> - BGP Administrative Shutdown Communication
<item> <rfc id="8212"> - Default EBGP Route Propagation Behavior without Policies
<item> <rfc id="9117"> - Revised Validation Procedure for BGP Flow Specifications
+<item> <rfc id="9234"> - Route Leak Prevention and Detection Using Roles
</itemize>
<sect1>Route selection rules
@@ -2819,6 +2820,29 @@ using the following configuration parameters:
protocol itself (for example, if a route is received through eBGP and
therefore does not have such attribute). Default: 100 (0 in pre-1.2.0
versions of BIRD).
+
+ <tag><label id="bgp-local-role">local role <m/role-name/</tag>
+ BGP roles are a mechanism for route leak prevention and automatic route
+ filtering based on common BGP topology relationships. They are defined
+ in <rfc id="9234">. Instead of manually configuring filters and
+ communities, automatic filtering is done with the help of the OTC
+ attribute - a flag for routes that should be sent only to customers.
+ The same attribute is also used to automatically detect and filter route
+ leaks created by third parties.
+
+ This option is valid for EBGP sessions, but it is not recommended to be
+ used within AS confederations (which would require manual filtering of
+ <cf/bgp_otc/ attribute on confederation boundaries).
+
+ Possible <cf><m/role-name/</cf> values are: <cf/provider/,
+ <cf/rs_server/, <cf/rs_client/, <cf/customer/ and <cf/peer/.
+ Default: No local role assigned.
+
+ <tag><label id="bgp-require-roles">require roles <m/switch/</tag>
+ If this option is set, the BGP roles must be defined on both sides,
+ otherwise the session will not be established. This behavior is defined
+ in <rfc id="9234"> as "strict mode" and is used to enforce corresponding
+ configuration at your conterpart side. Default: disabled.
</descrip>
<sect1>Channel configuration
@@ -3126,6 +3150,11 @@ some of them (marked with `<tt/O/') are optional.
This attribute contains accumulated IGP metric, which is a total
distance to the destination through multiple autonomous systems.
Currently, the attribute is not accessible from filters.
+
+ <tag><label id="bgp-otc">int bgp_otc [O]</tag>
+ This attribute is defined in <rfc id="9234">. OTC is a flag that marks
+ routes that should be sent only to customers. If <ref id="bgp-role"
+ name="local Role"> is configured it set automatically.
</descrip>
<sect1>Example
diff --git a/lib/route.h b/lib/route.h
index cf3c70ba..f5456396 100644
--- a/lib/route.h
+++ b/lib/route.h
@@ -10,6 +10,8 @@
#ifndef _BIRD_LIB_ROUTE_H_
#define _BIRD_LIB_ROUTE_H_
+#undef RT_SOURCE_DEBUG
+
#include "lib/type.h"
#include "lib/rcu.h"
#include "lib/hash.h"
@@ -87,6 +89,11 @@ struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id);
struct rte_src *rt_find_source_global(u32 id);
+#ifdef RT_SOURCE_DEBUG
+#define rt_lock_source _rt_lock_source_internal
+#define rt_unlock_source _rt_unlock_source_internal
+#endif
+
static inline void rt_lock_source(struct rte_src *src)
{
/* Locking a source is trivial; somebody already holds it so we just increase
@@ -139,6 +146,14 @@ static inline void rt_unlock_source(struct rte_src *src)
rcu_read_unlock();
}
+#ifdef RT_SOURCE_DEBUG
+#undef rt_lock_source
+#undef rt_unlock_source
+
+#define rt_lock_source(x) ( log(L_INFO "Lock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_lock_source_internal(x) )
+#define rt_unlock_source(x) ( log(L_INFO "Unlock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_unlock_source_internal(x) )
+#endif
+
void rt_init_sources(struct rte_owner *, const char *name, event_list *list);
void rt_destroy_sources(struct rte_owner *, event *);
@@ -242,7 +257,7 @@ typedef struct ea_list {
struct ea_storage {
struct ea_storage *next_hash; /* Next in hash chain */
struct ea_storage **pprev_hash; /* Previous in hash chain */
- u32 uc; /* Use count */
+ _Atomic u32 uc; /* Use count */
u32 hash_key; /* List hash */
ea_list l[0]; /* The list itself */
};
@@ -511,12 +526,15 @@ static inline struct ea_storage *ea_get_storage(ea_list *r)
return SKIP_BACK(struct ea_storage, l[0], r);
}
-static inline ea_list *ea_clone(ea_list *r) { ea_get_storage(r)->uc++; return r; }
+static inline ea_list *ea_clone(ea_list *r) {
+ ASSERT_DIE(0 < atomic_fetch_add_explicit(&ea_get_storage(r)->uc, 1, memory_order_acq_rel));
+ return r;
+}
void ea__free(struct ea_storage *r);
static inline void ea_free(ea_list *l) {
if (!l) return;
struct ea_storage *r = ea_get_storage(l);
- if (!--r->uc) ea__free(r);
+ if (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel)) ea__free(r);
}
void ea_dump(ea_list *);
diff --git a/nest/proto.c b/nest/proto.c
index e64cef28..853b1cf9 100644
--- a/nest/proto.c
+++ b/nest/proto.c
@@ -313,9 +313,9 @@ proto_remove_channels(struct proto *p)
}
static void
-channel_roa_in_changed(struct rt_subscription *s)
+channel_roa_in_changed(void *_data)
{
- struct channel *c = s->data;
+ struct channel *c = _data;
int active = !!c->reload_req.hook;
CD(c, "Reload triggered by RPKI change%s", active ? " - already active" : "");
@@ -327,9 +327,9 @@ channel_roa_in_changed(struct rt_subscription *s)
}
static void
-channel_roa_out_changed(struct rt_subscription *s)
+channel_roa_out_changed(void *_data)
{
- struct channel *c = s->data;
+ struct channel *c = _data;
CD(c, "Feeding triggered by RPKI change");
c->refeed_pending = 1;
@@ -347,14 +347,14 @@ struct roa_subscription {
static int
channel_roa_is_subscribed(struct channel *c, rtable *tab, int dir)
{
- void (*hook)(struct rt_subscription *) =
+ void (*hook)(void *) =
dir ? channel_roa_in_changed : channel_roa_out_changed;
struct roa_subscription *s;
node *n;
WALK_LIST2(s, n, c->roa_subscriptions, roa_node)
- if ((s->s.tab == tab) && (s->s.hook == hook))
+ if ((s->s.tab == tab) && (s->s.event->hook == hook))
return 1;
return 0;
@@ -368,9 +368,9 @@ channel_roa_subscribe(struct channel *c, rtable *tab, int dir)
return;
struct roa_subscription *s = mb_allocz(c->proto->pool, sizeof(struct roa_subscription));
+ s->s.event = ev_new_init(c->proto->pool, dir ? channel_roa_in_changed : channel_roa_out_changed, c);
+ s->s.list = proto_work_list(c->proto);
- s->s.hook = dir ? channel_roa_in_changed : channel_roa_out_changed;
- s->s.data = c;
rt_subscribe(tab, &s->s);
add_tail(&c->roa_subscriptions, &s->roa_node);
@@ -381,6 +381,7 @@ channel_roa_unsubscribe(struct roa_subscription *s)
{
rt_unsubscribe(&s->s);
rem_node(&s->roa_node);
+ rfree(s->s.event);
mb_free(s);
}
diff --git a/nest/rt-attr.c b/nest/rt-attr.c
index b3a4c7a1..471209ee 100644
--- a/nest/rt-attr.c
+++ b/nest/rt-attr.c
@@ -1043,6 +1043,8 @@ ea_list_ref(ea_list *l)
}
}
+static void ea_free_nested(ea_list *l);
+
static void
ea_list_unref(ea_list *l)
{
@@ -1063,7 +1065,7 @@ ea_list_unref(ea_list *l)
}
if (l->next)
- ea_free(l->next);
+ ea_free_nested(l->next);
}
void
@@ -1472,9 +1474,15 @@ ea_lookup(ea_list *o, int overlay)
o = ea_normalize(o, overlay);
h = ea_hash(o);
+ RTA_LOCK;
+
for(r=rta_hash_table[h & rta_cache_mask]; r; r=r->next_hash)
if (r->hash_key == h && ea_same(r->l, o))
- return ea_clone(r->l);
+ {
+ atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel);
+ RTA_UNLOCK;
+ return r->l;
+ }
uint elen = ea_list_size(o);
r = mb_alloc(rta_pool, elen + sizeof(struct ea_storage));
@@ -1490,12 +1498,17 @@ ea_lookup(ea_list *o, int overlay)
if (++rta_cache_count > rta_cache_limit)
rta_rehash();
+ RTA_UNLOCK;
return r->l;
}
-void
-ea__free(struct ea_storage *a)
+static void
+ea_free_locked(struct ea_storage *a)
{
+ /* Somebody has cloned this rta inbetween. This sometimes happens. */
+ if (atomic_load_explicit(&a->uc, memory_order_acquire))
+ return;
+
ASSERT(rta_cache_count);
rta_cache_count--;
*a->pprev_hash = a->next_hash;
@@ -1506,6 +1519,22 @@ ea__free(struct ea_storage *a)
mb_free(a);
}
+static void
+ea_free_nested(struct ea_list *l)
+{
+ struct ea_storage *r = ea_get_storage(l);
+ if (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel))
+ ea_free_locked(r);
+}
+
+void
+ea__free(struct ea_storage *a)
+{
+ RTA_LOCK;
+ ea_free_locked(a);
+ RTA_UNLOCK;
+}
+
/**
* rta_dump_all - dump attribute cache
*
@@ -1515,6 +1544,8 @@ ea__free(struct ea_storage *a)
void
ea_dump_all(void)
{
+ RTA_LOCK;
+
debug("Route attribute cache (%d entries, rehash at %d):\n", rta_cache_count, rta_cache_limit);
for (uint h=0; h < rta_cache_size; h++)
for (struct ea_storage *a = rta_hash_table[h]; a; a = a->next_hash)
@@ -1524,6 +1555,8 @@ ea_dump_all(void)
debug("\n");
}
debug("\n");
+
+ RTA_UNLOCK;
}
void
diff --git a/nest/rt-table.c b/nest/rt-table.c
index b8f0e61d..3ade4237 100644
--- a/nest/rt-table.c
+++ b/nest/rt-table.c
@@ -114,8 +114,6 @@
pool *rt_table_pool;
-static linpool *rte_update_pool;
-
list routing_tables;
list deleted_routing_tables;
@@ -153,9 +151,6 @@ static void rt_cork_release_hook(void *);
static inline void rt_export_used(struct rt_exporter *);
static void rt_export_cleanup(rtable *tab);
-static inline void rte_update_lock(void);
-static inline void rte_update_unlock(void);
-
static int rte_same(rte *x, rte *y);
const char *rt_import_state_name_array[TIS_MAX] = {
@@ -966,13 +961,10 @@ done:
}
/* Nothing to export */
- if (!new_best && !old_best)
- {
+ if (new_best || old_best)
+ do_rt_notify(c, n, new_best, old_best);
+ else
DBG("rt_notify_accepted: nothing to export\n");
- return;
- }
-
- do_rt_notify(c, n, new_best, old_best);
}
rte *
@@ -1079,7 +1071,7 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen
}
/* Prepare new merged route */
- rte *new_merged = count ? rt_export_merged(c, feed, count, rte_update_pool, 0) : NULL;
+ rte *new_merged = count ? rt_export_merged(c, feed, count, tmp_linpool, 0) : NULL;
if (new_merged || old_best)
do_rt_notify(c, n, new_merged, old_best);
@@ -1089,7 +1081,6 @@ void
rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe)
{
struct channel *c = SKIP_BACK(struct channel, out_req, req);
-
rte *o = RTE_VALID_OR_NULL(rpe->old_best);
struct rte_storage *new_best = rpe->new_best;
@@ -1482,14 +1473,10 @@ rt_export_hook(void *_data)
/* Process the export */
for (uint i=0; i<RT_EXPORT_BULK; i++)
{
- rte_update_lock();
-
rte_export(c, c->rpe_next);
if (!c->rpe_next)
break;
-
- rte_update_unlock();
}
rt_send_export_event(c);
@@ -1775,21 +1762,6 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr
}
-static int rte_update_nest_cnt; /* Nesting counter to allow recursive updates */
-
-static inline void
-rte_update_lock(void)
-{
- rte_update_nest_cnt++;
-}
-
-static inline void
-rte_update_unlock(void)
-{
- if (!--rte_update_nest_cnt)
- lp_flush(rte_update_pool);
-}
-
int
channel_preimport(struct rt_import_request *req, rte *new, rte *old)
{
@@ -1839,7 +1811,6 @@ rte_update(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
const struct filter *filter = c->in_filter;
struct channel_import_stats *stats = &c->import_stats;
- rte_update_lock();
if (new)
{
new->net = n;
@@ -1890,7 +1861,6 @@ rte_update(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
ea_free(a);
}
- rte_update_unlock();
}
void
@@ -1921,25 +1891,6 @@ rte_import(struct rt_import_request *req, const net_addr *n, rte *new, struct rt
rte_recalculate(hook, nn, new, src);
}
-/* Independent call to rte_announce(), used from next hop
- recalculation, outside of rte_update(). new must be non-NULL */
-static inline void
-rte_announce_i(rtable *tab, net *net, struct rte_storage *new, struct rte_storage *old,
- struct rte_storage *new_best, struct rte_storage *old_best)
-{
- rte_update_lock();
- rte_announce(tab, net, new, old, new_best, old_best);
- rte_update_unlock();
-}
-
-static inline void
-rte_discard(net *net, rte *old) /* Non-filtered route deletion, used during garbage collection */
-{
- rte_update_lock();
- rte_recalculate(old->sender, net, NULL, old->src);
- rte_update_unlock();
-}
-
/* Check rtable for best route to given net whether it would be exported do p */
int
rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter)
@@ -1951,15 +1902,11 @@ rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filte
rte rt = n->routes->rte;
- rte_update_lock();
-
/* Rest is stripped down export_filter() */
int v = c->proto->preexport ? c->proto->preexport(c, &rt) : 0;
if (v == RIC_PROCESS)
v = (f_run(filter, &rt, FF_SILENT) <= F_ACCEPT);
- rte_update_unlock();
-
return v > 0;
}
@@ -2489,7 +2436,7 @@ rt_settle_timer(timer *t)
struct rt_subscription *s;
WALK_LIST(s, tab->subscribers)
- s->hook(s);
+ ev_send(s->list, s->event);
}
static void
@@ -2731,7 +2678,6 @@ rt_init(void)
{
rta_init();
rt_table_pool = rp_new(&root_pool, "Routing tables");
- rte_update_pool = lp_new_default(rt_table_pool);
init_list(&routing_tables);
init_list(&deleted_routing_tables);
ev_init_list(&rt_cork.queue, &main_birdloop, "Route cork release");
@@ -2816,7 +2762,7 @@ again:
(e->rte.stale_cycle < s->stale_valid) ||
(e->rte.stale_cycle > s->stale_set))
{
- rte_discard(n, &e->rte);
+ rte_recalculate(e->rte.sender, n, NULL, e->rte.src);
limit--;
goto rescan;
@@ -3590,7 +3536,7 @@ rt_next_hop_update_net(rtable *tab, net *n)
{ "autoupdated [+best]", "autoupdated [best]" }
};
rt_rte_trace_in(D_ROUTES, updates[i].new->rte.sender->req, &updates[i].new->rte, best_indicator[nb][ob]);
- rte_announce_i(tab, n, updates[i].new, updates[i].old, new, old_best);
+ rte_announce(tab, n, updates[i].new, updates[i].old, new, old_best);
}
return count;
@@ -3940,20 +3886,16 @@ rt_feed_net(struct rt_export_hook *c, net *n)
count = rte_feed_count(n);
if (count)
{
- rte_update_lock();
rte **feed = alloca(count * sizeof(rte *));
rte_feed_obtain(n, feed, count);
c->req->export_bulk(c->req, n->n.addr, NULL, feed, count);
- rte_update_unlock();
}
}
else if (n->routes)
{
- rte_update_lock();
struct rt_pending_export rpe = { .new = n->routes, .new_best = n->routes };
c->req->export_one(c->req, n->n.addr, &rpe);
- rte_update_unlock();
count = 1;
}
diff --git a/nest/rt.h b/nest/rt.h
index 66111dde..1a6b7a93 100644
--- a/nest/rt.h
+++ b/nest/rt.h
@@ -139,8 +139,8 @@ typedef struct rtable {
struct rt_subscription {
node n;
rtable *tab;
- void (*hook)(struct rt_subscription *b);
- void *data;
+ event *event;
+ event_list *list;
};
struct rt_flowspec_link {
diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c
index e96b175d..2543ee73 100644
--- a/proto/bgp/attrs.c
+++ b/proto/bgp/attrs.c
@@ -939,6 +939,18 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla
bgp_set_attr_ptr(to, BA_LARGE_COMMUNITY, flags, ad);
}
+
+static void
+bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to)
+{
+ if (len != 4)
+ WITHDRAW(BAD_LENGTH, "OTC", len);
+
+ u32 val = get_u32(data);
+ bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, flags, val);
+}
+
+
static void
bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a)
{
@@ -1175,6 +1187,13 @@ static union bgp_attr_desc bgp_attr_table[BGP_ATTR_MAX] = {
.encode = bgp_encode_u32s,
.decode = bgp_decode_large_community,
},
+ [BA_ONLY_TO_CUSTOMER] = {
+ .name = "otc",
+ .type = T_INT,
+ .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
+ .encode = bgp_encode_u32,
+ .decode = bgp_decode_otc,
+ },
[BA_MPLS_LABEL_STACK] = {
.name = "bgp_mpls_label_stack",
.type = T_CLIST,
@@ -1504,6 +1523,29 @@ bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to)
REPORT("Discarding AIGP attribute received on non-AIGP session");
bgp_unset_attr(to, BA_AIGP);
}
+
+ /* Handle OTC ingress procedure, RFC 9234 */
+ if (bgp_channel_is_role_applicable(s->channel))
+ {
+ struct bgp_proto *p = s->proto;
+ eattr *e = bgp_find_attr(*to, BA_ONLY_TO_CUSTOMER);
+
+ /* Reject routes from downstream if they are leaked */
+ if (e && (p->cf->local_role == BGP_ROLE_PROVIDER ||
+ p->cf->local_role == BGP_ROLE_RS_SERVER))
+ WITHDRAW("Route leak detected - OTC attribute from downstream");
+
+ /* Reject routes from peers if they are leaked */
+ if (e && (p->cf->local_role == BGP_ROLE_PEER) && (e->u.data != p->cf->remote_as))
+ WITHDRAW("Route leak detected - OTC attribute with mismatched ASN (%u)",
+ (uint) e->u.data);
+
+ /* Mark routes from upstream if it did not happened before */
+ if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER ||
+ p->cf->local_role == BGP_ROLE_PEER ||
+ p->cf->local_role == BGP_ROLE_RS_CLIENT))
+ bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as);
+ }
}
@@ -1522,8 +1564,8 @@ bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to)
HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket)
-void
-bgp_init_bucket_table(struct bgp_channel *c)
+static void
+bgp_init_bucket_table(struct bgp_pending_tx *c)
{
HASH_INIT(c->bucket_hash, c->pool, 8);
@@ -1531,24 +1573,8 @@ bgp_init_bucket_table(struct bgp_channel *c)
c->withdraw_bucket = NULL;
}
-void
-bgp_free_bucket_table(struct bgp_channel *c)
-{
- HASH_FREE(c->bucket_hash);
-
- struct bgp_bucket *b;
- WALK_LIST_FIRST(b, c->bucket_queue)
- {
- rem_node(&b->send_node);
- mb_free(b);
- }
-
- mb_free(c->withdraw_bucket);
- c->withdraw_bucket = NULL;
-}
-
static struct bgp_bucket *
-bgp_get_bucket(struct bgp_channel *c, ea_list *new)
+bgp_get_bucket(struct bgp_pending_tx *c, ea_list *new)
{
/* Hash and lookup */
u32 hash = ea_hash(new);
@@ -1577,7 +1603,7 @@ bgp_get_bucket(struct bgp_channel *c, ea_list *new)
}
static struct bgp_bucket *
-bgp_get_withdraw_bucket(struct bgp_channel *c)
+bgp_get_withdraw_bucket(struct bgp_pending_tx *c)
{
if (!c->withdraw_bucket)
{
@@ -1589,15 +1615,17 @@ bgp_get_withdraw_bucket(struct bgp_channel *c)
}
static void
-bgp_free_bucket_xx(struct bgp_channel *c, struct bgp_bucket *b)
+bgp_free_bucket(struct bgp_pending_tx *c, struct bgp_bucket *b)
{
HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b);
mb_free(b);
}
int
-bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b)
+bgp_done_bucket(struct bgp_channel *bc, struct bgp_bucket *b)
{
+ struct bgp_pending_tx *c = bc->ptx;
+
/* Won't free the withdraw bucket */
if (b == c->withdraw_bucket)
return 0;
@@ -1608,21 +1636,23 @@ bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b)
if (b->px_uc || !EMPTY_LIST(b->prefixes))
return 0;
- bgp_free_bucket_xx(c, b);
+ bgp_free_bucket(c, b);
return 1;
}
void
-bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b)
+bgp_defer_bucket(struct bgp_channel *bc, struct bgp_bucket *b)
{
+ struct bgp_pending_tx *c = bc->ptx;
rem_node(&b->send_node);
add_tail(&c->bucket_queue, &b->send_node);
}
void
-bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b)
+bgp_withdraw_bucket(struct bgp_channel *bc, struct bgp_bucket *b)
{
- struct bgp_proto *p = (void *) c->c.proto;
+ struct bgp_proto *p = (void *) bc->c.proto;
+ struct bgp_pending_tx *c = bc->ptx;
struct bgp_bucket *wb = bgp_get_withdraw_bucket(c);
log(L_ERR "%s: Attribute list too long", p->p.name);
@@ -1643,7 +1673,7 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b)
#define PXH_KEY(px) px->net, px->path_id, px->hash
#define PXH_NEXT(px) px->next
-#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (c->add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2)
+#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2)
#define PXH_FN(n,i,h) h
#define PXH_REHASH bgp_pxh_rehash
@@ -1652,29 +1682,21 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b)
HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix)
-void
-bgp_init_prefix_table(struct bgp_channel *c)
+static void
+bgp_init_prefix_table(struct bgp_channel *bc)
{
+ struct bgp_pending_tx *c = bc->ptx;
HASH_INIT(c->prefix_hash, c->pool, 8);
- uint alen = net_addr_length[c->c.net_type];
+ uint alen = net_addr_length[bc->c.net_type];
c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL;
}
-void
-bgp_free_prefix_table(struct bgp_channel *c)
-{
- HASH_FREE(c->prefix_hash);
-
- rfree(c->prefix_slab);
- c->prefix_slab = NULL;
-}
-
static struct bgp_prefix *
-bgp_get_prefix(struct bgp_channel *c, const net_addr *net, struct rte_src *src)
+bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, struct rte_src *src, int add_path_tx)
{
u32 path_id = src->global_id;
- u32 path_id_hash = c->add_path_tx ? path_id : 0;
+ u32 path_id_hash = add_path_tx ? path_id : 0;
/* We must use a different hash function than the rtable */
u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash));
struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash);
@@ -1698,15 +1720,16 @@ bgp_get_prefix(struct bgp_channel *c, const net_addr *net, struct rte_src *src)
return px;
}
-static void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px);
+static void bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px);
static inline int
bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *b)
{
+#define IS_WITHDRAW_BUCKET(b) ((b) == c->ptx->withdraw_bucket)
#define BPX_TRACE(what) do { \
if (c->c.debug & D_ROUTES) log(L_TRACE "%s.%s < %s %N %uG %s", \
c->c.proto->name, c->c.name, what, \
- px->net, px->path_id, (b == c->withdraw_bucket) ? "withdraw" : "update"); } while (0)
+ px->net, px->path_id, IS_WITHDRAW_BUCKET(b) ? "withdraw" : "update"); } while (0)
px->lastmod = current_time();
/* Already queued for the same bucket */
@@ -1724,7 +1747,7 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke
}
/* The new bucket is the same as we sent before */
- if ((px->last == b) || c->c.out_table && !px->last && (b == c->withdraw_bucket))
+ if ((px->last == b) || c->c.out_table && !px->last && IS_WITHDRAW_BUCKET(b))
{
if (px->cur)
BPX_TRACE("reverted");
@@ -1733,15 +1756,15 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke
/* Well, we haven't sent anything yet */
if (!px->last)
- bgp_free_prefix(c, px);
+ bgp_free_prefix(c->ptx, px);
px->cur = NULL;
return 0;
}
/* Enqueue the bucket if it has been empty */
- if ((b != c->withdraw_bucket) && EMPTY_LIST(b->prefixes))
- add_tail(&c->bucket_queue, &b->send_node);
+ if (!IS_WITHDRAW_BUCKET(b) && EMPTY_LIST(b->prefixes))
+ add_tail(&c->ptx->bucket_queue, &b->send_node);
/* Enqueue to the new bucket and indicate the change */
add_tail(&b->prefixes, &px->buck_node_xx);
@@ -1754,7 +1777,7 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke
}
static void
-bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px)
+bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px)
{
HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px);
@@ -1784,7 +1807,7 @@ bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket
px->last->px_uc--;
/* Ref the current sent version */
- if (buck != c->withdraw_bucket)
+ if (!IS_WITHDRAW_BUCKET(buck))
{
px->last = buck;
px->last->px_uc++;
@@ -1794,7 +1817,49 @@ bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket
/* Prefixes belonging to the withdraw bucket are freed always */
}
- bgp_free_prefix(c, px);
+ bgp_free_prefix(c->ptx, px);
+}
+
+static void
+bgp_pending_tx_rfree(resource *r)
+{
+ struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r);
+
+ HASH_WALK(ptx->prefix_hash, next, n)
+ rt_unlock_source(rt_find_source_global(n->path_id));
+ HASH_WALK_END;
+}
+
+static void bgp_pending_tx_dump(resource *r UNUSED) { debug("\n"); }
+
+static struct resclass bgp_pending_tx_class = {
+ .name = "BGP Pending TX",
+ .size = sizeof(struct bgp_pending_tx),
+ .free = bgp_pending_tx_rfree,
+ .dump = bgp_pending_tx_dump,
+};
+
+void
+bgp_init_pending_tx(struct bgp_channel *c)
+{
+ ASSERT_DIE(!c->ptx);
+
+ pool *p = rp_new(c->pool, "BGP Pending TX");
+ c->ptx = ralloc(p, &bgp_pending_tx_class);
+ c->ptx->pool = p;
+
+ bgp_init_bucket_table(c->ptx);
+ bgp_init_prefix_table(c);
+}
+
+void
+bgp_free_pending_tx(struct bgp_channel *c)
+{
+ ASSERT_DIE(c->ptx);
+ ASSERT_DIE(c->ptx->pool);
+
+ rfree(c->ptx->pool);
+ c->ptx = NULL;
}
@@ -1806,7 +1871,8 @@ static void
bgp_out_table_feed(void *data)
{
struct rt_export_hook *hook = data;
- struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->table);
+ struct bgp_channel *bc = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->table);
+ struct bgp_pending_tx *c = bc->ptx;
int max = 512;
@@ -1901,8 +1967,8 @@ bgp_out_table_feed(void *data)
static struct rt_export_hook *
bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req UNUSED)
{
- struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, re);
- pool *p = rp_new(c->c.proto->pool, "Export hook");
+ struct bgp_channel *bc = SKIP_BACK(struct bgp_channel, prefix_exporter, re);
+ pool *p = rp_new(bc->c.proto->pool, "Export hook");
struct rt_export_hook *hook = mb_allocz(p, sizeof(struct rt_export_hook));
hook->pool = p;
hook->event = ev_new_init(p, bgp_out_table_feed, hook);
@@ -1937,6 +2003,7 @@ bgp_preexport(struct channel *C, rte *e)
{
struct bgp_proto *p = (struct bgp_proto *) C->proto;
struct bgp_proto *src = bgp_rte_proto(e);
+ struct bgp_channel *c = (struct bgp_channel *) C;
/* Reject our routes */
if (src == p)
@@ -1976,11 +2043,11 @@ bgp_preexport(struct channel *C, rte *e)
}
/* Handle well-known communities, RFC 1997 */
- struct eattr *com;
+ struct eattr *a;
if (p->cf->interpret_communities &&
- (com = ea_find(e->attrs, BGP_EA_ID(BA_COMMUNITY))))
+ (a = bgp_find_attr(e->attrs, BA_COMMUNITY)))
{
- const struct adata *d = com->u.ptr;
+ const struct adata *d = a->u.ptr;
/* Do not export anywhere */
if (int_set_contains(d, BGP_COMM_NO_ADVERTISE))
@@ -1999,6 +2066,16 @@ bgp_preexport(struct channel *C, rte *e)
return -1;
}
+ /* Do not export routes marked with OTC to upstream, RFC 9234 */
+ if (bgp_channel_is_role_applicable(c))
+ {
+ a = bgp_find_attr(e->attrs, BA_ONLY_TO_CUSTOMER);
+ if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER ||
+ p->cf->local_role==BGP_ROLE_PEER ||
+ p->cf->local_role==BGP_ROLE_RS_CLIENT))
+ return -1;
+ }
+
return 0;
}
@@ -2107,6 +2184,16 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at
}
}
+ /* Mark routes for downstream with OTC, RFC 9234 */
+ if (bgp_channel_is_role_applicable(c))
+ {
+ a = bgp_find_attr(attrs, BA_ONLY_TO_CUSTOMER);
+ if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER ||
+ p->cf->local_role == BGP_ROLE_PEER ||
+ p->cf->local_role == BGP_ROLE_RS_SERVER))
+ bgp_set_attr_u32(&attrs, BA_ONLY_TO_CUSTOMER, 0, p->public_as);
+ }
+
/*
* Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above
* conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute
@@ -2134,16 +2221,16 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c
log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n);
/* If attributes are invalid, we fail back to withdraw */
- buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c);
+ buck = attrs ? bgp_get_bucket(c->ptx, attrs) : bgp_get_withdraw_bucket(c->ptx);
path = new->src;
}
else
{
- buck = bgp_get_withdraw_bucket(c);
+ buck = bgp_get_withdraw_bucket(c->ptx);
path = old->src;
}
- if (bgp_update_prefix(c, bgp_get_prefix(c, n, path), buck))
+ if (bgp_update_prefix(c, bgp_get_prefix(c->ptx, n, path, c->add_path_tx), buck))
bgp_schedule_packet(p->conn, c, PKT_UPDATE);
}
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index d240112c..68c788ea 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -102,6 +102,7 @@
* RFC 8212 - Default EBGP Route Propagation Behavior without Policies
* RFC 8654 - Extended Message Support for BGP
* RFC 9117 - Revised Validation Procedure for BGP Flow Specifications
+ * RFC 9234 - Route Leak Prevention and Detection Using Roles
* draft-ietf-idr-ext-opt-param-07
* draft-uttaro-idr-bgp-persistence-04
* draft-walton-bgp-hostname-capability-02
@@ -518,6 +519,12 @@ bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len)
p->uncork_ev->data = NULL;
bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len);
bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len);
+
+ struct bgp_channel *c;
+ WALK_LIST(c, p->p.channels)
+ if (c->ptx)
+ bgp_free_pending_tx(c);
+
ev_schedule(p->event);
}
@@ -787,10 +794,8 @@ bgp_handle_graceful_restart(struct bgp_proto *p)
}
/* Reset bucket and prefix tables */
- bgp_free_bucket_table(c);
- bgp_free_prefix_table(c);
- bgp_init_bucket_table(c);
- bgp_init_prefix_table(c);
+ bgp_free_pending_tx(c);
+ bgp_init_pending_tx(c);
c->packets_to_send = 0;
}
@@ -1806,8 +1811,7 @@ bgp_channel_start(struct channel *C)
if (c->cf->export_table)
bgp_setup_out_table(c);
- bgp_init_bucket_table(c);
- bgp_init_prefix_table(c);
+ bgp_init_pending_tx(c);
c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0);
@@ -2017,6 +2021,15 @@ bgp_postconfig(struct proto_config *CF)
if (internal && cf->rs_client)
cf_error("Only external neighbor can be RS client");
+ if (internal && (cf->local_role != BGP_ROLE_UNDEFINED))
+ cf_error("Local role cannot be set on IBGP sessions");
+
+ if (interior && (cf->local_role != BGP_ROLE_UNDEFINED))
+ log(L_WARN "BGP roles are not recommended to be used within AS confederations");
+
+ if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED))
+ cf_error("Local role must be set if roles are required");
+
if (!cf->confederation && cf->confederation_member)
cf_error("Confederation ID must be set for member sessions");
@@ -2379,6 +2392,15 @@ bgp_show_afis(int code, char *s, u32 *afis, uint count)
cli_msg(code, b.start);
}
+static const char *
+bgp_format_role_name(u8 role)
+{
+ static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" };
+ if (role == BGP_ROLE_UNDEFINED) return "undefined";
+ if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role];
+ return "?";
+}
+
static void
bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
{
@@ -2507,6 +2529,9 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
if (caps->hostname)
cli_msg(-1006, " Hostname: %s", caps->hostname);
+
+ if (caps->role != BGP_ROLE_UNDEFINED)
+ cli_msg(-1006, " Role: %s", bgp_format_role_name(caps->role));
}
static void
diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h
index 81382099..0cd327a2 100644
--- a/proto/bgp/bgp.h
+++ b/proto/bgp/bgp.h
@@ -113,6 +113,8 @@ struct bgp_config {
int gr_mode; /* Graceful restart mode (BGP_GR_*) */
int llgr_mode; /* Long-lived graceful restart mode (BGP_LLGR_*) */
int setkey; /* Set MD5 password to system SA/SP database */
+ u8 local_role; /* Set peering role with neighbor [RFC 9234] */
+ int require_roles; /* Require configured roles on both sides */
/* Times below are in seconds */
unsigned gr_time; /* Graceful restart timeout */
unsigned llgr_time; /* Long-lived graceful restart stale time */
@@ -166,6 +168,13 @@ struct bgp_channel_config {
#define BGP_PT_INTERNAL 1
#define BGP_PT_EXTERNAL 2
+#define BGP_ROLE_UNDEFINED 255
+#define BGP_ROLE_PROVIDER 0
+#define BGP_ROLE_RS_SERVER 1
+#define BGP_ROLE_RS_CLIENT 2
+#define BGP_ROLE_CUSTOMER 3
+#define BGP_ROLE_PEER 4
+
#define NH_NO 0
#define NH_ALL 1
#define NH_IBGP 2
@@ -226,6 +235,7 @@ struct bgp_caps {
u8 ext_messages; /* Extended message length, RFC draft */
u8 route_refresh; /* Route refresh capability, RFC 2918 */
u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */
+ u8 role; /* BGP role capability, RFC 9234 */
u8 gr_aware; /* Graceful restart capability, RFC 4724 */
u8 gr_flags; /* Graceful restart flags */
@@ -351,14 +361,8 @@ struct bgp_channel {
/* Rest are zeroed when down */
pool *pool;
- HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */
- struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */
- list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */
-
- HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */
- slab *prefix_slab; /* Slab holding prefix nodes */
-
- struct rt_exporter prefix_exporter; /* Table-like exporter for prefix_hash */
+ struct bgp_pending_tx *ptx; /* Routes waiting to be sent */
+ struct rt_exporter prefix_exporter; /* Table-like exporter for ptx */
ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */
ip_addr link_addr; /* Link-local version of next_hop_addr */
@@ -401,6 +405,18 @@ struct bgp_bucket {
ea_list eattrs[0]; /* Per-bucket extended attributes */
};
+struct bgp_pending_tx {
+ resource r;
+ pool *pool;
+
+ HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */
+ struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */
+ list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */
+
+ HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */
+ slab *prefix_slab; /* Slab holding prefix nodes */
+};
+
struct bgp_export_state {
struct bgp_proto *proto;
struct bgp_channel *channel;
@@ -494,6 +510,12 @@ static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c)
static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c)
{ return BGP_AFI(c->afi) == BGP_AFI_IPV6; }
+static inline int bgp_channel_is_role_applicable(struct bgp_channel *c)
+{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); }
+
+static inline int bgp_cc_is_role_applicable(struct bgp_channel_config *c)
+{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); }
+
static inline uint bgp_max_packet_length(struct bgp_conn *conn)
{ return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; }
@@ -567,13 +589,12 @@ void bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to);
void bgp_setup_out_table(struct bgp_channel *c);
-void bgp_init_bucket_table(struct bgp_channel *c);
-void bgp_free_bucket_table(struct bgp_channel *c);
+void bgp_init_pending_tx(struct bgp_channel *c);
+void bgp_free_pending_tx(struct bgp_channel *c);
+
void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b);
int bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b);
-void bgp_init_prefix_table(struct bgp_channel *c);
-void bgp_free_prefix_table(struct bgp_channel *c);
void bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck);
int bgp_rte_better(struct rte *, struct rte *);
@@ -662,6 +683,7 @@ enum bgp_attr_id {
BA_AS4_AGGREGATOR = 0x12, /* RFC 6793 */
BA_AIGP = 0x1a, /* RFC 7311 */
BA_LARGE_COMMUNITY = 0x20, /* RFC 8092 */
+#define BA_ONLY_TO_CUSTOMER 0x23 /* RFC 9234 */
/* Bird's private internal BGP attributes */
BA_MPLS_LABEL_STACK = 0x100, /* MPLS label stack transfer attribute */
diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y
index 24f3ec8f..9f0d2306 100644
--- a/proto/bgp/config.Y
+++ b/proto/bgp/config.Y
@@ -30,7 +30,8 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE,
STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG,
LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS,
DYNAMIC, RANGE, NAME, DIGITS, AIGP, ORIGINATE, COST, ENFORCE,
- FIRST, FREE, VALIDATE, BASE)
+ FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER,
+ RS_SERVER, RS_CLIENT, REQUIRE, BGP_OTC)
%type <i> bgp_nh
%type <i32> bgp_afi
@@ -39,7 +40,7 @@ CF_KEYWORDS(CEASE, PREFIX, LIMIT, HIT, ADMINISTRATIVE, SHUTDOWN, RESET, PEER,
CONFIGURATION, CHANGE, DECONFIGURED, CONNECTION, REJECTED, COLLISION,
OUT, OF, RESOURCES)
-%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag
+%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag bgp_role_name
CF_GRAMMAR
@@ -73,6 +74,7 @@ bgp_proto_start: proto_start BGP {
BGP_CFG->llgr_mode = -1;
BGP_CFG->llgr_time = 3600;
BGP_CFG->setkey = 1;
+ BGP_CFG->local_role = BGP_ROLE_UNDEFINED;
BGP_CFG->dynamic_name = "dynbgp";
BGP_CFG->check_link = -1;
}
@@ -115,6 +117,14 @@ bgp_cease_flag:
| OUT OF RESOURCES { $$ = 1 << 8; }
;
+bgp_role_name:
+ PEER { $$ = BGP_ROLE_PEER; }
+ | PROVIDER { $$ = BGP_ROLE_PROVIDER; }
+ | CUSTOMER { $$ = BGP_ROLE_CUSTOMER; }
+ | RS_SERVER { $$ = BGP_ROLE_RS_SERVER; }
+ | RS_CLIENT { $$ = BGP_ROLE_RS_CLIENT; }
+ ;
+
bgp_proto:
bgp_proto_start proto_name '{'
| bgp_proto proto_item ';'
@@ -198,6 +208,8 @@ bgp_proto:
| bgp_proto BFD GRACEFUL ';' { init_bfd_opts(&BGP_CFG->bfd); BGP_CFG->bfd->mode = BGP_BFD_GRACEFUL; }
| bgp_proto BFD { open_bfd_opts(&BGP_CFG->bfd); } bfd_opts { close_bfd_opts(); } ';'
| bgp_proto ENFORCE FIRST AS bool ';' { BGP_CFG->enforce_first_as = $5; }
+ | bgp_proto LOCAL ROLE bgp_role_name ';' { BGP_CFG->local_role = $4; }
+ | bgp_proto REQUIRE ROLES bool ';' { BGP_CFG->require_roles = $4; }
;
bgp_afi:
diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c
index 867be75f..d4d2d0b0 100644
--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@@ -238,6 +238,7 @@ bgp_prepare_capabilities(struct bgp_conn *conn)
caps->ext_messages = p->cf->enable_extended_messages;
caps->route_refresh = p->cf->enable_refresh;
caps->enhanced_refresh = p->cf->enable_refresh;
+ caps->role = p->cf->local_role;
if (caps->as4_support)
caps->as4_number = p->public_as;
@@ -350,6 +351,13 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
*buf++ = 0; /* Capability data length */
}
+ if (caps->role != BGP_ROLE_UNDEFINED)
+ {
+ *buf++ = 9; /* Capability 9: Announce chosen BGP role */
+ *buf++ = 1; /* Capability data length */
+ *buf++ = caps->role;
+ }
+
if (caps->gr_aware)
{
*buf++ = 64; /* Capability 64: Support for graceful restart */
@@ -449,11 +457,15 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len)
struct bgp_proto *p = conn->bgp;
struct bgp_caps *caps;
struct bgp_af_caps *ac;
+ uint err_subcode = 0;
int i, cl;
u32 af;
if (!conn->remote_caps)
+ {
caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + sizeof(struct bgp_af_caps));
+ caps->role = BGP_ROLE_UNDEFINED;
+ }
else
{
caps = conn->remote_caps;
@@ -513,6 +525,21 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len)
caps->ext_messages = 1;
break;
+ case 9: /* BGP role capability, RFC 9234 */
+ if (cl != 1)
+ goto err;
+
+ /* Reserved value */
+ if (pos[2] == BGP_ROLE_UNDEFINED)
+ { err_subcode = 11; goto err; }
+
+ /* Multiple inconsistent values */
+ if ((caps->role != BGP_ROLE_UNDEFINED) && (caps->role != pos[2]))
+ { err_subcode = 11; goto err; }
+
+ caps->role = pos[2];
+ break;
+
case 64: /* Graceful restart capability, RFC 4724 */
if (cl % 4 != 2)
goto err;
@@ -638,7 +665,7 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len)
err:
mb_free(caps);
- bgp_error(conn, 2, 0, NULL, 0);
+ bgp_error(conn, 2, err_subcode, NULL, 0);
return -1;
}
@@ -854,6 +881,22 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
conn->received_as = asn;
}
+ /* RFC 9234 4.2 - check role agreement */
+ u8 local_role = p->cf->local_role;
+ u8 neigh_role = caps->role;
+
+ if ((local_role != BGP_ROLE_UNDEFINED) &&
+ (neigh_role != BGP_ROLE_UNDEFINED) &&
+ !((local_role == BGP_ROLE_PEER && neigh_role == BGP_ROLE_PEER) ||
+ (local_role == BGP_ROLE_CUSTOMER && neigh_role == BGP_ROLE_PROVIDER) ||
+ (local_role == BGP_ROLE_PROVIDER && neigh_role == BGP_ROLE_CUSTOMER) ||
+ (local_role == BGP_ROLE_RS_CLIENT && neigh_role == BGP_ROLE_RS_SERVER) ||
+ (local_role == BGP_ROLE_RS_SERVER && neigh_role == BGP_ROLE_RS_CLIENT)))
+ { bgp_error(conn, 2, 11, NULL, 0); return; }
+
+ if ((p->cf->require_roles) && (neigh_role == BGP_ROLE_UNDEFINED))
+ { bgp_error(conn, 2, 11, NULL, 0); return; }
+
/* Check the other connection */
other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
switch (other->state)
@@ -2169,7 +2212,7 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu
* var IPv4 Network Layer Reachability Information
*/
- ASSERT_DIE(s->channel->withdraw_bucket != buck);
+ ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck);
int lr, la;
@@ -2192,7 +2235,7 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu
static byte *
bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
{
- ASSERT_DIE(s->channel->withdraw_bucket != buck);
+ ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck);
/*
* 2 B IPv4 Withdrawn Routes Length (zero)
@@ -2332,7 +2375,7 @@ again: ;
};
/* Try unreachable bucket */
- if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
+ if ((buck = c->ptx->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
{
res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
bgp_create_ip_unreach(&s, buck, buf, end):
@@ -2342,9 +2385,9 @@ again: ;
}
/* Try reachable buckets */
- if (!EMPTY_LIST(c->bucket_queue))
+ if (!EMPTY_LIST(c->ptx->bucket_queue))
{
- buck = HEAD(c->bucket_queue);
+ buck = HEAD(c->ptx->bucket_queue);
/* Cleanup empty buckets */
if (bgp_done_bucket(c, buck))
@@ -2977,6 +3020,7 @@ static struct {
{ 2, 6, "Unacceptable hold time" },
{ 2, 7, "Required capability missing" }, /* [RFC5492] */
{ 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
+ { 2,11, "Role mismatch" }, /* From Open Policy, RFC 9234 */
{ 3, 0, "Invalid UPDATE message" },
{ 3, 1, "Malformed attribute list" },
{ 3, 2, "Unrecognized well-known attribute" },
diff --git a/sysdep/cf/README b/sysdep/cf/README
index 9a7a4afa..68078bbe 100644
--- a/sysdep/cf/README
+++ b/sysdep/cf/README
@@ -4,7 +4,6 @@ Available configuration variables:
CONFIG_AUTO_ROUTES Device routes are added automagically by the kernel
CONFIG_SELF_CONSCIOUS We're able to recognize whether route was installed by us
CONFIG_MULTIPLE_TABLES The kernel supports multiple routing tables
-CONFIG_ALL_TABLES_AT_ONCE Kernel scanner wants to process all tables at once
CONFIG_SINGLE_ROUTE There is only one route per network
CONFIG_MC_PROPER_SRC Multicast packets have source address according to socket saddr field
diff --git a/sysdep/cf/linux.h b/sysdep/cf/linux.h
index 047d3764..c640bef4 100644
--- a/sysdep/cf/linux.h
+++ b/sysdep/cf/linux.h
@@ -9,7 +9,6 @@
#define CONFIG_AUTO_ROUTES
#define CONFIG_SELF_CONSCIOUS
#define CONFIG_MULTIPLE_TABLES
-#define CONFIG_ALL_TABLES_AT_ONCE
#define CONFIG_IP6_SADR_KERNEL
#define CONFIG_MC_PROPER_SRC
diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
index 656202ac..099ae6e9 100644
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@@ -257,16 +257,13 @@ nl_open_sock(struct nl_sock *nl)
}
}
-static void
+static int
nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED)
{
- /*
- * Strict checking is not necessary, it improves behavior on newer kernels.
- * If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT
- * run-time), we can just ignore it.
- */
#ifdef SOL_NETLINK
- setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict));
+ return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict));
+#else
+ return -1;
#endif
}
@@ -294,10 +291,17 @@ nl_cfg_rx_buffer_size(struct config *cfg)
static void
nl_open(void)
{
+ if ((nl_scan.fd >= 0) && (nl_req.fd >= 0))
+ return;
+
nl_open_sock(&nl_scan);
nl_open_sock(&nl_req);
- nl_set_strict_dump(&nl_scan, 1);
+ if (nl_set_strict_dump(&nl_scan, 1) < 0)
+ {
+ log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once");
+ krt_use_shared_scan();
+ }
}
static void
@@ -352,11 +356,13 @@ nl_request_dump_addr(int af)
}
static void
-nl_request_dump_route(int af)
+nl_request_dump_route(int af, int table_id)
{
struct {
struct nlmsghdr nh;
struct rtmsg rtm;
+ struct rtattr rta;
+ u32 table_id;
} req = {
.nh.nlmsg_type = RTM_GETROUTE,
.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
@@ -365,7 +371,17 @@ nl_request_dump_route(int af)
.rtm.rtm_family = af,
};
- send(nl_scan.fd, &req, sizeof(req), 0);
+ if (table_id < 256)
+ req.rtm.rtm_table = table_id;
+ else
+ {
+ req.rta.rta_type = RTA_TABLE;
+ req.rta.rta_len = RTA_LENGTH(4);
+ req.table_id = table_id;
+ req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len;
+ }
+
+ send(nl_scan.fd, &req, req.nh.nlmsg_len, 0);
nl_scan.last_hdr = NULL;
}
@@ -2072,18 +2088,27 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
}
void
-krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
+krt_do_scan(struct krt_proto *p)
{
struct nlmsghdr *h;
struct nl_parse_state s;
nl_parse_begin(&s, 1);
- nl_request_dump_route(AF_UNSPEC);
+
+ /* Table-specific scan or shared scan */
+ if (p)
+ nl_request_dump_route(p->af, krt_table_id(p));
+ else
+ nl_request_dump_route(AF_UNSPEC, 0);
+
while (h = nl_get_scan())
+ {
if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
nl_parse_route(&s, h);
else
log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
+ }
+
nl_parse_end(&s);
}
diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c
index 471efeba..cd44a1e5 100644
--- a/sysdep/unix/krt.c
+++ b/sysdep/unix/krt.c
@@ -578,18 +578,17 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src)
}
}
+
/*
* Periodic scanning
*/
-
-#ifdef CONFIG_ALL_TABLES_AT_ONCE
-
-static timer *krt_scan_timer;
-static int krt_scan_count;
+static timer *krt_scan_all_timer;
+static int krt_scan_all_count;
+static _Bool krt_scan_all_tables;
static void
-krt_scan(timer *t UNUSED)
+krt_scan_all(timer *t UNUSED)
{
struct krt_proto *p;
node *n;
@@ -610,35 +609,42 @@ krt_scan(timer *t UNUSED)
}
static void
-krt_scan_timer_start(struct krt_proto *p)
+krt_scan_all_timer_start(struct krt_proto *p)
{
- if (!krt_scan_count)
- krt_scan_timer = tm_new_init(krt_pool, krt_scan, NULL, KRT_CF->scan_time, 0);
+ if (!krt_scan_all_count)
+ krt_scan_all_timer = tm_new_init(krt_pool, krt_scan_all, NULL, KRT_CF->scan_time, 0);
- krt_scan_count++;
+ krt_scan_all_count++;
- tm_start(krt_scan_timer, 1 S);
+ tm_start(krt_scan_all_timer, 1 S);
}
static void
-krt_scan_timer_stop(struct krt_proto *p UNUSED)
+krt_scan_all_timer_stop(void)
{
- krt_scan_count--;
+ ASSERT(krt_scan_all_count > 0);
+
+ krt_scan_all_count--;
- if (!krt_scan_count)
+ if (!krt_scan_all_count)
{
- rfree(krt_scan_timer);
- krt_scan_timer = NULL;
+ rfree(krt_scan_all_timer);
+ krt_scan_all_timer = NULL;
}
}
static void
-krt_scan_timer_kick(struct krt_proto *p UNUSED)
+krt_scan_all_timer_kick(void)
{
- tm_start(krt_scan_timer, 0);
+ tm_start(krt_scan_all_timer, 0);
+}
+
+void
+krt_use_shared_scan(void)
+{
+ krt_scan_all_tables = 1;
}
-#else
static void
krt_scan(timer *t)
@@ -656,26 +662,33 @@ krt_scan(timer *t)
static void
krt_scan_timer_start(struct krt_proto *p)
{
- p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0);
- tm_start(p->scan_timer, 1 S);
+ if (krt_scan_all_tables)
+ krt_scan_all_timer_start(p);
+ else
+ {
+ p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0);
+ tm_start(p->scan_timer, 1 S);
+ }
}
static void
krt_scan_timer_stop(struct krt_proto *p)
{
- tm_stop(p->scan_timer);
+ if (krt_scan_all_tables)
+ krt_scan_all_timer_stop();
+ else
+ tm_stop(p->scan_timer);
}
static void
krt_scan_timer_kick(struct krt_proto *p)
{
- tm_start(p->scan_timer, 0);
+ if (krt_scan_all_tables)
+ krt_scan_all_timer_kick();
+ else
+ tm_start(p->scan_timer, 0);
}
-#endif
-
-
-
/*
* Updates
@@ -792,11 +805,6 @@ krt_postconfig(struct proto_config *CF)
if (! proto_cf_main_channel(CF))
cf_error("Channel not specified");
-#ifdef CONFIG_ALL_TABLES_AT_ONCE
- if (krt_cf->scan_time != cf->scan_time)
- cf_error("All kernel syncers must use the same table scan interval");
-#endif
-
struct channel_config *cc = proto_cf_main_channel(CF);
struct rtable_config *tab = cc->table;
if (tab->krt_attached)
diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h
index e0d60cbd..9f7ebb4f 100644
--- a/sysdep/unix/krt.h
+++ b/sysdep/unix/krt.h
@@ -21,11 +21,6 @@ struct kif_proto;
#define KRT_DEFAULT_ECMP_LIMIT 16
-#if 0
-#define EA_KRT_SOURCE EA_CODE(PROTOCOL_KERNEL, 0)
-#define EA_KRT_METRIC EA_CODE(PROTOCOL_KERNEL, 1)
-#endif
-
extern struct ea_class ea_krt_source, ea_krt_metric;
#define KRT_REF_SEEN 0x1 /* Seen in table */
@@ -55,10 +50,7 @@ struct krt_proto {
struct proto p;
struct krt_state sys; /* Sysdep state */
-#ifndef CONFIG_ALL_TABLES_AT_ONCE
timer *scan_timer;
-#endif
-
struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */
struct bmap seen_map; /* Routes seen during last periodic scan */
node krt_node; /* Node in krt_proto_list */
@@ -80,6 +72,7 @@ extern pool *krt_pool;
struct proto_config * kif_init_config(int class);
void kif_request_scan(void);
+void krt_use_shared_scan(void);
void krt_got_route(struct krt_proto *p, struct rte *e, s8 src);
void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src);