diff options
Diffstat (limited to 'proto/bgp')
-rw-r--r-- | proto/bgp/Makefile | 1 | ||||
-rw-r--r-- | proto/bgp/attrs.c | 363 | ||||
-rw-r--r-- | proto/bgp/bgp.c | 133 | ||||
-rw-r--r-- | proto/bgp/bgp.h | 60 | ||||
-rw-r--r-- | proto/bgp/config.Y | 16 | ||||
-rw-r--r-- | proto/bgp/packets.c | 83 |
6 files changed, 496 insertions, 160 deletions
diff --git a/proto/bgp/Makefile b/proto/bgp/Makefile index 2a4cc99c..f6a38678 100644 --- a/proto/bgp/Makefile +++ b/proto/bgp/Makefile @@ -2,6 +2,5 @@ src := attrs.c bgp.c packets.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,bgp_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 084c9b63..bfdd9ac5 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -939,6 +939,18 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla bgp_set_attr_ptr(to, BA_LARGE_COMMUNITY, flags, ad); } + +static void +bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to) +{ + if (len != 4) + WITHDRAW(BAD_LENGTH, "OTC", len); + + u32 val = get_u32(data); + bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, flags, val); +} + + static void bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) { @@ -1175,6 +1187,13 @@ static union bgp_attr_desc bgp_attr_table[BGP_ATTR_MAX] = { .encode = bgp_encode_u32s, .decode = bgp_decode_large_community, }, + [BA_ONLY_TO_CUSTOMER] = { + .name = "otc", + .type = T_INT, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_u32, + .decode = bgp_decode_otc, + }, [BA_MPLS_LABEL_STACK] = { .name = "bgp_mpls_label_stack", .type = T_CLIST, @@ -1504,6 +1523,29 @@ bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to) REPORT("Discarding AIGP attribute received on non-AIGP session"); bgp_unset_attr(to, BA_AIGP); } + + /* Handle OTC ingress procedure, RFC 9234 */ + if (bgp_channel_is_role_applicable(s->channel)) + { + struct bgp_proto *p = s->proto; + eattr *e = bgp_find_attr(*to, BA_ONLY_TO_CUSTOMER); + + /* Reject routes from downstream if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + WITHDRAW("Route leak detected - OTC attribute from downstream"); + + /* Reject routes from peers if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PEER) && (e->u.data != p->cf->remote_as)) + WITHDRAW("Route leak detected - OTC attribute with mismatched ASN (%u)", + (uint) e->u.data); + + /* Mark routes from upstream if it did not happened before */ + if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_CLIENT)) + bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as); + } } @@ -1522,8 +1564,8 @@ bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to) HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket) -void -bgp_init_bucket_table(struct bgp_channel *c) +static void +bgp_init_bucket_table(struct bgp_pending_tx *c) { HASH_INIT(c->bucket_hash, c->pool, 8); @@ -1531,24 +1573,8 @@ bgp_init_bucket_table(struct bgp_channel *c) c->withdraw_bucket = NULL; } -void -bgp_free_bucket_table(struct bgp_channel *c) -{ - HASH_FREE(c->bucket_hash); - - struct bgp_bucket *b; - WALK_LIST_FIRST(b, c->bucket_queue) - { - rem_node(&b->send_node); - mb_free(b); - } - - mb_free(c->withdraw_bucket); - c->withdraw_bucket = NULL; -} - static struct bgp_bucket * -bgp_get_bucket(struct bgp_channel *c, ea_list *new) +bgp_get_bucket(struct bgp_pending_tx *c, ea_list *new) { /* Hash and lookup */ u32 hash = ea_hash(new); @@ -1577,7 +1603,7 @@ bgp_get_bucket(struct bgp_channel *c, ea_list *new) } static struct bgp_bucket * -bgp_get_withdraw_bucket(struct bgp_channel *c) +bgp_get_withdraw_bucket(struct bgp_pending_tx *c) { if (!c->withdraw_bucket) { @@ -1589,15 +1615,17 @@ bgp_get_withdraw_bucket(struct bgp_channel *c) } static void -bgp_free_bucket_xx(struct bgp_channel *c, struct bgp_bucket *b) +bgp_free_bucket(struct bgp_pending_tx *c, struct bgp_bucket *b) { HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b); mb_free(b); } int -bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_done_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { + struct bgp_pending_tx *c = bc->ptx; + /* Won't free the withdraw bucket */ if (b == c->withdraw_bucket) return 0; @@ -1608,21 +1636,23 @@ bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b) if (b->px_uc || !EMPTY_LIST(b->prefixes)) return 0; - bgp_free_bucket_xx(c, b); + bgp_free_bucket(c, b); return 1; } void -bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_defer_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { + struct bgp_pending_tx *c = bc->ptx; rem_node(&b->send_node); add_tail(&c->bucket_queue, &b->send_node); } void -bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_withdraw_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { - struct bgp_proto *p = (void *) c->c.proto; + struct bgp_proto *p = (void *) bc->c.proto; + struct bgp_pending_tx *c = bc->ptx; struct bgp_bucket *wb = bgp_get_withdraw_bucket(c); log(L_ERR "%s: Attribute list too long", p->p.name); @@ -1643,7 +1673,7 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) #define PXH_KEY(px) px->net, px->path_id, px->hash #define PXH_NEXT(px) px->next -#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (c->add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2) +#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2) #define PXH_FN(n,i,h) h #define PXH_REHASH bgp_pxh_rehash @@ -1652,28 +1682,21 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix) -void -bgp_init_prefix_table(struct bgp_channel *c) +static void +bgp_init_prefix_table(struct bgp_channel *bc) { + struct bgp_pending_tx *c = bc->ptx; HASH_INIT(c->prefix_hash, c->pool, 8); - uint alen = net_addr_length[c->c.net_type]; + uint alen = net_addr_length[bc->c.net_type]; c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL; } -void -bgp_free_prefix_table(struct bgp_channel *c) -{ - HASH_FREE(c->prefix_hash); - - rfree(c->prefix_slab); - c->prefix_slab = NULL; -} - static struct bgp_prefix * -bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) +bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, struct rte_src *src, int add_path_tx) { - u32 path_id_hash = c->add_path_tx ? path_id : 0; + u32 path_id = src->global_id; + u32 path_id_hash = add_path_tx ? path_id : 0; /* We must use a different hash function than the rtable */ u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash)); struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash); @@ -1690,21 +1713,23 @@ bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) px->hash = hash; px->path_id = path_id; net_copy(px->net, net); + rt_lock_source(src); HASH_INSERT2(c->prefix_hash, PXH, c->pool, px); return px; } -static void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px); +static void bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px); static inline int bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *b) { +#define IS_WITHDRAW_BUCKET(b) ((b) == c->ptx->withdraw_bucket) #define BPX_TRACE(what) do { \ if (c->c.debug & D_ROUTES) log(L_TRACE "%s.%s < %s %N %uG %s", \ c->c.proto->name, c->c.name, what, \ - px->net, px->path_id, (b == c->withdraw_bucket) ? "withdraw" : "update"); } while (0) + px->net, px->path_id, IS_WITHDRAW_BUCKET(b) ? "withdraw" : "update"); } while (0) px->lastmod = current_time(); /* Already queued for the same bucket */ @@ -1722,7 +1747,7 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke } /* The new bucket is the same as we sent before */ - if ((px->last == b) || c->c.out_table && !px->last && (b == c->withdraw_bucket)) + if ((px->last == b) || c->c.out_table && !px->last && IS_WITHDRAW_BUCKET(b)) { if (px->cur) BPX_TRACE("reverted"); @@ -1731,15 +1756,15 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke /* Well, we haven't sent anything yet */ if (!px->last) - bgp_free_prefix(c, px); + bgp_free_prefix(c->ptx, px); px->cur = NULL; return 0; } /* Enqueue the bucket if it has been empty */ - if ((b != c->withdraw_bucket) && EMPTY_LIST(b->prefixes)) - add_tail(&c->bucket_queue, &b->send_node); + if (!IS_WITHDRAW_BUCKET(b) && EMPTY_LIST(b->prefixes)) + add_tail(&c->ptx->bucket_queue, &b->send_node); /* Enqueue to the new bucket and indicate the change */ add_tail(&b->prefixes, &px->buck_node_xx); @@ -1752,10 +1777,12 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke } static void -bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) +bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px) { HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px); + rt_unlock_source(rt_find_source_global(px->path_id)); + if (c->prefix_slab) sl_free(px); else @@ -1780,7 +1807,7 @@ bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket px->last->px_uc--; /* Ref the current sent version */ - if (buck != c->withdraw_bucket) + if (!IS_WITHDRAW_BUCKET(buck)) { px->last = buck; px->last->px_uc++; @@ -1790,7 +1817,49 @@ bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket /* Prefixes belonging to the withdraw bucket are freed always */ } - bgp_free_prefix(c, px); + bgp_free_prefix(c->ptx, px); +} + +static void +bgp_pending_tx_rfree(resource *r) +{ + struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r); + + HASH_WALK(ptx->prefix_hash, next, n) + rt_unlock_source(rt_find_source_global(n->path_id)); + HASH_WALK_END; +} + +static void bgp_pending_tx_dump(resource *r UNUSED) { debug("\n"); } + +static struct resclass bgp_pending_tx_class = { + .name = "BGP Pending TX", + .size = sizeof(struct bgp_pending_tx), + .free = bgp_pending_tx_rfree, + .dump = bgp_pending_tx_dump, +}; + +void +bgp_init_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(!c->ptx); + + pool *p = rp_new(c->pool, "BGP Pending TX"); + c->ptx = ralloc(p, &bgp_pending_tx_class); + c->ptx->pool = p; + + bgp_init_bucket_table(c->ptx); + bgp_init_prefix_table(c); +} + +void +bgp_free_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(c->ptx); + ASSERT_DIE(c->ptx->pool); + + rfree(c->ptx->pool); + c->ptx = NULL; } @@ -1798,24 +1867,30 @@ bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket * Prefix hash table exporter */ +struct bgp_out_export_hook { + struct rt_export_hook h; + u32 hash_iter; /* Iterator over hash */ +}; + static void bgp_out_table_feed(void *data) { - struct rt_export_hook *hook = data; - struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->table); + struct bgp_out_export_hook *hook = data; + struct bgp_channel *bc = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->h.table); + struct bgp_pending_tx *c = bc->ptx; int max = 512; - const net_addr *neq = (hook->req->addr_mode == TE_ADDR_EQUAL) ? hook->req->addr : NULL; + const net_addr *neq = (hook->h.req->addr_mode == TE_ADDR_EQUAL) ? hook->h.req->addr : NULL; const net_addr *cand = NULL; do { HASH_WALK_ITER(c->prefix_hash, PXH, n, hook->hash_iter) { - switch (hook->req->addr_mode) + switch (hook->h.req->addr_mode) { case TE_ADDR_IN: - if (!net_in_netX(n->net, hook->req->addr)) + if (!net_in_netX(n->net, hook->h.req->addr)) continue; /* fall through */ case TE_ADDR_NONE: @@ -1827,7 +1902,7 @@ bgp_out_table_feed(void *data) case TE_ADDR_FOR: if (!neq) { - if (net_in_netX(hook->req->addr, n->net) && (!cand || (n->net->length > cand->length))) + if (net_in_netX(hook->h.req->addr, n->net) && (!cand || (n->net->length > cand->length))) cand = n->net; continue; } @@ -1872,13 +1947,13 @@ bgp_out_table_feed(void *data) .new = &es, .new_best = &es, }; - if (hook->req->export_bulk) + if (hook->h.req->export_bulk) { rte *feed = &es.rte; - hook->req->export_bulk(hook->req, n->net, &rpe, &feed, 1); + hook->h.req->export_bulk(hook->h.req, n->net, &rpe, &feed, 1); } - else if (hook->req->export_one) - hook->req->export_one(hook->req, n->net, &rpe); + else if (hook->h.req->export_one) + hook->h.req->export_one(hook->h.req, n->net, &rpe); else bug("No export method in export request"); } @@ -1889,36 +1964,51 @@ bgp_out_table_feed(void *data) } while (neq); if (hook->hash_iter) - ev_schedule_work(hook->event); + ev_schedule_work(&hook->h.event); else - rt_set_export_state(hook, TES_READY); + rt_set_export_state(&hook->h, TES_READY); +} + +static void +bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +{ + req->hook = rt_alloc_export(re, sizeof(struct bgp_out_export_hook)); + req->hook->req = req; + + struct bgp_out_export_hook *hook = SKIP_BACK(struct bgp_out_export_hook, h, req->hook); + + hook->h.event.hook = bgp_out_table_feed; + rt_init_export(re, req->hook); } -static struct rt_export_hook * -bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req UNUSED) +static void +bgp_out_table_export_done(void *data) { - struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, re); - pool *p = rp_new(c->c.proto->pool, "Export hook"); - struct rt_export_hook *hook = mb_allocz(p, sizeof(struct rt_export_hook)); - hook->pool = p; - hook->lp = lp_new_default(p); - hook->event = ev_new_init(p, bgp_out_table_feed, hook); - hook->feed_type = TFT_HASH; + struct bgp_out_export_hook *hook = data; + struct rt_export_request *req = hook->h.req; + void (*stopped)(struct rt_export_request *) = hook->h.stopped; - return hook; + rt_export_stopped(&hook->h); + CALL(stopped, req); } +static const struct rt_exporter_class bgp_out_table_export_class = { + .start = bgp_out_table_export_start, + .done = bgp_out_table_export_done, +}; + void bgp_setup_out_table(struct bgp_channel *c) { ASSERT_DIE(c->c.out_table == NULL); c->prefix_exporter = (struct rt_exporter) { + .class = &bgp_out_table_export_class, .addr_type = c->c.table->addr_type, - .start = bgp_out_table_export_start, + .rp = c->c.proto->pool, }; - init_list(&c->prefix_exporter.hooks); + rt_exporter_init(&c->prefix_exporter); c->c.out_table = &c->prefix_exporter; } @@ -1929,11 +2019,11 @@ bgp_setup_out_table(struct bgp_channel *c) */ int -bgp_preexport(struct channel *c, rte *e) +bgp_preexport(struct channel *C, rte *e) { - struct proto *SRC = e->src->proto; - struct bgp_proto *p = (struct bgp_proto *) (c->proto); - struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (struct bgp_proto *) SRC : NULL; + struct bgp_proto *p = (struct bgp_proto *) C->proto; + struct bgp_proto *src = bgp_rte_proto(e); + struct bgp_channel *c = (struct bgp_channel *) C; /* Reject our routes */ if (src == p) @@ -1973,11 +2063,11 @@ bgp_preexport(struct channel *c, rte *e) } /* Handle well-known communities, RFC 1997 */ - struct eattr *com; + struct eattr *a; if (p->cf->interpret_communities && - (com = ea_find(e->attrs, BGP_EA_ID(BA_COMMUNITY)))) + (a = bgp_find_attr(e->attrs, BA_COMMUNITY))) { - const struct adata *d = com->u.ptr; + const struct adata *d = a->u.ptr; /* Do not export anywhere */ if (int_set_contains(d, BGP_COMM_NO_ADVERTISE)) @@ -1996,14 +2086,23 @@ bgp_preexport(struct channel *c, rte *e) return -1; } + /* Do not export routes marked with OTC to upstream, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(e->attrs, BA_ONLY_TO_CUSTOMER); + if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER || + p->cf->local_role==BGP_ROLE_PEER || + p->cf->local_role==BGP_ROLE_RS_CLIENT)) + return -1; + } + return 0; } static ea_list * bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *attrs0, struct linpool *pool) { - struct proto *SRC = e->src->proto; - struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (void *) SRC : NULL; + struct bgp_proto *src = bgp_rte_proto(e); struct bgp_export_state s = { .proto = p, .channel = c, .pool = pool, .src = src, .route = e, .mpls = c->desc->mpls }; ea_list *attrs = attrs0; eattr *a; @@ -2105,6 +2204,16 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at } } + /* Mark routes for downstream with OTC, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(attrs, BA_ONLY_TO_CUSTOMER); + if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + bgp_set_attr_u32(&attrs, BA_ONLY_TO_CUSTOMER, 0, p->public_as); + } + /* * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute @@ -2121,7 +2230,7 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c struct bgp_proto *p = (void *) P; struct bgp_channel *c = (void *) C; struct bgp_bucket *buck; - u32 path; + struct rte_src *path; if (new) { @@ -2132,16 +2241,16 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n); /* If attributes are invalid, we fail back to withdraw */ - buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c); - path = new->src->global_id; + buck = attrs ? bgp_get_bucket(c->ptx, attrs) : bgp_get_withdraw_bucket(c->ptx); + path = new->src; } else { - buck = bgp_get_withdraw_bucket(c); - path = old->src->global_id; + buck = bgp_get_withdraw_bucket(c->ptx); + path = old->src; } - if (bgp_update_prefix(c, bgp_get_prefix(c, n, path), buck)) + if (bgp_update_prefix(c, bgp_get_prefix(c->ptx, n, path, c->add_path_tx), buck)) bgp_schedule_packet(p->conn, c, PKT_UPDATE); } @@ -2156,7 +2265,7 @@ bgp_get_neighbor(rte *r) return as; /* If AS_PATH is not defined, we treat rte as locally originated */ - struct bgp_proto *p = (void *) r->src->proto; + struct bgp_proto *p = bgp_rte_proto(r); return p->cf->confederation ?: p->local_as; } @@ -2186,8 +2295,8 @@ rte_stale(rte *r) int bgp_rte_better(rte *new, rte *old) { - struct bgp_proto *new_bgp = (struct bgp_proto *) new->src->proto; - struct bgp_proto *old_bgp = (struct bgp_proto *) old->src->proto; + struct bgp_proto *new_bgp = bgp_rte_proto(new); + struct bgp_proto *old_bgp = bgp_rte_proto(old); eattr *x, *y; u32 n, o; @@ -2331,8 +2440,8 @@ bgp_rte_better(rte *new, rte *old) int bgp_rte_mergable(rte *pri, rte *sec) { - struct bgp_proto *pri_bgp = (struct bgp_proto *) pri->src->proto; - struct bgp_proto *sec_bgp = (struct bgp_proto *) sec->src->proto; + struct bgp_proto *pri_bgp = bgp_rte_proto(pri); + struct bgp_proto *sec_bgp = bgp_rte_proto(sec); eattr *x, *y; u32 p, s; @@ -2416,12 +2525,12 @@ same_group(rte *r, u32 lpref, u32 lasn) static inline int use_deterministic_med(struct rte_storage *r) { - struct proto *P = r->rte.src->proto; - return (P->proto == &proto_bgp) && ((struct bgp_proto *) P)->cf->deterministic_med; + struct bgp_proto *p = bgp_rte_proto(&r->rte); + return p && p->cf->deterministic_med; } int -bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) +bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best) { rte *key = new ? new : old; u32 lpref = rt_get_preference(key); @@ -2546,27 +2655,57 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) return !old_suppressed; } -rte * -bgp_rte_modify_stale(struct rte *r, struct linpool *pool) +void +bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) { - eattr *ea = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); - const struct adata *ad = ea ? ea->u.ptr : NULL; - uint flags = ea ? ea->flags : BAF_PARTIAL; + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct rt_import_hook *irh = c->c.in_req.hook; - if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) - return NULL; + /* Find our routes among others */ + for (uint i=0; i<count; i++) + { + rte *r = feed[i]; + + if ( + !rte_is_valid(r) || /* Not a valid route */ + (r->sender != irh) || /* Not our route */ + (r->stale_cycle == irh->stale_set)) /* A new route, do not mark as stale */ + continue; + + eattr *ea = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); + const struct adata *ad = ea ? ea->u.ptr : NULL; + uint flags = ea ? ea->flags : BAF_PARTIAL; - if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) - return r; + /* LLGR not allowed, withdraw the route */ + if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) + { + rte_import(&c->c.in_req, n, NULL, r->src); + continue; + } + + /* Route already marked as LLGR, do nothing */ + if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) + continue; - _Thread_local static rte e0; - e0 = *r; + /* Store the tmp_linpool state to aggresively save memory */ + struct lp_state tmpp; + lp_save(tmp_linpool, &tmpp); - bgp_set_attr_ptr(&e0.attrs, BA_COMMUNITY, flags, - int_set_add(pool, ad, BGP_COMM_LLGR_STALE)); - e0.pflags |= BGP_REF_STALE; + /* Mark the route as LLGR */ + rte e0 = *r; + bgp_set_attr_ptr(&e0.attrs, BA_COMMUNITY, flags, int_set_add(tmp_linpool, ad, BGP_COMM_LLGR_STALE)); + e0.pflags &= ~BGP_REF_NOT_STALE; + e0.pflags |= BGP_REF_STALE; - return &e0; + /* We need to update the route but keep it stale. */ + ASSERT_DIE(irh->stale_set == irh->stale_valid + 1); + irh->stale_set--; + rte_import(&c->c.in_req, n, &e0, r->src); + irh->stale_set++; + + /* Restore the memory state */ + lp_restore(tmp_linpool, &tmpp); + } } diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 6ffe8824..573e3d25 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -102,6 +102,7 @@ * RFC 8212 - Default EBGP Route Propagation Behavior without Policies * RFC 8654 - Extended Message Support for BGP * RFC 9117 - Revised Validation Procedure for BGP Flow Specifications + * RFC 9234 - Route Leak Prevention and Detection Using Roles * draft-ietf-idr-ext-opt-param-07 * draft-uttaro-idr-bgp-persistence-04 * draft-walton-bgp-hostname-capability-02 @@ -139,6 +140,9 @@ static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd); static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); static void bgp_listen_sock_err(sock *sk UNUSED, int err); +static void bgp_graceful_restart_feed(struct bgp_channel *c); + + /** * bgp_open - open a BGP instance * @p: BGP instance @@ -373,6 +377,7 @@ bgp_close_conn(struct bgp_conn *conn) conn->keepalive_timer = NULL; rfree(conn->hold_timer); conn->hold_timer = NULL; + rfree(conn->tx_ev); conn->tx_ev = NULL; rfree(conn->sk); @@ -511,8 +516,15 @@ void bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len) { proto_notify_state(&p->p, PS_STOP); + p->uncork_ev->data = NULL; bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len); bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len); + + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + if (c->ptx) + bgp_free_pending_tx(c); + ev_schedule(p->event); } @@ -760,32 +772,30 @@ bgp_handle_graceful_restart(struct bgp_proto *p) { case BGP_GRS_NONE: c->gr_active = BGP_GRS_ACTIVE; - rt_refresh_begin(c->c.table, &c->c.in_req); + rt_refresh_begin(&c->c.in_req); break; case BGP_GRS_ACTIVE: - rt_refresh_end(c->c.table, &c->c.in_req); - rt_refresh_begin(c->c.table, &c->c.in_req); + rt_refresh_end(&c->c.in_req); + rt_refresh_begin(&c->c.in_req); break; case BGP_GRS_LLGR: - rt_refresh_begin(c->c.table, &c->c.in_req); - rt_modify_stale(c->c.table, &c->c.in_req); + rt_refresh_begin(&c->c.in_req); + bgp_graceful_restart_feed(c); break; } } else { /* Just flush the routes */ - rt_refresh_begin(c->c.table, &c->c.in_req); - rt_refresh_end(c->c.table, &c->c.in_req); + rt_refresh_begin(&c->c.in_req); + rt_refresh_end(&c->c.in_req); } /* Reset bucket and prefix tables */ - bgp_free_bucket_table(c); - bgp_free_prefix_table(c); - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + bgp_free_pending_tx(c); + bgp_init_pending_tx(c); c->packets_to_send = 0; } @@ -796,6 +806,53 @@ bgp_handle_graceful_restart(struct bgp_proto *p) tm_start(p->gr_timer, p->conn->remote_caps->gr_time S); } +static void +bgp_graceful_restart_feed_done(struct rt_export_request *req) +{ + req->hook = NULL; +} + +static void +bgp_graceful_restart_feed_dump_req(struct rt_export_request *req) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + debug(" BGP-GR %s.%s export request %p\n", c->c.proto->name, c->c.name, req); +} + +static void +bgp_graceful_restart_feed_log_state_change(struct rt_export_request *req, u8 state) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct bgp_proto *p = (void *) c->c.proto; + BGP_TRACE(D_EVENTS, "Long-lived graceful restart export state changed to %s", rt_export_state_name(state)); + + if (state == TES_READY) + rt_stop_export(req, bgp_graceful_restart_feed_done); +} + +static void +bgp_graceful_restart_drop_export(struct rt_export_request *req UNUSED, const net_addr *n UNUSED, struct rt_pending_export *rpe UNUSED) +{ /* Nothing to do */ } + +static void +bgp_graceful_restart_feed(struct bgp_channel *c) +{ + c->stale_feed = (struct rt_export_request) { + .name = "BGP-GR", + .list = &global_work_list, + .trace_routes = c->c.debug | c->c.proto->debug, + .dump_req = bgp_graceful_restart_feed_dump_req, + .log_state_change = bgp_graceful_restart_feed_log_state_change, + .export_bulk = bgp_rte_modify_stale, + .export_one = bgp_graceful_restart_drop_export, + }; + + rt_request_export(c->c.table, &c->stale_feed); +} + + + + /** * bgp_graceful_restart_done - finish active BGP graceful restart * @c: BGP channel @@ -819,7 +876,7 @@ bgp_graceful_restart_done(struct bgp_channel *c) BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); tm_stop(c->stale_timer); - rt_refresh_end(c->c.table, &c->c.in_req); + rt_refresh_end(&c->c.in_req); } /** @@ -861,7 +918,7 @@ bgp_graceful_restart_timeout(timer *t) /* Channel is in GR, and supports LLGR -> start LLGR */ c->gr_active = BGP_GRS_LLGR; tm_start(c->stale_timer, c->stale_time S); - rt_modify_stale(c->c.table, &c->c.in_req); + bgp_graceful_restart_feed(c); } } else @@ -899,7 +956,7 @@ bgp_refresh_begin(struct bgp_channel *c) { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } c->load_state = BFS_REFRESHING; - rt_refresh_begin(c->c.table, &c->c.in_req); + rt_refresh_begin(&c->c.in_req); } /** @@ -920,7 +977,7 @@ bgp_refresh_end(struct bgp_channel *c) { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } c->load_state = BFS_NONE; - rt_refresh_end(c->c.table, &c->c.in_req); + rt_refresh_end(&c->c.in_req); } @@ -1526,6 +1583,8 @@ bgp_start(struct proto *P) p->last_rx_update = 0; p->event = ev_new_init(p->p.pool, bgp_decision, p); + p->uncork_ev = ev_new_init(p->p.pool, bgp_uncork, p); + p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0); p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0); @@ -1656,6 +1715,13 @@ done: return p->p.proto_state; } +struct rte_owner_class bgp_rte_owner_class = { + .get_route_info = bgp_get_route_info, + .rte_better = bgp_rte_better, + .rte_mergable = bgp_rte_mergable, + .rte_igp_metric = bgp_rte_igp_metric, +}; + static struct proto * bgp_init(struct proto_config *CF) { @@ -1669,11 +1735,9 @@ bgp_init(struct proto_config *CF) P->reload_routes = bgp_reload_routes; P->feed_begin = bgp_feed_begin; P->feed_end = bgp_feed_end; - P->rte_better = bgp_rte_better; - P->rte_mergable = bgp_rte_mergable; - P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; - P->rte_modify = bgp_rte_modify_stale; - P->rte_igp_metric = bgp_rte_igp_metric; + + P->sources.class = &bgp_rte_owner_class; + P->sources.rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; p->cf = cf; p->is_internal = (cf->local_as == cf->remote_as); @@ -1747,8 +1811,7 @@ bgp_channel_start(struct channel *C) if (c->cf->export_table) bgp_setup_out_table(c); - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + bgp_init_pending_tx(c); c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0); @@ -1871,7 +1934,7 @@ bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 return cc2->c.table; /* Last, try default table of given type */ - if (tab = cf->c.global->def_tables[type]) + if (tab = rt_get_default_table(cf->c.global, type)) return tab; cf_error("Undefined IGP table"); @@ -1890,7 +1953,7 @@ bgp_default_base_table(struct bgp_config *cf, struct bgp_channel_config *cc) return cc2->c.table; /* Last, try default table of given type */ - struct rtable_config *tab = cf->c.global->def_tables[type]; + struct rtable_config *tab = rt_get_default_table(cf->c.global, type); if (tab) return tab; @@ -1958,6 +2021,15 @@ bgp_postconfig(struct proto_config *CF) if (internal && cf->rs_client) cf_error("Only external neighbor can be RS client"); + if (internal && (cf->local_role != BGP_ROLE_UNDEFINED)) + cf_error("Local role cannot be set on IBGP sessions"); + + if (interior && (cf->local_role != BGP_ROLE_UNDEFINED)) + log(L_WARN "BGP roles are not recommended to be used within AS confederations"); + + if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED)) + cf_error("Local role must be set if roles are required"); + if (!cf->confederation && cf->confederation_member) cf_error("Confederation ID must be set for member sessions"); @@ -2320,6 +2392,15 @@ bgp_show_afis(int code, char *s, u32 *afis, uint count) cli_msg(code, b.start); } +static const char * +bgp_format_role_name(u8 role) +{ + static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" }; + if (role == BGP_ROLE_UNDEFINED) return "undefined"; + if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role]; + return "?"; +} + static void bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) { @@ -2448,6 +2529,9 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) if (caps->hostname) cli_msg(-1006, " Hostname: %s", caps->hostname); + + if (caps->role != BGP_ROLE_UNDEFINED) + cli_msg(-1006, " Role: %s", bgp_format_role_name(caps->role)); } static void @@ -2585,7 +2669,6 @@ struct protocol proto_bgp = { .reconfigure = bgp_reconfigure, .copy_config = bgp_copy_config, .get_status = bgp_get_status, - .get_route_info = bgp_get_route_info, .show_proto_info = bgp_show_proto_info }; diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 003893e0..1bcfb915 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -113,6 +113,8 @@ struct bgp_config { int gr_mode; /* Graceful restart mode (BGP_GR_*) */ int llgr_mode; /* Long-lived graceful restart mode (BGP_LLGR_*) */ int setkey; /* Set MD5 password to system SA/SP database */ + u8 local_role; /* Set peering role with neighbor [RFC 9234] */ + int require_roles; /* Require configured roles on both sides */ /* Times below are in seconds */ unsigned gr_time; /* Graceful restart timeout */ unsigned llgr_time; /* Long-lived graceful restart stale time */ @@ -166,6 +168,13 @@ struct bgp_channel_config { #define BGP_PT_INTERNAL 1 #define BGP_PT_EXTERNAL 2 +#define BGP_ROLE_UNDEFINED 255 +#define BGP_ROLE_PROVIDER 0 +#define BGP_ROLE_RS_SERVER 1 +#define BGP_ROLE_RS_CLIENT 2 +#define BGP_ROLE_CUSTOMER 3 +#define BGP_ROLE_PEER 4 + #define NH_NO 0 #define NH_ALL 1 #define NH_IBGP 2 @@ -226,6 +235,7 @@ struct bgp_caps { u8 ext_messages; /* Extended message length, RFC draft */ u8 route_refresh; /* Route refresh capability, RFC 2918 */ u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */ + u8 role; /* BGP role capability, RFC 9234 */ u8 gr_aware; /* Graceful restart capability, RFC 4724 */ u8 gr_flags; /* Graceful restart flags */ @@ -319,6 +329,7 @@ struct bgp_proto { struct bgp_socket *sock; /* Shared listening socket */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ struct birdsock *postponed_sk; /* Postponed incoming socket for dynamic BGP */ + event *uncork_ev; /* Uncork event in case of congestion */ struct bgp_stats stats; /* BGP statistics */ btime last_established; /* Last time of enter/leave of established state */ btime last_rx_update; /* Last time of RX update */ @@ -350,14 +361,8 @@ struct bgp_channel { /* Rest are zeroed when down */ pool *pool; - HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ - struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ - list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ - - HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ - slab *prefix_slab; /* Slab holding prefix nodes */ - - struct rt_exporter prefix_exporter; /* Table-like exporter for prefix_hash */ + struct bgp_pending_tx *ptx; /* Routes waiting to be sent */ + struct rt_exporter prefix_exporter; /* Table-like exporter for ptx */ ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ ip_addr link_addr; /* Link-local version of next_hop_addr */ @@ -371,6 +376,7 @@ struct bgp_channel { timer *stale_timer; /* Long-lived stale timer for LLGR */ u32 stale_time; /* Stored LLGR stale time from last session */ + struct rt_export_request stale_feed; /* Feeder request for stale route modification */ u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ @@ -399,6 +405,18 @@ struct bgp_bucket { ea_list eattrs[0]; /* Per-bucket extended attributes */ }; +struct bgp_pending_tx { + resource r; + pool *pool; + + HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ + struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ + list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ + + HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ + slab *prefix_slab; /* Slab holding prefix nodes */ +}; + struct bgp_export_state { struct bgp_proto *proto; struct bgp_channel *channel; @@ -492,6 +510,12 @@ static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c) static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c) { return BGP_AFI(c->afi) == BGP_AFI_IPV6; } +static inline int bgp_channel_is_role_applicable(struct bgp_channel *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + +static inline int bgp_cc_is_role_applicable(struct bgp_channel_config *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + static inline uint bgp_max_packet_length(struct bgp_conn *conn) { return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } @@ -533,6 +557,7 @@ rte_resolvable(const rte *rt) return NEXTHOP_IS_REACHABLE(nhad) || (nhad->dest != RTD_UNREACHABLE); } +extern struct rte_owner_class bgp_rte_owner_class; #ifdef LOCAL_DEBUG #define BGP_FORCE_DEBUG 1 @@ -564,25 +589,30 @@ void bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to); void bgp_setup_out_table(struct bgp_channel *c); -void bgp_init_bucket_table(struct bgp_channel *c); -void bgp_free_bucket_table(struct bgp_channel *c); +void bgp_init_pending_tx(struct bgp_channel *c); +void bgp_free_pending_tx(struct bgp_channel *c); + void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b); int bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b); -void bgp_init_prefix_table(struct bgp_channel *c); -void bgp_free_prefix_table(struct bgp_channel *c); void bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck); int bgp_rte_better(struct rte *, struct rte *); int bgp_rte_mergable(rte *pri, rte *sec); -int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); -struct rte *bgp_rte_modify_stale(struct rte *r, struct linpool *pool); +int bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best); +void bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count); u32 bgp_rte_igp_metric(const rte *); void bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, const rte *old); int bgp_preexport(struct channel *, struct rte *); void bgp_get_route_info(struct rte *, byte *); int bgp_total_aigp_metric_(const rte *e, u64 *metric, const struct adata **ad); +static inline struct bgp_proto *bgp_rte_proto(struct rte *rte) +{ + return (rte->src->owner->class == &bgp_rte_owner_class) ? + SKIP_BACK(struct bgp_proto, p.sources, rte->src->owner) : NULL; +} + #define BGP_AIGP_METRIC 1 #define BGP_AIGP_MAX U64(0xffffffffffffffff) @@ -609,6 +639,7 @@ void bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) void bgp_kick_tx(void *vconn); void bgp_tx(struct birdsock *sk); int bgp_rx(struct birdsock *sk, uint size); +void bgp_uncork(void *vp); const char * bgp_error_dsc(unsigned code, unsigned subcode); void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len); @@ -652,6 +683,7 @@ enum bgp_attr_id { BA_AS4_AGGREGATOR = 0x12, /* RFC 6793 */ BA_AIGP = 0x1a, /* RFC 7311 */ BA_LARGE_COMMUNITY = 0x20, /* RFC 8092 */ +#define BA_ONLY_TO_CUSTOMER 0x23 /* RFC 9234 */ /* Bird's private internal BGP attributes */ BA_MPLS_LABEL_STACK = 0x100, /* MPLS label stack transfer attribute */ diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 24f3ec8f..9f0d2306 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -30,7 +30,8 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG, LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS, DYNAMIC, RANGE, NAME, DIGITS, AIGP, ORIGINATE, COST, ENFORCE, - FIRST, FREE, VALIDATE, BASE) + FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER, + RS_SERVER, RS_CLIENT, REQUIRE, BGP_OTC) %type <i> bgp_nh %type <i32> bgp_afi @@ -39,7 +40,7 @@ CF_KEYWORDS(CEASE, PREFIX, LIMIT, HIT, ADMINISTRATIVE, SHUTDOWN, RESET, PEER, CONFIGURATION, CHANGE, DECONFIGURED, CONNECTION, REJECTED, COLLISION, OUT, OF, RESOURCES) -%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag +%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag bgp_role_name CF_GRAMMAR @@ -73,6 +74,7 @@ bgp_proto_start: proto_start BGP { BGP_CFG->llgr_mode = -1; BGP_CFG->llgr_time = 3600; BGP_CFG->setkey = 1; + BGP_CFG->local_role = BGP_ROLE_UNDEFINED; BGP_CFG->dynamic_name = "dynbgp"; BGP_CFG->check_link = -1; } @@ -115,6 +117,14 @@ bgp_cease_flag: | OUT OF RESOURCES { $$ = 1 << 8; } ; +bgp_role_name: + PEER { $$ = BGP_ROLE_PEER; } + | PROVIDER { $$ = BGP_ROLE_PROVIDER; } + | CUSTOMER { $$ = BGP_ROLE_CUSTOMER; } + | RS_SERVER { $$ = BGP_ROLE_RS_SERVER; } + | RS_CLIENT { $$ = BGP_ROLE_RS_CLIENT; } + ; + bgp_proto: bgp_proto_start proto_name '{' | bgp_proto proto_item ';' @@ -198,6 +208,8 @@ bgp_proto: | bgp_proto BFD GRACEFUL ';' { init_bfd_opts(&BGP_CFG->bfd); BGP_CFG->bfd->mode = BGP_BFD_GRACEFUL; } | bgp_proto BFD { open_bfd_opts(&BGP_CFG->bfd); } bfd_opts { close_bfd_opts(); } ';' | bgp_proto ENFORCE FIRST AS bool ';' { BGP_CFG->enforce_first_as = $5; } + | bgp_proto LOCAL ROLE bgp_role_name ';' { BGP_CFG->local_role = $4; } + | bgp_proto REQUIRE ROLES bool ';' { BGP_CFG->require_roles = $4; } ; bgp_afi: diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index 4d4ae3eb..d4d2d0b0 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -238,6 +238,7 @@ bgp_prepare_capabilities(struct bgp_conn *conn) caps->ext_messages = p->cf->enable_extended_messages; caps->route_refresh = p->cf->enable_refresh; caps->enhanced_refresh = p->cf->enable_refresh; + caps->role = p->cf->local_role; if (caps->as4_support) caps->as4_number = p->public_as; @@ -350,6 +351,13 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf) *buf++ = 0; /* Capability data length */ } + if (caps->role != BGP_ROLE_UNDEFINED) + { + *buf++ = 9; /* Capability 9: Announce chosen BGP role */ + *buf++ = 1; /* Capability data length */ + *buf++ = caps->role; + } + if (caps->gr_aware) { *buf++ = 64; /* Capability 64: Support for graceful restart */ @@ -449,11 +457,15 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) struct bgp_proto *p = conn->bgp; struct bgp_caps *caps; struct bgp_af_caps *ac; + uint err_subcode = 0; int i, cl; u32 af; if (!conn->remote_caps) + { caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + sizeof(struct bgp_af_caps)); + caps->role = BGP_ROLE_UNDEFINED; + } else { caps = conn->remote_caps; @@ -513,6 +525,21 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) caps->ext_messages = 1; break; + case 9: /* BGP role capability, RFC 9234 */ + if (cl != 1) + goto err; + + /* Reserved value */ + if (pos[2] == BGP_ROLE_UNDEFINED) + { err_subcode = 11; goto err; } + + /* Multiple inconsistent values */ + if ((caps->role != BGP_ROLE_UNDEFINED) && (caps->role != pos[2])) + { err_subcode = 11; goto err; } + + caps->role = pos[2]; + break; + case 64: /* Graceful restart capability, RFC 4724 */ if (cl % 4 != 2) goto err; @@ -638,7 +665,7 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) err: mb_free(caps); - bgp_error(conn, 2, 0, NULL, 0); + bgp_error(conn, 2, err_subcode, NULL, 0); return -1; } @@ -854,6 +881,22 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) conn->received_as = asn; } + /* RFC 9234 4.2 - check role agreement */ + u8 local_role = p->cf->local_role; + u8 neigh_role = caps->role; + + if ((local_role != BGP_ROLE_UNDEFINED) && + (neigh_role != BGP_ROLE_UNDEFINED) && + !((local_role == BGP_ROLE_PEER && neigh_role == BGP_ROLE_PEER) || + (local_role == BGP_ROLE_CUSTOMER && neigh_role == BGP_ROLE_PROVIDER) || + (local_role == BGP_ROLE_PROVIDER && neigh_role == BGP_ROLE_CUSTOMER) || + (local_role == BGP_ROLE_RS_CLIENT && neigh_role == BGP_ROLE_RS_SERVER) || + (local_role == BGP_ROLE_RS_SERVER && neigh_role == BGP_ROLE_RS_CLIENT))) + { bgp_error(conn, 2, 11, NULL, 0); return; } + + if ((p->cf->require_roles) && (neigh_role == BGP_ROLE_UNDEFINED)) + { bgp_error(conn, 2, 11, NULL, 0); return; } + /* Check the other connection */ other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; switch (other->state) @@ -1372,6 +1415,8 @@ bgp_rte_update(struct bgp_parse_state *s, const net_addr *n, u32 path_id, ea_lis { if (path_id != s->last_id) { + rt_unlock_source(s->last_src); + s->last_src = rt_get_source(&s->proto->p, path_id); s->last_id = path_id; @@ -2167,7 +2212,7 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu * var IPv4 Network Layer Reachability Information */ - ASSERT_DIE(s->channel->withdraw_bucket != buck); + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); int lr, la; @@ -2190,7 +2235,7 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu static byte * bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - ASSERT_DIE(s->channel->withdraw_bucket != buck); + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); /* * 2 B IPv4 Withdrawn Routes Length (zero) @@ -2330,7 +2375,7 @@ again: ; }; /* Try unreachable bucket */ - if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + if ((buck = c->ptx->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) { res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ? bgp_create_ip_unreach(&s, buck, buf, end): @@ -2340,9 +2385,9 @@ again: ; } /* Try reachable buckets */ - if (!EMPTY_LIST(c->bucket_queue)) + if (!EMPTY_LIST(c->ptx->bucket_queue)) { - buck = HEAD(c->bucket_queue); + buck = HEAD(c->ptx->bucket_queue); /* Cleanup empty buckets */ if (bgp_done_bucket(c, buck)) @@ -2449,6 +2494,7 @@ bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_lis s->last_id = 0; s->last_src = s->proto->p.main_source; + rt_lock_source(s->last_src); /* * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not @@ -2475,6 +2521,8 @@ bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_lis rta_free(s->cached_ea); s->cached_ea = NULL; + + rt_unlock_source(s->last_src); } static void @@ -2972,6 +3020,7 @@ static struct { { 2, 6, "Unacceptable hold time" }, { 2, 7, "Required capability missing" }, /* [RFC5492] */ { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */ + { 2,11, "Role mismatch" }, /* From Open Policy, RFC 9234 */ { 3, 0, "Invalid UPDATE message" }, { 3, 1, "Malformed attribute list" }, { 3, 2, "Unrecognized well-known attribute" }, @@ -3175,6 +3224,21 @@ bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len) } } +void +bgp_uncork(void *vp) +{ + struct bgp_proto *p = vp; + + if (p && p->conn && (p->conn->state == BS_ESTABLISHED) && !p->conn->sk->rx_hook) + { + struct birdsock *sk = p->conn->sk; + ASSERT_DIE(sk->rpos > sk->rbuf); + sk->rx_hook = bgp_rx; + bgp_rx(sk, sk->rpos - sk->rbuf); + BGP_TRACE(D_PACKETS, "Uncorked"); + } +} + /** * bgp_rx - handle received data * @sk: socket @@ -3189,6 +3253,7 @@ int bgp_rx(sock *sk, uint size) { struct bgp_conn *conn = sk->data; + struct bgp_proto *p = conn->bgp; byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; uint i, len; @@ -3198,6 +3263,12 @@ bgp_rx(sock *sk, uint size) { if ((conn->state == BS_CLOSE) || (conn->sk != sk)) return 0; + if ((conn->state == BS_ESTABLISHED) && rt_cork_check(conn->bgp->uncork_ev)) + { + sk->rx_hook = NULL; + BGP_TRACE(D_PACKETS, "Corked"); + return 0; + } for(i=0; i<16; i++) if (pkt_start[i] != 0xff) { |