diff options
Diffstat (limited to 'proto/bgp/bgp.c')
-rw-r--r-- | proto/bgp/bgp.c | 682 |
1 files changed, 433 insertions, 249 deletions
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index e2754649..cda0eb8d 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -101,7 +101,9 @@ * RFC 8203 - BGP Administrative Shutdown Communication * RFC 8212 - Default EBGP Route Propagation Behavior without Policies * RFC 8654 - Extended Message Support for BGP - * draft-ietf-idr-ext-opt-param-07 + * RFC 9072 - Extended Optional Parameters Length for BGP OPEN Message + * RFC 9117 - Revised Validation Procedure for BGP Flow Specifications + * RFC 9234 - Route Leak Prevention and Detection Using Roles * draft-uttaro-idr-bgp-persistence-04 * draft-walton-bgp-hostname-capability-02 */ @@ -113,7 +115,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "nest/locks.h" #include "conf/conf.h" @@ -124,8 +126,13 @@ #include "bgp.h" -/* Global list of listening sockets */ -static list STATIC_LIST_INIT(bgp_sockets); +static void bgp_listen_create(void *); + +static list STATIC_LIST_INIT(bgp_sockets); /* Global list of listening sockets */ +static list STATIC_LIST_INIT(bgp_listen_pending); /* Global list of listening socket open requests */ +static event bgp_listen_event = { .hook = bgp_listen_create }; + +DOMAIN(rtable) bgp_listen_domain; static void bgp_connect(struct bgp_proto *p); static void bgp_active(struct bgp_proto *p); @@ -136,14 +143,69 @@ static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd); static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); static void bgp_listen_sock_err(sock *sk UNUSED, int err); +static void bgp_initiate_disable(struct bgp_proto *p, int err_val); static void bgp_graceful_restart_feed(struct bgp_channel *c); -static inline void channel_refresh_end_reload(struct channel *c) + + +static inline int +bgp_setup_auth(struct bgp_proto *p, int enable) { - channel_refresh_end(c); + /* Beware. This is done from main_birdloop and protocol birdloop is NOT ENTERED. + * Anyway, we are only accessing: + * - protocol config which can be changed only from main_birdloop (reconfig) + * - protocol listen socket which is always driven by main_birdloop + * - protocol name which is set on reconfig + */ + + if (p->cf->password && p->listen.sock) + { + ip_addr prefix = p->cf->remote_ip; + int pxlen = -1; + + if (p->cf->remote_range) + { + prefix = net_prefix(p->cf->remote_range); + pxlen = net_pxlen(p->cf->remote_range); + } + + int rv = sk_set_md5_auth(p->listen.sock->sk, + p->cf->local_ip, prefix, pxlen, p->cf->iface, + enable ? p->cf->password : NULL, p->cf->setkey); + + if (rv < 0) + sk_log_error(p->listen.sock->sk, p->p.name); - if (c->in_table) - channel_request_reload(c); + return rv; + } + else + return 0; +} + +/** + * bgp_close - close a BGP instance + * @p: BGP instance + * + * This function frees and deconfigures shared BGP resources. + */ +static void +bgp_close(struct bgp_proto *p) +{ + LOCK_DOMAIN(rtable, bgp_listen_domain); + + struct bgp_listen_request *req = &p->listen; + struct bgp_socket *bs = req->sock; + + if (bs) + { + req->sock = NULL; + rem_node(&req->n); + + if (bs && EMPTY_LIST(bs->requests)) + ev_send(&global_event_list, &bgp_listen_event); + } + + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } /** @@ -155,121 +217,136 @@ static inline void channel_refresh_end_reload(struct channel *c) * is acquired and neighbor is ready). When error, caller should change state to * PS_DOWN and return immediately. */ -static int +static void bgp_open(struct bgp_proto *p) { - ASSERT_DIE(birdloop_inside(&main_birdloop)); - - struct bgp_socket *bs = NULL; - struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL; - ip_addr addr = p->cf->strict_bind ? p->cf->local_ip : - (p->ipv4 ? IPA_NONE4 : IPA_NONE6); - uint port = p->cf->local_port; - uint flags = p->cf->free_bind ? SKF_FREEBIND : 0; - uint flag_mask = SKF_FREEBIND; + LOCK_DOMAIN(rtable, bgp_listen_domain); + struct bgp_listen_request *req = &p->listen; /* We assume that cf->iface is defined iff cf->local_ip is link-local */ + req->iface = p->cf->strict_bind ? p->cf->iface : NULL; + req->vrf = p->p.vrf; + req->addr = p->cf->strict_bind ? p->cf->local_ip : + (p->ipv4 ? IPA_NONE4 : IPA_NONE6); + req->port = p->cf->local_port; + req->flags = p->cf->free_bind ? SKF_FREEBIND : 0; - WALK_LIST(bs, bgp_sockets) - if (ipa_equal(bs->sk->saddr, addr) && - (bs->sk->sport == port) && - (bs->sk->iface == ifa) && - (bs->sk->vrf == p->p.vrf) && - ((bs->sk->flags & flag_mask) == flags)) - { - bs->uc++; - p->sock = bs; - return 0; - } - - sock *sk = sk_new(proto_pool); - sk->type = SK_TCP_PASSIVE; - sk->ttl = 255; - sk->saddr = addr; - sk->sport = port; - sk->iface = ifa; - sk->vrf = p->p.vrf; - sk->flags = flags | SKF_PASSIVE_THREAD; - sk->tos = IP_PREC_INTERNET_CONTROL; - sk->rbsize = BGP_RX_BUFFER_SIZE; - sk->tbsize = BGP_TX_BUFFER_SIZE; - sk->rx_hook = bgp_incoming_connection; - sk->err_hook = bgp_listen_sock_err; - - if (sk_open(sk) < 0) - goto err; - - bs = mb_allocz(proto_pool, sizeof(struct bgp_socket)); - bs->sk = sk; - bs->uc = 1; - p->sock = bs; - sk->data = bs; + BGP_TRACE(D_EVENTS, "Requesting listen socket at %I%J port %u", req->addr, req->iface, req->port); - add_tail(&bgp_sockets, &bs->n); + add_tail(&bgp_listen_pending, &req->n); + ev_send(&global_event_list, &bgp_listen_event); - return 0; - -err: - sk_log_error(sk, p->p.name); - log(L_ERR "%s: Cannot open listening socket", p->p.name); - rfree(sk); - return -1; + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } -/** - * bgp_close - close a BGP instance - * @p: BGP instance - * - * This function frees and deconfigures shared BGP resources. - */ static void -bgp_close(struct bgp_proto *p) +bgp_listen_create(void *_ UNUSED) { ASSERT_DIE(birdloop_inside(&main_birdloop)); - struct bgp_socket *bs = p->sock; - - ASSERT(bs && bs->uc); + uint flag_mask = SKF_FREEBIND; - if (--bs->uc) - return; + while (1) { + LOCK_DOMAIN(rtable, bgp_listen_domain); - rfree(bs->sk); - rem_node(&bs->n); - mb_free(bs); -} + if (EMPTY_LIST(bgp_listen_pending)) + { + UNLOCK_DOMAIN(rtable, bgp_listen_domain); + break; + } -static inline int -bgp_setup_auth(struct bgp_proto *p, int enable) -{ - if (p->cf->password) - { - ip_addr prefix = p->cf->remote_ip; - int pxlen = -1; + /* Get the first request to match */ + struct bgp_listen_request *req = HEAD(bgp_listen_pending); + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, listen, req); + rem_node(&req->n); + + /* First try to find existing socket */ + struct bgp_socket *bs; + WALK_LIST(bs, bgp_sockets) + if (ipa_equal(bs->sk->saddr, req->addr) && + (bs->sk->sport == req->port) && + (bs->sk->iface == req->iface) && + (bs->sk->vrf == req->vrf) && + ((bs->sk->flags & flag_mask) == req->flags)) + break; - if (p->cf->remote_range) + /* Not found any */ + if (NODE_VALID(bs)) + BGP_TRACE(D_EVENTS, "Found a listening socket: %p", bs); + else { - prefix = net_prefix(p->cf->remote_range); - pxlen = net_pxlen(p->cf->remote_range); + /* Allocating new socket from global protocol pool. + * We can do this in main_birdloop. */ + sock *sk = sk_new(proto_pool); + sk->type = SK_TCP_PASSIVE; + sk->ttl = 255; + sk->saddr = req->addr; + sk->sport = req->port; + sk->iface = req->iface; + sk->vrf = req->vrf; + sk->flags = req->flags; + sk->tos = IP_PREC_INTERNET_CONTROL; + sk->rbsize = BGP_RX_BUFFER_SIZE; + sk->tbsize = BGP_TX_BUFFER_SIZE; + sk->rx_hook = bgp_incoming_connection; + sk->err_hook = bgp_listen_sock_err; + + if (sk_open(sk, &main_birdloop) < 0) + { + sk_log_error(sk, p->p.name); + log(L_ERR "%s: Cannot open listening socket", p->p.name); + rfree(sk); + UNLOCK_DOMAIN(rtable, bgp_listen_domain); + + bgp_initiate_disable(p, BEM_NO_SOCKET); + continue; + } + + bs = mb_allocz(proto_pool, sizeof(struct bgp_socket)); + bs->sk = sk; + sk->data = bs; + + init_list(&bs->requests); + add_tail(&bgp_sockets, &bs->n); + + BGP_TRACE(D_EVENTS, "Created new listening socket: %p", bs); } - int rv = sk_set_md5_auth(p->sock->sk, - p->cf->local_ip, prefix, pxlen, p->cf->iface, - enable ? p->cf->password : NULL, p->cf->setkey); + req->sock = bs; + add_tail(&bs->requests, &req->n); - if (rv < 0) - sk_log_error(p->sock->sk, p->p.name); + if (bgp_setup_auth(p, 1) < 0) + { + rem_node(&req->n); + req->sock = NULL; - return rv; + UNLOCK_DOMAIN(rtable, bgp_listen_domain); + + bgp_initiate_disable(p, BEM_INVALID_MD5); + continue; + } + + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } - else - return 0; + + /* Cleanup leftover listening sockets */ + LOCK_DOMAIN(rtable, bgp_listen_domain); + struct bgp_socket *bs; + node *nxt; + WALK_LIST_DELSAFE(bs, nxt, bgp_sockets) + if (EMPTY_LIST(bs->requests)) + { + rfree(bs->sk); + rem_node(&bs->n); + mb_free(bs); + } + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } static inline struct bgp_channel * bgp_find_channel(struct bgp_proto *p, u32 afi) { struct bgp_channel *c; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) if (c->afi == afi) return c; @@ -288,6 +365,8 @@ bgp_startup(struct bgp_proto *p) if (p->postponed_sk) { /* Apply postponed incoming connection */ + sk_reloop(p->postponed_sk, p->p.loop); + bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, p->postponed_sk); bgp_send_open(&p->incoming_conn); @@ -305,13 +384,7 @@ bgp_startup_timeout(timer *t) static void bgp_initiate(struct bgp_proto *p) { - int err_val; - - if (bgp_open(p) < 0) - { err_val = BEM_NO_SOCKET; goto err1; } - - if (bgp_setup_auth(p, 1) < 0) - { err_val = BEM_INVALID_MD5; goto err2; } + bgp_open(p); if (p->cf->bfd) bgp_update_bfd(p, p->cf->bfd); @@ -324,24 +397,28 @@ bgp_initiate(struct bgp_proto *p) } else bgp_startup(p); +} - return; - -err2: - bgp_close(p); -err1: - p->p.disabled = 1; - bgp_store_error(p, NULL, BE_MISC, err_val); - - p->neigh = NULL; - proto_notify_state(&p->p, PS_DOWN); - - return; +static void +bgp_initiate_disable(struct bgp_proto *p, int err_val) +{ + PROTO_LOCKED_FROM_MAIN(&p->p) + { + /* The protocol may be already down for another reason. + * Shutdown the protocol only if it isn't already shutting down. */ + switch (p->p.proto_state) + { + case PS_START: + case PS_UP: + p->p.disabled = 1; + bgp_store_error(p, NULL, BE_MISC, err_val); + bgp_stop(p, err_val, NULL, 0); + } + } } /** * bgp_start_timer - start a BGP timer - * @p: bgp_proto which the timer belongs to * @t: timer * @value: time (in seconds) to fire (0 to disable the timer) * @@ -352,8 +429,6 @@ err1: void bgp_start_timer(struct bgp_proto *p, timer *t, uint value) { - BGP_ASSERT_INSIDE(p); - if (value) { /* The randomization procedure is specified in RFC 4271 section 10 */ @@ -375,7 +450,7 @@ bgp_start_timer(struct bgp_proto *p, timer *t, uint value) void bgp_close_conn(struct bgp_conn *conn) { - BGP_ASSERT_INSIDE(conn->bgp); + // struct bgp_proto *p = conn->bgp; DBG("BGP: Closing connection\n"); conn->packets_to_send = 0; @@ -386,8 +461,10 @@ bgp_close_conn(struct bgp_conn *conn) conn->keepalive_timer = NULL; rfree(conn->hold_timer); conn->hold_timer = NULL; + rfree(conn->tx_ev); conn->tx_ev = NULL; + rfree(conn->sk); conn->sk = NULL; @@ -467,8 +544,6 @@ bgp_graceful_close_conn(struct bgp_conn *conn, int subcode, byte *data, uint len static void bgp_down(struct bgp_proto *p) { - bgp_start_timer(p, p->startup_timer, 0); - if (p->start_state > BSS_PREPARE) { bgp_setup_auth(p, 0); @@ -482,34 +557,21 @@ bgp_down(struct bgp_proto *p) } static void -bgp_active_event(void *vp) +bgp_decision(void *vp) { struct bgp_proto *p = vp; - BGP_ASSERT_INSIDE(p); - - DBG("%s: Decision start\n", p->p.name); + DBG("BGP: Decision start\n"); if ((p->p.proto_state == PS_START) && (p->outgoing_conn.state == BS_IDLE) && (p->incoming_conn.state != BS_OPENCONFIRM) && !p->passive) bgp_active(p); -} - -static void -bgp_down_event(void *vp) -{ - struct bgp_proto *p = vp; - - BGP_ENTER(p); - DBG("%s: Down event\n", p->p.name); if ((p->p.proto_state == PS_STOP) && (p->outgoing_conn.state == BS_IDLE) && (p->incoming_conn.state == BS_IDLE)) bgp_down(p); - - BGP_LEAVE(p); } static struct bgp_proto * @@ -539,9 +601,16 @@ void bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len) { proto_notify_state(&p->p, PS_STOP); + p->uncork_ev->data = NULL; bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len); bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len); - ev_send_loop(&main_birdloop, p->down_event); + + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + if (c->ptx) + bgp_free_pending_tx(c); + + proto_send_event(&p->p, p->event); } static inline void @@ -588,7 +657,6 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->link_addr = p->neigh->iface->llv6->ip; conn->sk->fast_rx = 0; - conn->sk->cork = &rt_cork; p->conn = conn; p->last_error_class = 0; @@ -614,7 +682,7 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) /* Summary state of ADD_PATH RX for active channels */ uint summary_add_path_rx = 0; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) { const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi); const struct bgp_af_caps *rem = bgp_find_af_caps(peer, c->afi); @@ -635,7 +703,7 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) int active = loc->ready && rem->ready; c->c.disabled = !active; - c->c.reloadable = p->route_refresh || c->cf->import_table; + c->c.reloadable = p->route_refresh || ((c->c.in_keep & RIK_PREFILTER) == RIK_PREFILTER); c->index = active ? num++ : 0; @@ -696,7 +764,7 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->channel_count = num; p->summary_add_path_rx = summary_add_path_rx; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) { if (c->c.disabled) continue; @@ -747,8 +815,7 @@ bgp_conn_enter_idle_state(struct bgp_conn *conn) bgp_close_conn(conn); bgp_conn_set_state(conn, BS_IDLE); - ev_send_loop(p->p.loop, p->active_event); - ev_send_loop(&main_birdloop, p->down_event); + proto_send_event(&p->p, p->event); if (os == BS_ESTABLISHED) bgp_conn_leave_established_state(p); @@ -776,7 +843,7 @@ bgp_handle_graceful_restart(struct bgp_proto *p) p->gr_active_num = 0; struct bgp_channel *c; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) { /* FIXME: perhaps check for channel state instead of disabled flag? */ if (c->c.disabled) @@ -790,16 +857,14 @@ bgp_handle_graceful_restart(struct bgp_proto *p) { case BGP_GRS_NONE: c->gr_active = BGP_GRS_ACTIVE; - channel_refresh_begin(&c->c); - break; + /* fall through */ case BGP_GRS_ACTIVE: - channel_refresh_end(&c->c); - channel_refresh_begin(&c->c); + rt_refresh_begin(&c->c.in_req); break; case BGP_GRS_LLGR: - channel_refresh_begin(&c->c); + rt_refresh_begin(&c->c.in_req); bgp_graceful_restart_feed(c); break; } @@ -807,15 +872,13 @@ bgp_handle_graceful_restart(struct bgp_proto *p) else { /* Just flush the routes */ - channel_refresh_begin(&c->c); - channel_refresh_end(&c->c); + rt_refresh_begin(&c->c.in_req); + rt_refresh_end(&c->c.in_req); } /* Reset bucket and prefix tables */ - bgp_free_bucket_table(c); - bgp_free_prefix_table(c); - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + bgp_free_pending_tx(c); + bgp_init_pending_tx(c); c->packets_to_send = 0; } @@ -871,6 +934,8 @@ bgp_graceful_restart_feed(struct bgp_channel *c) } + + /** * bgp_graceful_restart_done - finish active BGP graceful restart * @c: BGP channel @@ -893,11 +958,8 @@ bgp_graceful_restart_done(struct bgp_channel *c) if (!p->gr_active_num) BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); - if (c->stale_feed.hook) - rt_stop_export(&c->stale_feed, bgp_graceful_restart_feed_done); - tm_stop(c->stale_timer); - channel_refresh_end_reload(&c->c); + rt_refresh_end(&c->c.in_req); } /** @@ -919,7 +981,7 @@ bgp_graceful_restart_timeout(timer *t) if (p->llgr_ready) { struct bgp_channel *c; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) { /* Channel is not in GR and is already flushed */ if (!c->gr_active) @@ -976,11 +1038,8 @@ bgp_refresh_begin(struct bgp_channel *c) if (c->load_state == BFS_LOADING) { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } - if (c->load_state == BFS_REFRESHING) - channel_refresh_end(&c->c); - c->load_state = BFS_REFRESHING; - channel_refresh_begin(&c->c); + rt_refresh_begin(&c->c.in_req); } /** @@ -1001,7 +1060,7 @@ bgp_refresh_end(struct bgp_channel *c) { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } c->load_state = BFS_NONE; - channel_refresh_end_reload(&c->c); + rt_refresh_end(&c->c.in_req); } @@ -1137,12 +1196,10 @@ bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn) static void bgp_setup_sk(struct bgp_conn *conn, sock *s) { - ASSERT_DIE(s->flags & SKF_THREAD); s->data = conn; s->err_hook = bgp_sock_err; s->fast_rx = 1; conn->sk = s; - sk_start(s); } static void @@ -1151,8 +1208,6 @@ bgp_active(struct bgp_proto *p) int delay = MAX(1, p->cf->connect_delay_time); struct bgp_conn *conn = &p->outgoing_conn; - BGP_ASSERT_INSIDE(p); - BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay); bgp_setup_conn(p, conn); bgp_conn_set_state(conn, BS_ACTIVE); @@ -1173,12 +1228,9 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c struct bgp_conn *conn = &p->outgoing_conn; int hops = p->cf->multihop ? : 1; - BGP_ASSERT_INSIDE(p); - DBG("BGP: Connecting\n"); sock *s = sk_new(p->p.pool); s->type = SK_TCP_ACTIVE; - s->flags |= SKF_THREAD; s->saddr = p->local_ip; s->daddr = p->remote_ip; s->dport = p->cf->remote_port; @@ -1190,6 +1242,7 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c s->tos = IP_PREC_INTERNET_CONTROL; s->password = p->cf->password; s->tx_hook = bgp_connected; + s->flags = p->cf->free_bind ? SKF_FREEBIND : 0; BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J", s->daddr, ipa_is_link_local(s->daddr) ? p->cf->iface : NULL, s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL); @@ -1197,7 +1250,7 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c bgp_setup_sk(conn, s); bgp_conn_set_state(conn, BS_CONNECT); - if (sk_open(s) < 0) + if (sk_open(s, p->p.loop) < 0) goto err; /* Set minimal receive TTL if needed */ @@ -1227,12 +1280,17 @@ static struct bgp_proto * bgp_find_proto(sock *sk) { struct bgp_proto *best = NULL; - struct bgp_proto *p; + struct bgp_socket *bs = sk->data; + struct bgp_listen_request *req; /* sk->iface is valid only if src or dst address is link-local */ int link = ipa_is_link_local(sk->saddr) || ipa_is_link_local(sk->daddr); - WALK_LIST(p, proto_list) + LOCK_DOMAIN(rtable, bgp_listen_domain); + + WALK_LIST(req, bs->requests) + { + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, listen, req); if ((p->p.proto == &proto_bgp) && (ipa_equal(p->remote_ip, sk->daddr) || bgp_is_dynamic(p)) && (!p->cf->remote_range || ipa_in_netX(sk->daddr, p->cf->remote_range)) && @@ -1246,7 +1304,9 @@ bgp_find_proto(sock *sk) if (!bgp_is_dynamic(p)) break; } + } + UNLOCK_DOMAIN(rtable, bgp_listen_domain); return best; } @@ -1265,6 +1325,8 @@ bgp_find_proto(sock *sk) static int bgp_incoming_connection(sock *sk, uint dummy UNUSED) { + ASSERT_DIE(birdloop_inside(&main_birdloop)); + struct bgp_proto *p; int acc, hops; @@ -1278,17 +1340,7 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) return 0; } - if (p->p.loop == &main_birdloop) - { - /* Protocol is down for whatever reason. No need for locking. */ - BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) rejected (protocol is down)", - sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, - sk->dport); - rfree(sk); - return 0; - } - - BGP_ENTER(p); + birdloop_enter(p->p.loop); /* * BIRD should keep multiple incoming connections in OpenSent state (for @@ -1319,8 +1371,7 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) if (!acc) { rfree(sk); - BGP_LEAVE(p); - return 0; + goto leave; } hops = p->cf->multihop ? : 1; @@ -1345,22 +1396,24 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) p = bgp_spawn(p, sk->daddr); p->postponed_sk = sk; rmove(sk, p->p.pool); - BGP_LEAVE(p); - return 0; + goto leave; } rmove(sk, p->p.pool); + sk_reloop(sk, p->p.loop); + bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, sk); bgp_send_open(&p->incoming_conn); - BGP_LEAVE(p); - return 0; + goto leave; err: sk_log_error(sk, p->p.name); log(L_ERR "%s: Incoming connection aborted", p->p.name); rfree(sk); - BGP_LEAVE(p); + +leave: + birdloop_leave(p->p.loop); return 0; } @@ -1395,9 +1448,10 @@ bgp_neigh_notify(neighbor *n) struct bgp_proto *p = (struct bgp_proto *) n->proto; int ps = p->p.proto_state; - BGP_ASSERT_INSIDE(p); + if (n != p->neigh) + return; - if ((n != p->neigh) || (ps == PS_DOWN) || (ps == PS_STOP)) + if ((ps == PS_DOWN) || (ps == PS_STOP)) return; int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE); @@ -1470,15 +1524,22 @@ static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd) { if (bfd && p->bfd_req) + { + BGP_TRACE(D_EVENTS, "Updating existing BFD request"); bfd_update_request(p->bfd_req, bfd); + } if (bfd && !p->bfd_req && !bgp_is_dynamic(p)) + { p->bfd_req = bfd_request_session(p->p.pool, p->remote_ip, p->local_ip, p->cf->multihop ? NULL : p->neigh->iface, - p->p.vrf, bgp_bfd_notify, p, birdloop_event_list(p->p.loop), bfd); + p->p.vrf, bgp_bfd_notify, p, p->p.loop, bfd); + BGP_TRACE(D_EVENTS, "Requesting a new BFD session"); + } if (!bfd && p->bfd_req) { + BGP_TRACE(D_EVENTS, "Retracting the BFD request"); rfree(p->bfd_req); p->bfd_req = NULL; } @@ -1490,8 +1551,11 @@ bgp_reload_routes(struct channel *C) struct bgp_proto *p = (void *) C->proto; struct bgp_channel *c = (void *) C; - ASSERT(p->conn && (p->route_refresh)); + /* Ignore non-BGP channels */ + if (C->channel != &channel_bgp) + return; + ASSERT(p->conn && p->route_refresh); bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); } @@ -1501,6 +1565,10 @@ bgp_feed_begin(struct channel *C, int initial) struct bgp_proto *p = (void *) C->proto; struct bgp_channel *c = (void *) C; + /* Ignore non-BGP channels */ + if (C->channel != &channel_bgp) + return; + /* This should not happen */ if (!p->conn) return; @@ -1508,6 +1576,12 @@ bgp_feed_begin(struct channel *C, int initial) if (initial && p->cf->gr_mode) c->feed_state = BFS_LOADING; + if (!initial && C->out_table) + { + c->feed_out_table = 1; + return; + } + /* It is refeed and both sides support enhanced route refresh */ if (!initial && p->enhanced_refresh) { @@ -1526,6 +1600,16 @@ bgp_feed_end(struct channel *C) struct bgp_proto *p = (void *) C->proto; struct bgp_channel *c = (void *) C; + /* Ignore non-BGP channels */ + if (C->channel != &channel_bgp) + return; + + if (c->feed_out_table) + { + c->feed_out_table = 0; + return; + } + /* This should not happen */ if (!p->conn) return; @@ -1548,17 +1632,14 @@ bgp_feed_end(struct channel *C) static void -bgp_start_locked(struct object_lock *lock) +bgp_start_locked(void *_p) { - struct bgp_proto *p = lock->data; + struct bgp_proto *p = _p; const struct bgp_config *cf = p->cf; - BGP_ENTER(p); - if (p->p.proto_state != PS_START) { DBG("BGP: Got lock in different state %d\n", p->p.proto_state); - BGP_LEAVE(p); return; } @@ -1568,11 +1649,10 @@ bgp_start_locked(struct object_lock *lock) { /* Multi-hop sessions do not use neighbor entries */ bgp_initiate(p); - BGP_LEAVE(p); return; } - neighbor *n = neigh_find(&p->p, p->remote_ip, cf->iface, NEF_STICKY | NEF_NOTIFY_MAIN); + neighbor *n = neigh_find(&p->p, p->remote_ip, cf->iface, NEF_STICKY); if (!n) { log(L_ERR "%s: Invalid remote address %I%J", p->p.name, p->remote_ip, cf->iface); @@ -1580,7 +1660,6 @@ bgp_start_locked(struct object_lock *lock) p->p.disabled = 1; bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP); proto_notify_state(&p->p, PS_DOWN); - BGP_LEAVE(p); return; } @@ -1592,8 +1671,6 @@ bgp_start_locked(struct object_lock *lock) BGP_TRACE(D_EVENTS, "Waiting for link on %s", n->iface->name); else bgp_start_neighbor(p); - - BGP_LEAVE(p); } static int @@ -1632,13 +1709,12 @@ bgp_start(struct proto *P) p->stats.rx_bytes = p->stats.tx_bytes = 0; p->last_rx_update = 0; - p->active_event = ev_new_init(p->p.pool, bgp_active_event, p); - p->down_event = ev_new_init(p->p.pool, bgp_down_event, p); + p->event = ev_new_init(p->p.pool, bgp_decision, p); + p->uncork_ev = ev_new_init(p->p.pool, bgp_uncork, p); + p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0); p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0); - p->rx_lp = lp_new_default(p->p.pool); - p->local_id = proto_get_router_id(P->cf); if (p->rr_client) p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id; @@ -1650,7 +1726,7 @@ bgp_start(struct proto *P) if (p->p.gr_recovery && p->cf->gr_mode) { struct bgp_channel *c; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) channel_graceful_restart_lock(&c->c); } @@ -1665,8 +1741,11 @@ bgp_start(struct proto *P) lock->iface = p->cf->iface; lock->vrf = p->cf->iface ? NULL : p->p.vrf; lock->type = OBJLOCK_TCP; - lock->hook = bgp_start_locked; - lock->data = p; + lock->event = (event) { + .hook = bgp_start_locked, + .data = p, + }; + lock->target = proto_event_list(P); /* For dynamic BGP, we use inst 1 to avoid collisions with regular BGP */ if (bgp_is_dynamic(p)) @@ -1782,7 +1861,7 @@ bgp_init(struct proto_config *CF) P->rt_notify = bgp_rt_notify; P->preexport = bgp_preexport; - P->neigh_notify = bgp_neigh_notify; + P->iface_sub.neigh_notify = bgp_neigh_notify; P->reload_routes = bgp_reload_routes; P->feed_begin = bgp_feed_begin; P->feed_end = bgp_feed_end; @@ -1809,7 +1888,7 @@ bgp_init(struct proto_config *CF) /* Add all channels */ struct bgp_channel_config *cc; - WALK_LIST(cc, CF->channels) + BGP_CF_WALK_CHANNELS(cf, cc) proto_add_channel(P, &cc->c); return P; @@ -1830,6 +1909,9 @@ bgp_channel_init(struct channel *C, struct channel_config *CF) if (cf->igp_table_ip6) c->igp_table_ip6 = cf->igp_table_ip6->table; + + if (cf->base_table) + c->base_table = cf->base_table->table; } static int @@ -1840,22 +1922,23 @@ bgp_channel_start(struct channel *C) ip_addr src = p->local_ip; if (c->igp_table_ip4) - RT_LOCKED(c->igp_table_ip4, t) - rt_lock_table(t); + rt_lock_table(c->igp_table_ip4); if (c->igp_table_ip6) - RT_LOCKED(c->igp_table_ip6, t) - rt_lock_table(t); + rt_lock_table(c->igp_table_ip6); - c->pool = p->p.pool; // XXXX - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + if (c->base_table) + { + rt_lock_table(c->base_table); + rt_flowspec_link(c->base_table, c->c.table); + } - if (c->cf->import_table) - channel_setup_in_table(C, 0); + c->pool = p->p.pool; // XXXX if (c->cf->export_table) - channel_setup_out_table(C); + bgp_setup_out_table(c); + + bgp_init_pending_tx(c); c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0); @@ -1926,12 +2009,16 @@ bgp_channel_cleanup(struct channel *C) struct bgp_channel *c = (void *) C; if (c->igp_table_ip4) - RT_LOCKED(c->igp_table_ip4, t) - rt_unlock_table(t); + rt_unlock_table(c->igp_table_ip4); if (c->igp_table_ip6) - RT_LOCKED(c->igp_table_ip6, t) - rt_unlock_table(t); + rt_unlock_table(c->igp_table_ip6); + + if (c->base_table) + { + rt_flowspec_unlink(c->base_table, c->c.table); + rt_unlock_table(c->base_table); + } c->index = 0; @@ -1944,7 +2031,7 @@ bgp_find_channel_config(struct bgp_config *cf, u32 afi) { struct bgp_channel_config *cc; - WALK_LIST(cc, cf->c.channels) + BGP_CF_WALK_CHANNELS(cf, cc) if (cc->afi == afi) return cc; @@ -1974,12 +2061,31 @@ bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 return cc2->c.table; /* Last, try default table of given type */ - if (tab = cf->c.global->def_tables[type]) + if (tab = rt_get_default_table(cf->c.global, type)) return tab; cf_error("Undefined IGP table"); } +static struct rtable_config * +bgp_default_base_table(struct bgp_config *cf, struct bgp_channel_config *cc) +{ + /* Expected table type */ + u32 type = (cc->afi == BGP_AF_FLOW4) ? NET_IP4 : NET_IP6; + + /* First, try appropriate IP channel */ + u32 afi2 = BGP_AF(BGP_AFI(cc->afi), BGP_SAFI_UNICAST); + struct bgp_channel_config *cc2 = bgp_find_channel_config(cf, afi2); + if (cc2 && (cc2->c.table->addr_type == type)) + return cc2->c.table; + + /* Last, try default table of given type */ + struct rtable_config *tab = rt_get_default_table(cf->c.global, type); + if (tab) + return tab; + + cf_error("Undefined base table"); +} void bgp_postconfig(struct proto_config *CF) @@ -2042,6 +2148,15 @@ bgp_postconfig(struct proto_config *CF) if (internal && cf->rs_client) cf_error("Only external neighbor can be RS client"); + if (internal && (cf->local_role != BGP_ROLE_UNDEFINED)) + cf_error("Local role cannot be set on IBGP sessions"); + + if (interior && (cf->local_role != BGP_ROLE_UNDEFINED)) + log(L_WARN "BGP roles are not recommended to be used within AS confederations"); + + if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED)) + cf_error("Local role must be set if roles are required"); + if (!cf->confederation && cf->confederation_member) cf_error("Confederation ID must be set for member sessions"); @@ -2064,9 +2179,24 @@ bgp_postconfig(struct proto_config *CF) if (internal && cf->enforce_first_as) cf_error("Enforce first AS check is requires EBGP sessions"); + if (cf->keepalive_time > cf->hold_time) + cf_error("Keepalive time must be at most hold time"); + + if (cf->keepalive_time > (cf->hold_time / 2)) + log(L_WARN "Keepalive time should be at most 1/2 of hold time"); + + if (cf->min_hold_time > cf->hold_time) + cf_error("Min hold time (%u) exceeds hold time (%u)", + cf->min_hold_time, cf->hold_time); + + uint keepalive_time = cf->keepalive_time ?: cf->hold_time / 3; + if (cf->min_keepalive_time > keepalive_time) + cf_error("Min keepalive time (%u) exceeds keepalive time (%u)", + cf->min_keepalive_time, keepalive_time); + struct bgp_channel_config *cc; - WALK_LIST(cc, CF->channels) + BGP_CF_WALK_CHANNELS(cf, cc) { /* Handle undefined import filter */ if (cc->c.in_filter == FILTER_UNDEF) @@ -2094,6 +2224,10 @@ bgp_postconfig(struct proto_config *CF) if (!cc->gw_mode) cc->gw_mode = cf->multihop ? GW_RECURSIVE : GW_DIRECT; + /* Different default for next_hop_prefer */ + if (!cc->next_hop_prefer) + cc->next_hop_prefer = (cc->gw_mode == GW_DIRECT) ? NHP_GLOBAL : NHP_LOCAL; + /* Defaults based on proto config */ if (cc->gr_able == 0xff) cc->gr_able = (cf->gr_mode == BGP_GR_ABLE); @@ -2124,6 +2258,14 @@ bgp_postconfig(struct proto_config *CF) cf_error("Mismatched IGP table type"); } + /* Default value of base table */ + if ((BGP_SAFI(cc->afi) == BGP_SAFI_FLOW) && cc->validate && !cc->base_table) + cc->base_table = bgp_default_base_table(cf, cc); + + if (cc->base_table && !cc->base_table->trie_used) + cf_error("Flowspec validation requires base table (%s) with trie", + cc->base_table->name); + if (cf->multihop && (cc->gw_mode == GW_DIRECT)) cf_error("Multihop BGP cannot use direct gateway mode"); @@ -2165,20 +2307,16 @@ bgp_reconfigure(struct proto *P, struct proto_config *CF) WALK_LIST(C, p->p.channels) C->stale = 1; - WALK_LIST(cc, new->c.channels) + BGP_CF_WALK_CHANNELS(new, cc) { C = (struct channel *) bgp_find_channel(p, cc->afi); same = proto_configure_channel(P, &C, &cc->c) && same; - - if (C) - C->stale = 0; } WALK_LIST_DELSAFE(C, C2, p->p.channels) if (C->stale) same = proto_configure_channel(P, &C, NULL) && same; - if (same && (p->start_state > BSS_PREPARE)) bgp_update_bfd(p, new->bfd); @@ -2192,7 +2330,7 @@ bgp_reconfigure(struct proto *P, struct proto_config *CF) return same; } -#define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL ) +#define TABLE(cf, NAME) ((cf)->NAME ? (cf)->NAME->table : NULL ) static int bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *import_changed, int *export_changed) @@ -2203,29 +2341,33 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor struct bgp_channel_config *old = c->cf; if ((new->secondary != old->secondary) || + (new->validate != old->validate) || (new->gr_able != old->gr_able) || (new->llgr_able != old->llgr_able) || (new->llgr_time != old->llgr_time) || (new->ext_next_hop != old->ext_next_hop) || (new->add_path != old->add_path) || - (new->import_table != old->import_table) || (new->export_table != old->export_table) || - (IGP_TABLE(new, ip4) != IGP_TABLE(old, ip4)) || - (IGP_TABLE(new, ip6) != IGP_TABLE(old, ip6))) + (TABLE(new, igp_table_ip4) != TABLE(old, igp_table_ip4)) || + (TABLE(new, igp_table_ip6) != TABLE(old, igp_table_ip6)) || + (TABLE(new, base_table) != TABLE(old, base_table))) return 0; if (new->mandatory && !old->mandatory && (C->channel_state != CS_UP)) return 0; if ((new->gw_mode != old->gw_mode) || + (new->next_hop_prefer != old->next_hop_prefer) || (new->aigp != old->aigp) || (new->cost != old->cost)) { - /* import_changed itself does not force ROUTE_REFRESH when import_table is active */ - if (c->c.in_table && (c->c.channel_state == CS_UP)) + /* If import table is active and route refresh is possible, we just ask for route refresh */ + if ((c->c.in_keep & RIK_PREFILTER) && (c->c.channel_state == CS_UP) && p->route_refresh) bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); - *import_changed = 1; + /* Otherwise we do complete reload */ + else + *import_changed = 1; } if (!ipa_equal(new->next_hop_addr, old->next_hop_addr) || @@ -2394,6 +2536,15 @@ bgp_show_afis(int code, char *s, u32 *afis, uint count) cli_msg(code, b.start); } +const char * +bgp_format_role_name(u8 role) +{ + static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" }; + if (role == BGP_ROLE_UNDEFINED) return "undefined"; + if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role]; + return "?"; +} + static void bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) { @@ -2522,6 +2673,9 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) if (caps->hostname) cli_msg(-1006, " Hostname: %s", caps->hostname); + + if (caps->role != BGP_ROLE_UNDEFINED) + cli_msg(-1006, " Role: %s", bgp_format_role_name(caps->role)); } static void @@ -2529,9 +2683,6 @@ bgp_show_proto_info(struct proto *P) { struct bgp_proto *p = (struct bgp_proto *) P; - if (p->p.proto_state != PS_DOWN) - BGP_ASSERT_INSIDE(p); - cli_msg(-1006, " BGP state: %s", bgp_state_dsc(p)); if (bgp_is_dynamic(p) && p->cf->remote_range) @@ -2539,6 +2690,9 @@ bgp_show_proto_info(struct proto *P) else cli_msg(-1006, " Neighbor address: %I%J", p->remote_ip, p->cf->iface); + if ((p->conn == &p->outgoing_conn) && (p->cf->remote_port != BGP_PORT)) + cli_msg(-1006, " Neighbor port: %u", p->cf->remote_port); + cli_msg(-1006, " Neighbor AS: %u", p->remote_as); cli_msg(-1006, " Local AS: %u", p->cf->local_as); @@ -2581,6 +2735,9 @@ bgp_show_proto_info(struct proto *P) tm_remains(p->conn->hold_timer), p->conn->hold_time); cli_msg(-1006, " Keepalive timer: %t/%u", tm_remains(p->conn->keepalive_timer), p->conn->keepalive_time); + cli_msg(-1006, " TX pending: %d bytes%s", + p->conn->sk->tpos - p->conn->sk->ttx, + ev_active(p->conn->tx_ev) ? " (refill scheduled)" : ""); } #if 0 @@ -2609,6 +2766,9 @@ bgp_show_proto_info(struct proto *P) { channel_show_info(&c->c); + if (c->c.channel != &channel_bgp) + continue; + if (p->gr_active_num) cli_msg(-1006, " Neighbor GR: %s", bgp_gr_states[c->gr_active]); @@ -2628,6 +2788,25 @@ bgp_show_proto_info(struct proto *P) if (c->igp_table_ip6) cli_msg(-1006, " IGP IPv6 table: %s", c->igp_table_ip6->name); + + if (c->base_table) + cli_msg(-1006, " Base table: %s", c->base_table->name); + + uint bucket_cnt = 0; + uint prefix_cnt = 0; + struct bgp_bucket *buck; + struct bgp_prefix *px; + if (c->ptx) + WALK_LIST(buck, c->ptx->bucket_queue) + { + bucket_cnt++; + WALK_LIST(px, buck->prefixes) + if (px->cur) + prefix_cnt++; + } + + cli_msg(-1006, " Pending %u attribute sets with total %u prefixes to send", + bucket_cnt, prefix_cnt); } } } @@ -2645,7 +2824,6 @@ struct channel_class channel_bgp = { struct protocol proto_bgp = { .name = "BGP", .template = "bgp%d", - .class = PROTOCOL_BGP, .preference = DEF_PREF_BGP, .channel_mask = NB_IP | NB_VPN | NB_FLOW, .proto_size = sizeof(struct bgp_proto), @@ -2657,6 +2835,12 @@ struct protocol proto_bgp = { .reconfigure = bgp_reconfigure, .copy_config = bgp_copy_config, .get_status = bgp_get_status, - .get_attr = bgp_get_attr, .show_proto_info = bgp_show_proto_info }; + +void bgp_build(void) +{ + proto_build(&proto_bgp); + bgp_register_attrs(); + bgp_listen_domain = DOMAIN_NEW(rtable, "BGP Listen Sockets"); +} |